将dplyr连接语法转换为纯数据表语法 [英] convert dplyr join syntax into pure data.table syntax
问题描述
我正在学习data.table。我无法转换dplyr连接语法。您能否推荐以下测试用例的data.table等价?
library(data.table)
库(dplyr)
dtProduct< - data.table(
ProductID = c(6,33,17,88,44,51),
ProductName = c ,Helmet,Gloves,Towel,Chair,Detergent),
Price = c(25,60,10,7.5,135,16),
key = 'ProductID'
)
set.seed(20141216)
dtOrder< - data.table(
OrderID = sample(1001:9999,12),
CustomerID = sample(271:279,12,replace = TRUE),
#注意:一些不存在的ProductID有意引入
ProductID = sample(c(dtProduct [,ProductID] 439),12,replace = TRUE),
Qty = sample(1:3,12,replace = TRUE),
key ='OrderID'
)
$ b b。 tables()
NAME NROW NCOL MB COLS KEY
[1,] dtOrder 12 4 1 OrderID,CustomerID,ProductID,Qty OrderID
[2,] dtProduct 6 3 1 ProductID,ProductName,Price ProductID
> dtProduct
ProductID ProductName价格
1:6衬衫25.0
2:17手套10.0
3:33头盔60.0
4:44椅子135.0
5: 51洗涤剂16.0
6:88毛巾7.5
> dtOrder
OrderID CustomerID ProductID数量
1:1651 275 6 3
2:2726 272 88 2
3:3079 275 88 2
4:3168 274 17 1
5:4816 277 88 1
6:4931 278 51 1
7:5134 274 439 2
8:5265 272 33 3
9:7702 275 33 2
10:7727 279 155 2
11:8412 273 88 2
12:9130 271 17 3
Case1:显示订单明细,隐藏不匹配的ProductID
dtOrder%>%
inner_join(dtProduct,by =ProductID)%>%
transfute(OrderID,ProductID,ProductName,Qty,Price,ExtPrice = Qty * Price)
OrderID产品编号产品名称数量价格价格
1 1651 6衬衫3 25.0 75.0
2 3168 17手套1 10.0 10.0
3 9130 17手套3 10.0 30.0
4 5265 33头盔3 60.0 180.0
5 7702 33头盔2 60.0 120.0
6 4931 51洗涤剂1 16.0 16.0
7 2726 88毛巾2 7.5 15.0
8 3079 88毛巾2 7.5 15.0
9 4816 88毛巾1 7.5 7.5
10 8412 88毛巾2 7.5 15.0
Case2:显示订单详情,包括不匹配的ProductID
dtOrder%>%
left_join(dtProduct,by =ProductID)%>%
transfute(OrderID,ProductID,ProductName,Qty,Price,ExtPrice = Qty * Price)
OrderID ProductID ProductName
1 1651 6衬衫3 25.0 75.0
2 3168 17手套1 10.0 10.0
3 9130 17手套3 10.0 30.0
4 5265 33头盔3 60.0 180.0
5 7702 33头盔2 60.0 120.0
6 4931 51洗涤剂1 16.0 16.0
7 2726 88毛巾2 7.5 15.0
8 3079 88毛巾2 7.5 15.0
9 4816 88毛巾1 7.5 7.5
10 8412 88毛巾2 7.5 15.0
11 7727 155 NA 2 NA NA
12 5134 439 NA 2 NA NA
情况3:显示订单错误(只有不匹配的产品ID)
dtOrder%>%
left_join(dtProduct,by =ProductID)%>%
过滤器(is.na(ProductName))%>%
select OrderID,ProductID,ProductName,Qty)
OrderID ProductID ProductName数量
1 7727 155 NA 2
2 5134 439 NA 2
Case4:按ProductID排序的多种汇总,按TotalSales递减排序结果
dtOrder%>%
inner_join(dtProduct,by =ProductID)%>%
group_by(ProductID)%>%
(OrderCount = n(),TotalQty = sum(Qty),TotalSales = sum(Qty * Price))%>%
arrange(desc(TotalSales))
ProductID OrderCount TotalQty TotalSales
1 33 2 5 300.0
2 6 1 3 75.0
3 88 4 7 52.5
4 17 2 4 40.0
5 51 1 1 16.0
案例5:按产品ID列出不同总计,按总销售额递减排序结果
- NOTE1:此时ProductName与ProductID一起显示
-
按降序TotalSales不再工作(BUG?)
dtOrder%>%
inner_join(dtProduct,by = ProductID)%>%
group_by(ProductID,ProductName)%>%
summarize(OrderCount = n(),TotalQty = sum(Qty),TotalSales = sum %b&b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b b a b $ b 3 33头盔2 5 300.0
4 51洗涤剂1 1 16.0
5 88毛巾4 7 52.5
非常感谢您提供任何帮助。
setkey(dtOrder,ProductID)
(1-2)
#这将是你写的
pre>
dtProduct [dtOrder,
list(OrderID,ProductID,ProductName,Qty,Price,ExtPrice = Qty * Price),
nomatch = 0或者省略这个获得(2)
]
#但我认为你最好用这个
dtProduct [dtOrder] [,ExtPrice:= Qty * Price] []
(3)
#你可以再次采取字面的方向:
dtProduct [dtOrder] [!is.na(ProductName)] [,
list(OrderID,ProductID,ProductName,Qty)]
#但我再次认为你会更好的
dtOrder [!dtProduct]
(4-5)
dtProduct [dtOrder,nomatch = 0] [,
list(OrderCount = .N,TotalQty = sum(Qty),TotalSales = sum(Qty * Price)),
by = list(ProductID,ProductName)] [
order(-TotalSales)]
I am learning data.table. I have difficulty converting the dplyr join syntax. Can you please recommend the data.table equivalence for the following test cases?
library(data.table) library(dplyr) dtProduct <- data.table( ProductID = c(6, 33, 17, 88, 44, 51), ProductName= c("Shirt", "Helmet", "Gloves", "Towel", "Chair", "Detergent"), Price= c(25, 60, 10, 7.5, 135, 16), key = 'ProductID' ) set.seed(20141216) dtOrder <- data.table( OrderID = sample(1001:9999, 12), CustomerID = sample(271:279, 12, replace=TRUE), # NOTE: some non-existent ProductID intentionally introduced ProductID = sample(c(dtProduct[, ProductID], 155, 439), 12, replace=TRUE), Qty = sample(1:3, 12, replace=TRUE), key = 'OrderID' ) > tables() NAME NROW NCOL MB COLS KEY [1,] dtOrder 12 4 1 OrderID,CustomerID,ProductID,Qty OrderID [2,] dtProduct 6 3 1 ProductID,ProductName,Price ProductID > dtProduct ProductID ProductName Price 1: 6 Shirt 25.0 2: 17 Gloves 10.0 3: 33 Helmet 60.0 4: 44 Chair 135.0 5: 51 Detergent 16.0 6: 88 Towel 7.5 > dtOrder OrderID CustomerID ProductID Qty 1: 1651 275 6 3 2: 2726 272 88 2 3: 3079 275 88 2 4: 3168 274 17 1 5: 4816 277 88 1 6: 4931 278 51 1 7: 5134 274 439 2 8: 5265 272 33 3 9: 7702 275 33 2 10: 7727 279 155 2 11: 8412 273 88 2 12: 9130 271 17 3
Case1: Show Order Details, no-match ProductID are hidden
dtOrder %>% inner_join(dtProduct, by="ProductID") %>% transmute(OrderID, ProductID, ProductName, Qty, Price, ExtPrice=Qty*Price) OrderID ProductID ProductName Qty Price ExtPrice 1 1651 6 Shirt 3 25.0 75.0 2 3168 17 Gloves 1 10.0 10.0 3 9130 17 Gloves 3 10.0 30.0 4 5265 33 Helmet 3 60.0 180.0 5 7702 33 Helmet 2 60.0 120.0 6 4931 51 Detergent 1 16.0 16.0 7 2726 88 Towel 2 7.5 15.0 8 3079 88 Towel 2 7.5 15.0 9 4816 88 Towel 1 7.5 7.5 10 8412 88 Towel 2 7.5 15.0
Case2: Show Order Details, INCLUDING no-match ProductID
dtOrder %>% left_join(dtProduct, by="ProductID") %>% transmute(OrderID, ProductID, ProductName, Qty, Price, ExtPrice=Qty*Price) OrderID ProductID ProductName Qty Price ExtPrice 1 1651 6 Shirt 3 25.0 75.0 2 3168 17 Gloves 1 10.0 10.0 3 9130 17 Gloves 3 10.0 30.0 4 5265 33 Helmet 3 60.0 180.0 5 7702 33 Helmet 2 60.0 120.0 6 4931 51 Detergent 1 16.0 16.0 7 2726 88 Towel 2 7.5 15.0 8 3079 88 Towel 2 7.5 15.0 9 4816 88 Towel 1 7.5 7.5 10 8412 88 Towel 2 7.5 15.0 11 7727 155 NA 2 NA NA 12 5134 439 NA 2 NA NA
Case3: Show Order Errors (Only no-match ProductID)
dtOrder %>% left_join(dtProduct, by="ProductID") %>% filter(is.na(ProductName)) %>% select(OrderID, ProductID, ProductName, Qty) OrderID ProductID ProductName Qty 1 7727 155 NA 2 2 5134 439 NA 2
Case4: Various Aggregates by ProductID, sort result by TotalSales descending
dtOrder %>% inner_join(dtProduct, by="ProductID") %>% group_by(ProductID) %>% summarize(OrderCount=n(), TotalQty=sum(Qty), TotalSales=sum(Qty*Price)) %>% arrange(desc(TotalSales)) ProductID OrderCount TotalQty TotalSales 1 33 2 5 300.0 2 6 1 3 75.0 3 88 4 7 52.5 4 17 2 4 40.0 5 51 1 1 16.0
Case5: Various Aggregates by ProductID, sort result by TotalSales descending
- NOTE1: This time, ProductName is displayed along with ProductID
NOTE2: sort by descending TotalSales no longer working (BUG?)
dtOrder %>% inner_join(dtProduct, by="ProductID") %>% group_by(ProductID, ProductName) %>% summarize(OrderCount=n(), TotalQty=sum(Qty), TotalSales=sum(Qty*Price)) %>% arrange(desc(TotalSales)) ProductID ProductName OrderCount TotalQty TotalSales 1 6 Shirt 1 3 75.0 2 17 Gloves 2 4 40.0 3 33 Helmet 2 5 300.0 4 51 Detergent 1 1 16.0 5 88 Towel 4 7 52.5
Thank you very much in advance for any help.
setkey(dtOrder, ProductID)
(1-2)
# this will be literally what you wrote
dtProduct[dtOrder,
list(OrderID, ProductID, ProductName, Qty, Price, ExtPrice=Qty*Price),
nomatch = 0 # or omit this to get (2)
]
# but I think you'd be better off with this
dtProduct[dtOrder][, ExtPrice := Qty*Price][]
(3)
# you can again take the literal direction:
dtProduct[dtOrder][!is.na(ProductName)][,
list(OrderID, ProductID, ProductName, Qty)]
# but again I think you'd be better off with
dtOrder[!dtProduct]
(4-5)
dtProduct[dtOrder, nomatch = 0][,
list(OrderCount=.N, TotalQty=sum(Qty), TotalSales=sum(Qty*Price)),
by = list(ProductID, ProductName)][
order(-TotalSales)]
这篇关于将dplyr连接语法转换为纯数据表语法的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!