加入4个变量,然后使用data.table对较少的变量进行分组 [英] Join on 4 variables then group on fewer variables using data.table
问题描述
此主题是我之前的主题的延续加入使用无中间表的data.table进行mutate 。
在该话题中,我使用查找表更改收入和数量,然后将结果除以 .N
使得当我聚合产品,我没有看到膨胀的值。
根据该线程的专家的建议,我不想指望用于连接的所有四个变量 PO_ID
,
SO_ID
, F_Year
, Product_ID
但只有 SO_ID
, F_Year
, Product_ID
。
问题:如何使用 data.table
? >
这里是我的数据和代码:
这里是我的数据和解决方案使用 dplyr
输入
DFI =结构(列表(PO_ID = c(P1234,P1234,P1234,P1234,
P1234,P1234,P2345,P2345,P3456 ,S2,S3,S3,S7,S2,S2,S2 ,S10),F_Year = c(2012,
2012,2013,2013,2013,2011,2011,2014),Product_ID = c(385X,
385X ,450X,450X,450X,900X,3700,3700,A11U,
2700),Revenue = c(1,2,3,34,位置1 = c(MA,位置1,位置3),位置1 = c(1,
2,3,8,8,6,7,8,9,40) NY,WA,
NY,WA,NY,IL,IL,MN,CA)).Names = c(PO_ID ,
SO_ID,F_Year,Product_ID,Revenue,Quantity,Location1
),row.names = c(NA,10L)框架)
查找表
DF_Lookup = structure(list(PO_ID = c(P1234,P1234,P1234,P2345,
P2345, P3456,P4567),SO_ID = c(S1,S2,S2,S3,
S4,S7,S10),F_Year = c (385X,450X,900X,3700,3700,
A11U ,2700),Revenue = c(50,70,35,100,-50,50,100),
Quantity = c(3,20,20,20,-10,20,40) ,.Names = c(PO_ID,
SO_ID,F_Year,Product_ID,Revenue,Quantity),row.names = c(NA,
7L) class =data.frame)
这里是我修改的代码使用 dplyr
:
DF_Generated< - DFI%>%
left_join(DF_Lookup,by = c(PO_ID,SO_ID,F_Year,Product_ID))%>%
dplyr :: group_by(SO_ID,F_Year,Product_ID)%>%
dplyr :: mutate (Count = n())%>%
dplyr :: ungroup()%>%
dplyr :: mutate(Revenue = Revenue.y / Count,Quantity = Quantity.y / Count) %>%
dplyr :: select(PO_ID:Product_ID,Location1,Revenue,Quantity)
请注意,对 group_by
的输入已更改。
预期输出:
DF_Generated =结构(列表(PO_ID = c(P1234,P1234,P1234,P1234,
P1234 S1,S2,S2,S2,S2,S2,S2,S2,S2,P212,P1234,P2345,P2345,P3456,P4567 S2,S3,S3,S7,S10),F_Year = c(2012,
2012,2013,2013,2013,2013,2011,2014,2015),Product_ID = c(385X,
385X,450X,450X,450X,900X,3700,3700,A11U,
2700 ),Location1 = c(MA,NY,WA,NY,WA,NY,IL,
IL,MN,CA) ,Revenue = c(25,25,23.3333333333333,23.3333333333333,
23.3333333333333,35,50,50,50,100),数量= c(1.5,1.5,
6.66666666666667,6.666666666666667,6.666666666666667, 10,
10,20,40)),class = c(tbl_df,tbl,data.frame),row.names = c(NA,
-10L) Names = c(PO_ID,SO_ID,F_Year,Product_ID,Location1,
Revenue,Quantity))
pre> 注意:请注意,我不想创建中间变量,因为实际的数据大小太大,
这应该是你正在寻找的
library(data.table)
setDT(DFI)
DFI [,c(Revenue,Quantity):= NULL]
b $ b setDT(DF_Lookup)
dat = merge(DF_Lookup,DFI,by = c(PO_ID,SO_ID,F_Year,Product_ID))
dat = dat [,。(Revenue = Revenue / .N,Quantity = Quantity / .N,Location1),by =。(PO_ID,SO_ID,F_Year,Product_ID)]
dat
PO_ID SO_ID F_Year Product_ID收入数量Location1
1:P1234 S1 2012 385X 25.00000 1.500000 MA
2:P1234 S1 2012 385X 25.00000 1.500000 NY
3:P1234 S2 2013 450X 23.33333 6.666667 WA
4: P1234 S2 2013 450X 23.33333 6.666667 NY
5:P1234 S2 2013 450X 23.33333 6.666667 WA
6:P1234 S2 2013 900X 35.00000 20.000000 NY
7:P2345 S3 2011 3700 50.00000 10.000000 IL
8:P2345 S3 2011 3700 50.00000 10.000000 IL
9:P3456 S7 2014 A11U 50.00000 20.000000 MN
10:P4567 S10 2015 2700 100.00000 40.000000 CA
This thread is a continuation of my earlier thread Join then mutate using data.table without intermediate table.
In that thread, I am using look-up table to change revenue and quantity and then dividing the result by .N
so that when I aggregate the products, I don't see inflated values.
As per recommendation from the expert on that thread, I don't want to count on all the four variables used for join i.e. PO_ID
, SO_ID
, F_Year
, Product_ID
but only SO_ID
, F_Year
, Product_ID
.
Question: how can I do this using data.table
?
Here are my data and code:
Here are my data and solution using dplyr
Input
DFI = structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P2345", "P2345", "P3456", "P4567"), SO_ID = c("S1",
"S1", "S2", "S2", "S2", "S2", "S3", "S3", "S7", "S10"), F_Year = c(2012,
2012, 2013, 2013, 2013, 2013, 2011, 2011, 2014, 2015), Product_ID = c("385X",
"385X", "450X", "450X", "450X", "900X", "3700", "3700", "A11U",
"2700"), Revenue = c(1, 2, 3, 34, 34, 6, 7, 88, 9, 100), Quantity = c(1,
2, 3, 8, 8, 6, 7, 8, 9, 40), Location1 = c("MA", "NY", "WA",
"NY", "WA", "NY", "IL", "IL", "MN", "CA")), .Names = c("PO_ID",
"SO_ID", "F_Year", "Product_ID", "Revenue", "Quantity", "Location1"
), row.names = c(NA, 10L), class = "data.frame")
Look Up Table
DF_Lookup = structure(list(PO_ID = c("P1234", "P1234", "P1234", "P2345",
"P2345", "P3456", "P4567"), SO_ID = c("S1", "S2", "S2", "S3",
"S4", "S7", "S10"), F_Year = c(2012, 2013, 2013, 2011, 2011,
2014, 2015), Product_ID = c("385X", "450X", "900X", "3700", "3700",
"A11U", "2700"), Revenue = c(50, 70, 35, 100, -50, 50, 100),
Quantity = c(3, 20, 20, 20, -10, 20, 40)), .Names = c("PO_ID",
"SO_ID", "F_Year", "Product_ID", "Revenue", "Quantity"), row.names = c(NA,
7L), class = "data.frame")
Here's my modified code using dplyr
:
DF_Generated <- DFI %>%
left_join(DF_Lookup,by = c("PO_ID", "SO_ID", "F_Year", "Product_ID")) %>%
dplyr::group_by(SO_ID, F_Year, Product_ID) %>%
dplyr::mutate(Count = n()) %>%
dplyr::ungroup()%>%
dplyr::mutate(Revenue = Revenue.y/Count, Quantity = Quantity.y/Count) %>%
dplyr::select(PO_ID:Product_ID,Location1,Revenue,Quantity)
Please note that input to group_by
has changed.
Expected output:
DF_Generated = structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P2345", "P2345", "P3456", "P4567"), SO_ID = c("S1",
"S1", "S2", "S2", "S2", "S2", "S3", "S3", "S7", "S10"), F_Year = c(2012,
2012, 2013, 2013, 2013, 2013, 2011, 2011, 2014, 2015), Product_ID = c("385X",
"385X", "450X", "450X", "450X", "900X", "3700", "3700", "A11U",
"2700"), Location1 = c("MA", "NY", "WA", "NY", "WA", "NY", "IL",
"IL", "MN", "CA"), Revenue = c(25, 25, 23.3333333333333, 23.3333333333333,
23.3333333333333, 35, 50, 50, 50, 100), Quantity = c(1.5, 1.5,
6.66666666666667, 6.66666666666667, 6.66666666666667, 20, 10,
10, 20, 40)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L), .Names = c("PO_ID", "SO_ID", "F_Year", "Product_ID", "Location1",
"Revenue", "Quantity"))
NOTE: Please note that I don't want to create intermediate variable because the actual data size is so large that this may not be feasible.
This should do what you're looking for
library(data.table)
setDT(DFI)
DFI[ , c("Revenue", "Quantity") := NULL]
setDT(DF_Lookup)
dat = merge(DF_Lookup, DFI, by = c("PO_ID", "SO_ID", "F_Year", "Product_ID"))
dat = dat[ , .(Revenue = Revenue/.N, Quantity = Quantity/.N, Location1), by = .(PO_ID, SO_ID, F_Year, Product_ID)]
dat
PO_ID SO_ID F_Year Product_ID Revenue Quantity Location1
1: P1234 S1 2012 385X 25.00000 1.500000 MA
2: P1234 S1 2012 385X 25.00000 1.500000 NY
3: P1234 S2 2013 450X 23.33333 6.666667 WA
4: P1234 S2 2013 450X 23.33333 6.666667 NY
5: P1234 S2 2013 450X 23.33333 6.666667 WA
6: P1234 S2 2013 900X 35.00000 20.000000 NY
7: P2345 S3 2011 3700 50.00000 10.000000 IL
8: P2345 S3 2011 3700 50.00000 10.000000 IL
9: P3456 S7 2014 A11U 50.00000 20.000000 MN
10: P4567 S10 2015 2700 100.00000 40.000000 CA
这篇关于加入4个变量,然后使用data.table对较少的变量进行分组的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!