加入4个变量，然后使用data.table对较少的变量进行分组 [英] Join on 4 variables then group on fewer variables using data.table

查看：144 发布时间：2017/7/13 22:31:54 r data.table dplyr

本文介绍了加入4个变量，然后使用data.table对较少的变量进行分组的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

此线程是我早期线程的延续然后加入使用无中间表的data.table进行变异。

在该线程中，我使用查找表来更改收入和数量，然后将结果除以 .N 所以当我聚合产品时，我看不到膨胀的价值。

根据专家对该线程的建议，我不想指望用于连接的所有四个变量，即 PO_ID ， SO_ID ， F_Year ， Product_ID 但只有 SO_ID ， F_Year ， Product_ID 。

问题：如何使用 data.table ？ >

这是我的数据和代码：

这是我的数据和解决方案，使用 dplyr

输入

  DFI =结构（列表（PO_ID = c（P1234，P1234，P1234，P1234，
P1234，P1234，P2345，P2345 ，S2，S3，S3，S7，S2 ，S10），F_Year = c（2012，
 2012，2013，2013，2013，2013，2011，2011，2014，2015），Product_ID = c（385X，
385X ，450X，450X，450X，900X，3700，3700，A11U，
2700），Revenue = c（1,2,3,4，数量= c（1，
 2，3，8，8，6，7，8，9，40），Location1 = c（MA NY，WA，
NY，WA，NY，IL，IL，MN，CA）），.Names = c（PO_ID ，
SO_ID，F_Year，Product_ID，收入，数量，位置1
），row.names = c（NA，10L），class =data帧 ）

查找表

  DF_Lookup = structure（list（PO_ID = c（P1234，P1234，P1234，P2345，
P2345，P3456 ，S4567），SO_ID = c（S1，S2，S2，S3，
S4，S7，S10），F_Year = Product_ID = c（385X，450X，900X，3700，3700，
A11U， 2700），收入= c（50，70，35，100，-50，50，100），
数量= c（3，20，20，20，-10，20，40））。 names = c（PO_ID，
SO_ID，F_Year，Product_ID，收入，数量），row.names = c（NA，
 7L） data.frame）

这是我使用 dplyr

  DF_Generated<  -  DFI％>％
 left_join（DF_Lookup，by = c PO_ID，SO_ID，F_Year，Product_ID））％>％
 dplyr :: group_by（SO_ID，F_Year，Product_ID）％>％
 dplyr :: mutate = n（））％>％
 dplyr :: ungroup（）％>％
 dplyr :: mutate（Revenue = Revenue.y / Count，Quantity = Quantity.y / Count）％>％
 dplyr :: select（PO_ID：Product_ID，Location1，Revenue，Quantity）

请注意，输入到 group_by 已更改。

预期输出：

  DF_Generated =结构（列表（PO_ID = c（P1234，P1234 ，P1234，P1234，
P1234，P1234，P2345，P2345，P3456，P4567），SO_ID = c（S1 bS1，S2，S2，S2，S2，S3，S3，S7，S10），F_Year = c（2012，
 2012， 2013，2013，2013，2013），Product_ID = c（385X，
385X，450X，450X，450X，900X 3700，3700，A11U，
2700），Location1 = c（MA，NY，WA，NY，WA，NY ，
IL，MN，CA），Revenue = c（25,25,23.3333333333333,23.3333333333333，
 23.3333333333333,35,50,50,50,100），Quantity = c（1.5,1.5，
 6.66666666666667,6.666666666666667,6.666666666666667,20，10，
 10，20，40）），class = c（t bl_df，tbl，data.frame），row.names = c（NA，
 -10L），.Names = c（PO_ID，SO_ID，F_Year，Product_ID ，Location1，
收入，数量））

注意：请注意，我不想创建中间变量，因为实际的数据大小太大，这可能不可行。

解决方案

这应该是你想要的东西

  library（data.table）
 setDT（DFI）
 DFI [，c（收入，数量）：= NULL] 
 
 setDT（DF_Lookup）
 
 dat = merge DF_Lookup，DFI，by = c（PO_ID，SO_ID，F_Year，Product_ID））
 dat = dat [，。（Revenue = Revenue / .N，Quantity = Quantity /位置1），by =。（PO_ID，SO_ID，F_Year，Product_ID）] 
 
 dat 
 PO_ID SO_ID F_Year Product_ID收入数量Location1 
 1：P1234 S1 2012 385X 25.00000 1.500000 MA 
 2：P1234 S1 2012 385X 25.00000 1.500000 NY 
 3：P1234 S2 2013 450X 23.33333 6.666667 WA 
 4：P1234 S2 2013 450X 23.33333 6.666667 NY 
 5：P1234 S2 2013 450X 23.33333 6.666667 WA 
 6：P1234 S2 2013 900X 35.00000 20.000000 NY 
 7：P2345 S3 2011 3700 50.00000 10.000000 IL 
 8：P2345 S3 2011 3700 50.00000 10.000000 IL 
 9：P3456 S7 2014 A11U 50.00000 20.000000 MN 
 10：P4567 S10 2015 2700 100.00000 40.000000 CA

This thread is a continuation of my earlier thread Join then mutate using data.table without intermediate table.

In that thread, I am using look-up table to change revenue and quantity and then dividing the result by .N so that when I aggregate the products, I don't see inflated values.

As per recommendation from the expert on that thread, I don't want to count on all the four variables used for join i.e. PO_ID, SO_ID, F_Year, Product_ID but only SO_ID, F_Year, Product_ID.

Question: how can I do this using data.table?

Here are my data and code:

Here are my data and solution using dplyr

Input

DFI = structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234", 
"P1234", "P1234", "P2345", "P2345", "P3456", "P4567"), SO_ID = c("S1", 
"S1", "S2", "S2", "S2", "S2", "S3", "S3", "S7", "S10"), F_Year = c(2012, 
2012, 2013, 2013, 2013, 2013, 2011, 2011, 2014, 2015), Product_ID = c("385X", 
"385X", "450X", "450X", "450X", "900X", "3700", "3700", "A11U", 
"2700"), Revenue = c(1, 2, 3, 34, 34, 6, 7, 88, 9, 100), Quantity = c(1, 
2, 3, 8, 8, 6, 7, 8, 9, 40), Location1 = c("MA", "NY", "WA", 
"NY", "WA", "NY", "IL", "IL", "MN", "CA")), .Names = c("PO_ID", 
"SO_ID", "F_Year", "Product_ID", "Revenue", "Quantity", "Location1"
), row.names = c(NA, 10L), class = "data.frame")

Look Up Table

DF_Lookup = structure(list(PO_ID = c("P1234", "P1234", "P1234", "P2345", 
"P2345", "P3456", "P4567"), SO_ID = c("S1", "S2", "S2", "S3", 
"S4", "S7", "S10"), F_Year = c(2012, 2013, 2013, 2011, 2011, 
2014, 2015), Product_ID = c("385X", "450X", "900X", "3700", "3700", 
"A11U", "2700"), Revenue = c(50, 70, 35, 100, -50, 50, 100), 
    Quantity = c(3, 20, 20, 20, -10, 20, 40)), .Names = c("PO_ID", 
"SO_ID", "F_Year", "Product_ID", "Revenue", "Quantity"), row.names = c(NA, 
7L), class = "data.frame")

Here's my modified code using dplyr:

DF_Generated <- DFI %>% 
  left_join(DF_Lookup,by = c("PO_ID", "SO_ID", "F_Year", "Product_ID")) %>%
  dplyr::group_by(SO_ID, F_Year, Product_ID) %>%
  dplyr::mutate(Count = n()) %>%
  dplyr::ungroup()%>%
  dplyr::mutate(Revenue = Revenue.y/Count, Quantity = Quantity.y/Count) %>%
  dplyr::select(PO_ID:Product_ID,Location1,Revenue,Quantity)

Please note that input to group_by has changed.

Expected output:

DF_Generated = structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234", 
"P1234", "P1234", "P2345", "P2345", "P3456", "P4567"), SO_ID = c("S1", 
"S1", "S2", "S2", "S2", "S2", "S3", "S3", "S7", "S10"), F_Year = c(2012, 
2012, 2013, 2013, 2013, 2013, 2011, 2011, 2014, 2015), Product_ID = c("385X", 
"385X", "450X", "450X", "450X", "900X", "3700", "3700", "A11U", 
"2700"), Location1 = c("MA", "NY", "WA", "NY", "WA", "NY", "IL", 
"IL", "MN", "CA"), Revenue = c(25, 25, 23.3333333333333, 23.3333333333333, 
23.3333333333333, 35, 50, 50, 50, 100), Quantity = c(1.5, 1.5, 
6.66666666666667, 6.66666666666667, 6.66666666666667, 20, 10, 
10, 20, 40)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L), .Names = c("PO_ID", "SO_ID", "F_Year", "Product_ID", "Location1", 
"Revenue", "Quantity"))

NOTE: Please note that I don't want to create intermediate variable because the actual data size is so large that this may not be feasible.

解决方案

This should do what you're looking for

library(data.table)
setDT(DFI)
DFI[ , c("Revenue", "Quantity") := NULL]

setDT(DF_Lookup)

dat = merge(DF_Lookup, DFI, by = c("PO_ID", "SO_ID", "F_Year", "Product_ID"))
dat = dat[ , .(Revenue = Revenue/.N, Quantity = Quantity/.N, Location1), by = .(PO_ID, SO_ID, F_Year, Product_ID)]

dat
    PO_ID SO_ID F_Year Product_ID   Revenue  Quantity Location1
 1: P1234    S1   2012       385X  25.00000  1.500000        MA
 2: P1234    S1   2012       385X  25.00000  1.500000        NY
 3: P1234    S2   2013       450X  23.33333  6.666667        WA
 4: P1234    S2   2013       450X  23.33333  6.666667        NY
 5: P1234    S2   2013       450X  23.33333  6.666667        WA
 6: P1234    S2   2013       900X  35.00000 20.000000        NY
 7: P2345    S3   2011       3700  50.00000 10.000000        IL
 8: P2345    S3   2011       3700  50.00000 10.000000        IL
 9: P3456    S7   2014       A11U  50.00000 20.000000        MN
10: P4567   S10   2015       2700 100.00000 40.000000        CA

这篇关于加入4个变量，然后使用data.table对较少的变量进行分组的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

加入4个变量，然后使用data.table对较少的变量进行分组 [英] Join on 4 variables then group on fewer variables using data.table

问题描述

相关文章

其他开发语言最新文章

热门教程

热门工具

登录关闭

加入4个变量，然后使用data.table对较少的变量进行分组 [英] Join on 4 variables then group on fewer variables using data.table

问题描述

相关文章

其他开发语言最新文章

热门教程

热门工具

登录 关闭

登录关闭