R中的另一个变量的滚动和 [英] Rolling Sum by Another Variable in R

查看:145
本文介绍了R中的另一个变量的滚动和的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想通过ID获取滚动的7天总计。假设我的数据如下所示:

  data <-as.data.frame(matrix(NA,42,3))
data $ V1< -seq(as.Date(2014-05-01),as.Date(2014-09-01),by = 3)
data $ V2< (1:6,7)
data $ V3< -rep(c(1,2),21)
colnames(data)<-c(Date,USD,ID )

日期美元ID
1 2014-05-01 1 1
2 2014-05-04 2 2
3 2014-05-07 3 1
4 2014-05-10 4 2
5 2014-05-13 5 1
6 2014-05-16 6 2
7 2014-05-19 1 1
8 2014-05-22 2 2
9 2014-05-25 3 1
10 2014-05-28 4 2

如何添加一个新的列,该列将包含按ID编号的滚动7天总和?

解决方案

OP提供的数据集不会暴露任务的复杂性。到目前为止,在解决OP问题方面,Mike的答案是正确的。

事实上,由于 d <= 0,8个滚动日而不是7个滚动日& d> = -7

zoo Grothendieck几乎有效,只有合并会分配给 ID 的每个组。

下面的第二个data.table解决方案,这个时间有效的结果,使用dev RcppRoll允许 na.rm = TRUE

和微格式化Mike的解决方案输出。

  data< -as.data.frame(matrix(NA,42,3))
data $ V1< -seq(as.Date(2014-05-01),as.Date(2014-09-01),by = 3)
data $ V2< (1:6,7)
data $ V3< -rep(c(1,2),21)
colnames(data)<-c(Date,USD,ID )

库(microbenchmark)
库(RcppRoll)#install_github(kevinushey / RcppRoll)
库(data.table)#install_github(Rdatatable / data.table
correct_jan_dt = function(n,partial = TRUE){
DT = as.data.table(data)#这可以通过setDT()加速
date.range = DT [ ,range(Date)]
all.dates = seq.Date(date.range [1],date.range [2],by = 1)
setkey(DT,ID,Date)
r = DT [CJ(unique(ID),all.dates)] [,c(roll):= as.integer(roll_sumr(USD,n,normalize = FALSE,na.rm = TRUE)),by =ID] [!is.na(USD)]
#当在[kevinushey / RcppRoll](https://github.com/kevinushey/RcppRoll)中实现'partial'
if(isTRUE(partial)){
r [is.na(roll),roll:= cumsum(USD),by =ID] []
}
return (r [order(Date,ID)])
}
correct_mike_dt = function(){
data = as.data.table(data)[,ID2: c(ID)]
#Build引用表
Ref < - data [,list(Compare_Value = list(I(USD)),Compare_Date = list = c(ID2)]
#使用mapply通过id获取最近7天的值
data [,c(roll):= mapply(RD = Date,NUM = ID2,function (RD,NUM){
d< - as.numeric(Ref $ Compare_Date [[NUM]] -RD)
sum((d <= 0& d = 7)* Ref $ Compare_Value [[NUM]])})] [,ID2:= NULL] []
}
identical(correct_mike_dt(),correct_jan_dt partial = TRUE))
#[1] TRUE
microbenchmark(unit =relative,times = 5L,correct_mike_dt(),correct_jan_dt(8))
# b#expr min lq mean median uq max neval
#correct_mike_dt()274.0699 273.9892 267.2886 266.6009 266.2254 256.7296 5
#correct_jan_dt(8)1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 5



期待来自@Khashaa的更新。



编辑(20150122.2):低于基准>

定时在一个更大(仍然很小)的数据集上,5439行:

  library(zoo)
library(data.table)
library(dplyr)
库(RcppRoll)
库(microbenchmark)
data< -as.data.frame(matrix(NA,5439,3))
data $ V1 <-seq(as.Date(1970-01-01), as.Date(2014-09-01),by = 3)
data $ V2< -sample(1:6,5439,TRUE)
data $ V3< -sample ,2),5439,TRUE)
colnames(data)<-c(Date,USD,ID)
zoo_f = function(){
z& read.zoo(data)
z0 < - merge(z,zoo(,seq(start(z),end(z),day)),fill = 0)#expand to daily
roll transform(data,roll = ave(z0 $ USD,z0 $ ID,FUN = roll)[time(z)])
}
dt_f = function(){
DT = as.data.table(data)#这可以通过setDT()加速
date.range = DT [ (日期)]
all.dates = seq.Date(date.range [1],date.range [2],by = 1)
setkey(DT,Date)
DT [ 。(all.dates)
] [order(Date),c(roll):= rowSums(setDT(shift,USD,0:6,NA,lag)),na.rm = FALSE ),by =ID
] [!is.na(ID)]
}
dp_f = function(){
data%>%group_by >%
mutate(roll = roll_sum(c(rep(NA,6),USD),7))
}
dt2_f = function(){
#通过setDT()
as.data.table(data)[,c(roll):= roll_sum(c(rep(NA,6),USD),7) ]
相同(setDT(as.data。))
相同(as.data.table(zoo_f()),dt_f())
#框架(dp_f())),dt_f())
#[1] TRUE
相同(dt2_f(),dt_f())
#[1] TRUE
microbenchmark unit =relative,times = 20L,zoo_f(),dt_f(),dp_f(),dt2_f())
#单位:relative
#expr min lq mean median uq max neval
#zoo_f()140.331889 141.891917 138.064126 139.381336 136.029019 137.730171 20
#dt_f()14.917166 14.464199 15.210757 16.898931 16.543811 14.221987 20
#dp_f()1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
#dt2_f 1.536896 1.521983 1.500392 1.518641 1.629916 1.337903 20

但我不知道我的data.table代码是否已经最佳。



以上功能没有回答OP问题。阅读帖子的顶部以进行更新。迈克的解决方案是正确的。


I want to get the rolling 7-day sum by ID. Suppose my data looks like this:

data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")

         Date USD ID
1  2014-05-01   1  1
2  2014-05-04   2  2
3  2014-05-07   3  1
4  2014-05-10   4  2
5  2014-05-13   5  1
6  2014-05-16   6  2
7  2014-05-19   1  1
8  2014-05-22   2  2
9  2014-05-25   3  1
10 2014-05-28   4  2

How can I add a new column that will contain the rolling 7-day sum by ID?

解决方案

Dataset provided by OP does not expose the complexity of the task. In terms of addressing OP question so far only Mike's answer was the correct one.
In fact for a 8 rolling days, instead of 7 rolling days, due to d <= 0 & d >= -7.
zoo solution by @G. Grothendieck is almost valid, only if merge would be made to each group of ID.
Below second data.table solution, this time valid results, using dev RcppRoll which allows na.rm=TRUE.
And slightly formatted Mike's solution output.

data<-as.data.frame(matrix(NA,42,3))
data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3)
data$V2<-rep(1:6,7)
data$V3<-rep(c(1,2),21)
colnames(data)<-c("Date","USD","ID")

library(microbenchmark)
library(RcppRoll) # install_github("kevinushey/RcppRoll")
library(data.table) # install_github("Rdatatable/data.table")
correct_jan_dt = function(n, partial=TRUE){
  DT = as.data.table(data) # this can be speedup by setDT()
  date.range = DT[,range(Date)]
  all.dates = seq.Date(date.range[1],date.range[2],by=1)
  setkey(DT,ID,Date)
  r = DT[CJ(unique(ID),all.dates)][, c("roll") := as.integer(roll_sumr(USD, n, normalize = FALSE, na.rm = TRUE)), by="ID"][!is.na(USD)]
  # This could be simplified when `partial` arg will be implemented in [kevinushey/RcppRoll](https://github.com/kevinushey/RcppRoll)
  if(isTRUE(partial)){
    r[is.na(roll), roll := cumsum(USD), by="ID"][]
  }
  return(r[order(Date,ID)])
}
correct_mike_dt = function(){
  data = as.data.table(data)[,ID2:=.GRP,by=c("ID")]
  #Build reference table
  Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")]
  #Use mapply to get last seven days of value by id
  data[, c("roll") := mapply(RD = Date,NUM=ID2, function(RD, NUM){
    d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
    sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})][,ID2:=NULL][]
}
identical(correct_mike_dt(), correct_jan_dt(n=8,partial=TRUE))
# [1] TRUE
microbenchmark(unit="relative", times=5L, correct_mike_dt(), correct_jan_dt(8))
# Unit: relative
#               expr      min       lq     mean   median       uq      max neval
#  correct_mike_dt() 274.0699 273.9892 267.2886 266.6009 266.2254 256.7296     5
#  correct_jan_dt(8)   1.0000   1.0000   1.0000   1.0000   1.0000   1.0000     5

Looking forward for update from @Khashaa.

Edit (20150122.2): Below benchmarks do not answer OP question.

Timing on a bigger (still very tiny) dataset, 5439 rows:

library(zoo)
library(data.table)
library(dplyr)
library(RcppRoll)
library(microbenchmark)
data<-as.data.frame(matrix(NA,5439,3))
data$V1<-seq(as.Date("1970-01-01"),as.Date("2014-09-01"),by=3)
data$V2<-sample(1:6,5439,TRUE)
data$V3<-sample(c(1,2),5439,TRUE)
colnames(data)<-c("Date","USD","ID")
zoo_f = function(){
    z <- read.zoo(data)
    z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily
    roll <- function(x) rollsumr(x, 7, fill = NA)
    transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)])
}
dt_f = function(){
    DT = as.data.table(data) # this can be speedup by setDT()
    date.range = DT[,range(Date)]
    all.dates = seq.Date(date.range[1],date.range[2],by=1)
    setkey(DT,Date)
    DT[.(all.dates)
       ][order(Date), c("roll") := rowSums(setDT(shift(USD, 0:6, NA, "lag")),na.rm=FALSE), by="ID"
         ][!is.na(ID)]
}
dp_f = function(){
  data %>% group_by(ID) %>% 
    mutate(roll=roll_sum(c(rep(NA,6), USD), 7))
} 
dt2_f = function(){
  # this can be speedup by setDT()
  as.data.table(data)[, c("roll") := roll_sum(c(rep(NA,6), USD), 7), by="ID"][]
}
identical(as.data.table(zoo_f()),dt_f())
# [1] TRUE
identical(setDT(as.data.frame(dp_f())),dt_f())
# [1] TRUE
identical(dt2_f(),dt_f())
# [1] TRUE
microbenchmark(unit="relative", times=20L, zoo_f(), dt_f(), dp_f(), dt2_f())
# Unit: relative
#     expr        min         lq       mean     median         uq        max neval
#  zoo_f() 140.331889 141.891917 138.064126 139.381336 136.029019 137.730171    20
#   dt_f()  14.917166  14.464199  15.210757  16.898931  16.543811  14.221987    20
#   dp_f()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    20
#  dt2_f()   1.536896   1.521983   1.500392   1.518641   1.629916   1.337903    20

Yet I'm not sure if my data.table code is already optimal.

Above functions did not answer OP question. Read the top of post for update. Mike's solution was the correct one.

这篇关于R中的另一个变量的滚动和的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆