如何在每个组中创建滞后变量? [英] How to create a lag variable within each group?
问题描述
我有一个data.table:
set.seed(1)
data< - data。表(时间= c(1:3,1:4),
组= c(rep(c(b,a),c(3,4))) = rnorm(7))
data
#组时间值
#1:b 1 -0.6264538
#2:b 2 0.1836433
#3 :b 3 -0.8356286
#4:a 1 1.5952808
#5:a 2 0.3295078
#6:a 3 -0.8204684
#7:a 4 0.4874291
我想计算每个级别下的value列的滞后版本groups。
结果应类似于
#groups时间价值滞后价值
#1 a 1 1.5952808 NA
#2 a 2 0.3295078 1.5952808
#3 a 3 -0.8204684 0.3295078
#4 a 4 0.4874291 -0.8204684
#5 b 1 -0.6264538 NA
#6 b 2 0.1836433 -0.6264538
#7 b 3 -0.8356286 0.1836433
我试图直接使用 lag
:
data $ lag.value< - lag(data $ value)
...这显然不会工作。
我也尝试过:
value,data $ groups,lag))
a1 a2 a3 a4 b1 b2 b3
NA -0.1162932 0.4420753 2.1505440 NA 0.5894583 -0.2890288
这几乎是我想要的。但是,生成的向量的排序与数据表中的排序有所不同,这是有问题的。
在base R,plyr,dplyr中执行此操作的最有效的方法是什么,and data.table?
您可以在 data.table
library(data.table)
data [,lag.value:= c(NA,value [ -.N]),by = groups]
data
#时间组值lag.value
#1:1 a 0.02779005 NA
#2:2 a 0.88029938 0.02779005
#3:3 a -1.69514201 0.88029938
#4:1 b -1.27560288 NA
#5:2 b -0.65976434 -1.27560288
#6:3 b -1.37804943 -0.65976434
#7:4 b 0.12041778 -1.37804943
对于多个列:
nm1 < - grep(^ value,colnames(data),value = TRUE)
nm2 < ,nm1,sep =。)
data [,(nm2):= lapply(.SD,function(x)c(NA,x [ nm1]
data
#时间组值value1 value2 lag.value lag.value1
#1:1 b -0.6264538 0.7383247 1.12493092 NA NA
#2:2 b 0.1836433 0.5757814 - 0.04493361 -0.6264538 0.7383247
#3:3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4:1 a 1.5952808 1.5117812 0.94383621 NA NA
#5:2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6:3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7:4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
#lag.value2
#1:NA
#2:1.12493092
#3:-0.04493361
#4:NA
#5:0.94383621
#6:0.82122120
#7:0.59390132
更新
从 .table
versions> = v1.9.5
,我们可以使用 shift
$ c> type 为 lag
或 lead
。默认情况下,类型为 lag
。
data [,(nm2):= shift(.SD)by by groups,.SDcols = nm1]
pre>
#时间组值value1 value2 lag.value lag.value1
#1:1 b -0.6264538 0.7383247 1.12493092 NA NA
#2:2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3 :3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4:1 a 1.5952808 1.5117812 0.94383621 NA NA
#5:2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6:3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7:4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
#lag.value2
#1:NA
#2:1.12493092
#3:-0.04493361
#4:NA
#5:0.94383621
#6:0.82122120
#7:0.59390132
如果需要反向,使用
type = lead
$ b bnm3 < - paste(lead,nm1,sep =。)
使用原始数据集
data [,(nm3):= shift ,type ='lead'),by = groups,.SDcols = nm1]
#时间组值value1 value2 lead.value lead.value1
#1:1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814
#2:2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884
#3:3 b -0.8356286 -0.3053884 -0.01619026 NA NA
#4:1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432
#5:2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406
#6:3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999
#7:4 a 0.4874291 -2.2146999 0.91897737 NA NA
#lead .value2
#1:-0.04493361
#2:-0.01619026
#3:NA
#4:0.82122120
#5:0.59390132
#6 :0.91897737
#7:NA
data
set.seed(1)
data < - data.table(time = c(1:3,1:4),groups = c (7),value1 = rnorm(7),value2 = rnorm(7))
(7)
I have a data.table:
set.seed(1) data <- data.table(time = c(1:3, 1:4), groups = c(rep(c("b", "a"), c(3, 4))), value = rnorm(7)) data # groups time value # 1: b 1 -0.6264538 # 2: b 2 0.1836433 # 3: b 3 -0.8356286 # 4: a 1 1.5952808 # 5: a 2 0.3295078 # 6: a 3 -0.8204684 # 7: a 4 0.4874291
I want to compute a lagged version of the "value" column, within each level of "groups".
The result should look like
# groups time value lag.value # 1 a 1 1.5952808 NA # 2 a 2 0.3295078 1.5952808 # 3 a 3 -0.8204684 0.3295078 # 4 a 4 0.4874291 -0.8204684 # 5 b 1 -0.6264538 NA # 6 b 2 0.1836433 -0.6264538 # 7 b 3 -0.8356286 0.1836433
I have tried to use
lag
directly:data$lag.value <- lag(data$value)
...which clearly wouldn't work.
I have also tried:
unlist(tapply(data$value, data$groups, lag)) a1 a2 a3 a4 b1 b2 b3 NA -0.1162932 0.4420753 2.1505440 NA 0.5894583 -0.2890288
Which is almost what I want. However the vector generated is ordered differently from the ordering in the data.table which is problematic.
What is the most efficient way to do this in base R, plyr, dplyr, and data.table?
解决方案You could do this within
data.table
library(data.table) data[, lag.value:=c(NA, value[-.N]), by=groups] data # time groups value lag.value #1: 1 a 0.02779005 NA #2: 2 a 0.88029938 0.02779005 #3: 3 a -1.69514201 0.88029938 #4: 1 b -1.27560288 NA #5: 2 b -0.65976434 -1.27560288 #6: 3 b -1.37804943 -0.65976434 #7: 4 b 0.12041778 -1.37804943
For multiple columns:
nm1 <- grep("^value", colnames(data), value=TRUE) nm2 <- paste("lag", nm1, sep=".") data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1] data # time groups value value1 value2 lag.value lag.value1 #1: 1 b -0.6264538 0.7383247 1.12493092 NA NA #2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247 #3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814 #4: 1 a 1.5952808 1.5117812 0.94383621 NA NA #5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812 #6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432 #7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406 # lag.value2 #1: NA #2: 1.12493092 #3: -0.04493361 #4: NA #5: 0.94383621 #6: 0.82122120 #7: 0.59390132
Update
From
data.table
versions >=v1.9.5
, we can useshift
withtype
aslag
orlead
. By default, the type islag
.data[, (nm2) := shift(.SD), by=groups, .SDcols=nm1] # time groups value value1 value2 lag.value lag.value1 #1: 1 b -0.6264538 0.7383247 1.12493092 NA NA #2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247 #3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814 #4: 1 a 1.5952808 1.5117812 0.94383621 NA NA #5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812 #6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432 #7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406 # lag.value2 #1: NA #2: 1.12493092 #3: -0.04493361 #4: NA #5: 0.94383621 #6: 0.82122120 #7: 0.59390132
If you need the reverse, use
type=lead
nm3 <- paste("lead", nm1, sep=".")
Using the original dataset
data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1] # time groups value value1 value2 lead.value lead.value1 #1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814 #2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884 #3: 3 b -0.8356286 -0.3053884 -0.01619026 NA NA #4: 1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432 #5: 2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406 #6: 3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999 #7: 4 a 0.4874291 -2.2146999 0.91897737 NA NA # lead.value2 #1: -0.04493361 #2: -0.01619026 #3: NA #4: 0.82122120 #5: 0.59390132 #6: 0.91897737 #7: NA
data
set.seed(1) data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))), value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
这篇关于如何在每个组中创建滞后变量?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!