为用户定义的函数构建高效循环:data.table [英] building efficient for loop for user defined function: data.table
问题描述
相关问题的答案 Data.table:如何获得它承诺的快速子集,并应用于第二个data.table ,这可以通过使用 data.table
来实现 中的更新来解决。
与链接问题的区别在于,我们需要创建所有独特的交叉连接 CJ()
ID
s加入 lsr
之前我们自己的日期向量。
OP提供了一系列日期 time.months
其定义
time.months< - as.Date(2013-02-01)+(365.25 / 12)*(0:192)#dates to evaluate at
导致歪歪日期,只有强制为数字或POSIXct时才可见:
< pre $ head(lubridate :: as_datetime(time.months))
< blockquote>
[1]2013-02-01 00:00:00 UTC2013-03-03 10:30:00 UTC2013- 04-02 21:00:00 UTC
[4]2013-05-03 07:30:00 UTC2013-06-02 18:00:00 UTC2013-07-03 04 :30:00 UTC
问题在于这些日期与午夜不一致,而是从白天开始。为避免这些含糊之处,可以使用 seq()
函数
日期< - seq(as.Date(2013-02-01),length.out = 193,by =month)
从每个月的第一天开始创建一系列日期。
另外,数据使用.table
的 IDate
类,它将日期存储为整数(4个字节)而不是double(8个字节)。这可以节省内存和处理时间,因为通常可以使用更快的整数算术。
#coerce标识日期
idates< - as.IDate(dates)
setDT(lsr)[,eksd:= as.IDate(eksd)] [,ENDDATE:= as.IDate(ENDDATE)]
#交叉连接带日期的唯一ID
CJ(ID = lsr $ ID,date = idates,unique = TRUE)[
#intialize结果列
,AH:= 0L] [
$ b lsr,on =。(ID,日期> = eksd,日期< ENDDATE),
#...只更新匹配的行
$:as.integer(ENDDATE - x.date)] [
#从长格式转换为宽格式
,dcast(.SD,ID〜date)]
$ block $
$ $ $ $ $ $ $ c $ ID 2013-02-01 2013-03-01 2013- 04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...]
1:1 64 36 5 0 0 0 0
2:2 0 0 110 80 49 19 0
3:3 63 35 4 0 0 0 0
<请注意,上面的代码假设每个<$ c $>间隔 [eksd,ENDDATE)
c> ID do 不是重叠。这可以通过
lsr [order(eksd),all(eksd - shift(ENDDATE,fill = 0)> 0),keyby = ID]
ID V1
1:1 TRUE
2:2 TRUE
3:3 TRUE
在有重叠的情况下,可以修改上面的代码,使用 在另一个相关问题 data.table by = xx当我没有找到任何匹配项时,我如何保存长度为0的组合,OP指出由于他的生产规模数据。 根据, by = .EACHI $ c
是几百mio 。行。
基准
lsr
has-group-of-length-0-when-i-returns-no-match / 48401280#comment83662275_48336742> 20 mio行和12列, adherence
数据集,我试图不使用有2列1.5 mio行。在另一个中,OP提到
@minem已经通过在他的回答。我们可以使用这个基准测试数据来比较不同的答案。
$ $ $ $ $ $ $ $ b $ lsr< - data。 (
ID = c(1,1,1,2,2,2,3,3),
eksd = as 。日(c(2012-03-01,2012-08-02,2013-01-06,2012-08-25,2013-03-22,2013-09- (60,90,90,60,2011-01-01,2013-01-05)),
DDD = as.integer (120,60,30,90)),
stringsAsFactors = FALSE)
lsr $ ENDDATE < - lsr $ eksd + lsr $ DDD
n < - 5e4 (lsr2,use.names = T,fill = T,idcol = T)
lsr2 [,ID:= as.integer(paste0(.id,ID))]
基准数据集包含400 k行和150 k独特 ID
s:
<$ c $
$ block $
$ pre>N V2
1:400000 150000
#数据准备从基准
lsr2i< - copy(lsr2)[,eksd:= as.IDate(eksd)] [,ENDDATE:= as.IDate(ENDDATE)]
lsr2 [ ,ENDDATE2:= as.numeric(ENDDATE)]
#define date series
dates < - seq(as.Date(2013-02-01),length.out = (as.IDate(2013-02-01),length.out = 193,by =month)
#运行基准
library(microbenchmark)
bm< - microbenchmark(
minem = {
dt< - copy(lsr2)
xtot< - lapply (日期,函数(d){
d < - as.numeric(d)
x< - dt [eksd< = d& ENDDATE> d,sum(ENDDATE2-d),keyby = ID]
uid < - unique(dt $ ID)
id2 < - setdiff(uid,x $ ID)
id2< (ID = id2,V1 = 0)$ (x,seq_along(xtot))中的b $ bx <-rbind(x,x2)
}
setkey(x,ID)
x
})
{
setnames(xtot [[x]],c(ID,paste0(V,x)))
}
xtot< - Reduce(function(... )合并(...,all = TRUE,by =ID),xtot)
xtot
},
uwe = {
dt < - copy(lsr2i)
CJ(ID = dt $ ID,date = idates,unique = TRUE)[,AH:= 0L] [
dt,on =。(ID,日期> = eksd,日期< ENDDATE ),
AH:= as.integer(ENDDATE - x.date)] [,dcast(.SD,ID〜date)]
},
times = 1L
)
print(bm)
一次运行的结果显示 equi join 比 lapply()快4倍以上()
单位:秒
表达式最小值lq平均值中值uq max neval
minem 27.654703 27.654703 27.654703 27.654703 27.654703 27.654703 1
uwe 5.958907 5.958907 5.958907 5.958907 5.958907 5.958907 1
I'm trying to build an efficient for loop for this function proposed by minem here: (Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table)
My data are:
library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)
adherence <- cbind.data.frame(c("1", "2", "3", "1", "2", "3"), c("2013-01-01", "2013-01-01", "2013-01-01", "2013-02-01", "2013-02-01", "2013-02-01"))
names(adherence)[1] <- "ID"
names(adherence)[2] <- "year"
adherence$year <- ymd(adherence$year)
lsr <- cbind.data.frame(
c("1", "1", "1", "2", "2", "2", "3", "3"), #ID
c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05"), #eksd
c("60", "90", "90", "60", "120", "60", "30", "90") # DDD
)
names(lsr)[1] <- "ID"
names(lsr)[2] <- "eksd"
names(lsr)[3] <- "DDD"
lsr$eksd <- as.Date((lsr$eksd))
lsr$DDD <- as.numeric(as.character(lsr$DDD))
lsr$ENDDATE <- lsr$eksd + lsr$DDD
lsr <- as.data.table(lsr)
adherence <- as.data.table(adherence)
The Function proposed by minem are:
by_minem2 <- function(dt = lsr2) {
d <- as.numeric(as.Date("2013-02-01"))
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
x
}
This returns:
> by_minem2(lsr)
ID V1
1: 1 64
2: 2 0
3: 3 63
For the loop i need to include information about which time I evaluated at so the ideal repeated output looks like this:
cbind(as.Date("2013-02-01"),by_minem2(lsr))
I then want to repeat this for different dates a few hundred times putting everything into the same data.table:
time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at
I'm trying to do this with a for loop like this:
for (d in min(time.months):max(time.months))
{
by_minem <- function(dt = lsr2) {
d <- as.numeric(d)
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
xtot <- append(xtot,x)
xtot <- cbind(d, xtot) # i need to know time of evaluation
xtot
}
}
As indicated in the answer to the related question Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table, this can be solved by updating in a non-equi join which is possible with data.table
.
The difference to the linked question is that here we need to create the cross join CJ()
of all unique ID
s with the vector of dates on our own before joining with lsr
.
The OP has provided a series of dates time.months
whose defintion
time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at
leads to "crooked" dates which is only visible if coerced to numeric or POSIXct:
head(lubridate::as_datetime(time.months))
[1] "2013-02-01 00:00:00 UTC" "2013-03-03 10:30:00 UTC" "2013-04-02 21:00:00 UTC" [4] "2013-05-03 07:30:00 UTC" "2013-06-02 18:00:00 UTC" "2013-07-03 04:30:00 UTC"
The issue is that these "dates" are not aligned with midnight but start somewhere during the day. To avoid these ambiguities, the seq()
function can be used
dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")
which creates a series of dates starting on the first day of each month.
In addition, data.table
's IDate
class is used which stores dates as integers (4 bytes) instead of double (8 bytes). This saves memory as well as processing time because the usually faster integer arithmetic can be used.
# coerce Date to IDate
idates <- as.IDate(dates)
setDT(lsr)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]
# cross join unique IDs with dates
CJ(ID = lsr$ID, date = idates, unique = TRUE)[
# intialize result column
, AH := 0L][
# non-equi join and ...
lsr, on = .(ID, date >= eksd, date < ENDDATE),
# ... update only matching rows
AH := as.integer(ENDDATE - x.date)][
# reshape from long to wide format
, dcast(.SD, ID ~ date)]
ID 2013-02-01 2013-03-01 2013-04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...] 1: 1 64 36 5 0 0 0 0 2: 2 0 0 110 80 49 19 0 3: 3 63 35 4 0 0 0 0
Caveat
Note that above code assumes that the intervals [eksd, ENDDATE)
for each ID
do not overlap. This can be verified by
lsr[order(eksd), all(eksd - shift(ENDDATE, fill = 0) > 0), keyby = ID]
ID V1 1: 1 TRUE 2: 2 TRUE 3: 3 TRUE
In case there are overlaps, the above code can be modified to aggregate within the non-equi join using by = .EACHI
.
Benchmark
In another related question data.table by = xx How do i keep the groups of length 0 when i returns no match, the OP has pointed out that performance is crucial due to the size of his production data.
According to OP's comment, lsr
has 20 mio rows and 12 columns, the adherence
dataset, that I'm trying not to use has 1,5 mio rows of 2 columns. In another question, the OP mentions that lsr
is a few hundred mio. rows.
@minem has responded to this by providing a benchmark in his answer. We can use this benchmark data to compare the different answers.
# create benchmark data
lsr <- data.frame(
ID = c("1", "1", "1", "2", "2", "2", "3", "3"),
eksd = as.Date(c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05")),
DDD = as.integer(c("60", "90", "90", "60", "120", "60", "30", "90")),
stringsAsFactors = FALSE)
lsr$ENDDATE <- lsr$eksd + lsr$DDD
n <- 5e4
lsr2 <- lapply(1:n, function(x) lsr)
lsr2 <- rbindlist(lsr2, use.names = T, fill = T, idcol = T)
lsr2[, ID := as.integer(paste0(.id, ID))]
Thus, the benchmark dataset consists of 400 k rows and 150 k unique ID
s:
lsr2[, .(.N, uniqueN(ID))]
N V2 1: 400000 150000
# pull data preparation out of the benchmark
lsr2i <- copy(lsr2)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]
lsr2[, ENDDATE2 := as.numeric(ENDDATE)]
# define date series
dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")
idates <- seq(as.IDate("2013-02-01"), length.out = 193, by = "month")
# run benchmark
library(microbenchmark)
bm <- microbenchmark(
minem = {
dt <- copy(lsr2)
xtot <- lapply(dates, function(d) {
d <- as.numeric(d)
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
if (length(id2) > 0) {
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
}
setkey(x, ID)
x
})
for (x in seq_along(xtot)) {
setnames(xtot[[x]], c("ID", paste0("V", x)))
}
xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot)
xtot
},
uwe = {
dt <- copy(lsr2i)
CJ(ID = dt$ID, date = idates, unique = TRUE)[, AH := 0L][
dt, on = .(ID, date >= eksd, date < ENDDATE),
AH := as.integer(ENDDATE - x.date)][, dcast(.SD, ID ~ date)]
},
times = 1L
)
print(bm)
The result for one run shows that the non-equi join is more than 4 times faster than the lapply()
approach.
Unit: seconds expr min lq mean median uq max neval minem 27.654703 27.654703 27.654703 27.654703 27.654703 27.654703 1 uwe 5.958907 5.958907 5.958907 5.958907 5.958907 5.958907 1
这篇关于为用户定义的函数构建高效循环:data.table的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!