矢量化操作的速度取决于数据列的数量 [英] Speed of vectorized operation dependent of number of columns of data.frame
问题描述
f.df< - function(df,x = 0.5){
df < - df - x
df [df < 0]< - 0
return(df)
}
df1 < - data.frame(matrix(runif(1e5),nrow = 1e2) )
df2< - data.frame(matrix(runif(1e5),nrow = 1e3))
df3< - data.frame(matrix(runif(1e5),nrow = 1e4))
要求(微基准)
微基准(f.df(df1),f.df(df2),f.df(df3),times = 10L)
#Unit:milliseconds
#expr min lq median uq max neval
#f.df(df1)1562.66827 1568.21097 1595.07005 1674.91726 1680.90092 10
#f.df(df2)95.77452 98.12557 101.31215 190.46906 198.23927 10
#f.df(df3)16.25295 16.42373 16.74989 17.95621 18.69218 10
一些分析表明,大部分时间花在 [< - 。data.frame
中。
因此,缩放问题来自于 Ops.data.frame
和 [< -.dataframe
工作以及 [< - 。data.frame
副本以及 [[< -
一个命名列表的副本。
Ops.data.frame
中的相关代码我的意见)
#cn是您的data.frame
的名称(se在seq_along(cn) ){
left< - if(!lscalar)
e1 [[j]]
else e1
right< - if(!rscalar)
e2 [ [j]]
else e2
value [[j]]< - eval(f)
}
#有时返回一个data.frame
if(。 %c(+, - ,*,/,%%,%/%)中的通用%){
名称(值)< - cn
data.frame(value,row.names = rn,check.names = FALSE,
check.rows = FALSE)
}#有时返回一个矩阵
else矩阵(unlist(value, recursive = FALSE,use.names = FALSE),
nrow = nr,dimnames = list(rn,cn))
当您使用 Ops.data.frame
时,它将循环使用for循环中的列,使用 [[< -
每次更换。
这意味着随着列数的增加,所需的时间将增加(因为将会有一些保护性内部复制,因为它是一个数据。框架被命名为列表) - 因此它将以列数线性缩放
#例如,只有这部分将按列数扩展
f.df.1< - function (df,x = 0.5){
df < - df - x
return(df)
}
微基准(f.df.1(df1) ,f.df.1(df2),f.df.1(df3),times = 10L)
#单位:毫秒
#expr最小lq中位数uq max neval
#f。 df.1(df1)96.739646 97.143298 98.36253 172.937100 175.539239 10
#f.df.1(df2)11.697373 11.955173 12.12206 12.304543 281.055865 10
#f.df.1(df3)3.114089 3.149682 3.41174 3.575835 3.640467 10
[< - 。data.frame
当 i
是与...相同维度的逻辑矩阵时, code> x
if(is.logical(i)& is.matrix(i)&&所有(dim(i)== dim(x))){
pre>
nreplace< - sum(i,na.rm = TRUE)
if(!nreplace)return(x)
##允许复制length(value)> 1 in 1.8.0
N < - length(value)
if(N> 1L&& N< nreplace&(nreplace %% N)== 0L)
值< - rep(value,length.out = nreplace)
if(N> 1L&&(length(value)!= nreplace))
stop('value '是错误的长度')
n < - 0L
nv< - nrow(x)
for(v in seq_len(dim(i)[2L])){
thisvar< - i [,v,drop = TRUE]
nv< - sum(thisvar,na.rm = TRUE)
if(nv){
if(is.matrix (x [[v]]))
x [[v]] [thisvar,]< - if(N> 1L)value [n + seq_len(nv)] else value
else
x [[v]] [thisvar]< - if(N> 1L)value [n + seq_len(nv)] else value
}
n < - n + nv
}
return(x)
f.df.2< - fu (df,x = 0.5){
df [df < 0] < - 0
return(df)
}
微基准(f.df.2(df1),f.df.2(df2),f.df .2(df3),times = 10L)
#单位:毫秒
#expr最小lq中位数uq max neval
#f.df.2(df1)20.500873 20.575801 20.699469 20.993723 84.825607 10
#f.df.2(df2)3.143228 3.149111 3.173265 3.353779 3.409068 10
#f.df.2(df3)1.581727 1.634463 1.707337 1.876240 1.887746 10
[< -
data.frame code>< - )将同时复制
如何改进。您可以从
data.table
中使用lapply
或设置
包library(data.table)
sdf< - function(df,x = 0.5){
#显式副本,所以没有更改原始
dd< - copy(df)
for(j in names(df)){
set(dd,j = j, value = dd [[j]] - 0.5)
#这是很慢的(必须)重复完成或许这个
#可以从循环中出来,进入lapply或vapply b $ b whi < - (dd [j]]< 0)
if(length(whi)){
set(dd,j = j,i = whi,value = )
}
}
return(dd)
}
微基准(sdf(df1),sdf(df2),sdf(df3) = 10L)
#单位:毫秒
#expr最小lq中位数uq最大neval
#sdf(df1)87.471560 88.323686 89.880685 92.659141 153.218536 10
#sdf(df2)6.235951 6.531192 6.630981 6.786801 7.230825 10
#sdf(df3)2.631641 2.729612 2.7 75762 2.884807 2.970556 10
#使用lapply
的基础R方法ldf< - function(df,x = 0.5){
as.data.frame( lapply(df,function(xx,x){xxx
}
#挺好的对于大的data.frames
微基准(ldf(df1),ldf(df2),ldf(df3),times = 10L)
#单位:毫秒
#expr min lq中位数uq最大净额
#ldf(df1)84.380144 84.659572 85.987488 159.928249 161.720599 10
#ldf(df2)11.507918 11.793418 11.948194 12.175975 86.186517 10
#ldf(df3)4.237206 4.368717 4.449018 4.627336 5.081222 10
#他们都生成相同的
dd< - sdf(df1)
ddf1< - f.df(df1)
ldf1 < - ldf(df1)
相同(dd,ddf1)
## [1] TRUE
相同(ddf1,ldf1)
## [1] TRUE
#sdf和ldf与许多列相当,
#见下面的基准。
微基准(sdf(df1),ldf(df1),f.df(df1),times = 10L)
#单位:毫秒
#expr最小lq中位数uq max neval
#sdf(df1)85.75355 86.47659 86.76647 87.88829 172.0589 10
#ldf(df1)84.73023 85.27622 85.61528 172.02897 356.4318 10
#f.df(df1)3689.83135 3730.20084 3768.44067 3905.69565 3949.3532 10
#sdf对于较小的data.frames
microbenchmark(sdf(df2),ldf(df2),f.df(df2),times = 10L)
#单位:毫秒
#expr最小lq中位数uq最大值
#sdf(df2)6.46860 6.557955 6.603772 6.927785 7.019567 10
#ldf(df2)12.26376 12.551905 12.576802 12.667775 12.982594 10
#f.df(df2)268.42042 273.800762 278.435929 346.112355 503.551387 10
微基准(sdf(df3),ldf(df3),f.df(df3),times = 10L)
#单位:毫秒
#expr min lq median uq max neval
#sdf(df3)2.538830 2.911310 3.020998 3.120961 74.980466 10
#ldf(df3)4.698771 5.202121 5.272721 5.407351 5.424124 10
#f.df(df3)17.819254 18.039089 18.158069 19.692038 90.620645 10
#复制较大的对象较慢,重复调用的速度较慢。
microbenchmark(copy(df1),copy(df2),copy(df3),times = 10L)
#单位:微秒
#expr min lq median uq max neval
#copy(df1)369.926 407.218 480.5710 527.229 618.698 10
#copy(df2)165.402 224.626 279.5445 296.215 519.773 10
#copy(df3)150.148 180.625 214.9140 276.035 467.972 10
Why does it take longer to operate a comparison on a data.frame with the same number of elements, but arranged in more columns on vectorized operations? Take this simple example, where we subtract 0.5 from each element and then compare it to see if it is < 0 ( related to this question ):
f.df <- function( df , x = 0.5 ){ df <- df - x df[ df < 0 ] <- 0 return( df ) } df1 <- data.frame( matrix( runif(1e5) , nrow = 1e2 ) ) df2 <- data.frame( matrix( runif(1e5) , nrow = 1e3 ) ) df3 <- data.frame( matrix( runif(1e5) , nrow = 1e4 ) ) require( microbenchmark ) microbenchmark( f.df( df1 ) , f.df( df2 ) , f.df( df3 ) , times = 10L ) #Unit: milliseconds # expr min lq median uq max neval # f.df(df1) 1562.66827 1568.21097 1595.07005 1674.91726 1680.90092 10 # f.df(df2) 95.77452 98.12557 101.31215 190.46906 198.23927 10 # f.df(df3) 16.25295 16.42373 16.74989 17.95621 18.69218 10
解决方案A bit of profiling shows that most of your time is spent in
[<-.data.frame
.The scaling issues therefore come from how
Ops.data.frame
and[<-.dataframe
work and how[<-.data.frame
copies, and[[<-
copies for a named list,.The relevant code in
Ops.data.frame
(with my comments)# cn is the names of your data.frame for (j in seq_along(cn)) { left <- if (!lscalar) e1[[j]] else e1 right <- if (!rscalar) e2[[j]] else e2 value[[j]] <- eval(f) } # sometimes return a data.frame if (.Generic %in% c("+", "-", "*", "/", "%%", "%/%")) { names(value) <- cn data.frame(value, row.names = rn, check.names = FALSE, check.rows = FALSE) } # sometimes return a matrix else matrix(unlist(value, recursive = FALSE, use.names = FALSE), nrow = nr, dimnames = list(rn, cn))
When you use
Ops.data.frame
it will cycle through your columns in the for loop using[[<-
to replace each time. This means as the number of columns increases, the time required will increase (as there will be some protective internal copying as it is a data.frame is named list ) -- hence it will scale linearly with the number of columns# for example only this part will scale with the number of columns f.df.1 <- function( df , x = 0.5 ){ df <- df - x return( df ) } microbenchmark(f.df.1(df1),f.df.1(df2),f.df.1(df3), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # f.df.1(df1) 96.739646 97.143298 98.36253 172.937100 175.539239 10 # f.df.1(df2) 11.697373 11.955173 12.12206 12.304543 281.055865 10 # f.df.1(df3) 3.114089 3.149682 3.41174 3.575835 3.640467 10
[<-.data.frame
has a similar loop through columns wheni
is a logical matrix of the same dimension asx
if(is.logical(i) && is.matrix(i) && all(dim(i) == dim(x))) { nreplace <- sum(i, na.rm=TRUE) if(!nreplace) return(x) # nothing to replace ## allow replication of length(value) > 1 in 1.8.0 N <- length(value) if(N > 1L && N < nreplace && (nreplace %% N) == 0L) value <- rep(value, length.out = nreplace) if(N > 1L && (length(value) != nreplace)) stop("'value' is the wrong length") n <- 0L nv <- nrow(x) for(v in seq_len(dim(i)[2L])) { thisvar <- i[, v, drop = TRUE] nv <- sum(thisvar, na.rm = TRUE) if(nv) { if(is.matrix(x[[v]])) x[[v]][thisvar, ] <- if(N > 1L) value[n+seq_len(nv)] else value else x[[v]][thisvar] <- if(N > 1L) value[n+seq_len(nv)] else value } n <- n+nv } return(x) f.df.2 <- function( df , x = 0.5 ){ df[df < 0 ] <- 0 return( df ) } microbenchmark(f.df.2(df1), f.df.2(df2), f.df.2(df3), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # f.df.2(df1) 20.500873 20.575801 20.699469 20.993723 84.825607 10 # f.df.2(df2) 3.143228 3.149111 3.173265 3.353779 3.409068 10 # f.df.2(df3) 1.581727 1.634463 1.707337 1.876240 1.887746 10
[<-
data.frame (and<-
) will both copy as well
How to improve. You can use
lapply
orset
from thedata.table
packagelibrary(data.table) sdf <- function(df, x = 0.5){ # explicit copy so there are no changes to original dd <- copy(df) for(j in names(df)){ set(dd, j= j, value = dd[[j]] - 0.5) # this is slow when (necessarily) done repeatedly perhaps this # could come out of the loop and into a `lapply` or `vapply` statment whi <- which(dd[[j]] < 0 ) if(length(whi)){ set(dd, j= j, i = whi, value = 0.0) } } return(dd) } microbenchmark(sdf(df1), sdf(df2), sdf(df3), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # sdf(df1) 87.471560 88.323686 89.880685 92.659141 153.218536 10 # sdf(df2) 6.235951 6.531192 6.630981 6.786801 7.230825 10 # sdf(df3) 2.631641 2.729612 2.775762 2.884807 2.970556 10 # a base R approach using lapply ldf <- function(df, x = 0.5){ as.data.frame(lapply(df, function(xx,x){ xxx <- xx-x;replace(xxx, xxx<0,0)}, x=x)) } # pretty good. Does well with large data.frames microbenchmark(ldf(df1), ldf(df2), ldf(df3), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # ldf(df1) 84.380144 84.659572 85.987488 159.928249 161.720599 10 # ldf(df2) 11.507918 11.793418 11.948194 12.175975 86.186517 10 # ldf(df3) 4.237206 4.368717 4.449018 4.627336 5.081222 10 # they all produce the same dd <- sdf(df1) ddf1 <- f.df(df1) ldf1 <- ldf(df1) identical(dd,ddf1) ## [1] TRUE identical(ddf1, ldf1) ## [1] TRUE # sdf and ldf comparable with lots of columns # see benchmarking below. microbenchmark(sdf(df1), ldf(df1), f.df(df1), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # sdf(df1) 85.75355 86.47659 86.76647 87.88829 172.0589 10 # ldf(df1) 84.73023 85.27622 85.61528 172.02897 356.4318 10 # f.df(df1) 3689.83135 3730.20084 3768.44067 3905.69565 3949.3532 10 # sdf ~ twice as fast with smaller data.frames microbenchmark(sdf(df2), ldf(df2), f.df(df2), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # sdf(df2) 6.46860 6.557955 6.603772 6.927785 7.019567 10 # ldf(df2) 12.26376 12.551905 12.576802 12.667775 12.982594 10 # f.df(df2) 268.42042 273.800762 278.435929 346.112355 503.551387 10 microbenchmark(sdf(df3), ldf(df3), f.df(df3), times = 10L) # Unit: milliseconds # expr min lq median uq max neval # sdf(df3) 2.538830 2.911310 3.020998 3.120961 74.980466 10 # ldf(df3) 4.698771 5.202121 5.272721 5.407351 5.424124 10 # f.df(df3) 17.819254 18.039089 18.158069 19.692038 90.620645 10 # copying of larger objects is slower, repeated calls to which are slow. microbenchmark(copy(df1), copy(df2), copy(df3), times = 10L) # Unit: microseconds # expr min lq median uq max neval # copy(df1) 369.926 407.218 480.5710 527.229 618.698 10 # copy(df2) 165.402 224.626 279.5445 296.215 519.773 10 # copy(df3) 150.148 180.625 214.9140 276.035 467.972 10
这篇关于矢量化操作的速度取决于数据列的数量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!