矢量化操作的速度取决于数据列的数量 [英] Speed of vectorized operation dependent of number of columns of data.frame

查看:121
本文介绍了矢量化操作的速度取决于数据列的数量的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

为什么要对具有相同数量元素的数据框架进行比较操作需要更长时间,但是在向量化操作的更多列中排列?以这个简单的例子,我们从每个元素减去0.5,然后比较它,看看它是否是< 0(此问题):

  f.df<  -  function(df,x = 0.5){
df < - df - x
df [df < 0]< - 0
return(df)
}


df1 < - data.frame(matrix(runif(1e5),nrow = 1e2) )
df2< - data.frame(matrix(runif(1e5),nrow = 1e3))
df3< - data.frame(matrix(runif(1e5),nrow = 1e4))

要求(微基准)
微基准(f.df(df1),f.df(df2),f.df(df3),times = 10L)


#Unit:milliseconds
#expr min lq median uq max neval
#f.df(df1)1562.66827 1568.21097 1595.07005 1674.91726 1680.90092 10
#f.df(df2)95.77452 98.12557 101.31215 190.46906 198.23927 10
#f.df(df3)16.25295 16.42373 16.74989 17.95621 18.69218 10


解决方案

一些分析表明,大部分时间花在 [< - 。data.frame 中。



因此,缩放问题来自于 Ops.data.frame [< -.dataframe 工作以及 [< - 。data.frame 副本以及 [[< - 一个命名列表的副本。



Ops.data.frame 中的相关代码我的意见)

 #cn是您的data.frame 
的名称(se在seq_along(cn) ){
left< - if(!lscalar)
e1 [[j]]
else e1
right< - if(!rscalar)
e2 [ [j]]
else e2
value [[j]]< - eval(f)
}
#有时返回一个data.frame
if(。 %c(+, - ,*,/,%%,%/%)中的通用%){
名称(值)< - cn
data.frame(value,row.names = rn,check.names = FALSE,
check.rows = FALSE)
}#有时返回一个矩阵
else矩阵(unlist(value, recursive = FALSE,use.names = FALSE),
nrow = nr,dimnames = list(rn,cn))

当您使用 Ops.data.frame 时,它将循环使用for循环中的列,使用 [[< - 每次更换。
这意味着随着列数的增加,所需的时间将增加(因为将会有一些保护性内部复制,因为它是一个数据。框架被命名为列表) - 因此它将以列数线性缩放

 #例如,只有这部分将按列数扩展
f.df.1< - function (df,x = 0.5){
df < - df - x

return(df)
}
微基准(f.df.1(df1) ,f.df.1(df2),f.df.1(df3),times = 10L)
#单位:毫秒
#expr最小lq中位数uq max neval
#f。 df.1(df1)96.739646 97.143298 98.36253 172.937100 175.539239 10
#f.df.1(df2)11.697373 11.955173 12.12206 12.304543 281.055865 10
#f.df.1(df3)3.114089 3.149682 3.41174 3.575835 3.640467 10

[< - 。data.frame i 是与...相同维度的逻辑矩阵时, code> x

  if(is.logical(i)& is.matrix(i)&&所有(dim(i)== dim(x))){
nreplace< - sum(i,na.rm = TRUE)
if(!nreplace)return(x)
##允许复制length(value)> 1 in 1.8.0
N < - length(value)
if(N> 1L&& N< nreplace&(nreplace %% N)== 0L)
值< - rep(value,length.out = nreplace)
if(N> 1L&&(length(value)!= nreplace))
stop('value '是错误的长度')
n < - 0L
nv< - nrow(x)
for(v in seq_len(dim(i)[2L])){
thisvar< - i [,v,drop = TRUE]
nv< - sum(thisvar,na.rm = TRUE)
if(nv){
if(is.matrix (x [[v]]))
x [[v]] [thisvar,]< - if(N> 1L)value [n + seq_len(nv)] else value
else
x [[v]] [thisvar]< - if(N> 1L)value [n + seq_len(nv)] else value
}
n < - n + nv
}
return(x)


f.df.2< - fu (df,x = 0.5){
df [df < 0] < - 0

return(df)
}
微基准(f.df.2(df1),f.df.2(df2),f.df .2(df3),times = 10L)
#单位:毫秒
#expr最小lq中位数uq max neval
#f.df.2(df1)20.500873 20.575801 20.699469 20.993723 84.825607 10
#f.df.2(df2)3.143228 3.149111 3.173265 3.353779 3.409068 10
#f.df.2(df3)1.581727 1.634463 1.707337 1.876240 1.887746 10
pre>




[< - data.frame code>< - )将同时复制






如何改进。您可以从 data.table 中使用 lapply 设置

  library(data.table)
sdf< - function(df,x = 0.5){
#显式副本,所以没有更改原始
dd< - copy(df)
for(j in names(df)){
set(dd,j = j, value = dd [[j]] - 0.5)
#这是很慢的(必须)重复完成或许这个
#可以从循环中出来,进入lapply或vapply b $ b whi < - (dd [j]]< 0)
if(length(whi)){
set(dd,j = j,i = whi,value = )
}
}
return(dd)
}

微基准(sdf(df1),sdf(df2),sdf(df3) = 10L)
#单位:毫秒
#expr最小lq中位数uq最大neval
#sdf(df1)87.471560 88.323686 89.880685 92.659141 153.218536 10
#sdf(df2)6.235951 6.531192 6.630981 6.786801 7.230825 10
#sdf(df3)2.631641 2.729612 2.7 75762 2.884807 2.970556 10

#使用lapply
的基础R方法ldf< - function(df,x = 0.5){

as.data.frame( lapply(df,function(xx,x){xxx
}

#挺好的对于大的data.frames
微基准(ldf(df1),ldf(df2),ldf(df3),times = 10L)
#单位:毫秒
#expr min lq中位数uq最大净额
#ldf(df1)84.380144 84.659572 85.987488 159.928249 161.720599 10
#ldf(df2)11.507918 11.793418 11.948194 12.175975 86.186517 10
#ldf(df3)4.237206 4.368717 4.449018 4.627336 5.081222 10

#他们都生成相同的
dd< - sdf(df1)
ddf1< - f.df(df1)
ldf1 < - ldf(df1)
相同(dd,ddf1)
## [1] TRUE
相同(ddf1,ldf1)
## [1] TRUE

#sdf和ldf与许多列相当,
#见下面的基准。
微基准(sdf(df1),ldf(df1),f.df(df1),times = 10L)
#单位:毫秒
#expr最小lq中位数uq max neval
#sdf(df1)85.75355 86.47659 86.76647 87.88829 172.0589 10
#ldf(df1)84.73023 85.27622 85.61528 172.02897 356.4318 10
#f.df(df1)3689.83135 3730.20084 3768.44067 3905.69565 3949.3532 10
#sdf对于较小的data.frames
microbenchmark(sdf(df2),ldf(df2),f.df(df2),times = 10L)
#单位:毫秒
#expr最小lq中位数uq最大值
#sdf(df2)6.46860 6.557955 6.603772 6.927785 7.019567 10
#ldf(df2)12.26376 12.551905 12.576802 12.667775 12.982594 10
#f.df(df2)268.42042 273.800762 278.435929 346.112355 503.551387 10
微基准(sdf(df3),ldf(df3),f.df(df3),times = 10L)
#单位:毫秒
#expr min lq median uq max neval
#sdf(df3)2.538830 2.911310 3.020998 3.120961 74.980466 10
#ldf(df3)4.698771 5.202121 5.272721 5.407351 5.424124 10
#f.df(df3)17.819254 18.039089 18.158069 19.692038 90.620645 10

#复制较大的对象较慢,重复调用的速度较慢。

microbenchmark(copy(df1),copy(df2),copy(df3),times = 10L)
#单位:微秒
#expr min lq median uq max neval
#copy(df1)369.926 407.218 480.5710 527.229 618.698 10
#copy(df2)165.402 224.626 279.5445 296.215 519.773 10
#copy(df3)150.148 180.625 214.9140 276.035 467.972 10


Why does it take longer to operate a comparison on a data.frame with the same number of elements, but arranged in more columns on vectorized operations? Take this simple example, where we subtract 0.5 from each element and then compare it to see if it is < 0 ( related to this question ):

f.df <- function( df , x = 0.5 ){
  df <- df - x
  df[ df < 0 ] <- 0
  return( df )
}


df1 <- data.frame( matrix( runif(1e5) , nrow = 1e2 ) )
df2 <- data.frame( matrix( runif(1e5) , nrow = 1e3 ) )
df3 <- data.frame( matrix( runif(1e5) , nrow = 1e4 ) )

require( microbenchmark )
microbenchmark( f.df( df1 ) , f.df( df2 ) , f.df( df3 ) , times = 10L )


#Unit: milliseconds
#     expr        min         lq     median         uq        max neval
# f.df(df1) 1562.66827 1568.21097 1595.07005 1674.91726 1680.90092    10
# f.df(df2)   95.77452   98.12557  101.31215  190.46906  198.23927    10
# f.df(df3)   16.25295   16.42373   16.74989   17.95621   18.69218    10

解决方案

A bit of profiling shows that most of your time is spent in [<-.data.frame.

The scaling issues therefore come from how Ops.data.frame and [<-.dataframe work and how [<-.data.frame copies, and [[<- copies for a named list,.

The relevant code in Ops.data.frame (with my comments)

 # cn is the names of your data.frame 
 for (j in seq_along(cn)) {
         left <- if (!lscalar) 
             e1[[j]]
         else e1
         right <- if (!rscalar) 
             e2[[j]]
         else e2
         value[[j]] <- eval(f)
     }
    # sometimes return a data.frame
     if (.Generic %in% c("+", "-", "*", "/", "%%", "%/%")) {
         names(value) <- cn
         data.frame(value, row.names = rn, check.names = FALSE, 
             check.rows = FALSE)
     } # sometimes return a matrix
     else matrix(unlist(value, recursive = FALSE, use.names = FALSE), 
         nrow = nr, dimnames = list(rn, cn))

When you use Ops.data.frame it will cycle through your columns in the for loop using [[<- to replace each time. This means as the number of columns increases, the time required will increase (as there will be some protective internal copying as it is a data.frame is named list ) -- hence it will scale linearly with the number of columns

# for example  only this part will scale with the number of columns
f.df.1 <- function( df , x = 0.5 ){
     df <- df - x

     return( df )
 }
microbenchmark(f.df.1(df1),f.df.1(df2),f.df.1(df3), times = 10L)
# Unit: milliseconds
#        expr       min        lq   median         uq        max neval
# f.df.1(df1) 96.739646 97.143298 98.36253 172.937100 175.539239    10
# f.df.1(df2) 11.697373 11.955173 12.12206  12.304543 281.055865    10
# f.df.1(df3)  3.114089  3.149682  3.41174   3.575835   3.640467    10

[<-.data.frame has a similar loop through columns when i is a logical matrix of the same dimension as x

 if(is.logical(i) && is.matrix(i) && all(dim(i) == dim(x))) {
            nreplace <- sum(i, na.rm=TRUE)
            if(!nreplace) return(x) # nothing to replace
            ## allow replication of length(value) > 1 in 1.8.0
            N <- length(value)
            if(N > 1L && N < nreplace && (nreplace %% N) == 0L)
                value <- rep(value, length.out = nreplace)
            if(N > 1L && (length(value) != nreplace))
                stop("'value' is the wrong length")
            n <- 0L
            nv <- nrow(x)
            for(v in seq_len(dim(i)[2L])) {
                thisvar <- i[, v, drop = TRUE]
                nv <- sum(thisvar, na.rm = TRUE)
                if(nv) {
                    if(is.matrix(x[[v]]))
                        x[[v]][thisvar, ] <- if(N > 1L) value[n+seq_len(nv)] else value
                    else
                        x[[v]][thisvar] <- if(N > 1L) value[n+seq_len(nv)] else value
                }
                n <- n+nv
            }
            return(x)


f.df.2 <- function( df , x = 0.5 ){
     df[df < 0 ] <- 0

     return( df )
 }
 microbenchmark(f.df.2(df1), f.df.2(df2), f.df.2(df3), times = 10L)
# Unit: milliseconds
#        expr       min        lq    median        uq       max neval
# f.df.2(df1) 20.500873 20.575801 20.699469 20.993723 84.825607    10
# f.df.2(df2)  3.143228  3.149111  3.173265  3.353779  3.409068    10
# f.df.2(df3)  1.581727  1.634463  1.707337  1.876240  1.887746    10


[<- data.frame (and <-) will both copy as well


How to improve. You can use lapply or set from the data.table package

library(data.table)
sdf <- function(df, x = 0.5){
   # explicit copy so there are no changes to original
   dd <- copy(df)
  for(j in names(df)){
    set(dd, j= j, value = dd[[j]] - 0.5)
    # this is slow when (necessarily) done repeatedly perhaps this 
    # could come out of the loop and into a `lapply` or `vapply` statment
    whi <- which(dd[[j]] < 0 )
    if(length(whi)){
     set(dd, j= j, i = whi, value = 0.0)
    }
  }
  return(dd)
}

 microbenchmark(sdf(df1), sdf(df2), sdf(df3), times = 10L)
# Unit: milliseconds
# expr       min        lq    median        uq        max neval
# sdf(df1) 87.471560 88.323686 89.880685 92.659141 153.218536    10
# sdf(df2)  6.235951  6.531192  6.630981  6.786801   7.230825    10
# sdf(df3)  2.631641  2.729612  2.775762  2.884807   2.970556    10

# a base R approach using lapply
ldf <- function(df, x = 0.5){

  as.data.frame(lapply(df, function(xx,x){ xxx <- xx-x;replace(xxx, xxx<0,0)}, x=x))

}

# pretty good. Does well with large data.frames
microbenchmark(ldf(df1), ldf(df2), ldf(df3), times = 10L)
# Unit: milliseconds
# expr       min        lq    median         uq        max neval
# ldf(df1) 84.380144 84.659572 85.987488 159.928249 161.720599    10
# ldf(df2) 11.507918 11.793418 11.948194  12.175975  86.186517    10
# ldf(df3)  4.237206  4.368717  4.449018   4.627336   5.081222    10

# they all produce the same
dd <- sdf(df1)
ddf1 <- f.df(df1)
ldf1 <- ldf(df1)
identical(dd,ddf1)
## [1] TRUE
identical(ddf1, ldf1)
## [1] TRUE

# sdf and ldf comparable with lots of columns
# see benchmarking below.
microbenchmark(sdf(df1), ldf(df1), f.df(df1),  times = 10L)
# Unit: milliseconds
# expr        min         lq     median         uq       max neval
# sdf(df1)   85.75355   86.47659   86.76647   87.88829  172.0589    10
# ldf(df1)   84.73023   85.27622   85.61528  172.02897  356.4318    10
# f.df(df1) 3689.83135 3730.20084 3768.44067 3905.69565 3949.3532    10
# sdf ~ twice as fast with smaller data.frames
 microbenchmark(sdf(df2), ldf(df2), f.df(df2),  times = 10L)
# Unit: milliseconds
# expr       min         lq     median         uq        max neval
# sdf(df2)   6.46860   6.557955   6.603772   6.927785   7.019567    10
# ldf(df2)  12.26376  12.551905  12.576802  12.667775  12.982594    10
# f.df(df2) 268.42042 273.800762 278.435929 346.112355 503.551387    10
microbenchmark(sdf(df3), ldf(df3), f.df(df3),  times = 10L)
# Unit: milliseconds
# expr       min        lq    median        uq       max neval
# sdf(df3)  2.538830  2.911310  3.020998  3.120961 74.980466    10
# ldf(df3)  4.698771  5.202121  5.272721  5.407351  5.424124    10
# f.df(df3) 17.819254 18.039089 18.158069 19.692038 90.620645    10

# copying of larger objects is slower, repeated calls to which are slow.

microbenchmark(copy(df1), copy(df2), copy(df3), times = 10L)
# Unit: microseconds
# expr     min      lq   median      uq     max neval
# copy(df1) 369.926 407.218 480.5710 527.229 618.698    10
# copy(df2) 165.402 224.626 279.5445 296.215 519.773    10
# copy(df3) 150.148 180.625 214.9140 276.035 467.972    10

这篇关于矢量化操作的速度取决于数据列的数量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆