数据表中的逐行操作和更新 [英] row-by-row operations and updates in data.table

查看：75 发布时间：2017/3/12 10:17:35 r data.table

本文介绍了数据表中的逐行操作和更新的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我结束了一个大数据表，我必须做每行操作。（是...我知道这显然不是什么data.table是）

  R）set.seed ）
 R）DT = data.table（matrix（rnorm（100），nrow = 10））
 R）DT [，c（'a'，'b'）： 10,2：11）] 
 R）DT 
 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ab 
 1：-0.6264538107 1.51178116845 0.91897737161 1.35867955153 -0.1645235963 0.3981058804 2.40161776050 0.475509528900 -0.5686687328 -0.5425200310 1 2 
 2：0.1836433242 0.38984323641 0.78213630073 -0.10278772734 -0.2533616801 -0.6120263933 -0.03924000273 -0.709946430922 -0.1351786151 1.2078678060 2 3 
 3：-0.8356286124 -0.62124058054 0.07456498337 0.38767161156 0.6969633754 0.3411196914 0.68973936245 0.610726353489 1.1780869966 1.1604026157 3 4 
 4 ：1.5952808021 -2.21469988718 -1.98935169586 -0.05380504058 0.5566631987 -1.1293630961 0.02800215878 -0.934097631644 -1.5235668004 0.7002136495 4 5 
 5：0.3295077718 1.12493091814 0.61982574789 -1.37705955683 -0.6887556945 1.4330237017 -0.74327320888 -1.253633400239 0.5939461876 1.5868334545 5 6 
 6：-0.8204683841  - 0.04493360902 -0.05612873953 -0.41499456330 -0.7074951570 1.9803998985 0.18879229951 0.291446235517 0.3329503712 0.5584864256 6 7 
 7：0.4874290524 -0.01619026310 -0.15579550671 -0.39428995371 0.3645819621 -0.3672214765 -1.80495862889 -0.443291873218 1.0630998373 -1.2765922085 7 8 
 8：0.7383247051 0.94383621069 -1.47075238390  - 0.05931339671 0.7685329245 -1.0441346263 1.46555486156 0.001105351632 -0.3041839236 -0.5732654142 8 9 
 9：0.5757813517 0.82122119510 -0.47815005511 1.10002537198 -0.1123462122 0.5697196274 0.15325333821 0.074341324152 0.3700188099 -1.2246126149 9 10 
 10：-0.3053883872 0.59390132122 0.41794156020 0.76317574846 0.8811077265 -0.1350546039 2.17261167036 -0.589520946188 0.2670987908 -0.4734006364 10 11

说我想要 min 所有的 Vi 列，我使用 apply 当我使用 data.frame 。

  ：10），with = FALSE]，FUN = min，MAR = 1）
 [1] -0.6264538107 -0.7099464309 -0.8356286124 -2.2146998872 -1.3770595568 -0.8204683841 -1.8049586289 -1.4707523839 -1.2246126149 -0.5895209462 
  
 
 
 所以我可以轻松更新。
 
 
 想立即更新 min 和 max （当然这是一个例子，所以我只需要2个东西，但在现实生活将是10 ...）
  f = function（x）{return（c（max = max ），min = min（x））} 
 new = apply（DT [，paste0（'V'，1:10），with = FALSE]，FUN = f，MAR = 1）
 $，
 max 2.4016177605 1.2078678060 1.1780869966 1.595280802 1.586833455 [，1] [，2] [，3] [，4] [，5] 1.9803998985 1.063099837 1.465554862 1.100025372 2.1726116704 
 min -0.6264538107 -0.7099464309 -0.8356286124 -2.214699887 -1.377059557 -0.8204683841 -1.804958629 -1.470752384 -1.224612615 -0.5895209462 
  
我想写
  DT [，rownames（new）：= new] 
  
但这不起作用，所以这里是我的问题
 
 使用我的方法，如何转换 new ，以便我可以更新 DT   
 
  
 是否有更好的方法（允许我一次更新多个列， b 
   EDIT  ：我发现一个解决方案1，但是UGLY，实际上很奇怪，：= 不处理 matrix ，我很确定以前是这样的情况
  DT [，c（'a1'，'a2'）：= data.table（matrix（apply（DT [，paste0（'V'，1:10） ，with = FALSE]，FUN = f，MAR = 1），byrow = T，nrow = 10）] 
 R）DT 
 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ab 
 1分配：-0.6264538107 1.51178116845 0.91897737161 1.35867955153 -0.1645235963 0.3981058804 2.40161776050 0.475509528900 -0.5686687328 -0.5425200310 1 2 
 2：0.1836433242 0.38984323641 0.78213630073 -0.10278772734 -0.2533616801 -0.6120263933 -0.03924000273 -0.709946430922 -0.1351786151 1.2078678060 2 3 
 3： - 0.8356286124 -0.62124058054 0.07456498337 0.38767161156 0.6969633754 0.3411196914 0.68973936245 0.610726353489 1.1780869966 1.1604026157 3 4 
 4：1.5952808021 -2.21469988718 -1.98935169586 -0.05380504058 0.5566631987 -1.1293630961 0.02800215878 -0.934097631644 -1.5235668004 0.7002136495 4 5 
 5：0.3295077718 1.12493091814 0.61982574789 -1.37705955683 -0.6887556945 1.4330237017 -0.74327320888 -1.253633400239 0.5939461876 1.5868334545 5 6 
 6：-0.8204683841 -0.04493360902 -0.05612873953 -0.41499456330 -0.7074951570 1.9803998985 0.18879229951 0.291446235517 0.3329503712 0.5584864256 6 7 
 7：0.4874290524 -0.01619026310 -0.15579550671 -0.39428995371 0.3645819621 -0.3672214765 -1.80495862889 -0.443291873218 1.0630998373 -1.2765922085 7 8 
 8：0.7383247051 0.94383621069 -1.47075238390 -0.05931339671 0.7685329245 -1.0441346263 1.46555486156 0.001105351632 -0.3041839236 -0.5732654142 8 9 
 9：0.5757813517 0.82122119510 -0.47815005511 1.10002537198 -0.1123462122 0.5697196274 0.15325333821 0.074341324152 0.3700188099 -1.2246126149 9 10 
 10：-0.3053883872 0.59390132122 0.41794156020 0.76317574846 0.8811077265 -0.1350546039 2.17261167036 -0.589520946188 0.2670987908 -0.4734006364 10 11 
 a1 a2 
 1：2.401617761 -0.6264538107 
 2：1.207867806 -0.7099464309 
 3：1.178086997 -0.8356286124 
 4：1.595280802 -2.2146998872 
 5：1.586833455 -1.3770595568 
 6：1.980399899 -0.8204683841 
 7：1.063099837 -1.8049586289 
 8：1.465554862  - 1.4707523839 
 9：1.100025372 -1.2246126149 
 10：2.172611670 -0.5895209462 
  
  >  EDIT2  ：使用 DT [，（newColnames）：= f（.DT），by = IDX，.SDcols = someIdx]  $ 
 
创建 .SD 可能是一个非常昂贵的操作，特别是如果你的data.table包含 rows>>列。我建议在使用循环的列中使用 pmin 和 pmax 。 
 
 
 数据：
 
 
 
  set.seed（1）
 require（data.table）
 DT1 < -  data.table（matrix（rnorm（1e6），ncol = 10））
 DT1 [，a：= 1：1e5] 
 DT2 < -  copy（DT1）
 DT3   
 
 
 
功能：
 
 
 
  arun<  -  function（DT）{
＃指定第一列（虚拟）
 DT [，`：=`（min = DT [，V1]，max = DT [，V1]）] 
＃ pmin和pmax 
＃并替换最小和最大列
 cols < -  names（DT）[2:10] 
 for（i in cols）{
 DT [ ：=`（min = pmin（min，DT [[i]]），max = pmax（max，DT [[i]]））
} 
 DT 
} 
 
 eddi<  -  function（DT）{
 DT [，`：=`（min = min（.SD），max = max（.SD）），by = a，.SDcols = paste0（V，1:10）] 
} 
 
 frank<  -  function（DT）{
 cols& ^ V [[：digit：]] + $'，名称（DT））] 
 newcols<  -  c（min，max）
 myfun<  -  range 
 DT [，（newcols）：= as.list（myfun（.SD）），。SDcols = cols，by = 1：nrow（DT）] 
} 
  / pre> 
 
 基准化：
 
 
 
  require（microbenchmark）
微基准（o1 < -  arun（DT1），o2  
单位：毫秒
 expr （DT1）204.4417 204.4417 250.5205 296.5992 296.5992 2 
 o2 < -  eddi（DT2）92343.5321 92343.5321 96706.1622 101068.7923 101068.7923 2 
 o3 < - frank（DT3）49083.7000 49083.7000 49521.9296 49960.1592 49960.1592 2 
 
 identical（o1，o2）＃TRUE 
 identical（o1，o3）＃TRUE 
  / pre> 
 
   -  
 
 
 由于@Frank在注释下指出，  do.call  as：
  DT [，c ，max）：= {z  list（do.call（pmin，z），do.call（pmax，z））}] 
  
 
I ended up with a big data.table and I have to do operations per row. (yes... I know that this is clearly not what data.table are for)
R) set.seed(1)
R) DT=data.table(matrix(rnorm(100),nrow=10))
R) DT[,c('a','b'):=list(1:10,2:11)]
R) DT
               V1             V2             V3             V4            V5            V6             V7              V8            V9           V10  a  b
 1: -0.6264538107  1.51178116845  0.91897737161  1.35867955153 -0.1645235963  0.3981058804  2.40161776050  0.475509528900 -0.5686687328 -0.5425200310  1  2
 2:  0.1836433242  0.38984323641  0.78213630073 -0.10278772734 -0.2533616801 -0.6120263933 -0.03924000273 -0.709946430922 -0.1351786151  1.2078678060  2  3
 3: -0.8356286124 -0.62124058054  0.07456498337  0.38767161156  0.6969633754  0.3411196914  0.68973936245  0.610726353489  1.1780869966  1.1604026157  3  4
 4:  1.5952808021 -2.21469988718 -1.98935169586 -0.05380504058  0.5566631987 -1.1293630961  0.02800215878 -0.934097631644 -1.5235668004  0.7002136495  4  5
 5:  0.3295077718  1.12493091814  0.61982574789 -1.37705955683 -0.6887556945  1.4330237017 -0.74327320888 -1.253633400239  0.5939461876  1.5868334545  5  6
 6: -0.8204683841 -0.04493360902 -0.05612873953 -0.41499456330 -0.7074951570  1.9803998985  0.18879229951  0.291446235517  0.3329503712  0.5584864256  6  7
 7:  0.4874290524 -0.01619026310 -0.15579550671 -0.39428995371  0.3645819621 -0.3672214765 -1.80495862889 -0.443291873218  1.0630998373 -1.2765922085  7  8
 8:  0.7383247051  0.94383621069 -1.47075238390 -0.05931339671  0.7685329245 -1.0441346263  1.46555486156  0.001105351632 -0.3041839236 -0.5732654142  8  9
 9:  0.5757813517  0.82122119510 -0.47815005511  1.10002537198 -0.1123462122  0.5697196274  0.15325333821  0.074341324152  0.3700188099 -1.2246126149  9 10
10: -0.3053883872  0.59390132122  0.41794156020  0.76317574846  0.8811077265 -0.1350546039  2.17261167036 -0.589520946188  0.2670987908 -0.4734006364 10 11
Say I want the min across of all the Vi columns row by row, I used to use apply when I was using data.frame.
apply(DT[,paste0('V',1:10),with=FALSE],FUN=min,MAR=1)
 [1] -0.6264538107 -0.7099464309 -0.8356286124 -2.2146998872 -1.3770595568 -0.8204683841 -1.8049586289 -1.4707523839 -1.2246126149 -0.5895209462
So I can update easily.

Ok, now say that I want to update the min and max at once (off course this is an example so I took just 2 things but in real life that would be 10...)
 f = function(x){return(c(max=max(x),min=min(x)))}
 new=apply(DT[,paste0('V',1:10),with=FALSE],FUN=f,MAR=1)
             [,1]          [,2]          [,3]         [,4]         [,5]          [,6]         [,7]         [,8]         [,9]         [,10]
max  2.4016177605  1.2078678060  1.1780869966  1.595280802  1.586833455  1.9803998985  1.063099837  1.465554862  1.100025372  2.1726116704
min -0.6264538107 -0.7099464309 -0.8356286124 -2.214699887 -1.377059557 -0.8204683841 -1.804958629 -1.470752384 -1.224612615 -0.5895209462
I would like to write
DT[,rownames(new):=new]
but this does not work, so here are my questions

Using my method, how can I transform new such that I can update DT at once ?
Are there some better approach (that would allow me to update multiple columns at once, with the result of a by-row calculation)
EDIT: I found a solution for 1 but that's UGLY, actually It is strange that := do not handle matrix, I am pretty sure it used to be the case
DT[,c('a1','a2'):=data.table(matrix(apply(DT[,paste0('V',1:10),with=FALSE],FUN=f,MAR=1),byrow=T,nrow=10))]
R) DT
               V1             V2             V3             V4            V5            V6             V7              V8            V9           V10  a  b
 1: -0.6264538107  1.51178116845  0.91897737161  1.35867955153 -0.1645235963  0.3981058804  2.40161776050  0.475509528900 -0.5686687328 -0.5425200310  1  2
 2:  0.1836433242  0.38984323641  0.78213630073 -0.10278772734 -0.2533616801 -0.6120263933 -0.03924000273 -0.709946430922 -0.1351786151  1.2078678060  2  3
 3: -0.8356286124 -0.62124058054  0.07456498337  0.38767161156  0.6969633754  0.3411196914  0.68973936245  0.610726353489  1.1780869966  1.1604026157  3  4
 4:  1.5952808021 -2.21469988718 -1.98935169586 -0.05380504058  0.5566631987 -1.1293630961  0.02800215878 -0.934097631644 -1.5235668004  0.7002136495  4  5
 5:  0.3295077718  1.12493091814  0.61982574789 -1.37705955683 -0.6887556945  1.4330237017 -0.74327320888 -1.253633400239  0.5939461876  1.5868334545  5  6
 6: -0.8204683841 -0.04493360902 -0.05612873953 -0.41499456330 -0.7074951570  1.9803998985  0.18879229951  0.291446235517  0.3329503712  0.5584864256  6  7
 7:  0.4874290524 -0.01619026310 -0.15579550671 -0.39428995371  0.3645819621 -0.3672214765 -1.80495862889 -0.443291873218  1.0630998373 -1.2765922085  7  8
 8:  0.7383247051  0.94383621069 -1.47075238390 -0.05931339671  0.7685329245 -1.0441346263  1.46555486156  0.001105351632 -0.3041839236 -0.5732654142  8  9
 9:  0.5757813517  0.82122119510 -0.47815005511  1.10002537198 -0.1123462122  0.5697196274  0.15325333821  0.074341324152  0.3700188099 -1.2246126149  9 10
10: -0.3053883872  0.59390132122  0.41794156020  0.76317574846  0.8811077265 -0.1350546039  2.17261167036 -0.589520946188  0.2670987908 -0.4734006364 10 11
             a1            a2
 1: 2.401617761 -0.6264538107
 2: 1.207867806 -0.7099464309
 3: 1.178086997 -0.8356286124
 4: 1.595280802 -2.2146998872
 5: 1.586833455 -1.3770595568
 6: 1.980399899 -0.8204683841
 7: 1.063099837 -1.8049586289
 8: 1.465554862 -1.4707523839
 9: 1.100025372 -1.2246126149
10: 2.172611670 -0.5895209462
EDIT2: It looks on my data that using DT[, (newColnames):=f(.DT), by=IDX, .SDcols=someIdx] is much slower than the apply way, is that expected ?
 解决方案 
Creating .SD on each row could be a very costly operation, especially if your data.table consists of rows >> columns. I'd advice using pmin and pmax across columns with a loop. I'll illustrate this with a bigger data (along the rows).

Data:

set.seed(1)
require(data.table)
DT1 <- data.table(matrix(rnorm(1e6),ncol=10))
DT1[, a := 1:1e5]
DT2 <- copy(DT1)
DT3 <- copy(DT1)


Functions:

arun <- function(DT) {
    # assign first column (dummy)
    DT[, `:=`(min = DT[, V1], max = DT[, V1])]
    # get all other column names and use pmin and pmax 
    # and replace min and max columns
    cols <- names(DT)[2:10]
    for (i in cols) {
        DT[, `:=`(min = pmin(min, DT[[i]]), max = pmax(max, DT[[i]]))]
    }
    DT
}

eddi <- function(DT) {
    DT[, `:=`(min = min(.SD), max = max(.SD)), by = a, .SDcols = paste0("V", 1:10)]
}

frank <- function(DT) {
    cols    <- names(DT)[grepl('^V[[:digit:]]+$',names(DT))]
    newcols <- c("min","max")
    myfun   <- range
    DT[,(newcols):=as.list(myfun(.SD)),.SDcols=cols,by=1:nrow(DT)]
}


Benchmarking:

require(microbenchmark)
microbenchmark(o1 <- arun(DT1), o2 <- eddi(DT2), o3 <- frank(DT3), times=2)

Unit: milliseconds
             expr        min         lq     median          uq         max neval
  o1 <- arun(DT1)   204.4417   204.4417   250.5205    296.5992    296.5992     2
  o2 <- eddi(DT2) 92343.5321 92343.5321 96706.1622 101068.7923 101068.7923     2
 o3 <- frank(DT3) 49083.7000 49083.7000 49521.9296  49960.1592  49960.1592     2

identical(o1, o2) # TRUE
identical(o1, o3) # TRUE
-- 

As @Frank points out under comments, you could replace the for-loop with do.call as:
DT[, c("min", "max") := { z <- dt[, 1:10, with=FALSE]; 
             list(do.call(pmin, z), do.call(pmax, z))}]


                        
这篇关于数据表中的逐行操作和更新的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

数据表中的逐行操作和更新 [英] row-by-row operations and updates in data.table

问题描述

数据：

功能：

基准化：

Data:

Functions:

Benchmarking:

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

数据表中的逐行操作和更新 [英] row-by-row operations and updates in data.table

问题描述

数据：

功能：

基准化：

Data:

Functions:

Benchmarking:

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭