快速/优雅的方式来统一多列 [英] Speedy/elegant way to unite many pairs of columns

查看:94
本文介绍了快速/优雅的方式来统一多列的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述



使用 mapply()可以使用一个优雅的/ fastR方法来组合数据框中的所有列。 paste()我们可以打开这个data.frame:

  mydf<  -  data.frame(a.1 = letters,a.2 = 26:1,b.1 = letters,b.2 = 1:26)
head(mydf)
a.1 a.2 b.1 b.2
1 a 26 a 1
2 b 25 b 2
3 c 24 c 3
4 d 23 d 4
5 e 22 e 5
6 f 21 f 6

into这个data.frame:

  mydf2<  -  mapply(function(x,y){
paste y,sep =。)},
mydf [,seq(1,ncol(mydf),by = 2)],
mydf [,seq(2,ncol(mydf) 2)])
头(mydf2)
a.1 b.1
[1,]a.26a.1
[2,]b .25b.2
[3,]c.24c.3
[4,]d.23d.4
[5 ,]e.22e.5
[6,]f.21f.6

然而,这种感觉很笨拙,应用于大da时有点慢tasets。任何建议,可能使用Hadley软件包?



编辑:
理想的解决方案将轻松扩展到大量的列,以便列的名称不需要包含在函数调用中。谢谢!

解决方案

有趣的是,OP的解决方案似乎是最快的:

  f1<  -  function(mydf){
mapply(function(x,y){
paste(x,y,sep = ,)},
mydf [,seq(1,ncol(mydf),by = 2)],
mydf [,seq(2,ncol(mydf),by = 2)])
}

f.thelatemail< - function(mydf){
mapply(paste,mydf [c(TRUE,FALSE)],mydf [c(FALSE,TRUE) ],sep =。)
}

require(dplyr)

f.on_the_shores_of_linux_sea< - function(mydf){
transmute mydf,x1 = paste0(a.1,'。',a.2),x2 = paste0(b.1,'。',b.2))
}

f .jazurro< - function(mydf){
odd < - seq(1,ncol(mydf),2);
lapply(odd,function(x)paste(mydf [,x],mydf [,x + 1],sep =。))%>%
do.call(cbind, )
}

库(data.table)
f.akrun< - function(mydf){
res < - as.data.table(matrix (,ncol = ncol(mydf)/ 2,nrow = nrow(mydf)))
indx< - seq(1,ncol(mydf),2)
setDT(mydf)
for(j in seq_along(indx)){
set(res,i = NULL,j = j,value = paste(mydf [[indx [j]]],
mydf [[indx [j ] +1]],
}
res
}

mydf< - data.frame(a.1 =字母, a.2 = 26:1,b.1 = letter,b.2 = 1:26)
mydf< - mydf [rep(1:nrow(mydf),5000)]]


库(rbenchmark)
基准(f1(mydf),f.thelatemail(mydf),f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf))

结果:

 code>#test replises elapsed相对user.self sys.self user.child sys.child 
#5 f.akrun(mydf)100 14.000 75.269 13.673 0.296 0 0
#4 f.jazurro(mydf)100 0.388 2.086 0.314 0.071 0 0
#3 f.on_the_shores_of_linux_sea(mydf) 100 15.585 83.790 15.293 0.280 0 0
#2 f.thelatemail(mydf)100 26.416 142.022 25.736 0.639 0 0
#1 f1(mydf)100 0.186 1.000 0.169 0.017 0 0

[更新基准]



我从@thelatemail中添加了一个解决方案,在原来的答案中我错过了一个解决方案,另外一个来自@akrun的解决方案:

  f.thelatemail2 <  -  function(mydf){
data.frame(Map(paste,mydf [c(TRUE,FALSE)],mydf [c(FALSE,TRUE)],sep =。))
}

f.akrun2< - function(mydf){
setDT(mydf)
indx< - as.integer(seq(1,ncol(mydf),2))
mydf2< - copy(mydf)
for(j in indx){
set(mydf2 ,i = NULL,j = j,value = paste(mydf2 [[j]],
mydf2 [[j + 1]],sep =。))
}
mydf2 [,indx,with = FALSE]
}

基准:

 库(rbenchmark)

基准(f1(mydf),f.thelatemail(mydf),f.thelatemail2(mydf) ,f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf),f.akrun2(mydf))
#测试复制已经过了相对user.self sys.self user.child sys.child
#6 f.akrun(mydf)100 13.247 69.356 12.897 0.340 0 0
#7 f.akrun2(mydf)100 12.746 66.733 12.405 0.339 0 0
#5 f.jazurro(mydf) 100 0.327 1.712 0.254 0.073 0 0
#4 f.on_the_shores_of_linux_sea(mydf)100 16.347 85.586 15.838 0.445 0 0
#2 f.thelatemail(mydf)100 26.307 137.733 25.536 0.708 0 0
#3 f.thelatemail2(mydf)100 15.938 83.445 15.136 0.750 0 0
#1 f1(mydf)100 0.191 1.000 0.156 0.036 0 0


Is there an elegant/fastR way to combine all pairs of columns in a data.frame?

For example, using mapply() and paste() we can turn this data.frame:

mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
head(mydf)
  a.1 a.2 b.1 b.2
1   a  26   a   1
2   b  25   b   2
3   c  24   c   3
4   d  23   d   4
5   e  22   e   5
6   f  21   f   6

into this data.frame:

mydf2 <- mapply(function(x, y) {
     paste(x, y, sep = ".")},
     mydf[ ,seq(1, ncol(mydf), by = 2)],
     mydf[ ,seq(2, ncol(mydf), by = 2)])
head(mydf2)
     a.1    b.1  
[1,] "a.26" "a.1"
[2,] "b.25" "b.2"
[3,] "c.24" "c.3"
[4,] "d.23" "d.4"
[5,] "e.22" "e.5"
[6,] "f.21" "f.6"

However, this feels clumsy and is a bit slow when applied to big datasets. Any suggestions, perhaps using a Hadley package?

EDIT: The ideal solution would easily scale to large numbers of columns, such that the names of the columns would not need to be included in the function call. Thanks!

解决方案

It's amusing to note that the OP's solution appears to be the fastest one:

f1 <- function(mydf) {
    mapply(function(x, y) {
        paste(x, y, sep = ".")},
        mydf[ ,seq(1, ncol(mydf), by = 2)],
        mydf[ ,seq(2, ncol(mydf), by = 2)])
}

f.thelatemail <- function(mydf) {
    mapply(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep=".")
}

require(dplyr)

f.on_the_shores_of_linux_sea <- function(mydf) {
    transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2)) 
}

f.jazurro <- function(mydf) {
    odd <- seq(1, ncol(mydf), 2);
    lapply(odd, function(x) paste(mydf[,x], mydf[,x+1], sep = ".")) %>% 
        do.call(cbind,.)
}

library(data.table) 
f.akrun <- function(mydf) {
    res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
    indx <- seq(1, ncol(mydf), 2)
    setDT(mydf)
    for(j in seq_along(indx)){
        set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]], 
                                           mydf[[indx[j]+1]], sep='.'))
    }
    res
}

mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
mydf <- mydf[rep(1:nrow(mydf),5000),]


library(rbenchmark)
benchmark(f1(mydf),f.thelatemail(mydf),f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf))

Results:

#                                 test replications elapsed relative user.self sys.self user.child sys.child
# 5                      f.akrun(mydf)          100  14.000   75.269    13.673    0.296          0         0
# 4                    f.jazurro(mydf)          100   0.388    2.086     0.314    0.071          0         0
# 3 f.on_the_shores_of_linux_sea(mydf)          100  15.585   83.790    15.293    0.280          0         0
# 2                f.thelatemail(mydf)          100  26.416  142.022    25.736    0.639          0         0
# 1                           f1(mydf)          100   0.186    1.000     0.169    0.017          0         0

[Updated Benchmark]

I've added one solution from @thelatemail, which I missed in the original answer, and one solution from @akrun:

f.thelatemail2 <- function(mydf) {
    data.frame(Map(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep="."))
}

f.akrun2 <- function(mydf) {    
    setDT(mydf)
    indx <- as.integer(seq(1, ncol(mydf), 2))
    mydf2 <- copy(mydf)
    for(j in indx){
        set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
                                             mydf2[[j+1]], sep="."))
    }
    mydf2[,indx, with=FALSE]
}

Benchmark:

library(rbenchmark)

benchmark(f1(mydf),f.thelatemail(mydf), f.thelatemail2(mydf), f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf),f.akrun2(mydf))
#                                 test replications elapsed relative user.self sys.self user.child sys.child
# 6                      f.akrun(mydf)          100  13.247   69.356    12.897    0.340          0         0
# 7                     f.akrun2(mydf)          100  12.746   66.733    12.405    0.339          0         0
# 5                    f.jazurro(mydf)          100   0.327    1.712     0.254    0.073          0         0
# 4 f.on_the_shores_of_linux_sea(mydf)          100  16.347   85.586    15.838    0.445          0         0
# 2                f.thelatemail(mydf)          100  26.307  137.733    25.536    0.708          0         0
# 3               f.thelatemail2(mydf)          100  15.938   83.445    15.136    0.750          0         0
# 1                           f1(mydf)          100   0.191    1.000     0.156    0.036          0         0

这篇关于快速/优雅的方式来统一多列的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆