数据帧子集性能 [英] Data Frame Subset Performance

查看:133
本文介绍了数据帧子集性能的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有几个大数据帧(100万行×6-10列)我需要重复子集。子集化部分是我的代码中最慢的部分,如果有办法可以更快的,我很好奇。

  load(https ://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda)
start_in< - strptime(2012-08-20 13:00,%Y-%m-%d %H:%M)
end_in< - strptime(2012-08-20 17:00,%Y-%m-%d%H:%M)
system.time (DF_IOSTAT_INT< - DF_IOSTAT_ALL [DF_IOSTAT_ALL $ date_stamp> = start_in& DF_IOSTAT_ALL $ date_stamp< = end_in]]

> system.time(DF_IOSTAT_INT< - DF_IOSTAT_ALL [DF_IOSTAT_ALL $ date_stamp> = start_in& DF_IOSTAT_ALL $ date_stamp <= end_in]]
用户系统已用
16.59 0.00 16.60

dput(头(DF_IOSTAT_ALL))
结构(list(date_stamp = structure(list(sec = c(14,24,34,
44,54,4))min = c(0L, ,0L,0L,0L,1L),小时= c(0L,0L,
0L,0L,0L,0L),mday = c(20L,20L,20L,20L,20L,20L)年龄= c(112L,112L,112L,112L,112L,112L
),wday = c(1L,1L,1L, 1L,1L,1L),yday = c(232L,232L,232L,
232L,232L,232L),isdst = c(1L,1L,1L,1L,1L,1L)),Names = c (sec,
min,hour,mday,mon,year,wday,yday,isdst
),class = POSIXlt,POSIXt)),cpu = c(0.9,0.2,0.2,0.1,
0.2,0.1),rsec_s = c(0,0,0,0,0,0),wsec_s = c(0,3.8,0,0
0.4,0.2,0.2),util_pct = c(0,0.1,0,0,0,0),node = c(bda101,
bda101 ,bda101,bda101,bda101,bda101))。 names = c(date_stamp,
cpu,rsec_s,wsec_s,util_pct,node),row.names = c(NA,
6L) data.frame)


解决方案

data.table 。有趣的是,只需转换为 data.table 就可以通过更有效的查找逻辑向量来更快地进行查找。我比较了四件事情:原始数据帧查找;从POSIXlt转换到POSIXct的查找(感谢Matthew Dowle);数据表查找;以及数据表查找以及复制和转换的设置。即使使用附加设置,数据表查找也将获胜。通过多次查找,您将获得更多的时间节省。

  library(data.table)
library rbenchmark)
load(DF_IOSTAT_ALL.rda)
DF_IOSTAT_ALL.original< - DF_IOSTAT_ALL

start_in< - strptime(2012-08-20 13:00 %Y-%m-%d%H:%M)
end_in< - strptime(2012-08-20 17:00,%Y-%m-%d%H:%M )
#function to test:original
fun< - function()DF_IOSTAT_INT< - DF_IOSTAT_ALL.original [DF_IOSTAT_ALL.original $ date_stamp> = start_in& DF_IOSTAT_ALL.original $ date_stamp< = end_in,]
#要测试的功能:更改为POSIXct
DF_IOSTAT_ALL.ct< - 内(DF_IOSTAT_ALL.original,date_stamp < - as.POSIXct(date_stamp))
fun.ct< - function()DF_IOSTAT_INT< - DF_IOSTAT_ALL.ct [with(DF_IOSTAT_ALL.ct,date_stamp> = start_in& date_stamp< = end_in),]
#函数测试:with data.table和POSIXct
DF_IOSTAT_ALL.dt < - as.data.table(DF_IOSTAT_ALL.ct);
fun.dt< - function()DF_IOSTAT_INT< - DF_IOSTAT_ALL.dt [date_stamp> = start_in& date_stamp< = end_in,]
#要测试的功能:与数据表和POSIXct,设置步骤
newfun< - function(){
DF_IOSTAT_ALL < - DF_IOSTAT_ALL.original;
#data.table与POSIXlt不兼容,因此转换为POSIXct
DF_IOSTAT_ALL $ date_stamp < - as.POSIXct(DF_IOSTAT_ALL $ date_stamp);
DF_IOSTAT_ALL< - data.table(DF_IOSTAT_ALL);
DF_IOSTAT_INT< - - DF_IOSTAT_ALL [date_stamp> = start_in& date_stamp< = end_in,];
}
基准(fun(),fun.ct(),fun.dt(),newfun(),replications = 3,order =relative)

#测试复制经过相对user.self sys.self user.child sys.child
#3 fun.dt()3 0.18 1.000000 0.11 0.08 NA NA
#2 fun.ct()3 0.52 2.888889 0.44 0.08 NA NA
#4 newfun()3 35.49 197.166667 34.88 0.58 NA NA
#1 fun()3 66.68 370.444444 66.42 0.15 NA NA

如果您事先知道您的时间间隔,则可以通过使用 findInterval cut 并键入/索引表。

  DF_IOSTAT_ALL<  -  copy(DF_IOSTAT_ALL .new)
time.breaks< - strptime.d(2012-08-19 19:00:00)+ 0:178 * 60 * 60#小时
DF_IOSTAT_ALL [,interval: = findInterval(date_stamp,time.breaks)]
setkey(D F_IOSTAT_ALL,间隔)

start_in< - time.breaks [60]
end_in< - time.breaks
基准(a < - DF_IOSTAT_ALL [J 60)],b< - fun2(DF_IOSTAT_ALL))
#测试复制已经过了相对user.self sys.self user.child sys.child
#1 DF_IOSTAT_ALL [J(60)] 100 0.78 1.000000 0.64 0.14 NA NA
#2 fun2(DF_IOSTAT_ALL)100 6.69 8.576923 5.76 0.91 NA NA
all.equal(a,b [,SD,.SDcols = c(12,1:11,13) ])#test for equality(重新排列列匹配)
#TRUE


I have a couple of large data frames (1 million+ rows x 6-10 columns) I need to subset repeatedly. The subsetting section is the slowest part of my code and I curious if there is way to do this faster.

load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda")
start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])

> system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,])
   user  system elapsed 
  16.59    0.00   16.60 

dput(head(DF_IOSTAT_ALL))
structure(list(date_stamp = structure(list(sec = c(14, 24, 34, 
44, 54, 4), min = c(0L, 0L, 0L, 0L, 0L, 1L), hour = c(0L, 0L, 
0L, 0L, 0L, 0L), mday = c(20L, 20L, 20L, 20L, 20L, 20L), mon = c(7L, 
7L, 7L, 7L, 7L, 7L), year = c(112L, 112L, 112L, 112L, 112L, 112L
), wday = c(1L, 1L, 1L, 1L, 1L, 1L), yday = c(232L, 232L, 232L, 
232L, 232L, 232L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec", 
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt")), cpu = c(0.9, 0.2, 0.2, 0.1, 
0.2, 0.1), rsec_s = c(0, 0, 0, 0, 0, 0), wsec_s = c(0, 3.8, 0, 
0.4, 0.2, 0.2), util_pct = c(0, 0.1, 0, 0, 0, 0), node = c("bda101", 
"bda101", "bda101", "bda101", "bda101", "bda101")), .Names = c("date_stamp", 
"cpu", "rsec_s", "wsec_s", "util_pct", "node"), row.names = c(NA, 
6L), class = "data.frame")

解决方案

Here are my experiments with data.table. Interestingly, just the conversion to data.table will make your lookups faster, possibly through more efficient lookup to the logical vectors. I compared four things: the original data frame lookup; a lookup with conversion from POSIXlt to POSIXct (thanks to Matthew Dowle); the data table lookup; and the data table lookup in addition to the setup of copy and conversion. Even with the additional setup, the data table lookup wins. With multiple lookups, you'll get even more savings in time.

library(data.table)
library(rbenchmark)
load("DF_IOSTAT_ALL.rda")
DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL

start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M")
end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M")
#function to test: original
fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,]
#function to test: changing to POSIXct
DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp))
fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),]
#function to test: with data.table and POSIXct
DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct);
fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,]
#function to test: with data table and POSIXct, with setup steps
newfun <- function() {
    DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original;
    #data.table doesn't play well with POSIXlt, so convert to POSIXct
    DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp);
    DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL);
    DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,];
}
benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative")

#      test replications elapsed   relative user.self sys.self user.child sys.child
#3 fun.dt()            3    0.18   1.000000      0.11     0.08         NA        NA
#2 fun.ct()            3    0.52   2.888889      0.44     0.08         NA        NA
#4 newfun()            3   35.49 197.166667     34.88     0.58         NA        NA
#1    fun()            3   66.68 370.444444     66.42     0.15         NA        NA

If you know what your time intervals are beforehand, you can probably make it even faster by splitting with findInterval or cut and keying/indexing the table.

DF_IOSTAT_ALL <- copy(DF_IOSTAT_ALL.new)
time.breaks <- strptime.d("2012-08-19 19:00:00") + 0:178 * 60 * 60 #by hour
DF_IOSTAT_ALL[,interval := findInterval(date_stamp,time.breaks)]
setkey(DF_IOSTAT_ALL,interval)

start_in <- time.breaks[60]
end_in <- time.breaks[61]
benchmark(a <- DF_IOSTAT_ALL[J(60)],b <- fun2(DF_IOSTAT_ALL))
#                  test replications elapsed relative user.self sys.self user.child sys.child
#1 DF_IOSTAT_ALL[J(60)]          100    0.78 1.000000      0.64     0.14         NA        NA
#2  fun2(DF_IOSTAT_ALL)          100    6.69 8.576923      5.76     0.91         NA        NA
all.equal(a,b[,.SD,.SDcols=c(12,1:11,13)]) #test for equality (rearranging columns to match)
#TRUE

这篇关于数据帧子集性能的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆