快速频率和百分比表与dplyr [英] fast frequency and percentage table with dplyr

查看:248
本文介绍了快速频率和百分比表与dplyr的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我一直使用一个小的标签函数,显示一个向量的频率,百分比和累积百分比。输出如下

 频率百分比
ARSON 462 0.01988893 0.01988893
BURGLARY 22767 0.98011107 1.00000000
23229 1.00000000 NA

优秀 dplyr 包激励我更新函数。现在我想知道如何使更新的版本更快。这是旧函数

  tab = function(x,useNA = FALSE){
k = length [* is.na(x)]))+ 1
if(useNA)k = k + 1
tab = array(NA,c(k,3))
colnames )= c(freq。,prob。,cum。)
useNA = ifelse(useNA,always,no)
rownames table(x,useNA = useNA),))

tab [-nrow(tab),1] = table(x,useNA = useNA)
tab [-nrow ),b]]>
if(k> 2)tab [nrow(a,b) $ n $($)
if(k == 2)tab [nrow(tab), - 3] = tab [-nrow(tab), - 3] ,-3]

tab
}

基于 dplyr

  tab2 = function(x,useNA = FALSE) {
if(!useNA)if(any(is.na(x)))x = na.omit(x)
n = length(x)
out = data.frame ,1)%。%
group_by(x)%。%
dplyr :: summarize(
Freq = length(X1),
Percent = Freq / n
)%。%
dplyr :: arrange(x)
ids = as.character(out $ x)
ids [is.na(ids)] ='< NA>'
out = select(out,Freq,Percent)
out $ cum = cumsum(out $ Percent)
class(out)=data.frame
out = rbind ,c(n,1,NA))
rownames(out)= c(ids,'')
out
}

最后,一些性能基准:

  x1 = c ('ARSON',462),rep('BURGLARY',22767))
x2 = c(rep('ARSON',462),rep('BURGLARY',22767),rep
x3 = c(c(1:10),c(1:10),1,4)
x4 = c(rep(c(1:100),500) 1:50),20),1,4)

库('rbenchmark')

基准(tab(x1),tab2(x1),replications = 100) [,c('test','elapsed','relative')]
#测试已过相对
#1选项卡(x1)1.412 2.307
#2 tab2(x1)0.612 1.000

基准(tab(x2),tab2(x2),replications = 100)[,c('test','elapsed','relative')]
# $ b#1 tab(x2)1.351 1.475
#2 tab2(x2)0.916 1.000

基准(tab(x2,useNA = TRUE),tab2(x2,useNA = TRUE)复制次数= 100)[,c('test','elapsed','relative')]
#测试已过相对
#1选项卡(x2,useNA = TRUE)1.883 2.282
# 2 tab2(x2,useNA = TRUE)0.825 1.000

基准(tab(x3),tab2(x3),replications = 1000)[,c('test','elapsed','relative' )]
#test elapsed relative
#1选项卡(x3)0.997 1.000
#2 tab2(x3)2.194 2.201

基准(x4),表(x4),复制次数= 100)[,c('test','elapsed','relative')]
#测试已过相对
#1制表符(x4)19.481 18.714
#2 tab2(x4)1.041 1.000
#3表(x4)6.515 6.258



除了非常短的向量外,

tab2 更快。性能增益在较大的向量中变得明显(参见 x4 with 51002 obs)。它也比 table 更快,甚至认为函数做的更多。



现在我的问题:如何进一步提高性能?创建具有频率和百分比的表格是一个非常标准的应用程序,当您使用大型数据集时,快速实现非常不错。



EDIT :是具有2e6向量(包括下面提出的 data.table 解决方案)的附加测试用例

  x5 = sample(c(1:100),2e6,replace = TRUE)
benchmark(tab(x5),tab2(x5),tabdt = 100)[,c('test','elapsed','relative')]
#测试已过相对
选项卡(x5)350.878 19.444
#2 tab2 52.917 2.932
#4 tabdt(x5)18.046 1.000
#3表格(x5)98.429 5.454


库(data.table)的大粉丝我写了类似的函数:

  tabdt<  -  function(x){
n< - length(which(!is.na(x)))
dt< - data.table(x)
out< - dt [,list(Freq = .N,Percent = .N / n),by = x]
out [ !is.na(x),CumSum:= cumsum(Percent)]
out
}

>基准(tabdt(x1),tab2(x1),replications = 1000)[,c('test','elapsed','relative')]
测试已过相对
2 tab2 1.879
1 tabdt(x1)2.98 1.000
>基准(tabdt(x2),tab2(x2),replications = 1000)[,c('test','elapsed','relative')]
测试已过相对
2 tab2 1.686
1 tabdt(x2)3.76 1.000
>基准(tabdt(x3),tab2(x3),replications = 1000)[,c('test','elapsed','relative')]
测试已过相对
2 tab2 1.000
1 tabdt(x3)2.34 1.418
>基准(tabdt(x4),tab2(x4),replications = 1000)[,c('test','elapsed','relative')]
测试已过相对
2 tab2(x4)14.35 1.000
1 tabdt(x4)22.04 1.536

因此 .table 方法对于 x1 x2 更快,而 dplyr对于 x3 x4 更快。实际上,我没有看到任何改善使用这些方法的余地。



p.s。您可以为此问题添加 data.table 关键字吗?我相信人们会喜欢看到 dplyr data.table 性能比较(见 data.table vs dplyr:can one do好的,其他人不能或不好?例如)。


I have been using a small tab function for some time, which shows the frequency, percent, and cumulative percent for a vector. The output looks like this

          Freq    Percent        cum
ARSON      462 0.01988893 0.01988893
BURGLARY 22767 0.98011107 1.00000000
         23229 1.00000000         NA

The excellent dplyr package motivated me to update the function. Now I am wondering how I can make the updated version even faster. Here is the old function

tab = function(x,useNA =FALSE) {
  k=length(unique(x[!is.na(x)]))+1
  if (useNA) k=k+1
  tab=array(NA,c(k,3))
  colnames(tab)=c("freq.","prob.","cum.")
  useNA=ifelse(useNA,"always","no")
  rownames(tab)=names(c(table(x,useNA=useNA),""))

  tab[-nrow(tab),1]=table(x,useNA=useNA)
  tab[-nrow(tab),2]=prop.table(table(x,useNA=useNA))
  tab[,3] = cumsum(tab[,2])
  if(k>2)  tab[nrow(tab),-3]=colSums(tab[-nrow(tab),-3])
  if(k==2) tab[nrow(tab),-3]=tab[-nrow(tab),-3]

  tab
}

and the new based on dplyr

tab2 = function(x, useNA =FALSE) {
    if(!useNA) if(any(is.na(x))) x = na.omit(x)
    n = length(x)
    out = data.frame(x,1) %.%
        group_by(x) %.%
        dplyr::summarise(
            Freq    = length(X1),
            Percent = Freq/n
        ) %.%
        dplyr::arrange(x)
    ids = as.character(out$x)
    ids[is.na(ids)] = '<NA>'
    out = select(out, Freq, Percent)
    out$cum = cumsum(out$Percent)
    class(out)="data.frame"
    out = rbind(out,c(n,1,NA))
    rownames(out) = c(ids,'')
    out
}

Finally, some performance benchmarks:

x1 = c(rep('ARSON',462),rep('BURGLARY',22767))
x2 = c(rep('ARSON',462),rep('BURGLARY',22767),rep(NA,100))
x3 = c(c(1:10),c(1:10),1,4)
x4 = c(rep(c(1:100),500),rep(c(1:50),20),1,4)

library('rbenchmark')

benchmark(tab(x1), tab2(x1), replications=100)[,c('test','elapsed','relative')]
#       test elapsed relative
# 1  tab(x1)   1.412    2.307
# 2 tab2(x1)   0.612    1.000

benchmark(tab(x2),tab2(x2), replications=100)[,c('test','elapsed','relative')]
#       test elapsed relative
# 1  tab(x2)   1.351    1.475
# 2 tab2(x2)   0.916    1.000

benchmark(tab(x2,useNA=TRUE), tab2(x2,useNA=TRUE), replications=100)[,c('test','elapsed','relative')]
#                     test elapsed relative
# 1  tab(x2, useNA = TRUE)   1.883    2.282
# 2 tab2(x2, useNA = TRUE)   0.825    1.000

benchmark(tab(x3), tab2(x3), replications=1000)[,c('test','elapsed','relative')]
#       test elapsed relative
# 1  tab(x3)   0.997    1.000
# 2 tab2(x3)   2.194    2.201

benchmark(tab(x4), tab2(x4), table(x4), replications=100)[,c('test','elapsed','relative')]
#        test elapsed relative
# 1   tab(x4)  19.481   18.714
# 2  tab2(x4)   1.041    1.000
# 3 table(x4)   6.515    6.258

tab2 is faster except for the very short vector. The performance gain becomes evident in the larger vector (see x4 with 51002 obs). It's also faster than table even thought the function is doing much more.

Now to my question: How can I further improve performance? Creating tables with frequencies and percent is a pretty standard application and a fast implementation is very nice when you work with large datasets.

EDIT: Here is an additional test case with a 2e6 vector (including the data.table solution proposed below)

x5 = sample(c(1:100),2e6, replace=TRUE)
benchmark(tab(x5), tab2(x5), table(x5), tabdt(x5), replications=100)[,c('test','elapsed','relative')]
#        test elapsed relative
# 1   tab(x5) 350.878   19.444
# 2  tab2(x5)  52.917    2.932
# 4 tabdt(x5)  18.046    1.000
# 3 table(x5)  98.429    5.454

解决方案

As I'm a big fan of library(data.table) I wrote similar function:

tabdt <- function(x){
    n <- length(which(!is.na(x)))
    dt <- data.table(x)
    out <- dt[, list(Freq = .N, Percent = .N / n), by = x]
    out[!is.na(x), CumSum := cumsum(Percent)]
    out
}

> benchmark(tabdt(x1), tab2(x1), replications=1000)[,c('test','elapsed','relative')]
       test elapsed relative
2  tab2(x1)    5.60    1.879
1 tabdt(x1)    2.98    1.000
> benchmark(tabdt(x2), tab2(x2), replications=1000)[,c('test','elapsed','relative')]
       test elapsed relative
2  tab2(x2)    6.34    1.686
1 tabdt(x2)    3.76    1.000
> benchmark(tabdt(x3), tab2(x3), replications=1000)[,c('test','elapsed','relative')]
       test elapsed relative
2  tab2(x3)    1.65    1.000
1 tabdt(x3)    2.34    1.418
> benchmark(tabdt(x4), tab2(x4), replications=1000)[,c('test','elapsed','relative')]
       test elapsed relative
2  tab2(x4)   14.35    1.000
1 tabdt(x4)   22.04    1.536

And so data.table approach was faster for x1 and x2 while dplyr was faster for x3 and x4. Actually I don't see any room for improvement using these approaches.

p.s. Would you add data.table keyword to this question? I believe people would love to see dplyr vs. data.table performance comparison (see data.table vs dplyr: can one do something well the other can't or does poorly? for example).

这篇关于快速频率和百分比表与dplyr的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆