子集内的子集 [英] Subsetting within a subset

查看：156 发布时间：2017/3/12 13:12:54 r data.table

本文介绍了子集内的子集的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我想知道如果我能有效地使用 data.table 。我有一个数据集，包括不同的样本，不同的时期（日期）和不同的组（id）。

  #the data 
 require（data.table）
 dt<  -  data.table = c（rep（1,50），rep（2,50），rep（1,50），rep（2,50）），date = c（rep（2004-01-01，100） （2004-02-01，100）），A = c（rnorm（50,1,3），rnorm（50,2,3），rnorm（50,1,4），rnorm（50,1.5， 3）），
 B = c（rnorm（50,1.3,2.9），rnorm（50,1.8,3.1），rnorm（50,1.6,4），rnorm（50,1.7,2.4）））

我要应用以下函数。

 ＃应该应用的函数
 function（a，ie1，b，a1，ie2，b2，...）{
 ipf < a，b，...）{
m < -  length（a）
n < -  length（b）
 if（m r& rank（c（a，b），...）[1：m] -1：m 
} else {
r < （m + 1）:( m + n）]  -  1：n 
} 
s  ）），sum（r））/（as.double（m）* n）
 return（ifelse（m< n，s，1  -  s））
} 
 
 expand.grid.alt<  -  function（seq1，seq2）{
 cbind（rep.int（seq1，length（seq2）），c（t ）），nrow = length（seq2））））
} 
 
 if（missing（a1）|缺少（b2）|缺省（ie2））{
 if（ie1 ==>）{
 return（ipf（a，b））
} else {
 return（ipf ，a））
} 
} else {
 if（ie1 ==>）{
 if（ie2 ==>）{
 return（ipf（a，apply（expand.grid.alt（b，b2），1，max））/ ipf（a1，b2））
} else {
 return（1  -  ipf （b），（b），（b），（b），（b），（b） ==>）{
 return（1  -  ipf（a，apply（expand.grid.alt（b，b2），1，max））/ ipf（a1，b2））
 } else {
 return（ipf（apply（expand.grid.alt（b，b2），1，min），a）/（1  -  ipf（a1，b2）））
} 
} 
}

}

此功能比较不同的样品;给定我们有三个样本A，B，C它允许例如。以计算样本A的绘制大于样本B的绘制的概率，因为来自样本A的绘制大于来自样本C的绘制。我想以某种方式使用data.tables应用此函数。以下示例应该说明我想要做什么：

  #example  - 我想做什么
 dt1< ;  -  dt [date ==2004-01-01] 
 ow < -  dt1 [id == 1，A] 
 ot < -  dt1 [id！= 1，A] 
 cs < -  dt1 [，B] 
 ex<  -  expand.grid（unique（ow），unique（ot），unique（cs））
 names（ex） c（ow，ot，cs）
 sum（ex $ ow> ex $ ot& ex $ ow> ex $ cs）/ sum（ex $ ow> ex $ ot ）
 
 #check如果结果正确
 all.equal（prob（ow，>，cs，ow，>，ot），sum > ex $ ot& ex $ ow> ex $ cs）/ sum（ex $ ow> ex $ ot））
 [1] TRUE 
  pre> 
 
 我想对所有id和所有日期使用data.table自动化上述过程。换句话说：我想计算从id = 1的变量A的绘制大于从变量B的绘制的概率，假设从id = 1的变量A的绘制大于从id！= 1的变量的绘制（使用expand.grid意味着强力方法，它考虑所有可能的组合，上面的prob（）函数使用更优雅的rank-sum方法）。
 
 
 这意味着我需要一个子集中的某种子集。直观地我已经玩过这样的东西：
  dt [，。SD [，prob（A，& B，A，SD[！。BY，A]），key = id]，key = date] 
  
但是这种方法会导致错误消息。谁能帮我解决这个问题？非常感谢任何评论。
解决方案
  重要的：在上面的示例中，您将循环使用 A 值以匹配 B 值的长度。不清楚这是否是你实际想要的，如果答案是错误的，或者如果答案是正确的，但是由于对称性而不是实际的方法。你可能想要仔细检查你的例子。 
 
 
 
 
 
 
  ##使用CJ 
 setkey（dt，id）
 dt [，{
 .SD1<  -  .SD; 
 .SD1 [，{.B < -  unlist（.BY）; 
 CJ（ow = .SD1 [。（。B）] [[A]]，
 ot = .SD1 [！。（。B）] [[A]] b $ b cs = .SD1 [[B]] 
）[
，sum（ow> ot& ow> cs）/ sum（ow> ot）] 
} 
，by = id] 
} 
，by = date 
] 
 
 ##使用PROB 
 setkey（dt，id）
 dt [，{
 .SD1<  -  .SD; 
 .SD1 [，{.B < -  unlist（.BY）; 
 ow<  -  .SD1 [。（。B）] [[A]] 
 ot<  -  .SD1 [！。（。B）] [[A]] 
 cs < -  .SD1 [[B]] 
 prob（ow，>，cs，ow，>，ot）
} by = id] 
} 
，由= date 
] 
  
 
 
 
基准：
 
 
 你说得对，prob函数更快（顺便说一句，不是太多）。 
  usingProb<  -  quote（dt [，{.SD1<  -  .SD; .SD1 [，{。 ;  -  unlist（.BY）; ow < -  .SD1 [。（。B）] [[A]]; ot <.SD1 [！。（。B）] [[A]] ; cs < -  .SD1 [[B]]; prob（ow，>，cs，ow，>，ot）}，by = id]}，by = date] $ b usingCJ < -  quote（dt [，{.SD1 < -  .SD; .SD1 [，{.B < -  unlist（.BY）; CJ（ow = .SD1 [。（。B）] [ [A]，ot = .SD1 [！。（.B）] [[A]]，cs = .SD1 [[B]]）[，sum（ow> ot& ow> cs）/ sum（ow> ot）]}，by = id]}，by = date]）
 
 eval（usingProb）
 eval（usingCJ）
 all。等于（eval（usingProb），eval（usingCJ））
 
库（microbenchmark）
 microbenchmark（PROB = eval（usingProb），CJ = eval（usingCJ），times = 20L）
 
单位：毫秒
 expr min lq median uq max neval 
 PROB 50.59504 53.62986 62.78143 80.64911 106.2133 20 
 CJ 67.63520 69.59654 74.56110 79.45636 136.6357 20 
  
 
I am wondering if I could do this efficiently with a data.table. I have got a data set which consists of different samples, for different periods (date) and different groups (id). 
    #the data
    require(data.table)
    dt <- data.table(id=c(rep(1,50),rep(2,50),rep(1,50),rep(2,50)),date=c(rep("2004-01-01",100),rep("2004-02-01",100)),A=c(rnorm(50,1,3),rnorm(50,2,3),rnorm(50,1,4),rnorm(50,1.5,3)),
             B=c(rnorm(50,1.3,2.9),rnorm(50,1.8,3.1),rnorm(50,1.6,4),rnorm(50,1.7,2.4)))
I want to apply the following function.
    #the function which should be applied
    function(a, ie1, b, a1, ie2, b2, ...) {
    ipf <- function(a, b, ...) {
    m <- length(a)
    n <- length(b)
    if (m < n) {
        r <- rank(c(a, b), ...)[1:m] - 1:m
    } else {
        r <- rank(c(a, b), ...)[(m + 1):(m + n)] - 1:n
    }
    s <- ifelse((n + m)^2 > 2^31, sum(as.double(r)), sum(r))/(as.double(m) * n)
    return(ifelse(m < n, s, 1 - s))
}

expand.grid.alt <- function(seq1, seq2) {
    cbind(rep.int(seq1, length(seq2)), c(t(matrix(rep.int(seq2, length(seq1)), nrow = length(seq2)))))
}

if (missing(a1) | missing(b2) | missing(ie2)) {
    if (ie1 == ">") {
        return(ipf(a, b))
    } else {
        return(ipf(b, a))
    }
} else {
    if (ie1 == ">") {
        if (ie2 == ">") {
            return(ipf(a, apply(expand.grid.alt(b, b2), 1, max))/ipf(a1, b2))
        } else {
            return(1 - ipf(apply(expand.grid.alt(b, b2), 1, min), a)/(1 - ipf(a1, b2)))
        }
    } else {
        if (ie2 == ">") {
            return(1 - ipf(a, apply(expand.grid.alt(b, b2), 1, max))/ipf(a1, b2))
        } else {
            return(ipf(apply(expand.grid.alt(b, b2), 1, min), a)/(1 - ipf(a1, b2)))
        }
    }
}
}

This function compares different samples; Given we have three samples A, B, C it allows e.g. to compute the probability that a draw from sample A is greater than a draw from sample B given that the draw from sample A is greater than a draw from sample C. I want to apply this function in a certain manner using data.tables. The following example should illustrate you what I want to do:
    #example - what I want to do
    dt1 <-  dt[date=="2004-01-01"]
    ow <-   dt1[id==1,A]
    ot <-   dt1[id!=1,A]
    cs  <-  dt1[,B]
    ex <- expand.grid(unique(ow),unique(ot),unique(cs))
    names(ex) <- c("ow","ot","cs")
    sum(ex$ow > ex$ot & ex$ow > ex$cs)/sum(ex$ow > ex$ot)

    #check if the result is correct
    all.equal(prob(ow,">",cs,ow,">",ot),sum(ex$ow > ex$ot & ex$ow > ex$cs)/sum(ex$ow > ex$ot))
    [1] TRUE
I want to automatize the procedure above with the use of data.table for all ids and all dates. In words: I want to compute the probability that a draw from variable A of id=1 is greater than a draw from variable B given that a draw from variable A of id=1 is greater than a draw from variable of id!=1 (the use of expand.grid implies the brute force method which looks at all possible combinations, the prob() function above use a more elegant rank-sum approach).

This means I need some kind of subset within a subset. Intuitively I have played around with something like that:
    dt[,.SD[,prob(A,">",B,A,">",.SD[!.BY,A]),key=id],key=date]
This approach however leads to an error messages. Who can help me with this problem? Any comment is highly appreciated!
 解决方案 
Importantly: In your example above, note that you are recycling your A values to match the length of the B values.  It's not clear if this is what you actually intend, if the answer is wrong, or if the answer is correct, but moreso due to a symmetry than to the actual method. You might want to double check your example. 
Meanwhile, this does what you have above, in an efficient manner



## USING CJ
setkey(dt, id)
dt[, {
      .SD1 <- .SD;
      .SD1[, {.B <- unlist(.BY);
              CJ( ow=.SD1[.(.B)][["A"]], 
                  ot=.SD1[!.(.B)][["A"]], 
                  cs=.SD1[["B"]]
                )[
                  , sum(ow>ot & ow>cs) / sum(ow > ot)] 
             }
    , by=id ]
    }
  , by=date
  ]

## USING PROB
setkey(dt, id)
dt[, {
      .SD1 <- .SD;
      .SD1[, {.B <- unlist(.BY);
              ow <- .SD1[.(.B)][["A"]] 
              ot <- .SD1[!.(.B)][["A"]]
              cs <- .SD1[["B"]]
              prob(ow,">",cs,ow,">",ot)
             }
    , by=id ]
    }
  , by=date
  ]


Benchmarks:

You are right, the prob function is faster (incidentally, not by much). 
usingProb <- quote(dt[, {.SD1 <- .SD;.SD1[, {.B <- unlist(.BY);ow <- .SD1[.(.B)][["A"]] ;ot <- .SD1[!.(.B)][["A"]];cs <- .SD1[["B"]];prob(ow,">",cs,ow,">",ot)}, by=id ]}, by=date  ])
usingCJ <- quote(dt[, {.SD1 <- .SD;.SD1[, {.B <- unlist(.BY);CJ( ow=.SD1[.(.B)][["A"]], ot=.SD1[!.(.B)][["A"]], cs=.SD1[["B"]])[, sum(ow>ot & ow>cs) / sum(ow > ot)] }, by=id ]}, by=date])

eval(usingProb)
eval(usingCJ)
all.equal(eval(usingProb), eval(usingCJ))

library(microbenchmark)
microbenchmark(PROB=eval(usingProb), CJ=eval(usingCJ), times=20L)

Unit: milliseconds
 expr      min       lq   median       uq      max neval
 PROB 50.59504 53.62986 62.78143 80.64911 106.2133    20
   CJ 67.63520 69.59654 74.56110 79.45636 136.6357    20


                        
这篇关于子集内的子集的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

子集内的子集 [英] Subsetting within a subset

问题描述

基准：

Benchmarks:

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

子集内的子集 [英] Subsetting within a subset

问题描述

基准：

Benchmarks:

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭