数据表选项用于检查和批量线性模型 [英] data table option for check and batch of linear models

查看:112
本文介绍了数据表选项用于检查和批量线性模型的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想知道是否有一个 data.table 选项,用于从首先检查的数据集中批处理线性模型。



我需要在每个唯一标识符上运行一堆线性模型,但首先我需要做一个检查。对于每个唯一的ID和年份,我需要检查,至少有24个月以前的每月数据,但不得超过60个月。所以当我运行回归时,应该包括每个人每年的上一个月(年)数据的24-60次观察结果。如果该年度的资料少于24个月,则该年度的年份将会下降,但如果超过60岁,则仅使用60个月。



<感谢这个(感谢@akrun)的帖子,我能够为每个人设置线性模型,运行它们,然后输出beta作为两者的总和。问题在于,这只能在当年(12个obs)而不是以前的24-60回归。



上一篇文章: dplyr和以前的观察



我希望有一个 dplyr 选项,但它似乎并没有起作用,而邮件中的 ddply 方法需要几个小时才能运行。但是,我需要在多个数据集中运行多次,这些数据范围在110万个obs中。



dput示例:



(code $ c)tdata< - structure(cusip = c(101L,101L,101L,101L,101L,101L,
101L,101L,101L,101L,101L ,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L, ,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L,101L, ),日期= c(19901130L,19901031L,19900928L,
19900831L,19900731L,19900629L,19900531L,19900430L,19900330L,
19900228L,19900131L,19891229L,19891130L,19891031L,19890929L,
19890831L, 19890731L,19890630L,19890531L,19890430L,19890331L,19890428L,19890331L,19890430L,19890331L,19890430L,19890331L,19890430L,19890331L,19890430L,19890331L,19890430L,19890331L, 19871231L,19871130L, 19871030L,19870930L,
19870831L,19870731L,19870630L,19870529L,19870430L,19870331L,
19870227L,19870130L,19861231L,19861128L,19861031L,19860930L,
19860829L,19860731L),fyear = c 1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年,1990年。 ,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,1989年,一九八八年,1988年,1988年,1988年,1988年,1988年,1988年,1988年,1988年,1988年,1988年。 ,1987,1987,1987,1987,1987,1987年,1987年,1987年,1987年,1987年, 1987,1987,1987,1986,1986,1986,1986,1986,
1986),month = c(11 ,09,08,07,06,05,
04,03,02,01,12 10,09,08,07,06,
05,04,03,02,01,12 10,09,08,07,
06,05,04,03,02,01 11,10,09,08,
07,06,05,04,03,02,01 ,11, 10,09,
08,07),ret = c(0.117647,0.030303,-0.161017,-0.186207,
-0.131737 ,0.128378,0.027778,-0.162791,0.131579,
0.178295,-0.091549,0.163934,-0.089552,0.007519,
0.117647,0.155340,0.211765,0.024096,0.338710,0.377778,
0.071429,-0.176471,0.378378,-0.026316,-0.050000
-0.047619,-0.086957,-0.061224,0.088889,-0.062500,
-0.040000,-0.056604,0.081633,0.042553 -0.096154,
0.238095,-0.263158,-0.393617,-0.160714,0.400000,
-0.090909,-0.200000,-0.098361 ,-0.152778,0.000000,
0.107692,0.460674,-0.101010,-0.019802,0.246914,
-0.052632,0.179310 -0.064516),ewretd = c(0.035468,-0.057155,
-0.080468,-0.108911,-0.025732,0.005359,0.045675,0.028117,
0.021315,0.015434,0.046408,0.012375,0.0058 ,-0.049934,
0.005532,0.018626,0.03133,0.007744,0.025054,0.029089,
0.01806,0.002988,0.062124,0.018872,0.036484 -0.011485
0.016951,0.025001,0.00289,0.047677,0.017671,0.014016
0.03569,0.060265,0.077392,0.026065,0.05085 ,-0.272248,-0.015876,
0.014544,0.035123,0.021487,0.000573,0.017709,0.036283,
0.074612,0.117565,0.034609,0.006263,0.023777,0.059071,
0.023269, - 0.073128),lagewretd = c(-0.004526,0.035468,0.057155,
-0.080468,-0.108911,-0.025732,0.005359,0.045675,0.028117,
0.021315,0.015434,0.046408,-0.012375, - 0.0058,0.049934,
0.005532,0.018626,0 0.031017,0.007744,0.025054,0.029089,
0.01806,0.002988,0.062124,0.018872,-0.036484,-0.011485,
0.016951,-0.025001,0.000289 ,0.047677,0.017671,0.014016,
0.03569,0.060265,0.077392,0.026065,0.05085,0.272248,0.015876
0.014544,0.035123,0.021487,0.000373,0.017709,0.036283,
0.074612,0.117565,0.034609,0.006263,0.023777,0.059071
0.023269)),类= c(tbl_df,tbl,data.frame),row.names = c(NA,
-53L),.Names = c(cusip,date,fyear ,月,ret,ewretd,
lagewretd))

ddply方法:

  library(dplyr)

##将fyear转换成正确的数字,然后exploit for sorting
tdata< - tdata%>%
mutate(fyear = fyear%>%as.integer)%>%
arrange(fyear,month)

##计算每年可用的累积月数(每个cusip)
yearstuff< - tdata%>%
group_by(cusip,fyear)%>%
总结(n = n())%>%
mutate(n_cum = cumsum(n))

##遍历yearstuff行(每个cusip)
模型< - plyr :: ddply(yearstuff,〜cusip + fyear,function(y){
if(y $ n_cum< 24){
c('(Intercept)'= NA_real_,ewretd = NA_real_,lagewretd = NA_real_)
} else {
my_dat< - tdata%>%
filter cusip == y $ cusip,fyear< = y $ fyear)%>%
mutate(rn = row_number(desc(date)))
lm(ret〜ewretd + lagewretd,my_dat, = rn <61)%>%coef
}
})


解决方案

我将为您所做的所有计算编写一个单独的函数来获取系数。然后,您可以使用 plyr dplyr data.table 。您应该可以使用更大的数据集重新运行下面的基准测试。

 #函数以获取系数
#(进一步的优化应该可能侧重于改进此功能)
get_coefs < - function(.cusip,.fyear,.n_cum){
if(.n_cum< 24){
data_frame(`(Intercept)`= NA_real_,ewretd = NA_real_,lagewretd = NA_real_)
} else {
my_dat < - tdata%>%
filter(cusip == .cusip,fyear< = .fyear)%>%
mutate = row_number(desc(date)))
lm(ret〜ewretd + lagewretd,my_dat,subset = rn <61)%>%
coef%>%
as.list %>%
as_data_frame
}
}
require(microbenchmark)
microbenchmark(
models_plyr< - plyr :: ddply(yearstuff,〜cusip + fyear,function(y)
get_coefs(y $ cusip,y $ fyear,y $ n_cum))

models_dplyr< - yearstuff%>%
group_by cusip,fyear)%>%
do(get_coefs(。$ cusip,。$ fyear,$ n_cum))

models_dt< - as.data.table .data.frame(yearstuff))[,get_coefs(cusip,fyear,n_cum),by = list(cusip,fyear)]

## min lq mean median uq max neval cld
## 12.69178 13.29136 13.62600 13.45849 13.67471 16.73910 100 c
## 12.45302 12.94036 13.33589 13.14721 13.59907 14.73485 100 b
## 10.66120 11.09856 11.43126 11.21593 11.45625 13.69591 100 a
all.equal(models_plyr%> ;%data.frame,
models_dplyr%>%data.frame)
## [1] TRUE
all.equal(models_plyr%>%data.frame,
models_dt%>%data.frame)
## [1] TRUE


I'm wondering if there is a data.table option for batch processing linear models from a data set with a check first.

I need to run a bunch of linear models on each unique identifier, but first I need to do a check. For each unique id and year, I need to check that there are at least 24 months of previous monthly data, but not more than 60 months. So when I run the regression it should include between 24 - 60 observations of previous month (years) data for each year of each individual. If there are less than 24 months of data for that year, the year is dropped for that individual, but if there are more than 60, then only the 60 months are used.

Thanks to this (thanks @akrun) post, I was able to setup the linear models for each individual, run them, and then output the beta as the sum of both betas. The problem is that this only runs the regression on the current year (12 obs) and not the previous 24-60.

Previous Post : dplyr and previous observations

I was hoping for a dplyr option, but it doesn't seem that it will work, and the ddply method in the post and below takes hours to run. However, I need to run this multiple times on various data sets that range in the 1.1 million obs.

dput example :

   tdata <- structure(list(cusip = c(101L, 101L, 101L, 101L, 101L, 101L, 
101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 
101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 
101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 
101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 101L, 
101L, 101L, 101L), date = c(19901130L, 19901031L, 19900928L, 
19900831L, 19900731L, 19900629L, 19900531L, 19900430L, 19900330L, 
19900228L, 19900131L, 19891229L, 19891130L, 19891031L, 19890929L, 
19890831L, 19890731L, 19890630L, 19890531L, 19890428L, 19890331L, 
19890228L, 19890131L, 19881230L, 19881130L, 19881031L, 19880930L, 
19880831L, 19880729L, 19880630L, 19880531L, 19880429L, 19880331L, 
19880229L, 19880129L, 19871231L, 19871130L, 19871030L, 19870930L, 
19870831L, 19870731L, 19870630L, 19870529L, 19870430L, 19870331L, 
19870227L, 19870130L, 19861231L, 19861128L, 19861031L, 19860930L, 
19860829L, 19860731L), fyear = c("1990", "1990", "1990", "1990", 
"1990", "1990", "1990", "1990", "1990", "1990", "1990", "1989", 
"1989", "1989", "1989", "1989", "1989", "1989", "1989", "1989", 
"1989", "1989", "1989", "1988", "1988", "1988", "1988", "1988", 
"1988", "1988", "1988", "1988", "1988", "1988", "1988", "1987", 
"1987", "1987", "1987", "1987", "1987", "1987", "1987", "1987", 
"1987", "1987", "1987", "1986", "1986", "1986", "1986", "1986", 
"1986"), month = c("11", "10", "09", "08", "07", "06", "05", 
"04", "03", "02", "01", "12", "11", "10", "09", "08", "07", "06", 
"05", "04", "03", "02", "01", "12", "11", "10", "09", "08", "07", 
"06", "05", "04", "03", "02", "01", "12", "11", "10", "09", "08", 
"07", "06", "05", "04", "03", "02", "01", "12", "11", "10", "09", 
"08", "07"), ret = c("0.117647", "0.030303", "-0.161017", "-0.186207", 
"-0.131737", "0.128378", "0.027778", "-0.162791", "0.131579", 
"0.178295", "-0.091549", "0.163934", "-0.089552", "0.007519", 
"0.117647", "0.155340", "0.211765", "0.024096", "0.338710", "0.377778", 
"0.071429", "-0.176471", "0.378378", "-0.026316", "-0.050000", 
"-0.047619", "-0.086957", "-0.061224", "0.088889", "-0.062500", 
"-0.040000", "-0.056604", "0.081633", "0.042553", "-0.096154", 
"0.238095", "-0.263158", "-0.393617", "-0.160714", "0.400000", 
"-0.090909", "-0.200000", "-0.098361", "-0.152778", "0.000000", 
"0.107692", "0.460674", "-0.101010", "-0.019802", "0.246914", 
"-0.052632", "0.179310", "-0.064516"), ewretd = c(0.035468, -0.057155, 
-0.080468, -0.108911, -0.025732, 0.005359, 0.045675, -0.028117, 
0.021315, 0.015434, -0.046408, -0.012375, -0.0058, -0.049934, 
0.005532, 0.018626, 0.031017, -0.007744, 0.025054, 0.029089, 
0.01806, 0.002988, 0.062124, 0.018872, -0.036484, -0.011485, 
0.016951, -0.025001, 0.000289, 0.047677, -0.017671, 0.014016, 
0.03569, 0.060265, 0.077392, 0.026065, -0.05085, -0.272248, -0.015876, 
0.014544, 0.035123, 0.021487, 0.000573, -0.017709, 0.036283, 
0.074612, 0.117565, -0.034609, -0.006263, 0.023777, -0.059071, 
0.023269, -0.073128), lagewretd = c(-0.004526, 0.035468, -0.057155, 
-0.080468, -0.108911, -0.025732, 0.005359, 0.045675, -0.028117, 
0.021315, 0.015434, -0.046408, -0.012375, -0.0058, -0.049934, 
0.005532, 0.018626, 0.031017, -0.007744, 0.025054, 0.029089, 
0.01806, 0.002988, 0.062124, 0.018872, -0.036484, -0.011485, 
0.016951, -0.025001, 0.000289, 0.047677, -0.017671, 0.014016, 
0.03569, 0.060265, 0.077392, 0.026065, -0.05085, -0.272248, -0.015876, 
0.014544, 0.035123, 0.021487, 0.000573, -0.017709, 0.036283, 
0.074612, 0.117565, -0.034609, -0.006263, 0.023777, -0.059071, 
0.023269)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-53L), .Names = c("cusip", "date", "fyear", "month", "ret", "ewretd", 
"lagewretd"))

ddply method :

library(dplyr)

## convert fyear to a proper number and then exploit for sorting
tdata <- tdata %>%
  mutate(fyear = fyear %>% as.integer) %>%
  arrange(fyear, month)

## figure out cumulative months available for each year (for each cusip)
yearstuff <- tdata %>%  
  group_by(cusip, fyear) %>% 
  summarize(n = n()) %>% 
  mutate(n_cum = cumsum(n))

## iterate over rows of yearstuff (for each cusip)
models <- plyr::ddply(yearstuff, ~ cusip + fyear, function(y) {
  if(y$n_cum < 24) {
    c('(Intercept)' = NA_real_, ewretd = NA_real_, lagewretd = NA_real_)
  } else {
    my_dat <- tdata %>%
      filter(cusip == y$cusip, fyear <= y$fyear) %>%
      mutate(rn = row_number(desc(date)))
    lm(ret ~ ewretd + lagewretd, my_dat, subset = rn < 61) %>% coef
  }
})

解决方案

I would write a seperate function for all the calculations you do to get the coefficients. Then you can use either plyr, dplyr or data.table. You should probably rerun the benchmarktests below with larger datasets.

# function to get coefficients 
# (further optimization should probably focus on improving this function)
get_coefs <- function(.cusip, .fyear, .n_cum){
  if(.n_cum < 24) {
    data_frame(`(Intercept)` = NA_real_, ewretd = NA_real_, lagewretd = NA_real_)
  } else {
    my_dat <- tdata %>%
      filter(cusip == .cusip, fyear <= .fyear) %>%
      mutate(rn = row_number(desc(date)))
    lm(ret ~ ewretd + lagewretd, my_dat, subset = rn < 61) %>% 
      coef %>% 
      as.list %>% 
      as_data_frame
  }
}
require(microbenchmark)
microbenchmark(
  models_plyr <- plyr::ddply(yearstuff, ~ cusip + fyear, function(y)
    get_coefs(y$cusip, y$fyear, y$n_cum))
  ,
  models_dplyr <- yearstuff %>% 
    group_by(cusip, fyear) %>%
    do(get_coefs(.$cusip, .$fyear, .$n_cum))
  ,
  models_dt <- as.data.table(as.data.frame(yearstuff))[, get_coefs(cusip, fyear, n_cum), by = list(cusip, fyear)]
)
##      min       lq     mean   median       uq      max neval cld
## 12.69178 13.29136 13.62600 13.45849 13.67471 16.73910   100   c
## 12.45302 12.94036 13.33589 13.14721 13.59907 14.73485   100  b 
## 10.66120 11.09856 11.43126 11.21593 11.45625 13.69591   100 a  
all.equal(models_plyr %>% data.frame, 
          models_dplyr %>% data.frame)
## [1] TRUE
all.equal(models_plyr %>% data.frame, 
          models_dt %>% data.frame) 
## [1] TRUE

这篇关于数据表选项用于检查和批量线性模型的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆