如何在列表中取得列表子集 [英] How to take subsets of lists in a tibble

查看:172
本文介绍了如何在列表中取得列表子集的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有几个股票的年度财务数据。我需要将其吹出来成为每月数据,感谢我以前问过的这个问题的答案,我有一个解决方案,涉及将日期列变成日期列表

 库(tidyverse)
库(lubridate)

因素。对于子集(子集)(list)(list(
sec_id = c(1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L, 1572L,1572L,1572L,1572L,1572L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1676L,1572L,1572L,1572L, 1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1572L,1676L,1676L,1676L,1676L,1676L,1676L, 1676L,1676L,1676L,1676L,1676L,1676L,167 6,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH EPS_GROWTHEPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH ,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,EPS_GROWTH,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY ,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITYND_EQUITY,ND_EQUITY,ND_EQUITY ,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY,ND_EQUITY),
date = structure(c(9464,99829,10193 ,10560,10925,11291,1165,12020,12384,12752,13117,13482,13847,14211,14578,14943,15308,15674,16038,16402,16769,17135,9342,9708,10073,103838,10802,11200 ,11565,13756,14120,14487,14852,15217,15583,15947,1663,11678,17044,9464,99829,10193,10560,10925,11291,1165,12020,12384,12752,13117,13482,13847,14211 ,14578,144943,15308,15674,16038,16402,16769,17135,9342,9708,10073,1043,3802,11200,11565,13756,14120,14487,14852,15217,15583,15947,1663,11678,17044 ),
class =Date),value = c(0.250468,0.091548,0.100863,0.058375,0.24784,0.178765,0.099276,0.25472,0.033291,0.124165,0.050947,0.243008,0.1205,0.239625,0.231221 ,0.365649,0.163779,0.024976,0.08388,0.154777,0.016473,-0.272928,-0.018711,-0.162076,-0.599241,-4.071504,-0.37761,1.6694085,0.0 45113,0.329818,0.199564,-0.616418,1.164773,0.8877078,-0.325099,-0.294199,0.272016,-0.706077,-2.57027,4.500261,4.734375,4.090376,3.322846,3.640895,4.645253,4.783054,3.946184,3847828,4.077601,4.778736,5.4583883 ,5.14355,5.084551,3.370378,3.076065,2.812879,2.87688,2.430692,3.029766,3.062665,3349906,0.396299,0.60174,0.527478,1.048755,1.136417,0.668333,0.523115,0.259175,0.164024,0.1318469,0.041141,0.096251,0346829,041832,0.300988 ,0.344943,0.432505)),
row.names = c(NA,-78L),class = c(tbl_df,tbl,data.frame),.Names = c(sec_id ,metric,date,value))

factors.subset.monthly = factors.subset.raw%>%
group_by(sec_id,metric)%> %
mutate(date = ceiling_date(date,'month'))%>%
mutate(date = map2(date,lead(date - 1,default = today()),seq,by ='month'))

现在只需添加%>% unnest()%>%mutate(date = date - 1)以上,将我的年度数据转换为每月,所有日期都是月底。



当数据存在很大差距时,我的问题出现。发生这种情况时,我只想填写最多18个月。



我尝试添加管道,将截断日期列,但到目前为止我似乎无法弄清楚。这个小宝石给我不兼容的大小错误,例如:

  factors.subset.monthly%>%
mutate (count.date = as.numeric(lapply(date,length)))%>%
mutate(count.cutoff = ifelse(count.date< = 18,count.date,18))%> ;%
mutate(date = date [1:count.cutoff])


解决方案

您需要使用 map / lapply 来遍历列表列,您可以使用将其限制为18个观察值:

  library(tidyverse)
库(lubridate)

df< - factors.subset.monthly%>%mutate(date = map(date,head,18))

any(length(factors.subset.monthly $ date)> 18)
#> [1] TRUE
any(length(df $ date)> 18)
#> [1] FALSE

您还可以包括 head 当你做 factors.subset.monthly

  factors.subset.raw%>%
group_by(sec_id,metric)%>%
mutate(date = ceiling_date(date,'month'),
date = map2(date,lead(date - 1,default = today()),
〜head(seq(.x,.y,by ='month'),18)))

您还可以将的起始日期的最低目标日期或18个月用于参数 seq ,但由于长度不规则而增加18个月有些困难。


I have annual financial data for several stocks. I needed to blow it out to become monthly data and, thanks to an answer to this question I'd asked earlier, I have a solution which involves mutating the date column into lists of dates:

library(tidyverse)
library(lubridate)

factors.subset.raw = structure(list(
    sec_id = c(1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L), 
    metric = c("EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY"), 
    date = structure(c(9464, 9829, 10193, 10560, 10925, 11291, 11656, 12020, 12384, 12752, 13117, 13482, 13847, 14211, 14578, 14943, 15308, 15674, 16038, 16402, 16769, 17135, 9342, 9708, 10073, 10438, 10802, 11200, 11565, 13756, 14120, 14487, 14852, 15217, 15583, 15947, 16311, 16678, 17044, 9464, 9829, 10193, 10560, 10925, 11291, 11656, 12020, 12384, 12752, 13117, 13482, 13847, 14211, 14578, 14943, 15308, 15674, 16038, 16402, 16769, 17135, 9342, 9708, 10073, 10438, 10802, 11200, 11565, 13756, 14120, 14487, 14852, 15217, 15583, 15947, 16311, 16678, 17044), 
    class = "Date"), value = c(0.250468, 0.091548, -0.100863, 0.058375, 0.24784, 0.178765, 0.099276, 0.25472, -0.033291, 0.124165, 0.050947, 0.243008, 0.1205, -0.239625, -0.231221, 0.365649, 0.163779, 0.024976, 0.08388, 0.154777, 0.016473, -0.272928, -0.018711, -0.162076, -0.599241, -4.071504, -0.37761, 1.694085, 0.045113, 0.329818, 0.199564, -0.616418, 1.164773, 0.877078, -0.325099, -0.294199, 0.272016, -0.706077, -2.57027, 4.500261, 4.734375, 4.090376, 3.322846, 3.640895, 4.645253, 4.783054, 3.946184, 3.847828, 4.077601, 4.778736, 5.453883, 5.14355, 5.084551, 3.370378, 3.076065, 2.812879, 2.87688, 2.430692, 3.029766, 3.062665, 3.349906, 0.396299, 0.60174, 0.527478, 1.048755, 1.136417, 0.668333, 0.523115, 0.259175, 0.164024, 0.118469, 0.061141, 0.096251, 0.346829, 0.401832, 0.300988, 0.344943, 0.432505)), 
    row.names = c(NA, -78L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("sec_id", "metric", "date", "value"))

factors.subset.monthly = factors.subset.raw %>%
    group_by(sec_id, metric) %>%
    mutate(date = ceiling_date(date, 'month')) %>%
    mutate(date = map2(date, lead(date - 1, default = today()), seq, by = 'month'))

Now it suffices to add %>% unnest() %>% mutate(date = date - 1) to the above to transform my annual data to monthly, with all dates being end of month.

My problem occurs when there is a big gap in the data. When this happens, I only want to fill forward at most 18 months.

I've tried adding pipes which cutoff the date column, but so far I can't seem to figure it out. This little gem gives me incompatible size errors, for example:

factors.subset.monthly %>%
    mutate(count.date = as.numeric(lapply(date, length))) %>%
    mutate(count.cutoff = ifelse(count.date <= 18, count.date, 18)) %>%
    mutate(date = date[1:count.cutoff])

解决方案

You need to use map/lapply to iterate over the list column, but then you can simply use head to limit it to 18 observations:

library(tidyverse)
library(lubridate)

df <- factors.subset.monthly %>% mutate(date = map(date, head, 18))

any(lengths(factors.subset.monthly$date) > 18)
#> [1] TRUE
any(lengths(df$date) > 18)
#> [1] FALSE

You could also just include head when you make factors.subset.monthly:

factors.subset.raw %>%
    group_by(sec_id, metric) %>%
    mutate(date = ceiling_date(date, 'month'),
           date = map2(date, lead(date - 1, default = today()), 
                       ~head(seq(.x, .y, by = 'month'), 18)))

You could also use the minimum of the target date or 18 months past the start date for the to parameter of seq, but adding 18 months is somewhat difficult due to their irregular length.

这篇关于如何在列表中取得列表子集的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆