在R中的汇总预测表中按组加入残差 [英] Join residual by group in summary Forecast table in R

查看:97
本文介绍了在R中的汇总预测表中按组加入残差的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

可重现的示例

df=structure(list(group = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
                            2L, 2L, 2L), year = c(1973L, 1974L, 1975L, 1976L, 1977L, 1978L, 
                                                  1973L, 1974L, 1975L, 1976L, 1977L, 1978L), Jan = c(9007L, 7750L, 
                                                                                                     8162L, 7717L, 7792L, 7836L, 9007L, 7750L, 8162L, 7717L, 7792L, 
                                                                                                     7836L), Feb = c(8106L, 6981L, 7306L, 7461L, 6957L, 6892L, 8106L, 
                                                                                                                     6981L, 7306L, 7461L, 6957L, 6892L), Mar = c(8928L, 8038L, 8124L, 
                                                                                                                                                                 7767L, 7726L, 7791L, 8928L, 8038L, 8124L, 7767L, 7726L, 7791L
                                                                                                                     ), Apr = c(9137L, 8422L, 7870L, 7925L, 8106L, 8192L, 9137L, 8422L, 
                                                                                                                                7870L, 7925L, 8106L, 8192L), May = c(10017L, 8714L, 9387L, 8623L, 
                                                                                                                                                                     8890L, 9115L, 10017L, 8714L, 9387L, 8623L, 8890L, 9115L), Jun = c(10826L, 
                                                                                                                                                                                                                                       9512L, 9556L, 8945L, 9299L, 9434L, 10826L, 9512L, 9556L, 8945L, 
                                                                                                                                                                                                                                       9299L, 9434L), Jul = c(11317L, 10120L, 10093L, 10078L, 10625L, 
                                                                                                                                                                                                                                                              10484L, 11317L, 10120L, 10093L, 10078L, 10625L, 10484L), Aug = c(10744L, 
                                                                                                                                                                                                                                                                                                                               9823L, 9620L, 9179L, 9302L, 9827L, 10744L, 9823L, 9620L, 9179L, 
                                                                                                                                                                                                                                                                                                                               9302L, 9827L), Sep = c(9713L, 8743L, 8285L, 8037L, 8314L, 9110L, 
                                                                                                                                                                                                                                                                                                                                                      9713L, 8743L, 8285L, 8037L, 8314L, 9110L), Oct = c(9938L, 9129L, 
                                                                                                                                                                                                                                                                                                                                                                                                         8466L, 8488L, 8850L, 9070L, 9938L, 9129L, 8466L, 8488L, 8850L, 
                                                                                                                                                                                                                                                                                                                                                                                                         9070L), Nov = c(9161L, 8710L, 8160L, 7874L, 8265L, 8633L, 9161L, 
                                                                                                                                                                                                                                                                                                                                                                                                                         8710L, 8160L, 7874L, 8265L, 8633L), Dec = c(8927L, 8680L, 8034L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                     8647L, 8796L, 9240L, 8927L, 8680L, 8034L, 8647L, 8796L, 9240L
                                                                                                                                                                                                                                                                                                                                                                                                                         )), .Names = c("group", "year", "Jan", "Feb", "Mar", "Apr", "May", 
                                                                                                                                                                                                                                                                                                                                                                                                                                        "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              -12L))

Perfor前驱按组

Perfrom Forecat by group

library(forecast)
ld <- split(df[, -1], df$group)
ld <- lapply(ld, function(x) {ts(c(t(x[,-1])), start = min(x[,1]), frequency = 12)})

lts <- lapply(ld, ets, model = "ZZZ")

结果为

$`1`
         Point Forecast     Lo 80     Hi 80    Lo 95     Hi 95
Jan 1979       8397.497  8022.399  8772.595 7823.834  8971.160
Feb 1979       7599.221  7162.825  8035.616 6931.812  8266.630
Mar 1979       8396.595  7906.510  8886.679 7647.075  9146.115
Apr 1979       8646.510  8108.063  9184.957 7823.026  9469.994

从1979年开始,这是预测值,我想获取
的残差结果1973-1978年。(初始值)

From 1979 year, it is forecasted values, i want get result of residuals for 1973-1978.(initiall values)

res <- lapply(lts, residuals)

和结果

$`1`
            Jan        Feb        Mar        Apr        May        Jun        Jul        Aug        Sep        Oct        Nov
1973  497.69233   99.50607   64.44947  -15.20925   77.85009  390.89045 -277.67369   26.92614   72.42590  -85.69894 -338.10035

以此类推

问题
1.如何将残差的结果加入汇总表。例如,像这样的

Questions 1. How result of residual to join in summary table. For example something like this


  1. 问题:
    对于1979年及以后,我们看到了预测值,但是对于1973-1978年,在列点预测中,我们看到了残差。
    当然,理想情况下,不会得到太多残差,而是原始值和预测值。
    所以我不知道如何将1973-1978年的初始数据加入汇总表中的原始值,例如
    df [df $ year == 1973,] 但是全年如何...
    然后从原始值中减去残差并得到预测值(也许我使任务复杂了很多,但否则我不知道如何获得所需的输出)

  1. Question: For 1979 and more we see forecasted value, but for 1973-1978 in the column point forecast we see the residuals. Ideally, of course, get not so much residual, but the original values and forecasted values. So i don't know how for initiall data 1973-1978 join in summary table original values something like this df[df$year == 1973,]but how for all year... Then from original values subtract residial and got forecasted value (Maybe I complicate the task a lot, but otherwise I don’t know how to get the desired output)

商品名点预测 lo80 hi80 不需要更改,我会记住,对于初始值,它们表示残差,原始的和预测的。

colnames point forecast,lo80 and hi80 is not need for changing, i will be remember that for initial values they mean residual, original and forecasted.

是否可以使用dplyr或data.table解决方案?

Is it possible to do it using dplyr or data.table solution?

# Tidy-up the splits
ld <- lapply(ld, function(x) {
  x %>%
    gather(key, value, -year) %>%
    unite(date, year, key, sep = "-") %>%
    mutate(date = paste0(date, "-01")) %>%
    mutate(date = as.Date(date, format = "%Y-%b-%d"))    
})

结果

$`1`
   date value
1  <NA>  9007
2  <NA>  7750
3  <NA>  8162
4  <NA>  7717
5  <NA>  7792
6  <NA>  7836
7  <NA>  8106
8  <NA>  6981
9  <NA>  7306
10 <NA>  7461
11 <NA>  6957
12 <NA>  6892


ld=dput()
ld <- lapply(ld, function(x) {
  yr <- lubridate::year(min(x$date))
  mth <- lubridate::month(min(x$date))
  timetk::tk_ts(data = x, select = value, frequency = 12,
                start = c(yr, mth))
})

错误

 Error in x$date : $ operator is invalid for atomic vectors 



edit3



edit3

> lts_all <- lapply(names(lts), function(x, lts) {
+   output_fit <- lts[[x]][["res_fit_tbl"]] %>%
+     mutate(group = x)
+   output_fcst <- lts[[x]][["res_fcst_tbl"]] %>%
+     mutate(group = x)
+   
+   return(list(output_fit=output_fit, output_fcst=output_fcst))
+ }, lts)
> lts_all
[[1]]
[[1]]$output_fit
# A tibble: 72 x 6
   date       value residuals CI95_upper CI95_lower group
   <date>     <dbl>     <dbl>      <dbl>      <dbl> <chr>
 1 1973-01-01  8509     498         9083       7936 value
 2 1973-02-01  8006      99.5       8580       7433 value
 3 1973-03-01  8864      64.4       9437       8290 value
 4 1973-04-01  9152    - 15.2       9726       8579 value
 5 1973-05-01  9939      77.9      10513       9365 value
 6 1973-06-01 10435     391        11009       9861 value
 7 1973-07-01 11595    -278        12168      11021 value
 8 1973-08-01 10717      26.9      11291      10143 value
 9 1973-09-01  9641      72.4      10214       9067 value
10 1973-10-01 10024    - 85.7      10597       9450 value
# ... with 62 more rows


推荐答案

这是一个完整的解决方案,从 df 开始,可重现的示例:

Here it is a complete solution starting from df, the reproducible example:

# load libraries
load_pkgs <- c("forecast", "zoo", "timetk", "tidyverse") 
sapply(load_pkgs, function(x) suppressPackageStartupMessages(library(x, character.only = T)))

第1步:预处理

# perform split by group
ld <- split(df[, -1], df$group)

# Tidy-up the splits
ld <- lapply(ld, function(x) {
  x %>%
    gather(key, value, -year) %>%
    unite(date, year, key, sep = "-") %>%
    mutate(date = paste0(date, "-01")) %>%
    mutate(date = as.Date(date, format = "%Y-%b-%d"))    
})

dput 首个ts:

structure(list(date = structure(c(1096, 1461, 1826, 2191, 2557, 
                                  2922, 1127, 1492, 1857, 2222, 2588, 2953, 1155, 1520, 1885, 2251, 
                                  2616, 2981, 1186, 1551, 1916, 2282, 2647, 3012, 1216, 1581, 1946, 
                                  2312, 2677, 3042, 1247, 1612, 1977, 2343, 2708, 3073, 1277, 1642, 
                                  2007, 2373, 2738, 3103, 1308, 1673, 2038, 2404, 2769, 3134, 1339, 
                                  1704, 2069, 2435, 2800, 3165, 1369, 1734, 2099, 2465, 2830, 3195, 
                                  1400, 1765, 2130, 2496, 2861, 3226, 1430, 1795, 2160, 2526, 2891, 
                                  3256), class = "Date"), value = c(9007L, 7750L, 8162L, 7717L, 
                                                                    7792L, 7836L, 8106L, 6981L, 7306L, 7461L, 6957L, 6892L, 8928L, 
                                                                    8038L, 8124L, 7767L, 7726L, 7791L, 9137L, 8422L, 7870L, 7925L, 
                                                                    8106L, 8192L, 10017L, 8714L, 9387L, 8623L, 8890L, 9115L, 10826L, 
                                                                    9512L, 9556L, 8945L, 9299L, 9434L, 11317L, 10120L, 10093L, 10078L, 
                                                                    10625L, 10484L, 10744L, 9823L, 9620L, 9179L, 9302L, 9827L, 9713L, 
                                                                    8743L, 8285L, 8037L, 8314L, 9110L, 9938L, 9129L, 8466L, 8488L, 
                                                                    8850L, 9070L, 9161L, 8710L, 8160L, 7874L, 8265L, 8633L, 8927L, 
                                                                    8680L, 8034L, 8647L, 8796L, 9240L)), class = "data.frame", row.names = c(NA, 
                                                                                                                                             -72L))

然后

# Transform time series to ts objects
ld <- lapply(ld, function(x) {
  yr <- lubridate::year(min(x$date))
  mth <- lubridate::month(min(x$date))
  timetk::tk_ts(data = x, select = value, frequency = 12,
                start = c(yr, mth))
})

dput 第一个ts:

structure(c(9007L, 8106L, 8928L, 9137L, 10017L, 10826L, 11317L, 
            10744L, 9713L, 9938L, 9161L, 8927L, 7750L, 6981L, 8038L, 8422L, 
            8714L, 9512L, 10120L, 9823L, 8743L, 9129L, 8710L, 8680L, 8162L, 
            7306L, 8124L, 7870L, 9387L, 9556L, 10093L, 9620L, 8285L, 8466L, 
            8160L, 8034L, 7717L, 7461L, 7767L, 7925L, 8623L, 8945L, 10078L, 
            9179L, 8037L, 8488L, 7874L, 8647L, 7792L, 6957L, 7726L, 8106L, 
            8890L, 9299L, 10625L, 9302L, 8314L, 8850L, 8265L, 8796L, 7836L, 
            6892L, 7791L, 8192L, 9115L, 9434L, 10484L, 9827L, 9110L, 9070L, 
            8633L, 9240L), .Dim = c(72L, 1L), .Dimnames = list(NULL, "value"), index = structure(c(94694400, 
                                                                                                   97372800, 99792000, 102470400, 105062400, 107740800, 110332800, 
                                                                                                   113011200, 115689600, 118281600, 120960000, 123552000, 126230400, 
                                                                                                   128908800, 131328000, 134006400, 136598400, 139276800, 141868800, 
                                                                                                   144547200, 147225600, 149817600, 152496000, 155088000, 157766400, 
                                                                                                   160444800, 162864000, 165542400, 168134400, 170812800, 173404800, 
                                                                                                   176083200, 178761600, 181353600, 184032000, 186624000, 189302400, 
                                                                                                   191980800, 194486400, 197164800, 199756800, 202435200, 205027200, 
                                                                                                   207705600, 210384000, 212976000, 215654400, 218246400, 220924800, 
                                                                                                   223603200, 226022400, 228700800, 231292800, 233971200, 236563200, 
                                                                                                   239241600, 241920000, 244512000, 247190400, 249782400, 252460800, 
                                                                                                   255139200, 257558400, 260236800, 262828800, 265507200, 268099200, 
                                                                                                   270777600, 273456000, 276048000, 278726400, 281318400), tzone = "UTC", tclass = "Date"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", class = "ts", .Tsp = c(1973, 
                                                                                                                                                                                                                                                                                           1978.91666666667, 12))

第2步:使用 et进行训练和预测

您需要一个帮助功能将输出转换为数据框:

You need a helping function to transform your output to data frame:

# helping function
make_df <- function(ts_obj) {

  ts_df <- timetk::tk_tbl(preserve_index = TRUE, ts_obj) %>%
    mutate(index = zoo::as.Date(x = .$index, frac = 0)) %>% 
    dplyr::rename(date = index)

  return(ts_df)
}

以下函数训练 ets 并预测未来12个月;然后,它准备具有拟合值和预测值的表:

The following function trains ets and forecasts the next 12 months; then, it prepares tables with the fitted and forecasting values:

lts <- lapply(ld, function(ts_obj) {
# train ets model and get fitted results
res_model <- ets(ts_obj, model = "ZZZ")
res_fit <- ts(as.numeric(res_model$fitted), start = start(ts_obj), frequency = 12)

# add extra metrics you may be interested in
model <- res_model[["method"]]
mse <- res_model[["mse"]]

# get forecasts for the next 12 months
res_fct <- forecast(res_model, h = 12)
res_fcst <- ts(res_fct$mean, start = end(ts_obj) + 1/12, frequency = 12)

# transform results to tbl
# for fitted output we keep the residuals and the 95% CI
res_fit_tbl <- make_df(res_fit) %>%
  mutate(residuals = as.numeric(res_model[["residuals"]])) %>%
  mutate(CI95_upper = value + 1.96*sqrt(res_model$sigma2), 
         CI95_lower = value - 1.96*sqrt(res_model$sigma2))
# the forecast output does not have residuals
res_fcst_tbl <- make_df(res_fcst)

return(list(res_fit_tbl = res_fit_tbl, res_fcst_tbl = res_fcst_tbl, model = model, mse = mse)) # don't forget to pass the extra metrics as output
})

步骤3:将拟合的和预测不同组之间的输出

Step 3: Bring together the fitted and forecasting outputs across different groups

# add groups back + other metrics of interest
lts_all <- lapply(names(lts), function(x, lts) {
  output_fit <- lts[[x]][["res_fit_tbl"]] %>%
    mutate(group = x,
           model = lts[[x]][["model"]],
           mse = lts[[x]][["mse"]])
  output_fcst <- lts[[x]][["res_fcst_tbl"]] %>%
    mutate(group = x)

  return(list(output_fit=output_fit, output_fcst=output_fcst))
  }, lts)

样本输出:

> lts_all[[1]][["output_fit"]]
# A tibble: 72 x 6
   date        value residuals CI95_upper CI95_lower group
   <date>      <dbl>     <dbl>      <dbl>      <dbl> <chr>
 1 1973-01-01  8509.     498.       9083.      7936. 1    
 2 1973-02-01  8006.      99.5      8580.      7433. 1    
 3 1973-03-01  8864.      64.4      9437.      8290. 1    
 4 1973-04-01  9152.     -15.2      9726.      8579. 1    
 5 1973-05-01  9939.      77.9     10513.      9365. 1    
 6 1973-06-01 10435.     391.      11009.      9861. 1    
 7 1973-07-01 11595.    -278.      12168.     11021. 1    
 8 1973-08-01 10717.      26.9     11291.     10143. 1    
 9 1973-09-01  9641.      72.4     10214.      9067. 1    
10 1973-10-01 10024.     -85.7     10597.      9450. 1    
# ... with 62 more rows

> lts_all[[2]][["output_fit"]]
# A tibble: 72 x 6
   date        value residuals CI95_upper CI95_lower group
   <date>      <dbl>     <dbl>      <dbl>      <dbl> <chr>
 1 1973-01-01  8509.     498.       9083.      7936. 2    
 2 1973-02-01  8006.      99.5      8580.      7433. 2    
 3 1973-03-01  8864.      64.4      9437.      8290. 2    
 4 1973-04-01  9152.     -15.2      9726.      8579. 2    
 5 1973-05-01  9939.      77.9     10513.      9365. 2    
 6 1973-06-01 10435.     391.      11009.      9861. 2    
 7 1973-07-01 11595.    -278.      12168.     11021. 2    
 8 1973-08-01 10717.      26.9     11291.     10143. 2    
 9 1973-09-01  9641.      72.4     10214.      9067. 2    
10 1973-10-01 10024.     -85.7     10597.      9450. 2    
# ... with 62 more rows

然后

# bring together the fitted respectively forecasting results
output_fit_all <- lapply(lts_all, function(x) x[[1]])
output_fit_all <- bind_rows(output_fit_all)

output_fcst_all <- lapply(lts_all, function(x) x[[2]])
output_fcst_all <- bind_rows(output_fcst_all)

这篇关于在R中的汇总预测表中按组加入残差的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆