自适应滚动窗口功能 - R的顶级性能 [英] Adaptive rolling window function - top performance in R
问题描述
重要注意是,我专注于案例
align =right
和滚动窗口 width
作为向量(与我们的观察向量相同的长度)。如果我们有 width
作为标量,在 zoo
和中已经有非常完善的函数TTR
包,这将是非常难以击败,因为他们中的一些甚至使用Fortran(但仍然用户定义的FUN可以更快使用下面 wapply
)。 RcppRoll
包是值得提及的,因为它的伟大的性能,但到目前为止没有功能,这个问题的答案。如果有人可以扩展它来回答这个问题,将是伟大的。 考虑我们有以下数据:
x = c(120,105,118,140,142,141,135,152,154,138,125,132,131,120)
plot(x,type =l)
应用滚动函数 x
向量与变量滚动窗口 width
。
set.seed(1)
width = sample (x),TRUE)
在这种特殊情况下,我们将滚动函数自适应为
。 c(2,3,4)
的示例
我们将应用
函数,预期结果:
r = f ,FUN = mean)
print(r)
## [1] NA NA 114.3333 120.7500 141.0000 135.2500 139.5000
## [8] 142.6667 147.0000 146.0000 131.5000 128.5000 131.5000 127.6667
(x,type =l)
lines(r,col =red)
任何指标用于产生 width
参数作为自适应移动平均值的不同变体,或任何其他函数。
寻找最佳效果。
#这是一个不需要做C ++的解决方案, 1. rollapply
library(zoo)
?rollapplyr
#2. mapply
base_mapply< - function(x,width,FUN,...){
如果(i <宽度)返回(NA_real_)
返回(FUN(数据),则FUN < - match.fun(FUN)
f < [(i-(width-1)):i],...))
}
mapply(FUN = f,
seq_along(x),width,
MoreArgs = list(data = x))
}
#3. wmapply - 修改后的wapply版本:https://rmazing.wordpress.com/2013/04/23/wapply-a-faster- rollback-for-vector-setups /
wmapply< - function(x,width,FUN = NULL,...){
FUN< - match.fun )
SEQ1 <-1:length(x)
SEQ1 [SEQ1<宽度] < - NA_integer_
SEQ2 < - lapply(SEQ1,function(i)if(!is.na(i))(i-(width [i] -1)):i)
OUT <-lapply(SEQ2,function(i)if(!is.null(i))FUN(x [i],...)else NA_real_)
return(base ::: simplify2array ,more = TRUE))
}
#4. forloopply - 简单循环解决方案
forloopply< - function(x,width,FUN = NULL,...){
FUN < - match.fun(FUN)
OUT< - numeric()
for(i in 1:length(x)){
if(i< width [i] )next
OUT [i] < - FUN(x [(i-(width [i] -1)):i],...)
}
}
以下是 prod
函数。 意味着
函数可能已在 rollapplyr
中优化。所有结果相等。
library(microbenchmark)
#1a。 length(x)= 1000,window = 5-20
x width <-rep(seq(from = 5,to = 20,by = 5 ),
base_mapply(x = x,width = width,width = width,width =宽度,FUN = prod,na.rm = T),
wmapply(x = x,width = width,FUN = prod,na.rm = T),
forloopply(x = x,width = width,FUN = prod,na.rm = T),
times = 100L
)
单位:毫秒
expr min lq median uq max neval
rollapplyr = x,width = width,FUN = prod,fill = NA)59.690217 60.694364 61.979876 68.55698 153.60445 100
base_mapply(x = x,width = width,FUN = prod,na.rm = T)14.372537 14.694266 14.953234 16.00777 99.82199 100
wmapply(x = x,width = width,FUN = prod,na.rm = T)9.384938 9.755893 9.872079 10.09932 84.82886 100
forloopply(x = x,width = width,FUN = prod,na .rm = T)14.730428 15.062188 15.305059 15.76560 342.44173 100
#1b。 length(x)= 1000,window = 50-200
x width <-rep(seq(from = 50,to = 200,by = 50 ),
base_mapply(x = x,width = width,width = width,width =宽度,FUN = prod,na.rm = T),
wmapply(x = x,width = width,FUN = prod,na.rm = T),
forloopply width,FUN = prod,na.rm = T),
times = 100L
)
单位:毫秒
expr min lq median uq max neval
rollapplyr = x,width = width,FUN = prod,fill = NA)71.99894 74.19434 75.44112 86.44893 281.6237 100
base_mapply(x = x,width = width,FUN = prod,na.rm = T)15.67158 16.10320 16.39249 17.20346 103.6211 100
wmapply(x = x,width = width,FUN = prod,na.rm = T)10.88882 11.54721 11.75229 12.19790 106.1170 100
forloopply(x = x,width = width,FUN = prod,na .rm = T)15.70704 16.06983 16.40393 17.14210 108.5005 100
#2a。 length(x)= 10000,window = 5-20
x < - runif(10000,0.5,1.5)
width <-rep(seq(from = 5,to = 20,by = 5 ),
base_mapply(x = x,width = width,width = width,width =宽度,FUN = prod,na.rm = T),
wmapply(x = x,width = width,FUN = prod,na.rm = T),
forloopply width,FUN = prod,na.rm = T),
times = 100L
)
单位:毫秒
expr min lq median uq max neval
rollapplyr = x,width = width,FUN = prod,fill = NA)753.87882 781.8789 809.7680 872.8405 1116.7021 100
base_mapply(x = x,width = width,FUN = prod,na.rm = T)148.54919 159.9986 231.5387 239.9183 339.7270 100
wmapply(x = x,width = width,FUN = prod,na.rm = T)98.42682 105.2641 117.4923 183.4472 245.4577 100
forloopply(x = x,width = width,FUN = prod,na .rm = T)533.95641 602.0652 646.7420 672.7483 922.3317 100
#2b。 length(x)= 10000,window = 50-200
x < - runif(10000,0.5,1.5)
width <-rep(seq(from = 50,to = 200,by = 50 ),
base_mapply(x = x,width = width,width = width,width =宽度,FUN = prod,na.rm = T),
wmapply(x = x,width = width,FUN = prod,na.rm = T),
forloopply width,FUN = prod,na.rm = T),
times = 100L
)
单位:毫秒
expr min lq median uq max neval
rollapplyr = x,width = width,FUN = prod,fill = NA)912.5829 946.2971 1024.7245 1071.5599 1431.5289 100
base_mapply(x = x,width = width,FUN = prod,na.rm = T)171.3189 180.6014 260.8817 269.5672 344.4500 100
wmapply(x = x,width = width,FUN = prod,na.rm = T)123.1964 131.1663 204.6064 221.1004 484.3636 100
forloopply(x = x,width = width,FUN = prod,na .rm = T)561.2993 696.5583 800.9197 959.6298 1273.5350 100
I am looking for some performance gains in terms of rolling/sliding window functions in R. It is quite common task which can be used in any ordered observations data set. I would like to share some of my findings, maybe somebody would be able to provide feedback to make it even faster.
Important note is that I focus on the case align="right"
and rolling window width
as vector (same length as our observation vector). In case if we have width
as scalar there are already very well developed functions in zoo
and TTR
packages which would be very hard to beat as some of them are even using Fortran (but still user-defined FUNs can be faster using mentioned below wapply
).
RcppRoll
package is worth to mention due to its great performance, but so far there is no function which answers to that question. Would be great if someone could extend it to answer the question.
Consider we have a following data:
x = c(120,105,118,140,142,141,135,152,154,138,125,132,131,120)
plot(x, type="l")
And we want to apply rolling function over x
vector with variable rolling window width
.
set.seed(1)
width = sample(2:4,length(x),TRUE)
In this particular case we would have rolling function adaptive to sample
of c(2,3,4)
.
We will apply mean
function, expected results:
r = f(x, width, FUN = mean)
print(r)
## [1] NA NA 114.3333 120.7500 141.0000 135.2500 139.5000
## [8] 142.6667 147.0000 146.0000 131.5000 128.5000 131.5000 127.6667
plot(x, type="l")
lines(r, col="red")
Any indicator can be employed to produce width
argument as different variants of adaptive moving averages, or any other function.
Looking for a top performance.
I chose 4 available solutions which doesn't need to do to C++, quite easy to find or google.
# 1. rollapply
library(zoo)
?rollapplyr
# 2. mapply
base_mapply <- function(x, width, FUN, ...){
FUN <- match.fun(FUN)
f <- function(i, width, data){
if(i < width) return(NA_real_)
return(FUN(data[(i-(width-1)):i], ...))
}
mapply(FUN = f,
seq_along(x), width,
MoreArgs = list(data = x))
}
# 3. wmapply - modified version of wapply found: https://rmazing.wordpress.com/2013/04/23/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
wmapply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
SEQ1 <- 1:length(x)
SEQ1[SEQ1 < width] <- NA_integer_
SEQ2 <- lapply(SEQ1, function(i) if(!is.na(i)) (i - (width[i]-1)):i)
OUT <- lapply(SEQ2, function(i) if(!is.null(i)) FUN(x[i], ...) else NA_real_)
return(base:::simplify2array(OUT, higher = TRUE))
}
# 4. forloopply - simple loop solution
forloopply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
OUT <- numeric()
for(i in 1:length(x)) {
if(i < width[i]) next
OUT[i] <- FUN(x[(i-(width[i]-1)):i], ...)
}
return(OUT)
}
Below are the timings for prod
function. mean
function might be already optimized inside rollapplyr
. All results equal.
library(microbenchmark)
# 1a. length(x) = 1000, window = 5-20
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 59.690217 60.694364 61.979876 68.55698 153.60445 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 14.372537 14.694266 14.953234 16.00777 99.82199 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 9.384938 9.755893 9.872079 10.09932 84.82886 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 14.730428 15.062188 15.305059 15.76560 342.44173 100
# 1b. length(x) = 1000, window = 50-200
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 71.99894 74.19434 75.44112 86.44893 281.6237 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 15.67158 16.10320 16.39249 17.20346 103.6211 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 10.88882 11.54721 11.75229 12.19790 106.1170 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 15.70704 16.06983 16.40393 17.14210 108.5005 100
# 2a. length(x) = 10000, window = 5-20
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 753.87882 781.8789 809.7680 872.8405 1116.7021 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 148.54919 159.9986 231.5387 239.9183 339.7270 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 98.42682 105.2641 117.4923 183.4472 245.4577 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 533.95641 602.0652 646.7420 672.7483 922.3317 100
# 2b. length(x) = 10000, window = 50-200
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 912.5829 946.2971 1024.7245 1071.5599 1431.5289 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 171.3189 180.6014 260.8817 269.5672 344.4500 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 123.1964 131.1663 204.6064 221.1004 484.3636 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 561.2993 696.5583 800.9197 959.6298 1273.5350 100
这篇关于自适应滚动窗口功能 - R的顶级性能的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!