R:使用网格的馈送函数 [英] R: Feeding Functions Using Grids
问题描述
我正在使用 R 编程语言.
I am working with the R programming language.
我编写了这个循环来评估以下函数":(它实际上是一个循环")100 次,在random_1、random_2、random_3、random_4、split_1、split_2、split_3"的随机选择输入处):
I wrote this loop that evaluates the following "function" (it's actually a "loop") 100 times, at randomly selected inputs for "random_1, random_2, random_3, random_4, split_1, split_2, split_3"):
#load library
library(dplyr)
library(data.table)
set.seed(123)
# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
####
results_table <- data.frame()
for (i in 1:100 ) {
#generate random numbers
random_1 = runif(1, 80, 120)
random_2 = runif(1, random_1, 120)
random_3 = runif(1, 85, 120)
random_4 = runif(1, random_3, 120)
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
split_1 = runif(1,0, 1)
split_2 = runif(1, 0, 1)
split_3 = runif(1, 0, 1)
#calculate 60th quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
以上代码的结果如下:
head(final_results)
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a b c total
1: 1 95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
2: 2 92.31360 110.0762 106.46871 109.5343 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
3: 3 81.02645 110.4645 116.42006 119.6172 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
4: 4 90.35986 116.7089 114.15588 116.7231 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702
5: 5 89.28374 114.7103 119.70448 119.7725 0.08881443 0.6351936 0.8565509 0.09027778 0.6349614 0.8461538 0.573
6: 6 87.35767 103.8575 97.44462 116.0414 0.48372890 0.2319129 0.2701634 0.47368421 0.2326333 0.2711370 0.255
这是我的问题:我不想在随机选择的点"处评估上述函数,而是想在 a 网格内定义的点处评估此函数.
Here is my question: Instead of evaluating the above function at "randomly selected points", I want to evaluate this function at points defined within the a grid.
首先,我定义了网格:
#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 = seq(0.4,1,0.2)
split_2 = seq(0.4,1,0.2)
split_3 = seq(0.4,1,0.2)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)
> head(DF_1)
Var1 Var2 Var3 Var4 Var5 Var6 Var7
1 80 85 85 90 0.4 0.4 0.4
2 85 85 85 90 0.4 0.4 0.4
3 90 85 85 90 0.4 0.4 0.4
4 95 85 85 90 0.4 0.4 0.4
5 100 85 85 90 0.4 0.4 0.4
6 80 90 85 90 0.4 0.4 0.4
接下来,我将那个循环"转换为变成一个函数"
Next, I converted that "loop" into a "function"
results_table <- data.frame()
grid_function <- function(random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
#calculate random quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
问题:不是在随机选择的点评估函数,例如"95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7730455 中的函数,要在网格内评估 0.7730455565638 点 0.7730755 代码04<代码>"80 85 85 90 0.4 0.4 0.4"
Problem: Instead of evaluating the function at randomly selected points, such as " 95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730"
, I want to evaluate the function at points defined within the grid (DF_1), e.g. " 80 85 85 90 0.4 0.4 0.4"
到目前为止我尝试过的:
#Reduce size of the gird for this example
DF_1 = DF_1[1:100,]
#rename variables within the grid:
colnames(DF_1) <- c("random_1" , "random_2", "random_3",
"random_4", "split_1", "split_2", "split_3")
#evauate function at points from grid:
resultdf1 <- apply(DF_1,1, # 1 means rows
FUN=function(x){
do.call(
# Call Function grid_function2 with the arguments in
# a list
grid_function,
# force list type for the arguments
as.list(
# make the row to a named vector
unlist(x)
)
)
}
)
#reformat results, it seems like every "block" in this output is identical
a = resultdf1$`1`
a = data.frame(a)
head(a)
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a b c total
1 1 95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
2 2 92.31360 110.0762 106.46871 109.5343 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
3 3 81.02645 110.4645 116.42006 119.6172 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
4 4 90.35986 116.7089 114.15588 116.7231 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702
5 5 89.28374 114.7103 119.70448 119.7725 0.08881443 0.6351936 0.8565509 0.09027778 0.6349614 0.8461538 0.573
6 6 87.35767 103.8575 97.44462 116.0414 0.48372890 0.2319129 0.2701634 0.47368421 0.2326333 0.2711370 0.255
这似乎奏效了,但是:
根据
resultdf1$
的结果,random_1、random_2、random_3、random_4、split_1、split_2、split_3"的值分别为与DF_1"中的值不匹配.
Based on the results of
resultdf1$
, the values of "random_1, random_2, random_3, random_4, split_1, split_2, split_3" do not match the values from "DF_1".
以上代码生成同一个表 100 次(例如 resultdf1$1
、resultdf1$2
、... resultdf1$100代码>).有没有办法只产生这些块"中的一个?(例如 resultdf1$
1
),以便使用更少的计算机内存?
The above code is producing the same table 100 times (e.g. resultdf1$1
, resultdf1$2
, ... resultdf1$100
). Is there a way to only produce one of these "blocks" (e.g. resultdf1$1
), so that less computer memory will be used?
有人可以告诉我如何解决这些问题吗?
Can someone please show me how to fix these problems?
谢谢.
推荐答案
由于 grid_function
正在更新全局 env 中的原始对象 'train_data' 而不是作为参数传递给函数,对象被修改.我们可能需要添加一个额外的参数
As the grid_function
is updating the original object 'train_data' in the global env and not passing as an argument to the function, the object gets modified. We may need to add an additional argument
results_table <- data.frame()
grid_function <- function(train_data, random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
#calculate random quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
然后将'train_data'的copy
作为输入
and then pass the copy
of 'train_data' as input
train_data_new <- copy(train_data)
DF_1 <- DF_1[1:5,]
调用函数
resultdf1 <- apply(DF_1,1, # 1 means rows
FUN=function(x){
do.call(
# Call Function grid_function2 with the arguments in
# a list
grid_function,
# force list type for the arguments
c(list(train_data_new), as.list(
# make the row to a named vector
unlist(x)
)
))
}
)
-输出
resultdf1
$`1`
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 b c total
1: 100 80 85 85 90 0.4 0.4 0.4 0.5 0.3997996 0.4
$`2`
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 b c total
1: 100 85 85 85 90 0.4 0.4 0.4 0.5 0.3997996 0.4
$`3`
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 b c total
1: 100 90 85 85 90 0.4 0.4 0.4 0.5 0.3997996 0.4
$`4`
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a b c total
1: 100 95 85 85 90 0.4 0.4 0.4 0 0.5 0.4002006 0.4
$`5`
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a b c total
1: 100 100 85 85 90 0.4 0.4 0.4 0 0.5 0.4002006 0.4
这篇关于R:使用网格的馈送函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!