R:使用网格的馈送函数 [英] R: Feeding Functions Using Grids

查看:25
本文介绍了R:使用网格的馈送函数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在使用 R 编程语言.

I am working with the R programming language.

我编写了这个循环来评估以下函数":(它实际上是一个循环")100 次,在random_1、random_2、random_3、random_4、split_1、split_2、split_3"的随机选择输入处):

I wrote this loop that evaluates the following "function" (it's actually a "loop") 100 times, at randomly selected inputs for "random_1, random_2, random_3, random_4, split_1, split_2, split_3"):

#load library
library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:100 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

以上代码的结果如下:

head(final_results)
   iteration_number random_1 random_2  random_3 random_4    split_1   split_2   split_3          a         b         c total
1:                1 95.67371 111.8133  94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
2:                2 92.31360 110.0762 106.46871 109.5343 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
3:                3 81.02645 110.4645 116.42006 119.6172 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
4:                4 90.35986 116.7089 114.15588 116.7231 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702
5:                5 89.28374 114.7103 119.70448 119.7725 0.08881443 0.6351936 0.8565509 0.09027778 0.6349614 0.8461538 0.573
6:                6 87.35767 103.8575  97.44462 116.0414 0.48372890 0.2319129 0.2701634 0.47368421 0.2326333 0.2711370 0.255

这是我的问题:我不想在随机选择的点"处评估上述函数,而是想在 a 网格内定义的点处评估此函数.

Here is my question: Instead of evaluating the above function at "randomly selected points", I want to evaluate this function at points defined within the a grid.

首先,我定义了网格:

#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 =  seq(0.4,1,0.2)
split_2 =  seq(0.4,1,0.2)
split_3 =  seq(0.4,1,0.2)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)

> head(DF_1)
  Var1 Var2 Var3 Var4 Var5 Var6 Var7
1   80   85   85   90  0.4  0.4  0.4
2   85   85   85   90  0.4  0.4  0.4
3   90   85   85   90  0.4  0.4  0.4
4   95   85   85   90  0.4  0.4  0.4
5  100   85   85   90  0.4  0.4  0.4
6   80   90   85   90  0.4  0.4  0.4

接下来,我将那个循环"转换为变成一个函数"

Next, I converted that "loop" into a "function"

results_table <- data.frame()

grid_function <- function(random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
    

    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    

    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

问题:不是在随机选择的点评估函数,例如"95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7730455 中的函数,要在网格内评估 0.7730455565638 点 0.7730755 代码"80 85 85 90 0.4 0.4 0.4"

Problem: Instead of evaluating the function at randomly selected points, such as " 95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730" , I want to evaluate the function at points defined within the grid (DF_1), e.g. " 80 85 85 90 0.4 0.4 0.4"

到目前为止我尝试过的:

#Reduce size of the gird for this example

DF_1 = DF_1[1:100,]

#rename variables within the grid:

colnames(DF_1) <- c("random_1" , "random_2", "random_3",
                    "random_4", "split_1", "split_2", "split_3")

#evauate function at points from grid:

resultdf1 <- apply(DF_1,1, # 1 means rows
                   FUN=function(x){
                     do.call(
                       # Call Function grid_function2 with the arguments in
                       # a list
                       grid_function,
                       # force list type for the arguments
                       as.list(
                         # make the row to a named vector
                         unlist(x)
                         )
                       )
                     }
                   )


#reformat results, it seems like every "block" in this output is identical

a = resultdf1$`1`
a = data.frame(a)

head(a)
  iteration_number random_1 random_2  random_3 random_4    split_1   split_2   split_3          a         b         c total
1                1 95.67371 111.8133  94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
2                2 92.31360 110.0762 106.46871 109.5343 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
3                3 81.02645 110.4645 116.42006 119.6172 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
4                4 90.35986 116.7089 114.15588 116.7231 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702
5                5 89.28374 114.7103 119.70448 119.7725 0.08881443 0.6351936 0.8565509 0.09027778 0.6349614 0.8461538 0.573
6                6 87.35767 103.8575  97.44462 116.0414 0.48372890 0.2319129 0.2701634 0.47368421 0.2326333 0.2711370 0.255

这似乎奏效了,但是:

  • 根据resultdf1$的结果,random_1、random_2、random_3、random_4、split_1、split_2、split_3"的值分别为与DF_1"中的值不匹配.

  • Based on the results of resultdf1$, the values of "random_1, random_2, random_3, random_4, split_1, split_2, split_3" do not match the values from "DF_1".

以上代码生成同一个表 100 次(例如 resultdf1$1、resultdf1$2、... resultdf1$100).有没有办法只产生这些块"中的一个?(例如 resultdf1$1),以便使用更少的计算机内存?

The above code is producing the same table 100 times (e.g. resultdf1$1, resultdf1$2, ... resultdf1$100). Is there a way to only produce one of these "blocks" (e.g. resultdf1$1), so that less computer memory will be used?

有人可以告诉我如何解决这些问题吗?

Can someone please show me how to fix these problems?

谢谢.

推荐答案

由于 grid_function 正在更新全局 env 中的原始对象 'train_data' 而不是作为参数传递给函数,对象被修改.我们可能需要添加一个额外的参数

As the grid_function is updating the original object 'train_data' in the global env and not passing as an argument to the function, the object gets modified. We may need to add an additional argument

results_table <- data.frame()

grid_function <- function(train_data, random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
    

    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    

    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

然后将'train_data'的copy作为输入

and then pass the copy of 'train_data' as input

train_data_new <- copy(train_data)
DF_1 <- DF_1[1:5,]

调用函数

resultdf1 <- apply(DF_1,1, # 1 means rows
                    FUN=function(x){
                      do.call(
                        # Call Function grid_function2 with the arguments in
                        # a list
                        grid_function,
                        # force list type for the arguments
                        c(list(train_data_new), as.list(
                          # make the row to a named vector
                          unlist(x)
                          )
                        ))
                      }
                    )

-输出

resultdf1
$`1`
   iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3   b         c total
1:              100       80       85       85       90     0.4     0.4     0.4 0.5 0.3997996   0.4

$`2`
   iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3   b         c total
1:              100       85       85       85       90     0.4     0.4     0.4 0.5 0.3997996   0.4

$`3`
   iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3   b         c total
1:              100       90       85       85       90     0.4     0.4     0.4 0.5 0.3997996   0.4

$`4`
   iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a   b         c total
1:              100       95       85       85       90     0.4     0.4     0.4 0 0.5 0.4002006   0.4

$`5`
   iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a   b         c total
1:              100      100       85       85       90     0.4     0.4     0.4 0 0.5 0.4002006   0.4

这篇关于R:使用网格的馈送函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆