R:在表中保留最大的5行 [英] R: Keeping the 5 Biggest Rows in a Table

查看:15
本文介绍了R:在表中保留最大的5行的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在使用R编程语言。我创建了一些随机数据,然后编写了以下程序,该程序循环执行一系列数据操作步骤:

#load library
    library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

在运行此循环10次后,结果(";FINAL_RESULTS&QOT;)如下所示:

final_results

    iteration_number  random_1  random_2  random_3  random_4    split_1   split_2   split_3          a         b         c total
 1:                1  95.67371 111.81329  94.00313 102.05692 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
 2:                2  92.31360 110.07617 106.46871 109.53428 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
 3:                3  81.02645 110.46446 116.42006 119.61718 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
 4:                4  90.35986 116.70888 114.15588 116.72312 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702
 5:                5  89.28374 114.71034 119.70448 119.77249 0.08881443 0.6351936 0.8565509 0.09027778 0.6349614 0.8461538 0.573
 6:                6  87.35767 103.85755  97.44462 116.04144 0.48372890 0.2319129 0.2701634 0.47368421 0.2326333 0.2711370 0.255
 7:                7 112.91974 113.10267  99.20739 111.60051 0.52873965 0.6825709 0.5078129 0.52849741 0.6830709 0.5094340 0.605
 8:                8 102.17487 117.17008  95.93786  96.80284 0.81599406 0.7785768 0.8593795 0.81300813 0.7795276 0.8586667 0.843
 9:                9  82.62877  82.95787 105.70883 118.13665 0.44629189 0.0375750 0.4102906 0.44117647 0.1666667 0.4083333 0.408
10:               10  94.60865 106.70978  89.67872 104.21645 0.26431269 0.4899329 0.9060612 0.40000000 0.4897959 0.8992629 0.656

我正在尝试修改循环,以便在迭代期间,表在任何时候都将只保留5个最大的结果(基于";FINAL_RESULTS$TOTAL&QOOT;的值)。这是为了防止终结表(";FINAL_RESULTS&QOT;)变得太大。

循环完成后,我知道如何裁剪";FINAL_RESULTS&QOOT;表,以便它只保留最大的5行(根据&QOOT;FINAL_RESULTS$TOTAL&QOOT;):

#sort the final table according to the desired criteria 
sorted_table = final_results[order(final_results$total, decreasing = TRUE),]  

#extract 5 biggest rows 
 sorted_table = sorted_table[1:5,]

#view the results
 head(sorted_table)
   iteration_number  random_1 random_2  random_3  random_4    split_1   split_2   split_3          a         b         c total
1:                3  81.02645 110.4645 116.42006 119.61718 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
2:                8 102.17487 117.1701  95.93786  96.80284 0.81599406 0.7785768 0.8593795 0.81300813 0.7795276 0.8586667 0.843
3:                2  92.31360 110.0762 106.46871 109.53428 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
4:                1  95.67371 111.8133  94.00313 102.05692 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
5:                4  90.35986 116.7089 114.15588 116.72312 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702

我的问题:但是否可以重写循环,使表在任何时候只包含5行?如果我运行此循环1,000,000次,表将变得非常大,我希望提前修剪它。

例如

  1. 循环5次
  2. 对于第6次迭代,查看";Total";的值是否小于前5个值中的任何一个
  • 如果是,则丢弃此迭代的结果并转到第7个迭代。
  • 如果不是,则保留本次迭代的结果,丢弃属于最小迭代的行并转到第7次迭代
  1. 重复步骤2),直到循环迭代了1,000,000次。
是否可以将此步骤添加到循环中,并在创建表时修剪它?或者,是否只能在整个循环完成后修剪表格?

谢谢

谢谢

推荐答案

我们可以添加该行

final_results <- head(final_results[order(-total)], 5)

在循环末尾只返回前5行

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
   
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    final_results <- head(final_results[order(-total)], 5)
    
}

这篇关于R:在表中保留最大的5行的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆