为每个唯一组创建新变量 [英] Creating a New Variable for Each Unique Group

查看:17
本文介绍了为每个唯一组创建新变量的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我在R工作

我有以下5个数据集(Data_1、Data_2、Data_3、Data_4、Data_5):

v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")

data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))

data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))

data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))

data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))

data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))

data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

# sample of one of the files

head(data_1)
      var_1     var_2     dates types
1  8.523382  4.945344 2010-2011     E
2 14.137515  3.223525 2012-2013     A
3 19.610770  7.762698 2011-2012     D
4 11.334196 10.879946 2012-2013     E
5 -1.406475  2.498347 2011-2012     E
6 11.116458  9.988073 2011-2012     E

根据上面的数据,我制作了一个表格,提供了总结:

summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
 counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))

)


   names counts mean_var_1 mean_var_2
1 data_1    871   9.426475   9.853399
2 data_2    412   9.853399   4.680188
3 data_3    332  10.275049   5.256084
4 data_4    611  10.094421   5.323108
5 data_5    789   9.960050   4.946458

我想向上述包含每年计数的表中添加5个新列。如下所示(这是一个空模板):

df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x

我知道如何手动完成此操作,但需要很长时间:

    library(dplyr)
    
     summary_1 = data.frame( data_1 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_2 = data.frame( data_2 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_3 = data.frame( data_3 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_4 = data.frame( data_4 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_5 = data.frame( data_5 %>%  group_by(dates) %>% summarise(my_counts = n()) )

#view sample of output
summary_1

      dates my_counts
1 2010-2011       407
2 2011-2012       189
3 2012-2013        79
4 2013-2014       101
5 2014-2015        95

但我必须手动创建5个新列,并手动复制这25个计数(5x5=25)。

有人能告诉我一种更快的方法吗?

谢谢!

所需输出示例

推荐答案

也许您可以考虑在下次使用随机值时使用set.seed(),这样回答的人可能会使用完全相同的采样数据。此外,您可以考虑在定义V1时使用_而不是-符号,以避免在生成的数据帧中添加反标记,因为R不喜欢在列名中使用减号。

您可以使用命名列表在data_nr列中包含类似data_1、data_2等的值,而不是1、2等。

library(tidyverse)

list(data_1, data_2, data_3, data_4, data_5) %>% 
  set_names(paste0("data_", 1:length(.))) %>% 
  bind_rows(.id = "data_nr") %>% 
  count(data_nr, dates, name = "my_counts") %>% 
  pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")

       data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
  <chr>                <int>              <int>              <int>              <int>              <int>
1 data_1                 437                161                 93                 88                 92
2 data_2                 218                 68                 40                 36                 50
3 data_3                 170                 58                 35                 34                 35
4 data_4                 331                114                 65                 54                 47
5 data_5                 398                146                 89                 78                 78

这篇关于为每个唯一组创建新变量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆