将不同位置的行添加到R中的列联表 [英] Adding Rows at Different Positions to a Contingency Table in R

查看:28
本文介绍了将不同位置的行添加到R中的列联表的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我使用的是R。对于我生成的这个随机数据集,我创建了以下联想表:

library(memisc)
library(dplyr)

set.seed(123)

v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")
v3 <- c("Z", "Y", "X", "W" )
v4 <- c("data_1", "data_2", "data_3", "data_4" )


dates <- as.factor(sample(v1, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

types <- as.factor(sample(v2,1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

types2 <- as.factor(sample(v3, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))

names <- as.factor(sample(v3, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))

var = rnorm(1000,10,10)

problem_data = data.frame(var,dates, types, types2, names)


summary <- xtabs(~dates+names+types+types2, problem_data)
t = ftable(summary, row.vars=1, col.vars=2:4)

show_html(t)

如果要在此表的底部添加包含&Quot;Total Total&Quot;的行,可以执行以下操作:

totals <- problem_data %>% group_by(names,  types, types2) %>% summarise(totals = n())
memisc::show_html(rbind(t, totals = totals$totals), varinfront = FALSE)

是否可以在此列联表中的任意位置添加&Quot;Totals&Quot;?

例如,假设我想要找到前两行(2010-2011、2011-2012)的合计,然后将该合计插入该表的第三行。我可以计算前两行的总和:

first_two_rows = subset(problem_data, dates %in% c("2010-2011","2011-2012"))

totals_first_two_rows <- first_two_rows %>% group_by(names,  types, types2) %>% summarise(totals = n())

但是如何将这个";talals_first_Two_row";添加到列联表的第三个位置?通过这个堆栈溢出帖子(Add new row to dataframe, at specific row-index, not appended?),我尝试使用答案中提供的函数:

insertRow <- function(existingDF, newrow, r) {
    existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),]
    existingDF[r,] <- newrow
    existingDF
}

insertRow(t, totals_first_two_rows, 3)

但这将返回以下错误:

Error in `[<-`(`*tmp*`, seq(r + 1, nrow(existingDF) + 1), , value = existingDF[seq(r,  : 
  subscript out of bounds

有人能向我演示如何解决此问题吗?

谢谢!

推荐答案

insertRow不起作用,因为t不是data.frame(我很惊讶rbind(t, totals = totals$totals)起作用了)。
如果你想要不常见的表格格式,我想你不可避免地要半手工制作。
这需要时间,但您可以完全自定义。

我介绍包flextable(下面是一个例子):

注意: !!sym(str)!!!syms(strs)是在dplyr函数中使用字符串名的技术。
例如,iris %>% mutate(!!sym("colname") := !!sym("Sepal.Length") * 10)表示iris %>% mutate(colname = Sepal.Length * 10)

加载包和数据准备

library(memisc)
library(dplyr)
library(tidyr)
library(flextable)
library(officer)

set.seed(123)
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")
v3 <- c("Z", "Y", "X", "W" )
v4 <- c("data_1", "data_2", "data_3", "data_4" )

dates <- as.factor(sample(v1, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
types <- as.factor(sample(v2,1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
types2 <- as.factor(sample(v3, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))
names <- as.factor(sample(v4, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))  # modified
var <- rnorm(1000,10,10)

problem_data <- data.frame(var,dates, types, types2, names)
contingency_table <- problem_data %>%    # this code comes from your previous question (a little changed).
  group_by(dates, names, types, types2) %>% 
  summarise(value = n(), .groups = "drop")  # I summarize_function (from mean() to n() and colname).

first_two_rows = subset(problem_data, dates %in% c("2010-2011","2011-2012"))
totals_first_two_rows <- first_two_rows %>% group_by(names,  types, types2) %>% summarise(totals = n(), .groups = "drop")

制作要输出的DF(但已粘贴标题)

# variable preparation
ind_cols <- c("names", "types", "types2")
ind_rows <- c("dates")
ind_sep = "__"    # use to make temp_header (pasted to single string; e.g., header1__header2__ ...)

# convet long to wide 
base_data <- contingency_table %>% 
  tidyr::pivot_wider(names_from = ind_cols, values_from = value, 
                     names_sep = ind_sep, names_sort = TRUE) %>% 
  arrange_at(ind_rows) %>% 
  mutate_if(is.factor, as.character)

#   dates     data_1__A__W data_1__A__X data_1__A__Y ...
#   <chr>            <int>        <int>        <int>
# 1 2010-2011            4            2           21
# 2 2011-2012            2            2            7
# 3 2012-2013           NA           NA            2
# 4 2013-2014            2            2            1
# 5 2014-2015            1           NA            6

totals_f2r_wide <- totals_first_two_rows %>% 
  mutate(dates = "totals") %>%    # this colname is important. it must be same as the colname what you want to put the value.
  tidyr::pivot_wider(names_from = ind_cols, values_from = totals, 
                     names_sep = ind_sep, names_sort = TRUE)

#   dates  data_1__A__W data_1__A__X data_1__A__Y
#   <chr>         <int>        <int>        <int>
# 1 totals            6            4           28


base_data2 <- bind_rows(base_data[1:2,], totals_f2r_wide, base_data[3:nrow(base_data),]) %>% 
  mutate_if(is.numeric, ~ replace_na(.x, 0))

## add header description col
# if you don't want it, please skip this part.
base_data3 <- base_data2 %>% 
  mutate(!!sym(paste0(ind_cols, sep = ":", collapse = ind_sep)) := NA) %>% 
  select(one_of(ind_rows), paste0(ind_cols, sep = ":", collapse = ind_sep), names(.))  # column order change

#   dates     `names:__types:__types2:` data_1__A__W data_1__A__X
#   <chr>     <lgl>                            <dbl>        <dbl>
# 1 2010-2011 NA                                   4            2
# 2 2011-2012 NA                                   2            2
# 3 totals    NA                                   6            4
# 4 2012-2013 NA                                   0            0
# 5 2013-2014 NA                                   2            2
# 6 2014-2015 NA                                   1            0

表头信息准备

计算标题的每个元素有多少行。

header_info_maker <- function(base_data) {

  pasted_ind <- base_data %>% colnames()
  ind_num <- length(ind_cols)

  ind_d <- tibble(a = pasted_ind) %>% 
    separate(a, into = letters[1:ind_num], sep = ind_sep, fill = "left") %>%   # warning occurred, but no problem
    mutate_all(~ replace_na(.x, ""))
#    a        b        c      
#    <chr>    <chr>    <chr>  
#  1 ""       ""       dates  
#  2 "names:" "types:" types2:
#  3 "data_1" "A"      W      
#  4 "data_1" "A"      X      
#  5 "data_1" "A"      Y      
#  6 "data_1" "A"      Z      
#  7 "data_1" "B"      W      
  
  group_ind <- ind_d %>% 
    mutate_all(~ cumsum(.x != lag(.x, default = "xxxxx")))
#        a     b     c
#    <int> <int> <int>
#  1     1     1     1
#  2     2     2     2
#  3     3     3     3
#  4     3     3     4
#  5     3     3     5
#  6     3     3     6
#  7     3     4     7
  
  sapply(1:ncol(ind_d), function(x){
    tibble(ind = ind_d[[x]], g_ind = group_ind[[x]]) %>% 
      count(g_ind, ind) %>% 
      select(-g_ind) %>% 
      as.list()
  }, simplify = FALSE)
}

header_info <- header_info_maker(base_data3)

制表

# convert df to flextable and delete origin pasted header.
ft <- flextable(base_data3) %>% delete_part(part = "header")
# add header
for(i in length(header_info):1){
  ft <- add_header_row(ft, colwidths = header_info[[i]]$n, values = header_info[[i]]$ind)
}
# change design
ft <- ft %>% 
  theme_vanilla() %>% 
  vline(j = 1, border = fp_border(width = 2)) %>% 
  align(align = "center", part = "all") %>% 
  hline(i = c(2,3), border = fp_border(width = 2))
ft

这篇关于将不同位置的行添加到R中的列联表的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆