根据组条件创建重复数据展开data.frame(3) [英] Expand data.frame by creating duplicates based on group condition (3)
问题描述
从此SO问题。
示例 data.frame
:
df = read.table(text ='ID Day Count Count_group
18 1933 6 15
33 1933 6 15
37 1933 6 15
18 1933 6 15
16 1933 6 15
11 1933 6 15
111 1932 5 9
34 1932 5 9
60 1932 5 9
88 1932 5 9
18 1932 5 9
33 1931 3 4
13 1931 3 4
56 1931 3 4
23 1930 1 1
6 1800 6 12
37 1800 6 12
98 1800 6 12
52 1800 6 12
18 1800 6 12
76 1800 6 12
55 1799 4 6
6 1799 4 6
52 1799 4 6
133 1799 4 6
112 1798 2 2
677 1798 2 2
778 888 4 8
111 888 4 8
88 888 4 8
10 888 4 8
37 887 2 4
26 887 2 4
8 886 1 2
56 885 1 1
22 120 2 6
34 120 2 6
88 119 1 6
99 118 2 5
12 118 2 5
90 117 1 3
22 115 2 2
99 115 2 2',header = TRUE)
计数
col显示总数 ID
每个
的日期
和 Count_group
c ol显示每个 Day
, Day 1之前的
, ID
值的总和 Day -2
,第3天
和 Day -4
。
例如1933 = Count_group
15,因为计数
6(1933)+ 计数
5(1932)+ 计数
3(1931)+ 计数
1(1930)+ 计数
0(1929)。
我需要做的是每个 Count_group
并将它们添加到它,以便每个 Count_group
显示其日
,第1天
,第-2天
,第3天
和 Day -4
。
eg Count_group
= 15由计数值 Day
1933,1932,1931,1930(和1929不存在于 df
)。所以五天需要包含在 Count_group
= 15中。下一个将是 Count_group
= 9,组成到1932年,1931年,1930年,1929年和1928年;等等...
期望的输出:
ID日数Count_group
18 1933 6 15
33 1933 6 15
37 1933 6 15
18 1933 6 15
16 1933 6 15
11 1933 6 15
111 1932 5 15
34 1932 5 15
60 1932 5 15
88 1932 5 15
18 1932 5 15
33 1931 3 15
13 1931 3 15
56 1931 3 15
23 1930 1 15
111 1932 5 9
34 1932 5 9
60 1932 5 9
88 1932 5 9
18 1932 5 9
33 1931 3 9
13 1931 3 9
56 1931 3 9
23 1930 1 9
33 1931 3 4
13 1931 3 4
56 1931 3 4
23 1930 1 4
23 1930 1 1
6 1800 6 12
37 1800 6 12
98 1800 6 12
52 1800 6 12
18 1800 6 12
76 1800 6 12
55 1799 4 12
6 1799 4 12
5 2 1799 4 12
133 1799 4 12
112 1798 2 12
677 1798 2 12
55 1799 4 6
6 1799 4 6
52 1799 4 6
133 1799 4 6
112 1798 2 6
677 1798 2 6
112 1798 2 2
677 1798 2 2
778 888 4 8
111 888 4 8
88 888 4 8
10 888 4 8
37 887 2 8
26 887 2 8
8 886 1 8
56 885 1 8
37 887 2 4
26 887 2 4
8 886 1 4
56 885 1 4
8 886 1 2
56 885 1 2
56 885 1 1
22 120 2 6
34 120 2 6
88 119 1 6
99 118 2 6
12 118 2 6
90 117 1 6
88 119 1 6
99 118 2 6
12 118 2 6
90 117 1 6
22 115 2 6
99 115 2 6
99 118 2 5
12 118 2 5
90 117 1 5
22 115 2 5
99 115 2 5
90 117 1 3
22 115 2 3
99 115 2 3
22 115 2 2
99 115 2 2
(注意不同的组5天,每一个都被一个空白行分隔,以使它们更清晰)
我有不同的数据框架,分为n天和因此,我想要适应每个人的代码(通过改变它)。
谢谢
#first add grouping变量
天< - 5#分组天数
df $ smalldaygroup< - c(0,cumsum(sapply(2:nrow(df),function(i)df $ Day [i] != df $ Day [i-1])))#individual days
df $ bigdaygroup< - c(0,cumsum(sapply(2:nrow(df)),function(i)df $ Day [i ]< df $ Day [i-1] -days + 1)))链接的日期的#block
#duplicate da每个大组中的ys
df2< - lapply(split(df,df $ bigdaygroup),function(x){
n < - max(x $ Day)-min(x $ Day)+ 1#大组连续天数
dayvec< - (max(x $ Day):min(x $ Day))可能的天数范围
daylog< - dayvec [dayvec%in %x $ Day]#范围内的实际天数
模式< - data.frame(base = rep(dayvec,each = days))
模式$ rep< - sapply(1:nrow ),function(i)pattern $ base [i] + 1-sum(pattern $ base [1:i] == pattern $ base [i]))#indices to repeat
pattern $ offset< - match (pattern $ rep,daylog)-match(pattern $ base,daylog)#offsets(稍后使用)
pattern< - pattern [(pattern $ base%in%x $ Day)& (pattern $ rep%in%x $ Day),] #remove无效元素
#store模式在列表中作为下一循环所需的偏移
return(list(df = split(x,x $ smalldaygroup) [match(pattern $ rep,daylog)],pat = pattern))
})
将Count_group更改为添加条目中的上一个值
df2< - lapply df2,function(L)lapply(1:length(L $ df),function(i){
x < - L $ df [[i]]
offset< - L $ pat $ offset #pointer to day to copy Count_group from
x $ Count_group< - L $ df [[i-offset [i]]] $ Count_group [1]
return(x)
}))
df2< - do.call(rbind,unlist(df2,recursive = FALSE))#bind返回
df2 [,5:6]< - NULL #remove分组变量
头(df2,30)#ignore rownames!
ID日数Count_group
01.1 18 1933 6 15
01.2 33 1933 6 15
01.3 37 1933 6 15
01.4 18 1933 6 15
01.5 16 1933 6 15
01.6 11 1933 6 15
02.7 111 1932 5 15
02.8 34 1932 5 15
02.9 60 1932 5 15
02.10 88 1932 5 15
02.11 18 1932 5 15
03.12 33 1931 3 15
03.13 13 1931 3 15
03.14 56 1931 3 15
04 23 1930 1 15
05.7 111 1932 5 9
05.8 34 1932 5 9
05.9 60 1932 5 9
05.10 88 1932 5 9
05.11 18 1932 5 9
06.12 33 1931 3 9
06.13 13 1931 3 9
06.14 56 1931 3 9
07 23 1930 1 9
08.12 33 1931 3 4
0 8.13 13 1931 3 4
08.14 56 1931 3 4
09 23 1930 1 4
010 23 1930 1 1
11.16 6 1800 6 12
Starting from this SO question.
Example data.frame
:
df = read.table(text = 'ID Day Count Count_group
18 1933 6 15
33 1933 6 15
37 1933 6 15
18 1933 6 15
16 1933 6 15
11 1933 6 15
111 1932 5 9
34 1932 5 9
60 1932 5 9
88 1932 5 9
18 1932 5 9
33 1931 3 4
13 1931 3 4
56 1931 3 4
23 1930 1 1
6 1800 6 12
37 1800 6 12
98 1800 6 12
52 1800 6 12
18 1800 6 12
76 1800 6 12
55 1799 4 6
6 1799 4 6
52 1799 4 6
133 1799 4 6
112 1798 2 2
677 1798 2 2
778 888 4 8
111 888 4 8
88 888 4 8
10 888 4 8
37 887 2 4
26 887 2 4
8 886 1 2
56 885 1 1
22 120 2 6
34 120 2 6
88 119 1 6
99 118 2 5
12 118 2 5
90 117 1 3
22 115 2 2
99 115 2 2', header = TRUE)
The Count
col shows the total number of ID
values per each Day
and the Count_group
col shows the sum of the ID
values per each Day
, Day - 1
, Day -2
, Day -3
and Day -4
.
e.g. 1933 = Count_group
15 because Count
6 (1933) + Count
5 (1932) + Count
3 (1931) + Count
1 (1930) + Count
0 (1929).
What I need to do is to create duplicated observations per each Count_group
and add them to it in order to show per each Count_group
its Day
, Day - 1
, Day -2
, Day -3
and Day -4
.
e.g. Count_group
= 15 is composed by the Count values of Day
1933, 1932, 1931, 1930 (and 1929 not present in the df
). So the five days needs to be included in the Count_group
= 15. The next one will be Count_group
= 9, composed by 1932, 1931, 1930, 1929 and 1928; etc...
Desired output:
ID Day Count Count_group
18 1933 6 15
33 1933 6 15
37 1933 6 15
18 1933 6 15
16 1933 6 15
11 1933 6 15
111 1932 5 15
34 1932 5 15
60 1932 5 15
88 1932 5 15
18 1932 5 15
33 1931 3 15
13 1931 3 15
56 1931 3 15
23 1930 1 15
111 1932 5 9
34 1932 5 9
60 1932 5 9
88 1932 5 9
18 1932 5 9
33 1931 3 9
13 1931 3 9
56 1931 3 9
23 1930 1 9
33 1931 3 4
13 1931 3 4
56 1931 3 4
23 1930 1 4
23 1930 1 1
6 1800 6 12
37 1800 6 12
98 1800 6 12
52 1800 6 12
18 1800 6 12
76 1800 6 12
55 1799 4 12
6 1799 4 12
52 1799 4 12
133 1799 4 12
112 1798 2 12
677 1798 2 12
55 1799 4 6
6 1799 4 6
52 1799 4 6
133 1799 4 6
112 1798 2 6
677 1798 2 6
112 1798 2 2
677 1798 2 2
778 888 4 8
111 888 4 8
88 888 4 8
10 888 4 8
37 887 2 8
26 887 2 8
8 886 1 8
56 885 1 8
37 887 2 4
26 887 2 4
8 886 1 4
56 885 1 4
8 886 1 2
56 885 1 2
56 885 1 1
22 120 2 6
34 120 2 6
88 119 1 6
99 118 2 6
12 118 2 6
90 117 1 6
88 119 1 6
99 118 2 6
12 118 2 6
90 117 1 6
22 115 2 6
99 115 2 6
99 118 2 5
12 118 2 5
90 117 1 5
22 115 2 5
99 115 2 5
90 117 1 3
22 115 2 3
99 115 2 3
22 115 2 2
99 115 2 2
(note that different group of 5 days each one have been separated by a blank line in order to make them clearer)
I have got different data.frames which are grouped by n days and therefore I would like to adapt the code (by changing it a little) specifically for each of them.
Thanks
A generalised version of my previous answer...
#first add grouping variables
days <- 5 #grouping no of days
df$smalldaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]!=df$Day[i-1]))) #individual days
df$bigdaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]<df$Day[i-1]-days+1))) #blocks of linked days
#duplicate days in each big group
df2 <- lapply(split(df,df$bigdaygroup),function(x) {
n <- max(x$Day)-min(x$Day)+1 #number of consecutive days in big group
dayvec <- (max(x$Day):min(x$Day)) #possible days in range
daylog <- dayvec[dayvec %in% x$Day] #actual days in range
pattern <- data.frame(base=rep(dayvec,each=days))
pattern$rep <- sapply(1:nrow(pattern),function(i) pattern$base[i]+1-sum(pattern$base[1:i]==pattern$base[i])) #indices to repeat
pattern$offset <- match(pattern$rep,daylog)-match(pattern$base,daylog) #offsets (used later)
pattern <- pattern[(pattern$base %in% x$Day) & (pattern$rep %in% x$Day),] #remove invalid elements
#store pattern in list as offsets needed in next loop
return(list(df=split(x,x$smalldaygroup)[match(pattern$rep,daylog)],pat=pattern))
})
#change the Count_group to previous value in added entries
df2 <- lapply(df2,function(L) lapply(1:length(L$df),function(i) {
x <- L$df[[i]]
offset <- L$pat$offset #pointer to day to copy Count_group from
x$Count_group <- L$df[[i-offset[i]]]$Count_group[1]
return(x)
}))
df2 <- do.call(rbind,unlist(df2,recursive=FALSE)) #bind back together
df2[,5:6] <- NULL #remove grouping variables
head(df2,30) #ignore rownames!
ID Day Count Count_group
01.1 18 1933 6 15
01.2 33 1933 6 15
01.3 37 1933 6 15
01.4 18 1933 6 15
01.5 16 1933 6 15
01.6 11 1933 6 15
02.7 111 1932 5 15
02.8 34 1932 5 15
02.9 60 1932 5 15
02.10 88 1932 5 15
02.11 18 1932 5 15
03.12 33 1931 3 15
03.13 13 1931 3 15
03.14 56 1931 3 15
04 23 1930 1 15
05.7 111 1932 5 9
05.8 34 1932 5 9
05.9 60 1932 5 9
05.10 88 1932 5 9
05.11 18 1932 5 9
06.12 33 1931 3 9
06.13 13 1931 3 9
06.14 56 1931 3 9
07 23 1930 1 9
08.12 33 1931 3 4
08.13 13 1931 3 4
08.14 56 1931 3 4
09 23 1930 1 4
010 23 1930 1 1
11.16 6 1800 6 12
这篇关于根据组条件创建重复数据展开data.frame(3)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!