在不重叠的间隔上合并两个数据帧 [英] merge two data frames on non overlapping intervals
问题描述
它们都有开始日期和结束日期。
如果给定的间隔重叠,我想将结果行拆分成不重叠的间隔。
请看这个例子:
a
id beg_a end_a prop_a
1 2000-01-01 2002-12-31 A
2 2000-01-01 2000-02-15 B
2 2000-04-01 2000-04-15 A
2 2002-01-01 2002-12-31 B
3 2000-01-01 2000-06-15 A
b
id beg_b end_b prop_b
1 1999-06-01 2000-05-15 D
1 2003-01-15 2003-01-31 D
2 1999-01-01 2003-01-15 D
3 2000-07-01 2001-08-01 E
合并
id beg_a end_a prop_a beg_b end_b prop_b overallBeg overallEnd
1< NA> < NA> < NA> 1999-06-01 2000-05-15 D 1999-06-01 1999-12-31
1 2000-01-01 2002-12-31 A 1999-06-01 2000-05-15 D 2000- 01-01 2000-05-15
1 2000-01-01 2002-12-31 A< NA> < NA> < NA> 2000-05-16 2002-12-31
1< NA> < NA> < NA> 2003-01-15 2003-01-31 D 2003-01-15 2003-01-31
2< NA> < NA> < NA> 1999-01-01 2003-01-15 D 1999-01-01 1999-12-31
2 2000-01-01 2000-02-15 B 1999-01-01 2003-01-15 D 2000- 01-01 2000-02-15
2< NA> < NA> < NA> 1999-01-01 2003-01-15 D 2000-02-16 2000-03-31
2 2000-04-01 2000-04-15 A 1999-01-01 2003-01-15 D 2000- 04-01 2000-04-15
2< NA> < NA> < NA> 1999-01-01 2003-01-15 D 2000-04-16 2001-12-31
2 2002-01-01 2002-12-31 B 1999-01-01 2003-01-15 D 2002- 01-01 2002-12-31
2< NA> < NA> < NA> 1999-01-01 2003-01-15 D 2003-01-01 2003-01-15
3 2000-01-01 2000-06-15 A< NA> < NA> 2000-01-01 2000-06-15
3< NA> < NA> < NA> 2000-07-01 2001-08-01 E 2000-07-01 2001-08-01
(或者简单地在R中使用这些命令)
a< - structure(list(id = c(1,2,2 ,2,3),beg_a = structure(c(10957,
10957,11048,11688,10957),class =Date),end_a = structure(c(12052,
11002,11062, 1202,11123),class =Date),prop_a = structure(c(1L,
2L,1L,2L,1L),.Label = c(A,B),class =因子)),.Names = c(id,
beg_a,end_a,prop_a),row.names = c(NA,-5L),class =data.frame )
b< - structure(list(id = c(1,1,2,3))beg_b = structure(c(10743,
12067,10592,11139),class = date),end_b = structure(c(11092,
12083,12067,11535),class =Date),prop_b = structure(c(1L,
1L,1L,2L) .Label = c(D,E),class =factor)),.Names = c(id,
beg_b,end_b,prop_b),行。 name = c(NA,-4L),class =data.frame)
merged< - structure(list(id = c(1,1,1,1,2,2 ,2,2,2,2,3,3),
beg_a = structure(c(NA,10957,10957,NA,NA,10957,NA,
11048,NA,11688, NA,10957,NA),class =Date),end_a = structure(c(NA,
12052,12052,NA,NA,11002,NA,11062,NA,12052,NA,11123,
NA),class =Date),prop_a = structure(c(NA,1L,1L,NA,
NA,2L,NA,1L,NA,2L,NA,1L,NA) Label = c(A,B),class =factor),
beg_b = structure(c(10743,10743,NA,12067,10592,10592,
10592,10592, (c)(c,11092,11092,NA,12083,12067,12067,
12067,12067,12067, (1L,1L,NA,1L,1L,1L,1L,1L,1L,
1L, 1L,NA,2L),.Label = c(D,E),class =factor),
overallBeg = structure(c(10743,10957,11093,12067,10592,
10957,11003,11048,11063,11688,12053,10957,11139),class =Da te),
overallEnd = structure(c(10956,11092,12052,12083,10956,
11002,11047,11062,11687,12052,12067,1111,11535),class =Date )),.Names = c(id,
beg_a,end_a,prop_a,beg_b,end_b,prop_b,overallBeg,
),row.names = c(NA,-13L),class =data.frame)
<我认为与我的另一个问题有一些相似之处:
平滑时间数据 - 可以提高效率吗?
但也略有不同。
谢谢您提前为您的帮助!
sqldf将工作,但我尝试了一个纯R解决方案。它的作品,但它有点马虎。我没有想出如何矢量化解决方案(删除split.interval中的两个for循环,并删除需要重新排列id.split)。
首先,我创建两个可以使用一个id的函数,并将'a'和'b'合并在一起:
split。 interval = function(sub.a,sub.b){
begs = c(sub.a $ beg_a,sub.b $ beg_b)
ends = c(sub.a $ end_a,sub.b $ end_b)
dates = c(begs,ends)
dates = dates [order(dates)]
d = data.frame(overallBeg = dates [-length(dates)],overallEnd =日期[-1])$ b $ b date.match = function(x,y){
s = match(x,d $ overallBeg)
e = match(y,d $ overallEnd)
join = as.Date(rep(NA,length(d $ overallBeg)))
for(i in 1:length(x))join [s [i]:e [i]] = x [i ]
加入
}
d $ a_join = date.match(sub.a $ beg_a,sub.a $ end_a)
d $ b_join = date.match(sub .b $ beg_b,sub.b $ end_b)
d = merge(sub .a,d,by.x ='beg_a',by.y ='a_join',all.y = T)
d = merge(sub.b,d,by.x ='beg_b',by。 y ='b_join',all.y = T)
d $ id = pmax(d $ id.x,d $ id.y,na.rm = T)
d = d [ order(d $ overallBeg),c('id','beg_a','end_a','prop_a','beg_b','end_b','prop_b','overallBeg','overallEnd')]
#下一行将导致一个错误,如果overallBeg == overallEnd
d $ overallEnd [d $ overallEnd == c(d $ overallBeg [-1],F)] = d $ overallEnd [d $ overallEnd == c (d $ overallBeg [-1],F)] - 1
d
}
id.split = function(ids){
sub。 a = a [a $ id == ids,]
sub.b = b [b $ id == ids,]
split.interval(sub.a,sub.b)
}
然后我为每个ID运行该函数,并将它们绑定在一起。 / p>
l = lapply(unique(c(a $ id,b $ id)),id.split)
res = do.call(rbind,l)
row.names(res)= NULL
res
I want to merge two data frames. Both of them have a begin date and an end date.
If the given intervals are overlapping, I want to split the resulting rows in non overlapping intevals.
Please see this example:
a
id beg_a end_a prop_a
1 2000-01-01 2002-12-31 A
2 2000-01-01 2000-02-15 B
2 2000-04-01 2000-04-15 A
2 2002-01-01 2002-12-31 B
3 2000-01-01 2000-06-15 A
b
id beg_b end_b prop_b
1 1999-06-01 2000-05-15 D
1 2003-01-15 2003-01-31 D
2 1999-01-01 2003-01-15 D
3 2000-07-01 2001-08-01 E
merged
id beg_a end_a prop_a beg_b end_b prop_b overallBeg overallEnd
1 <NA> <NA> <NA> 1999-06-01 2000-05-15 D 1999-06-01 1999-12-31
1 2000-01-01 2002-12-31 A 1999-06-01 2000-05-15 D 2000-01-01 2000-05-15
1 2000-01-01 2002-12-31 A <NA> <NA> <NA> 2000-05-16 2002-12-31
1 <NA> <NA> <NA> 2003-01-15 2003-01-31 D 2003-01-15 2003-01-31
2 <NA> <NA> <NA> 1999-01-01 2003-01-15 D 1999-01-01 1999-12-31
2 2000-01-01 2000-02-15 B 1999-01-01 2003-01-15 D 2000-01-01 2000-02-15
2 <NA> <NA> <NA> 1999-01-01 2003-01-15 D 2000-02-16 2000-03-31
2 2000-04-01 2000-04-15 A 1999-01-01 2003-01-15 D 2000-04-01 2000-04-15
2 <NA> <NA> <NA> 1999-01-01 2003-01-15 D 2000-04-16 2001-12-31
2 2002-01-01 2002-12-31 B 1999-01-01 2003-01-15 D 2002-01-01 2002-12-31
2 <NA> <NA> <NA> 1999-01-01 2003-01-15 D 2003-01-01 2003-01-15
3 2000-01-01 2000-06-15 A <NA> <NA> <NA> 2000-01-01 2000-06-15
3 <NA> <NA> <NA> 2000-07-01 2001-08-01 E 2000-07-01 2001-08-01
(or simply use these commands in R)
a <- structure(list(id = c(1, 2, 2, 2, 3), beg_a = structure(c(10957,
10957, 11048, 11688, 10957), class = "Date"), end_a = structure(c(12052,
11002, 11062, 12052, 11123), class = "Date"), prop_a = structure(c(1L,
2L, 1L, 2L, 1L), .Label = c("A", "B"), class = "factor")), .Names = c("id",
"beg_a", "end_a", "prop_a"), row.names = c(NA, -5L), class = "data.frame")
b <- structure(list(id = c(1, 1, 2, 3), beg_b = structure(c(10743,
12067, 10592, 11139), class = "Date"), end_b = structure(c(11092,
12083, 12067, 11535), class = "Date"), prop_b = structure(c(1L,
1L, 1L, 2L), .Label = c("D", "E"), class = "factor")), .Names = c("id",
"beg_b", "end_b", "prop_b"), row.names = c(NA, -4L), class = "data.frame")
merged <- structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3),
beg_a = structure(c(NA, 10957, 10957, NA, NA, 10957, NA,
11048, NA, 11688, NA, 10957, NA), class = "Date"), end_a = structure(c(NA,
12052, 12052, NA, NA, 11002, NA, 11062, NA, 12052, NA, 11123,
NA), class = "Date"), prop_a = structure(c(NA, 1L, 1L, NA,
NA, 2L, NA, 1L, NA, 2L, NA, 1L, NA), .Label = c("A", "B"), class = "factor"),
beg_b = structure(c(10743, 10743, NA, 12067, 10592, 10592,
10592, 10592, 10592, 10592, 10592, NA, 11139), class = "Date"),
end_b = structure(c(11092, 11092, NA, 12083, 12067, 12067,
12067, 12067, 12067, 12067, 12067, NA, 11535), class = "Date"),
prop_b = structure(c(1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, NA, 2L), .Label = c("D", "E"), class = "factor"),
overallBeg = structure(c(10743, 10957, 11093, 12067, 10592,
10957, 11003, 11048, 11063, 11688, 12053, 10957, 11139), class = "Date"),
overallEnd = structure(c(10956, 11092, 12052, 12083, 10956,
11002, 11047, 11062, 11687, 12052, 12067, 11123, 11535), class = "Date")), .Names = c("id",
"beg_a", "end_a", "prop_a", "beg_b", "end_b", "prop_b", "overallBeg",
"overallEnd"), row.names = c(NA, -13L), class = "data.frame")
I think there are some similarities with another question of mine: "smoothing" time data - can it be done more efficient?
But also slightly different.
Thank you in advance for your help!
sqldf will work, but I tried a 'pure' R solution. It works, but it is a little sloppy. I haven't figured out how to 'vectorize' the solution (remove the two for loops in the split.interval, and remove the need to lapply over id.split).
First I create two functions that can take one id, and merge 'a' and 'b' together:
split.interval = function(sub.a, sub.b) {
begs = c(sub.a$beg_a,sub.b$beg_b)
ends = c(sub.a$end_a,sub.b$end_b)
dates=c(begs,ends)
dates = dates[order(dates)]
d = data.frame(overallBeg = dates[-length(dates)], overallEnd = dates[-1])
date.match = function(x,y) {
s = match(x, d$overallBeg )
e = match(y, d$overallEnd )
join=as.Date(rep(NA,length(d$overallBeg)))
for (i in 1:length(x)) join [s[i]:e[i]]= x[i]
join
}
d$a_join = date.match(sub.a$beg_a,sub.a$end_a)
d$b_join = date.match(sub.b$beg_b,sub.b$end_b)
d = merge(sub.a,d,by.x='beg_a',by.y='a_join',all.y=T)
d = merge(sub.b,d,by.x='beg_b',by.y='b_join',all.y=T)
d$id=pmax(d$id.x,d$id.y,na.rm=T)
d = d [order(d$overallBeg),c('id','beg_a','end_a','prop_a','beg_b','end_b','prop_b','overallBeg','overallEnd')]
# This next line will lead to a bug if overallBeg == overallEnd
d$overallEnd [d$overallEnd == c(d$overallBeg[-1],F)] = d$overallEnd [d$overallEnd == c(d$overallBeg[-1],F)] - 1
d
}
id.split = function (ids) {
sub.a=a[a$id==ids,]
sub.b=b[b$id==ids,]
split.interval ( sub.a , sub.b )
}
Then I run the function for every ID, and bind them all together.
l=lapply(unique(c(a$id,b$id)), id.split)
res = do.call(rbind,l)
row.names(res) = NULL
res
这篇关于在不重叠的间隔上合并两个数据帧的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!