在R中的日期范围上合并两个数据帧 [英] Merging two dataframes on a date range in R
问题描述
df1< - structure(list(ID = 1:2,PtID = c(1L,1L),标号= c(T031,T040,T045,T064,T074,T081,T092,T094,T096 T107,T108,T108,T108,T108,T108,T115,T118,T120,T124 T142,T142_R1,T146,T158,T159,T160,T164,T166,T169,T171,T173,T197 ,T222,T221),class =factor),StartDateTime = structure(list(sec = c(0,0))min = c(11L,35L),小时= c(17L,17L) ,mday = c(23L,23L),mon = c(9L,9L),year = c(112L,112L),wday = c(2L,2L),yday = c(296L,296L),isdst = c 1L,1L)),.Names = c(sec,min,hour,mday,mon,year,wday,yday,isdst c(POSIXlt,POSIXt)),EndDateTime = structure(list(sec = c(0,0),min = c(16L,37L),hour = c(17L,17L),mday = ,23L),mon = c(9L,9L),year = c(112L,112L),wday = c(2L,2L),yday = c(296L,296L),isdst = c(1L,1L) .Names = c(sec,min ,小时,mday,mon,年,wday,yday,isdst),class = c(POSIXlt,POSIXt))),.Names = c ID,PtID,StartDateTime,EndDateTime),row.names = 1:2,class =data.frame)
df1
ID PtID StartDateTime EndDateTime
1 1 T031 2012- 10-23 17:11:00 2012-10-23 17:16:00
2 2 T031 2012-10-23 17:35:00 2012-10-23 17:37:00
第二个数据框具有几个ID(与第一个数据帧匹配)和分钟级别的时间戳。
df2
df2< - structure(list(ID = c(1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,3,3,3,3,3,3,3, 3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2),dateTime = structure(list(sec = c(0,0,0,0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0),min = 2:44,小时= c(17L,17L,17L,17L,17L, 17L,17L,17L,17L,17L ,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L,17L, ,17L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L, 23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L,23L, 23L,23L,23L,23L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L, ,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L,9L)年份= c(112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L, 112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L,112L),wday = c(2L,2L, 2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L, ,2L,2L,2L,2L,2L,2L,2L,2L,2L,2 L,2L,2L,2L,2L,2L),yday = c(296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L ,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L,296L, ,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L)), (sec,min,hour,mday,mon,year,wday,yday,isdst),class = c(POSIXlt,POSIXt ),LAT = C(33.06621406,33.06616621,33.06617305,33.06617624,33.06617932,33.06618161,33.06618326,33.06618604,33.06615089,33.06628004,33.06618461,33.06615113,33.0661362,33.06620301,33.0662218,33.06624283,33.06622268,33.06622425,33.06622787,33.06623042,33.06623318,33.06623654, 33.06623826,33.06623919,33.06623907,33.06624009,33.06623804,33.06624255,33.06624377,33.06624446,33.06624242, 33.06624254,33.06624513,33.06624582,33.06615573,33.06625534,33.06618541,33.06613825,33.06613624,33.06614027,33.06614551,33.06614844,33.06615393),lon = c(-116.6105531,116.6105651,116.6105613,116.6105553,116.610551,116.610549,116.6105484, -116.6105512,116.6105712,116.6104996,116.6104711,116.6104854,116.6105596,116.6104509,116.610524,116.6105535,116.6105461,116.6105461,116.6105477,116.6105498,116.6105478,116.6105473,116.6105473,116.6105488 ,-116.6105497,-116.6105479,116.610545,116.6105461,116.6105448,116.610543,116.6105409,116.6105395,116.6105367,116.6105337,116.6105344,116.6104779,116.6104953,116.6105222,116.610526,116.6105255, 116.6105282,-116.6105265,-116.6105282)),.Names = c(ID,dateTime,lat,lon),row.names = 1023:1065,class =data.frame $ b
所以希望的输出如下所示:
ID PtID DateTime lat lon
1 T031 2012-10-23 17:11:00 33.06628 -116.6105
1 T031 2012-10-23 17:12:00 33.06618 -116.6105
1 T031 2012-10-23 17:13: 00 33.06615 -116.6105
1 T031 2012-10-23 17:14:00 33.06614 -116.6106
1 T031 2012-10-23 17:15:00 33.06620 -116.6105
1 T031 2012- 10-23 17:16:00 33.06622 -116.6105
2 T031 2012-10-23 17:35:00 33.06625 -116.6105
2 T031 2012-10-23 17:36:00 33.06616 -116.6105
2 T031 2012-10-23 17:37:00 33.06626 -116.6105
所以用sqldf可能这样吗?
sqldf(SELECT df2.ID,df2.lon,df2.lat,FROM df1
INNER JOIN df2 ON df1.ID = df2.ID
WHERE df2.DateTime BETWEEN df1.StartDateTime AND df1.EndDateTime)
一般来说,它不是在数据帧中使用POSIXlt的好主意。改用POSIXct。另外,您的SQL语句也可以,除了需要删除之前的逗号之外:
df1a< - transform(df1,
StartDateTime = as.POSIXct(StartDateTime),
EndDateTime = as.POSIXct(EndDateTime))
df2a< - transform(df2,dateTime = as.POSIXct(dateTime))
问题中的SQL语句在 FROM
之前有一个无关的逗号。
这是一个稍微简化的语句。这一个使用左连接来确保包含 df1a
中的所有ID,即使它们在 df2a
中没有匹配。
sqldf(SELECT df1a.ID,PtID,dateTime,lat,lon
FROM df1a LEFT JOIN df2a
ON df1a.ID = df2a.ID AND dateTime BETWEEN StartDateTime AND EndDateTime)
In R I want to merge two dataframes on a range of dates, taking all rows from the second dataframe which fall on and between two columns of dates from the first dataframe. I couldn't find a strictly R function or version of the merge function that could do this, but I know there's a 'between' function in sql and I was thinking of trying the sqldf package (although I'm not well versed in sql). If there's a more R-ish way to do this, that would be preferable. Thank you in advance for your help!
df1 <- structure(list(ID = 1:2, PtID = structure(c(1L, 1L), .Label = c("T031", "T040", "T045", "T064", "T074", "T081", "T092", "T094", "T096", "T105", "T107", "T108", "T115", "T118", "T120", "T124", "T125", "T128", "T130", "T132", "T138", "T140", "T142", "T142_R1", "T146", "T158", "T159", "T160", "T164", "T166", "T169", "T171", "T173", "T197", "T208", "T214", "T221"), class = "factor"), StartDateTime = structure(list(sec = c(0, 0), min = c(11L, 35L), hour = c(17L, 17L), mday = c(23L, 23L), mon = c(9L, 9L), year = c(112L, 112L), wday = c(2L, 2L), yday = c(296L, 296L), isdst = c(1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt")), EndDateTime = structure(list(sec = c(0, 0), min = c(16L, 37L), hour = c(17L, 17L), mday = c(23L, 23L), mon = c(9L, 9L), year = c(112L, 112L), wday = c(2L, 2L), yday = c(296L, 296L), isdst = c(1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"))), .Names = c("ID", "PtID", "StartDateTime", "EndDateTime"), row.names = 1:2, class = "data.frame")
df1
ID PtID StartDateTime EndDateTime
1 1 T031 2012-10-23 17:11:00 2012-10-23 17:16:00
2 2 T031 2012-10-23 17:35:00 2012-10-23 17:37:00
The second dataframe has several IDs (which match the first dataframe) and timestamps on the minute level.
df2
df2 <- structure(list(ID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), dateTime = structure(list(sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = 2:44, hour = c(17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L), mday = c(23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L), mon = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), year = c(112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L), wday = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), yday = c(296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt")), lat = c(33.06621406, 33.06616621, 33.06617305, 33.06617624, 33.06617932, 33.06618161, 33.06618326, 33.06618604, 33.06615089, 33.06628004, 33.06618461, 33.06615113, 33.0661362, 33.06620301, 33.0662218, 33.06624283, 33.06622268, 33.06622425, 33.06622787, 33.06623042, 33.06623318, 33.06623654, 33.06623826, 33.06623919, 33.06623907, 33.06624009, 33.06623804, 33.06624255, 33.06624377, 33.06624446, 33.06624242, 33.06624254, 33.06624513, 33.06624582, 33.06615573, 33.06625534, 33.06618541, 33.06613825, 33.06613624, 33.06614027, 33.06614551, 33.06614844, 33.06615393), lon = c(-116.6105531, -116.6105651,-116.6105613, -116.6105553, -116.610551, -116.610549, -116.6105484, -116.6105512, -116.6105712, -116.6104996, -116.6104711, -116.6104854, -116.6105596, -116.6104509, -116.610524, -116.6105535, -116.6105461, -116.6105461, -116.6105477, -116.6105498, -116.6105478, -116.6105473, -116.6105473, -116.6105488, -116.6105497, -116.6105479, -116.610545, -116.6105461, -116.6105448, -116.610543, -116.6105409, -116.6105395, -116.6105367, -116.6105337, -116.6105344, -116.6104779, -116.6104953,-116.6105222, -116.610526, -116.6105255, -116.6105282, -116.6105265,-116.6105282)), .Names = c("ID", "dateTime", "lat", "lon"), row.names = 1023:1065, class = "data.frame")
So the desired output would look like this:
ID PtID DateTime lat lon
1 T031 2012-10-23 17:11:00 33.06628 -116.6105
1 T031 2012-10-23 17:12:00 33.06618 -116.6105
1 T031 2012-10-23 17:13:00 33.06615 -116.6105
1 T031 2012-10-23 17:14:00 33.06614 -116.6106
1 T031 2012-10-23 17:15:00 33.06620 -116.6105
1 T031 2012-10-23 17:16:00 33.06622 -116.6105
2 T031 2012-10-23 17:35:00 33.06625 -116.6105
2 T031 2012-10-23 17:36:00 33.06616 -116.6105
2 T031 2012-10-23 17:37:00 33.06626 -116.6105
So with sqldf maybe something like this?
sqldf("SELECT df2.ID, df2.lon, df2.lat, FROM df1
INNER JOIN df2 ON df1.ID = df2.ID
WHERE df2.DateTime BETWEEN df1.StartDateTime AND df1.EndDateTime")
In general, its not a good idea to use POSIXlt in data frames. Use POSIXct instead. Also your SQL statement is ok except the comma before FROM needs to be removed:
df1a <- transform(df1,
StartDateTime = as.POSIXct(StartDateTime),
EndDateTime = as.POSIXct(EndDateTime))
df2a <- transform(df2, dateTime = as.POSIXct(dateTime))
The SQL statement in the question has an extraneous commma before FROM
.
Here is a slightly simplified statement. This one uses a left join instead to ensure that all ID's from df1a
are included even if they have no matches in df2a
.
sqldf("SELECT df1a.ID, PtID, dateTime, lat, lon
FROM df1a LEFT JOIN df2a
ON df1a.ID = df2a.ID AND dateTime BETWEEN StartDateTime AND EndDateTime")
这篇关于在R中的日期范围上合并两个数据帧的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!