确定重叠范围 - R [英] Determine overlapping ranges - R
问题描述
我有两个数据帧。一个月内有购买,一个月内有广告的广告(广告)。要了解某次购买是否可以可靠地与广告相关联 - 我想了解有多少购买日期是在广告后的4天内。我创建了一些(繁琐的)代码来做这个 - 这是基于扩展的广告数据库的每一行覆盖相关的4天期间 - 然后使用合并结构,看看在哪里是(缺乏)重叠。这感觉就像一个非常繁琐的方法来做事情。理想的 - 我本来希望在dplyr以优雅的方式做到这一点。让我知道是否有任何建议
库(dplyr)
$ <$>
库(lubridate)
require data.table)
#设置开始和结束日期,以在
之间取样day.start< - 2007/01/01
day.end< - 2007 / 01/30
set.seed(1)#定义一个随机日期/时间选择函数
rand.day.time< - function(day.start,day.end, size){
dayseq< - seq.Date(as.Date(day.start),as.Date(day.end),by =day)
dayselect& ,size,replace = TRUE)
as.POSIXlt(paste(dayselect))
}
dateval = rand.day.time(day.start,day.end,size = 20)
### create initial dataframes
action = rep(c(ad,purchase),10)
id = rep(c 1,2,2),5)
df = data.frame(customer = id,date = dateval,action = action)
df_pur = filter(df,action ==purchase); df_pur = df_pur [order(df_pur $ date),])
df_ad = filter(df,action ==ad);(df_ad = df_ad [order(df_ad $ date),])
#expand数据框包含广告可能触发购买的所有范围
df_ad_exp = df_ad%>%
group_by(customer,date)%>%
汇总(df_ad_exp)
df_ad_exp2 = setDT(df_ad_exp)[,list(customer = 4) customer,range = seq(start,end,by =day)),by = 1:nrow(df_ad_exp)]
###合并数据框,使用NA值来标识那些日期已进行购买但没有广告活动
df_ad_exp2 = as.data.frame(df_ad_exp2)
(df_ad_exp2 = df_ad_exp2 [,c(customer,range)])
df_ad_exp2 $ helpercol = 0
(df_pur_m = merge(df_pur,df_ad_exp2,by.x = c(date,customer),by.y = c(range,customer),all .x = TRUE))
df_pur_m $ ad_in_range = df_pur_m $ helpercol; df_pur_m $ helpercol = NULL
df_pur_m $ ad_in_range [!is.na(df_pur_m $ ad_in_range)] = 1; df_pur_m $ ad_in_range [is.na(df_pur_m $ ad_in_range)] = 0
#outcomes
df_pur
df_ad
df_pur_m
> df_ad
客户日期操作
3 1 2007-01-07 ad
6 2 2007-01-07 ad
1 1 2007-01-08 ad
10 2 2007-01-12 ad
2 2 2007-01-18 ad
5 1 2007-01-19 ad
7 1 2007-01-21 ad
9 1 2007- 01-22 ad
8 2 2007-01-24 ad
4 2 2007-01-29 ad
> df_pur_m
date客户操作ad_in_range
1 2007-01-02 1购买0
2 2007-01-06 2购买0
3 2007-01-12 1购买1
4 2007-01-12 1购买1
5 2007-01-15 2购买1
6 2007-01-20 2购买1
7 2007-01-24 2购买1
8 2007-01-27 1购买0
9 2007-01-28 2购买1
10 2007-01-30 1购买0
解决方案尝试
,这是为 设计的(我不能想到优雅的
foverlaps
data.tabledplyr
方式,对不起)。您需要在两个表格中都有开始/结束日期列,因此广告的开始/结束日期是开始日期到4天后;#df_ad必须键入
setDT(df_ad)[ ,ad_date_end:= date + days(4)]
setnames(df_ad,'date','ad_date')#为了以后的可读性
setkey(df_ad,customer,ad_date,ad_date_end)
setDT(df_pur)[,purch_end:= date]
setnames(df_pur,'date','purch_date')#为了可读性
#type ='within' x间隔(购买)在y间隔(ad)内
#我们合并客户ID,开始&结束日期
ovl < - foverlaps(df_pur,df_ad,
by.x = c('customer','purch_date','purch_end'),type ='within')
#customer ad_date action ad_date_end purch_date i.action purch_end
#1:1< NA> NA NA< 2007-01-02 purchase 2007-01-02
#2:2< NA> NA NA< 2007-01-06购买2007-01-06
#3:1 2007-01-08 ad 2007-01-12 2007-01-12购买2007-01-12
#4:1 2007 -01-08 ad 2007-01-12 2007-01-12购买2007-01-12
#5:2 2007-01-12 ad 2007-01-16 2007-01-15购买2007-01- 15
#6:2 2007-01-18 ad 2007-01-22 2007-01-20 purchase 2007-01-20
#7:2 2007-01-24 ad 2007-01-28 2007-01-24 purchase 2007-01-24
#8:1< NA> NA NA< 2007-01-27购买2007-01-27
#9:2 2007-01-24 ad 2007-01-28 2007-01-28购买2007-01-28
#10:1< ; NA> NA NA< 2007-01-30购买2007-01-30
#tidyup
ovl [,action:= i.action] [,c('ad_date_end','purch_end','i.action'): = NULL]
customer ad_date action purch_date
#1:1< NA> purchase 2007-01-02
#2:2< NA>购买2007-01-06
#3:1 2007-01-08购买2007-01-12
#4:1 2007-01-08购买2007-01-12
#5 :2 2007-01-12购买2007-01-15
#6:2 2007-01-18购买2007-01-20
#7:2 2007-01-24购买2007-01- 24
#8:1< NA>购买2007-01-27
#9:2 2007-01-24购买2007-01-28
#10:1< NA&购买2007-01-30
NA
ad_date
是与广告无关的购买。I have two dataframes. One with purchases during a month, one with the advertisements (ads) that were broadcassted in that month. To understand whether a purchase can be credibly linked to an ad – I want to understand how many of the purchase dates come within 4 days after an advertisement. I created some (cumbersome) code to do this – which is based on expansion of each row of the advertisement database to cover the relevant 4 day period – and then use a merge construct to see where is a (lack of) overlap. This feels like a very cumbersome ways to do things. Ideally – I would have liked to do this in dplyr in an elegant manner. let me know if anyone has any suggestions
library(dplyr) library(lubridate) require(data.table) # set start and end dates to sample between day.start <- "2007/01/01" day.end <- "2007/01/30" set.seed(1) # define a random date/time selection function rand.day.time <- function(day.start,day.end,size) { dayseq <- seq.Date(as.Date(day.start),as.Date(day.end),by="day") dayselect <- sample(dayseq,size,replace=TRUE) as.POSIXlt(paste(dayselect) ) } dateval=rand.day.time(day.start,day.end,size=20) ###create initial dataframes action=rep(c("ad","purchase"),10) id=rep(c(1,1,2,2),5) df=data.frame(customer=id,date=dateval,action=action) df_pur=filter(df,action=="purchase");(df_pur=df_pur[order(df_pur$date),]) df_ad=filter(df,action=="ad");(df_ad=df_ad[order(df_ad$date),]) #expand data-frame to include all the ranges for which the ad might trigger purchases df_ad_exp = df_ad %>% group_by(customer,date) %>% summarize(start=min(date),end=min(date+days(4))) df_ad_exp=as.data.frame(df_ad_exp) df_ad_exp2=setDT(df_ad_exp)[, list(customer=customer, range=seq(start,end,by="day")), by=1:nrow(df_ad_exp)] ###merge the dataframe, use NA values to identify those dates in which purchase was made but no ad was "active" df_ad_exp2=as.data.frame(df_ad_exp2) (df_ad_exp2=df_ad_exp2[,c("customer","range")]) df_ad_exp2$helpercol=0 (df_pur_m=merge(df_pur,df_ad_exp2,by.x=c("date","customer"),by.y=c("range","customer"),all.x=TRUE)) df_pur_m$ad_in_range=df_pur_m$helpercol;df_pur_m$helpercol=NULL df_pur_m$ad_in_range[!is.na(df_pur_m$ad_in_range)]=1;df_pur_m$ad_in_range[is.na(df_pur_m$ad_in_range)]=0 #outcomes df_pur df_ad df_pur_m > df_ad customer date action 3 1 2007-01-07 ad 6 2 2007-01-07 ad 1 1 2007-01-08 ad 10 2 2007-01-12 ad 2 2 2007-01-18 ad 5 1 2007-01-19 ad 7 1 2007-01-21 ad 9 1 2007-01-22 ad 8 2 2007-01-24 ad 4 2 2007-01-29 ad > df_pur_m date customer action ad_in_range 1 2007-01-02 1 purchase 0 2 2007-01-06 2 purchase 0 3 2007-01-12 1 purchase 1 4 2007-01-12 1 purchase 1 5 2007-01-15 2 purchase 1 6 2007-01-20 2 purchase 1 7 2007-01-24 2 purchase 1 8 2007-01-27 1 purchase 0 9 2007-01-28 2 purchase 1 10 2007-01-30 1 purchase 0
解决方案Try
foverlaps
indata.table
, it's designed for this (I can't think of an elegantdplyr
way, sorry). You need to have a start/end date column in both tables, so the start/end date for the ad is the start date to 4 days later; the start/end date for the purchase are the same.# df_ad must be keyed setDT(df_ad)[, ad_date_end:=date + days(4)] setnames(df_ad, 'date', 'ad_date') # just for readability later setkey(df_ad, customer, ad_date, ad_date_end) setDT(df_pur)[, purch_end:=date] setnames(df_pur, 'date', 'purch_date') # for readability # type='within': the x interval (purchase) is within the y interval (ad) # we merge on customer ID, start & end date ovl <- foverlaps(df_pur, df_ad, by.x=c('customer', 'purch_date', 'purch_end'), type='within') # customer ad_date action ad_date_end purch_date i.action purch_end # 1: 1 <NA> NA <NA> 2007-01-02 purchase 2007-01-02 # 2: 2 <NA> NA <NA> 2007-01-06 purchase 2007-01-06 # 3: 1 2007-01-08 ad 2007-01-12 2007-01-12 purchase 2007-01-12 # 4: 1 2007-01-08 ad 2007-01-12 2007-01-12 purchase 2007-01-12 # 5: 2 2007-01-12 ad 2007-01-16 2007-01-15 purchase 2007-01-15 # 6: 2 2007-01-18 ad 2007-01-22 2007-01-20 purchase 2007-01-20 # 7: 2 2007-01-24 ad 2007-01-28 2007-01-24 purchase 2007-01-24 # 8: 1 <NA> NA <NA> 2007-01-27 purchase 2007-01-27 # 9: 2 2007-01-24 ad 2007-01-28 2007-01-28 purchase 2007-01-28 # 10: 1 <NA> NA <NA> 2007-01-30 purchase 2007-01-30 # tidyup ovl[, action:=i.action][, c('ad_date_end', 'purch_end', 'i.action'):=NULL] customer ad_date action purch_date # 1: 1 <NA> purchase 2007-01-02 # 2: 2 <NA> purchase 2007-01-06 # 3: 1 2007-01-08 purchase 2007-01-12 # 4: 1 2007-01-08 purchase 2007-01-12 # 5: 2 2007-01-12 purchase 2007-01-15 # 6: 2 2007-01-18 purchase 2007-01-20 # 7: 2 2007-01-24 purchase 2007-01-24 # 8: 1 <NA> purchase 2007-01-27 # 9: 2 2007-01-24 purchase 2007-01-28 # 10: 1 <NA> purchase 2007-01-30
The rows with
NA
ad_date
were purchases not related to an ad.这篇关于确定重叠范围 - R的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!