确定重叠范围 - R [英] Determine overlapping ranges - R

查看：116 发布时间：2017/3/12 11:59:01 r date merge data.table dplyr

本文介绍了确定重叠范围 - R的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有两个数据帧。一个月内有购买，一个月内有广告的广告（广告）。要了解某次购买是否可以可靠地与广告相关联 - 我想了解有多少购买日期是在广告后的4天内。我创建了一些（繁琐的）代码来做这个 - 这是基于扩展的广告数据库的每一行覆盖相关的4天期间 - 然后使用合并结构，看看在哪里是（缺乏）重叠。这感觉就像一个非常繁琐的方法来做事情。理想的 - 我本来希望在dplyr以优雅的方式做到这一点。让我知道是否有任何建议

 库（dplyr）
库（lubridate）
 require data.table）
 
＃设置开始和结束日期，以在
之间取样day.start<  - 2007/01/01
 day.end<  - 2007 / 01/30
 
 set.seed（1）＃定义一个随机日期/时间选择函数
 rand.day.time<  -  function（day.start，day.end， size）{
 dayseq<  -  seq.Date（as.Date（day.start），as.Date（day.end），by =day）
 dayselect& ，size，replace = TRUE）
 as.POSIXlt（paste（dayselect））
} 
 
 dateval = rand.day.time（day.start，day.end，size = 20）
 
 ### create initial dataframes 
 action = rep（c（ad，purchase），10）
 id = rep（c 1,2,2），5）
 df = data.frame（customer = id，date = dateval，action = action）
 df_pur = filter（df，action ==purchase）; df_pur = df_pur [order（df_pur $ date），]）
 df_ad = filter（df，action ==ad）;（df_ad = df_ad [order（df_ad $ date），]）
 
 #expand数据框包含广告可能触发购买的所有范围
 df_ad_exp = df_ad％>％
 group_by（customer，date）％>％
汇总（df_ad_exp）
 df_ad_exp2 = setDT（df_ad_exp）[，list（customer = 4） customer，range = seq（start，end，by =day）），by = 1：nrow（df_ad_exp）] 
 
 ###合并数据框，使用NA值来标识那些日期已进行购买但没有广告活动
 df_ad_exp2 = as.data.frame（df_ad_exp2）
（df_ad_exp2 = df_ad_exp2 [，c（customer，range）]）
 df_ad_exp2 $ helpercol = 0 
（df_pur_m = merge（df_pur，df_ad_exp2，by.x = c（date，customer），by.y = c（range，customer），all .x = TRUE））
 
 df_pur_m $ ad_in_range = df_pur_m $ helpercol; df_pur_m $ helpercol = NULL 
 df_pur_m $ ad_in_range [！is.na（df_pur_m $ ad_in_range）] = 1; df_pur_m $ ad_in_range [is.na（df_pur_m $ ad_in_range）] = 0 
 
 #outcomes 
 df_pur 
 df_ad 
 df_pur_m 
 
> df_ad 
客户日期操作
 3 1 2007-01-07 ad 
 6 2 2007-01-07 ad 
 1 1 2007-01-08 ad 
 10 2 2007-01-12 ad 
 2 2 2007-01-18 ad 
 5 1 2007-01-19 ad 
 7 1 2007-01-21 ad 
 9 1 2007- 01-22 ad 
 8 2 2007-01-24 ad 
 4 2 2007-01-29 ad 
> df_pur_m 
 date客户操作ad_in_range 
 1 2007-01-02 1购买0 
 2 2007-01-06 2购买0 
 3 2007-01-12 1购买1 
 4 2007-01-12 1购买1 
 5 2007-01-15 2购买1 
 6 2007-01-20 2购买1 
 7 2007-01-24 2购买1 
 8 2007-01-27 1购买0 
 9 2007-01-28 2购买1 
 10 2007-01-30 1购买0 
   $ <$> 
 
解决方案
尝试  foverlaps  data.table ，这是为 设计的（我不能想到优雅的 dplyr 方式，对不起）。您需要在两个表格中都有开始/结束日期列，因此广告的开始/结束日期是开始日期到4天后; 
 ＃df_ad必须键入
 setDT（df_ad）[ ，ad_date_end：= date + days（4）] 
 setnames（df_ad，'date'，'ad_date'）＃为了以后的可读性
 setkey（df_ad，customer，ad_date，ad_date_end）
 
 setDT（df_pur）[，purch_end：= date] 
 setnames（df_pur，'date'，'purch_date'）＃为了可读性
 
＃type ='within' x间隔（购买）在y间隔（ad）内
＃我们合并客户ID，开始&结束日期
 ovl < -  foverlaps（df_pur，df_ad，
 by.x = c（'customer'，'purch_date'，'purch_end'），type ='within'）
 
＃customer ad_date action ad_date_end purch_date i.action purch_end 
＃1：1< NA> NA NA< 2007-01-02 purchase 2007-01-02 
＃2：2< NA> NA NA< 2007-01-06购买2007-01-06 
＃3：1 2007-01-08 ad 2007-01-12 2007-01-12购买2007-01-12 
＃4：1 2007 -01-08 ad 2007-01-12 2007-01-12购买2007-01-12 
＃5：2 2007-01-12 ad 2007-01-16 2007-01-15购买2007-01- 15 
＃6：2 2007-01-18 ad 2007-01-22 2007-01-20 purchase 2007-01-20 
＃7：2 2007-01-24 ad 2007-01-28 2007-01-24 purchase 2007-01-24 
＃8：1< NA> NA NA< 2007-01-27购买2007-01-27 
＃9：2 2007-01-24 ad 2007-01-28 2007-01-28购买2007-01-28 
＃10：1< ; NA> NA NA< 2007-01-30购买2007-01-30 
＃tidyup 
 ovl [，action：= i.action] [，c（'ad_date_end'，'purch_end'，'i.action'）： = NULL] 
 customer ad_date action purch_date 
＃1：1< NA> purchase 2007-01-02 
＃2：2< NA>购买2007-01-06 
＃3：1 2007-01-08购买2007-01-12 
＃4：1 2007-01-08购买2007-01-12 
＃5 ：2 2007-01-12购买2007-01-15 
＃6：2 2007-01-18购买2007-01-20 
＃7：2 2007-01-24购买2007-01- 24 
＃8：1< NA>购买2007-01-27 
＃9：2 2007-01-24购买2007-01-28 
＃10：1< NA&购买2007-01-30 
  
  NA   ad_date 是与广告无关的购买。
 
I have two dataframes. One with purchases during a month, one with the advertisements (ads) that were broadcassted in that month. To understand whether a purchase can be credibly linked to an ad – I want to understand how many of the purchase dates come within 4 days after an advertisement. I created some (cumbersome) code to do this – which is based on expansion of each row of the advertisement database to cover the relevant 4 day period – and then use a merge construct to see where is a (lack of) overlap. This feels like a very cumbersome ways to do things. Ideally – I would have liked to do this in dplyr in an elegant manner. let me know if anyone has any suggestions 
library(dplyr)
library(lubridate)
require(data.table)

# set start and end dates to sample between
day.start <- "2007/01/01"
day.end <- "2007/01/30"

set.seed(1) # define a random date/time selection function
rand.day.time <- function(day.start,day.end,size) {
  dayseq <- seq.Date(as.Date(day.start),as.Date(day.end),by="day")
  dayselect <- sample(dayseq,size,replace=TRUE)
  as.POSIXlt(paste(dayselect) )
}

dateval=rand.day.time(day.start,day.end,size=20)

###create initial dataframes
action=rep(c("ad","purchase"),10)
id=rep(c(1,1,2,2),5)
df=data.frame(customer=id,date=dateval,action=action)
df_pur=filter(df,action=="purchase");(df_pur=df_pur[order(df_pur$date),])
df_ad=filter(df,action=="ad");(df_ad=df_ad[order(df_ad$date),])

#expand data-frame to include all the ranges for which the ad might trigger purchases
df_ad_exp = df_ad %>%
  group_by(customer,date) %>%
  summarize(start=min(date),end=min(date+days(4))) 
df_ad_exp=as.data.frame(df_ad_exp)
df_ad_exp2=setDT(df_ad_exp)[, list(customer=customer, range=seq(start,end,by="day")), by=1:nrow(df_ad_exp)]

###merge the dataframe, use NA values to identify those dates in which purchase was made but no ad was "active"
df_ad_exp2=as.data.frame(df_ad_exp2)
(df_ad_exp2=df_ad_exp2[,c("customer","range")])
df_ad_exp2$helpercol=0
(df_pur_m=merge(df_pur,df_ad_exp2,by.x=c("date","customer"),by.y=c("range","customer"),all.x=TRUE))

df_pur_m$ad_in_range=df_pur_m$helpercol;df_pur_m$helpercol=NULL
df_pur_m$ad_in_range[!is.na(df_pur_m$ad_in_range)]=1;df_pur_m$ad_in_range[is.na(df_pur_m$ad_in_range)]=0

#outcomes
df_pur
df_ad
df_pur_m

> df_ad
   customer       date action
3         1 2007-01-07     ad
6         2 2007-01-07     ad
1         1 2007-01-08     ad
10        2 2007-01-12     ad
2         2 2007-01-18     ad
5         1 2007-01-19     ad
7         1 2007-01-21     ad
9         1 2007-01-22     ad
8         2 2007-01-24     ad
4         2 2007-01-29     ad
> df_pur_m
         date customer   action ad_in_range
1  2007-01-02        1 purchase           0
2  2007-01-06        2 purchase           0
3  2007-01-12        1 purchase           1
4  2007-01-12        1 purchase           1
5  2007-01-15        2 purchase           1
6  2007-01-20        2 purchase           1
7  2007-01-24        2 purchase           1
8  2007-01-27        1 purchase           0
9  2007-01-28        2 purchase           1
10 2007-01-30        1 purchase           0

 解决方案 
Try foverlaps in data.table, it's designed for this (I can't think of an elegant dplyr way, sorry). You need to have a start/end date column in both tables, so the start/end date for the ad is the start date to 4 days later; the start/end date for the purchase are the same.
# df_ad must be keyed
setDT(df_ad)[, ad_date_end:=date + days(4)]
setnames(df_ad, 'date', 'ad_date') # just for readability later
setkey(df_ad, customer, ad_date, ad_date_end)

setDT(df_pur)[, purch_end:=date]
setnames(df_pur, 'date', 'purch_date') # for readability

# type='within': the x interval (purchase) is within the y interval (ad)
# we merge on customer ID, start & end date
ovl <- foverlaps(df_pur, df_ad,
                 by.x=c('customer', 'purch_date', 'purch_end'), type='within') 

#     customer    ad_date action ad_date_end purch_date i.action  purch_end
#  1:        1       <NA>     NA        <NA> 2007-01-02 purchase 2007-01-02
#  2:        2       <NA>     NA        <NA> 2007-01-06 purchase 2007-01-06
#  3:        1 2007-01-08     ad  2007-01-12 2007-01-12 purchase 2007-01-12
#  4:        1 2007-01-08     ad  2007-01-12 2007-01-12 purchase 2007-01-12
#  5:        2 2007-01-12     ad  2007-01-16 2007-01-15 purchase 2007-01-15
#  6:        2 2007-01-18     ad  2007-01-22 2007-01-20 purchase 2007-01-20
#  7:        2 2007-01-24     ad  2007-01-28 2007-01-24 purchase 2007-01-24
#  8:        1       <NA>     NA        <NA> 2007-01-27 purchase 2007-01-27
#  9:        2 2007-01-24     ad  2007-01-28 2007-01-28 purchase 2007-01-28
# 10:        1       <NA>     NA        <NA> 2007-01-30 purchase 2007-01-30
# tidyup
ovl[, action:=i.action][, c('ad_date_end', 'purch_end', 'i.action'):=NULL]
    customer    ad_date   action purch_date
#  1:        1       <NA> purchase 2007-01-02
#  2:        2       <NA> purchase 2007-01-06
#  3:        1 2007-01-08 purchase 2007-01-12
#  4:        1 2007-01-08 purchase 2007-01-12
#  5:        2 2007-01-12 purchase 2007-01-15
#  6:        2 2007-01-18 purchase 2007-01-20
#  7:        2 2007-01-24 purchase 2007-01-24
#  8:        1       <NA> purchase 2007-01-27
#  9:        2 2007-01-24 purchase 2007-01-28
# 10:        1       <NA> purchase 2007-01-30
The rows with NA ad_date were purchases not related to an ad.

                        这篇关于确定重叠范围 -  R的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

确定重叠范围 - R [英] Determine overlapping ranges - R

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

确定重叠范围 - R [英] Determine overlapping ranges - R

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭