R函数根据列值重复使用函数 [英] R function for a function to be repeated based on column values

查看:155
本文介绍了R函数根据列值重复使用函数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有一个大型的数据集,我需要做字符串匹配。我从这个网站获得了一些非常有用的帖子,并提到他们我已经创建了一个函数来为我的数据集进行字符串匹配。我粘贴了我的样本数据和代码。

样本数据
$ b

  Address1 <-c(786,GALI NO 5,XYZ,rambo,45,strret 4,atlast,pqr,23 / 4,23RD FLOOR,STREET 2,ABC-E ('10','10','14','20','PQR,45-B,GALI NO5,XYZ,HECTIC,99 STREET,PQR '30')
Year1 <-c(2001:2005)

Address2 <-c(abc,pqr,xyz,786,GALI NO 4 XYZ 45B,GALI NO 5,XYZ,del,546,strret2,朝东,pqr,23/4,STREET 2,PQR,abc,pqr,xyz,786,GALI NO 4 XYZ, 45B,GALI NO 5,XYZ,del,546,strret2,朝东,pqr,23/4,STREET 2,PQR)
Year2 <-c(2001:2010)
AREA_CODE <-c('10','10','10','20','30','40','50','61','64','99')

data1 < - data.table(Address1,Year1,AREACODE)
data2 < - data.table(Address2,Year2,AREA_CODE)
data2 [,unique_id:= sprintf( %06d,1:nrow(data2))]

CODE

  fn.fuzzymatch <-function(dat1,dat2,string1,string2,meth){

dist.name< -stringdistmatrix(dat1 [[string1]],dat2 [[string2]],method = meth)

min.name< -apply(dist.name ,1,min)

match.s1.s2< -NULL
for(i in 1:nrow(dist.name))
{
s2.i< ; -match(min.name [i],dist.name [i,])
s1.i< -i
match.s1.s2< -rbind(data.frame(s1_row = s1。 i,s2_row = s2.i,s1name = dat1 [s1.i,] [[string1]],s2name = dat2 [s2.i,] [[string2]],dist = min.name [i])匹配。 s1.s2)
}
输出< - (match.s1.s2)[order(match.s1.s2 $ s1_row),]
return(output)
}


match_50 < - fn.fuzzymatch(data1,data2,Address1,Address2,dl)

这对于国家层面的数据来说工作得很好,但是我在区域层面有多个数据文件,每个区域都有多个区域。每个区域的Areacode可由data1中的 AREACODE 变量和data2中的 AREA_CODE 变量使用。我想更新我的函数,以便为每个区域完成字符串匹配,并且输出具有该区域代码





  1. 我试图使用split和将数据文件转换为列表并使用,然后使用rbindlist将它们组合起来,但不能成功并且获得不同类型的错误。我相信有一种方法可以做到这一点,但无法得到它。希望我可以提供一些建议。

    解决方案

    尽管您可以使用apply函数重复执行不同区域的单独数据文件,这里是一个基于我对 fuzzyjoin 解决方案-return-specific-column-based-on-matched-string?noredirect = 1& lq = 1> previous question

    它寻找Address的最佳 stringdist 匹配,并且AreaCode必须完全匹配( == )。我还指定year2必须是> = year1,仅供演示。


    $ b $ p

    最后,我用 dplyr :: group_by dplyr :: top_n 来获得最小距离匹配,并且我不得不假设在匹配关系中采取什么措施(选择与最大year2匹配的匹配)。
    $ b 数据:

      Address1 <-c(786,GALI NO 5,XYZ,rambo,45,strret 4 ,23 / 4,23RD FLOOR,STREET 2,ABC-E,PQR,45-B,GALI NO5,XYZ,HECTIC,99 STREET,PQR)
    AREACODE< ; - c('10','10','14','20','30')
    Year1 <-c(2001:2005)

    Address2< c(abc,pqr,xyz,786,GALI NO 4 XYZ,45B,GALI NO 5,XYZ,del,546,strret2,朝东,pqr,23/4,STREET 2, PQR,abc,pqr,xyz,786,GALI NO 4 XYZ,45B,GALI NO 5,XYZ,del,546,strret2,朝东,pqr,23/4,STREET 2 ,PQR)
    Year2 <-c(2001:2010)
    AREA_CODE <-c('10','10','10','20','30','40 ','50','61','64','99')

    data1 < - data.table(Address1 (地址2,年2,AREA_CODE)
    data2 [,unique_id:= sprintf(%06d,1:nrow(data2))]

    解决方案:

      library(fuzzyjoin,quietly = TRUE);库(dplyr,quietly = TRUE)

    #首先,需要定义match_fun_stringdist
    #https://github.com/dgrtwo/fuzzyjoin中的stringdist_join代码
    match_fun_stringdist< - 函数(v1,v2){

    #出于某种原因,我无法从fuzzy_join,
    #中传递这些参数,所以我在这里设置它们。
    ignore_case = FALSE
    method =dl
    max_dist = 99
    distance_col =dist

    if(ignore_case){
    v1 < - stringr :: str_to_lower(v1)
    v2< - stringr :: str_to_lower(v2)
    }

    #快捷方式类似Levenshtein的方法:如果
    #字符串长度大于最大字符串距离,
    #编辑距离必须至少大于

    #长度比字符串距离计算快得多
    if(方法%in%c(osa,lv,dl)){
    length_diff < - abs(stringr :: str_length(v1) - stringr :: str_length(v2))
    包括< - length_diff< = max_dist

    dists< - rep(NA,length(v1))

    dists [include]< - stringdist: :stringdist(v1 [include],v2 [include],method = method)
    } else {
    #必须计算它们全部
    dists< - stringdist :: stringdist(v1,v2 ,method = method)
    }
    ret< - dplyr :: data_frame(include =(di sts< = max_dist))
    if(!is.null(distance_col)){
    ret [[distance_col]] < - dists
    }
    ret
    }
    $ b $##最后,调用fuzzy_join
    fuzzy_join(data1,data2,
    by = list(x = c(Address1,AREACODE,Year1),y = c(Address2,AREA_CODE,Year2)),
    match_fun = list(match_fun_stringdist,`==`,`< =`),
    mode =left$ b (1, - 2)%>%b $ b group_by(Address1,Year1,AREACODE)%>%
    top_n(1,-Address1.dist)%>%
    top_n(1,Year2) %>%
    select(unique_id,Address1.dist,everything())


    I have a large data set for which I need to do string matching. I have got some very useful posts from this site and referring them I have created a function to do the string matching for my dataset. I am pasting my sample data and code.

    SAMPLE DATA

    Address1 <- c("786, GALI NO 5, XYZ","rambo, 45, strret 4, atlast, pqr","23/4, 23RD FLOOR, STREET 2, ABC-E, PQR","45-B, GALI NO5, XYZ","HECTIC, 99 STREET, PQR")
    AREACODE <- c('10','10','14','20','30')
    Year1 <- c(2001:2005)
    
    Address2 <- c("abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR","abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR")
    Year2 <- c(2001:2010)
    AREA_CODE <- c('10','10','10','20','30','40','50','61','64', '99')
    
    data1 <- data.table(Address1, Year1, AREACODE)
    data2 <- data.table(Address2, Year2, AREA_CODE)
    data2[, unique_id := sprintf("%06d", 1:nrow(data2))]
    

    CODE

    fn.fuzzymatch<-function(dat1,dat2,string1,string2,meth){
    
      dist.name<-stringdistmatrix(dat1[[string1]],dat2[[string2]],method = meth)
    
      min.name<-apply(dist.name, 1, min)
    
      match.s1.s2<-NULL
      for(i in 1:nrow(dist.name))
      {
        s2.i<-match(min.name[i],dist.name[i,])
        s1.i<-i
        match.s1.s2<-rbind(data.frame(s1_row=s1.i,s2_row=s2.i,s1name=dat1[s1.i,][[string1]],s2name=dat2[s2.i,][[string2]], dist=min.name[i]),match.s1.s2)
      }
      output <- (match.s1.s2)[order(match.s1.s2$s1_row),]
      return(output)
    }
    
    
    match_50 <- fn.fuzzymatch(data1,data2,"Address1","Address2","dl")
    

    This is working fine for the data at country level, but then I have multiple data files at region level and each region is having multiple areas. Areacode for each region is available by the AREACODE variable in data1 and AREA_CODE variable in data2. I want to update my function so that

    1. string matching is done for each area and the output has that area code
    2. output is returned for each region consolidated for all area codes in that region.

    I was trying to use split and to convert the data files into list and use and then use rbindlist to combine them but not able to succeed and have been getting different kinds of errors. I am sure there is a way to do this but not able to get it. Hope I can have some suggestions.

    解决方案

    While you can probably use an apply function to repeat over separate data files of different regions, here is a fuzzyjoin solution based on my answer to your previous question.

    It looks for the best stringdist match for Address and the AreaCode must match exactly (==). I also specified year2 had to be >= year1, just for demonstration.

    Finally, I used dplyr::group_by and dplyr::top_n to get the minimum distance matches and I had to assume what to do in matching ties (picked matches with largest year2).

    Data:

    Address1 <- c("786, GALI NO 5, XYZ","rambo, 45, strret 4, atlast, pqr","23/4, 23RD FLOOR, STREET 2, ABC-E, PQR","45-B, GALI NO5, XYZ","HECTIC, 99 STREET, PQR")
    AREACODE <- c('10','10','14','20','30')
    Year1 <- c(2001:2005)
    
    Address2 <- c("abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR","abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR")
    Year2 <- c(2001:2010)
    AREA_CODE <- c('10','10','10','20','30','40','50','61','64', '99')
    
    data1 <- data.table(Address1, Year1, AREACODE)
    data2 <- data.table(Address2, Year2, AREA_CODE)
    data2[, unique_id := sprintf("%06d", 1:nrow(data2))]
    

    Solution:

    library(fuzzyjoin, quietly = TRUE); library(dplyr, quietly = TRUE)
    
    # First, need to define match_fun_stringdist 
    # Code from stringdist_join from https://github.com/dgrtwo/fuzzyjoin
    match_fun_stringdist <- function(v1, v2) {
    
      # for some reason, I couldn't pass these parameters in from fuzzy_join, 
      # so I set them here.
      ignore_case = FALSE
      method = "dl"
      max_dist = 99
      distance_col = "dist"
    
      if (ignore_case) {
        v1 <- stringr::str_to_lower(v1)
        v2 <- stringr::str_to_lower(v2)
      }
    
      # shortcut for Levenshtein-like methods: if the difference in
      # string length is greater than the maximum string distance, the
      # edit distance must be at least that large
    
      # length is much faster to compute than string distance
      if (method %in% c("osa", "lv", "dl")) {
        length_diff <- abs(stringr::str_length(v1) - stringr::str_length(v2))
        include <- length_diff <= max_dist
    
        dists <- rep(NA, length(v1))
    
        dists[include] <- stringdist::stringdist(v1[include], v2[include], method = method)
      } else {
        # have to compute them all
        dists <- stringdist::stringdist(v1, v2, method = method)
      }
      ret <- dplyr::data_frame(include = (dists <= max_dist))
      if (!is.null(distance_col)) {
        ret[[distance_col]] <- dists
      }
      ret
    }
    
    # Finally, call fuzzy_join
    fuzzy_join(data1, data2, 
               by = list(x = c("Address1", "AREACODE", "Year1"), y = c("Address2", "AREA_CODE", "Year2")), 
               match_fun = list(match_fun_stringdist, `==`, `<=`),
               mode = "left"
               ) %>%
      group_by(Address1, Year1, AREACODE) %>%
      top_n(1, -Address1.dist) %>%
      top_n(1, Year2) %>%
      select(unique_id, Address1.dist, everything())
    

    这篇关于R函数根据列值重复使用函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆