在R中写入数据帧时出错 [英] Error in writing data frame in R

查看:134
本文介绍了在R中写入数据帧时出错的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图从我从OCR格式的pdf文件中提取的文本中搜索一个单词。这个pdf文件有多个页面,所以对于每个页面,我搜索那个单词,如果找到这个单词,然后写文件名状态(存在或不存在) ,页面,找到了哪些。但数据框是给所有文件的状态存在,我只是想这样的

  file_name状态页字
test1.pdf现在test1_2,test1_4 gym,school
test2.pdfNot Present - -
test3.pdfPresenttest3_1 gym

这段代码中缺少的是什么。



这里是代码



$ $ p $ All_files = Sys.glob(*。pdf)
v1< - numeric(length(All_files))
chk_words = c(游泳池,健身房,西,段)
字< - 健身房
tc = c()
ps = c b $ b $ = list()
df< - data.frame()
Status =Present

for(i in seq_along(All_files)){


file_name< - All_files [i]

cnt< - pdf_info(All_files [i])$ pages
print(cnt)

for(j in seq_len(cnt)){
img_file< - pdftools :: pdf_convert(All_files [i],format ='tiff',pages = j,dp i = 400)
text < - ocr(img_file)
ocr_text< - capture.output(cat(text))
检查< - sapply(ocr_text,paste,collapse = )
junk < - dir(path =D:/ Deepesh / R Script / All_PDF_Files / Registration_Certificates_OCR,pattern =tiff)
file.remove(junk)
br< (b)长度(其中(stri_detect_fixed(tolower(检查),tolower(word))))<= 0)不存在
else存在
打印$ b $ (b)(b)(b)(b)(b)(b)(b)(b)如果(长度(其中,(stri_detect_fixed(tolower(check),tolower(k))))<= 0){print(Not present)} else {print(Present)}
if ==Present)
ps = k
x [[k]] = ps
tc = unlist(unique(x))
}





$ b print(tc)
状态< - if(v1 [i] == 0)Not PresentelsePresent
pages < - if(v1 [i] == 0) - else
paste0(tools :: file_pat h_sans_ext(basename(file_name)),_,v1 [i])
words < - if(v1 [i] == 0) - else word
df < - rbind df,cbind(file_name = basename(file_name),
Status,pages = pages,words = words,tc))


}


>

谢谢
<解决方案

这是一个单词的选项

  v1< - 数字(长度(All_files))
字< - school
df< - data.frame()
Status =存在

(我在seq_along(All_files)){


file_name< - All_files [i]

cnt< - pdf_info(All_files [i])$ pages
print(cnt)

for(j in seq_len(cnt)){
img_file< - pdftools :: pdf_convert(All_files [i],format ='tiff', pages = j,dpi = 400)
text <-ocr(img_file)
ocr_text< - capture.output(cat(text))
check< - sapply(ocr_text,p aste,collapse =)
junk < - dir(path = paste0(path,/ tiff),pattern =tiff)
file.remove(junk)
br < -if(长度(其中(stri_detect_fixed(tolower(检查),tolower(word))))<= 0)不存在
else存在
print(br)
if if(br ==Present){
v1 [i] < - j
break}

}

Status< - if(v1 [i] == 0)Not PresentelsePresent
pages < - if(v1 [i] == 0) - else
paste0(tools :: file_path_sans_ext(basename(file_name)),_,v1 [i])
words < - if(v1 [i] == 0) - else word
df < - rbind df,cbind(file_name = basename(file_name),
Status,pages = pages,words = words))


}


$输出


  df 
# file_name状态页单词
#1 Amenities.pdf不存在 - -
#2 test.pdf现在test_2学校


I'm trying to search a word from the text that I extract from the pdf file which is OCR'd format. This pdf file has multiple pages, so for each page, I'm searching that word, if that word is found then write the filename, status(Present or Not Present),Page on which it is found and what words it has found to a dataframe . But the dataframe is giving the status "Present" for all files, I just want like this

file_name       Status        Page              words
test1.pdf    "Present"       test1_2,test1_4    gym,school
test2.pdf    "Not Present"     -                 -
test3.pdf    "Present"       test3_1            gym

what m I missing in this code.

here is the code

    All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

  cnt <- pdf_info(All_files[i])$pages
  print(cnt)

  for(j in seq_len(cnt)){
    img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
    text <- ocr(img_file)
    ocr_text <- capture.output(cat(text))
    check <- sapply(ocr_text, paste, collapse="")
    junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
    file.remove(junk)
    br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
    else "Present" 
    print(br)       
    if(br=="Present") {
      v1[i] <- j
      break}

    for(k in chk_words){ 
      br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
      if(br == "Present")
        ps=k
      x[[k]]=ps
      tc=unlist(unique(x))
    }




  }

  print(tc)
  Status <- if(v1[i] == 0) "Not Present" else "Present"
  pages <- if(v1[i] == 0) "-" else 
    paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
  words <- if(v1[i] == 0) "-" else word
  df <- rbind(df, cbind(file_name = basename(file_name),
                        Status, pages = pages, words = words,tc))


}

Any suggestion is appreciable.

Thanks

解决方案

Here is an option for single word

v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

    cnt <- pdf_info(All_files[i])$pages
    print(cnt)

    for(j in seq_len(cnt)){
      img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
      text <- ocr(img_file)
      ocr_text <- capture.output(cat(text))
      check <- sapply(ocr_text, paste, collapse="")
      junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
      file.remove(junk)
      br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
              else "Present" 
      print(br)       
      if(br=="Present") {
         v1[i] <- j
         break}

    }

    Status <- if(v1[i] == 0) "Not Present" else "Present"
    pages <- if(v1[i] == 0) "-" else 
     paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
    words <- if(v1[i] == 0) "-" else word
    df <- rbind(df, cbind(file_name = basename(file_name),
              Status, pages = pages, words = words))


}

-output

df
#     file_name      Status  pages  words
#1 Amenities.pdf Not Present      -      -
#2      test.pdf     Present test_2 school

这篇关于在R中写入数据帧时出错的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆