在R中写入数据帧时出错 [英] Error in writing data frame in R
问题描述
我试图从我从OCR格式的pdf文件中提取的文本中搜索一个单词。这个pdf文件有多个页面,所以对于每个页面,我搜索那个单词,如果找到这个单词,然后写文件名,状态(存在或不存在) ,页面,找到了哪些字。但数据框是给所有文件的状态存在,我只是想这样的
file_name状态页字
test1.pdf现在test1_2,test1_4 gym,school
test2.pdfNot Present - -
test3.pdfPresenttest3_1 gym
这段代码中缺少的是什么。
这里是代码
$ $ p $
All_files = Sys.glob(*。pdf)
v1< - numeric(length(All_files))
chk_words = c(游泳池,健身房,西,段)
字< - 健身房
tc = c()
ps = c b $ b $ = list()
df< - data.frame()
Status =Present
for(i in seq_along(All_files)){
file_name< - All_files [i]
cnt< - pdf_info(All_files [i])$ pages
print(cnt)
for(j in seq_len(cnt)){
img_file< - pdftools :: pdf_convert(All_files [i],format ='tiff',pages = j,dp i = 400)
text < - ocr(img_file)
ocr_text< - capture.output(cat(text))
检查< - sapply(ocr_text,paste,collapse = )
junk < - dir(path =D:/ Deepesh / R Script / All_PDF_Files / Registration_Certificates_OCR,pattern =tiff)
file.remove(junk)
br< (b)长度(其中(stri_detect_fixed(tolower(检查),tolower(word))))<= 0)不存在
else存在
打印$ b $ (b)(b)(b)(b)(b)(b)(b)(b)如果(长度(其中,(stri_detect_fixed(tolower(check),tolower(k))))<= 0){print(Not present)} else {print(Present)}
if ==Present)
ps = k
x [[k]] = ps
tc = unlist(unique(x))
}
$ b print(tc)
状态< - if(v1 [i] == 0)Not PresentelsePresent
pages < - if(v1 [i] == 0) - else
paste0(tools :: file_pat h_sans_ext(basename(file_name)),_,v1 [i])
words < - if(v1 [i] == 0) - else word
df < - rbind df,cbind(file_name = basename(file_name),
Status,pages = pages,words = words,tc))
}
$ c $
>
谢谢
<解决方案
这是一个单词的选项
v1< - 数字(长度(All_files))
字< - school
df< - data.frame()
Status =存在
(我在seq_along(All_files)){
file_name< - All_files [i]
cnt< - pdf_info(All_files [i])$ pages
print(cnt)
for(j in seq_len(cnt)){
img_file< - pdftools :: pdf_convert(All_files [i],format ='tiff', pages = j,dpi = 400)
text <-ocr(img_file)
ocr_text< - capture.output(cat(text))
check< - sapply(ocr_text,p aste,collapse =)
junk < - dir(path = paste0(path,/ tiff),pattern =tiff)
file.remove(junk)
br < -if(长度(其中(stri_detect_fixed(tolower(检查),tolower(word))))<= 0)不存在
else存在
print(br)
if if(br ==Present){
v1 [i] < - j
break}
}
Status< - if(v1 [i] == 0)Not PresentelsePresent
pages < - if(v1 [i] == 0) - else
paste0(tools :: file_path_sans_ext(basename(file_name)),_,v1 [i])
words < - if(v1 [i] == 0) - else word
df < - rbind df,cbind(file_name = basename(file_name),
Status,pages = pages,words = words))
}
$输出
df
# file_name状态页单词
#1 Amenities.pdf不存在 - -
#2 test.pdf现在test_2学校
I'm trying to search a word from the text that I extract from the pdf file which is OCR'd format. This pdf file has multiple pages, so for each page, I'm searching that word, if that word is found then write the filename, status(Present or Not Present),Page on which it is found and what words it has found to a dataframe . But the dataframe is giving the status "Present" for all files, I just want like this
file_name Status Page words test1.pdf "Present" test1_2,test1_4 gym,school test2.pdf "Not Present" - - test3.pdf "Present" test3_1 gym
what m I missing in this code.
here is the code
All_files=Sys.glob("*.pdf") v1 <- numeric(length(All_files)) chk_words=c("Swimming pool","Gym","west","para") word <- "Gym" tc=c() ps=c() x=list() df <- data.frame() Status="Present" for (i in seq_along(All_files)){ file_name <- All_files[i] cnt <- pdf_info(All_files[i])$pages print(cnt) for(j in seq_len(cnt)){ img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400) text <- ocr(img_file) ocr_text <- capture.output(cat(text)) check <- sapply(ocr_text, paste, collapse="") junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff") file.remove(junk) br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present" else "Present" print(br) if(br=="Present") { v1[i] <- j break} for(k in chk_words){ br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")} if(br == "Present") ps=k x[[k]]=ps tc=unlist(unique(x)) } } print(tc) Status <- if(v1[i] == 0) "Not Present" else "Present" pages <- if(v1[i] == 0) "-" else paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i]) words <- if(v1[i] == 0) "-" else word df <- rbind(df, cbind(file_name = basename(file_name), Status, pages = pages, words = words,tc)) }
Any suggestion is appreciable.
Thanks
解决方案Here is an option for single word
v1 <- numeric(length(All_files)) word <- "school" df <- data.frame() Status="Present" for (i in seq_along(All_files)){ file_name <- All_files[i] cnt <- pdf_info(All_files[i])$pages print(cnt) for(j in seq_len(cnt)){ img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400) text <- ocr(img_file) ocr_text <- capture.output(cat(text)) check <- sapply(ocr_text, paste, collapse="") junk <- dir(path= paste0(path, "/tiff"), pattern="tiff") file.remove(junk) br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present" else "Present" print(br) if(br=="Present") { v1[i] <- j break} } Status <- if(v1[i] == 0) "Not Present" else "Present" pages <- if(v1[i] == 0) "-" else paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i]) words <- if(v1[i] == 0) "-" else word df <- rbind(df, cbind(file_name = basename(file_name), Status, pages = pages, words = words)) }
-output
df # file_name Status pages words #1 Amenities.pdf Not Present - - #2 test.pdf Present test_2 school
这篇关于在R中写入数据帧时出错的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!