Python PDFMIner - PDF到CSV [英] Python PDFMIner - PDF to CSV

查看:571
本文介绍了Python PDFMIner - PDF到CSV的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想要能够将PDF转换为CSV文件,并找到了几个有用的脚本,但是对于Python,我有一个问题:



指定要打印的PDF和CSV的文件路径?



我使用的是Python 2.7.11和PDFMiner 20140328。



import sys
从pdfminer.pdfinterp导入PDFResourceManager,PDFPageInterpreter
从pdfminer.pdf页导入PDFPage
从pdfminer.converter导入XMLConverter ,HTMLConverter,TextConverter
从pdfminer.layout导入LAParams
从cStringIO导入StringIO

def pdfparser(data):

fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec ='utf-8'
laparams = LAParams()
device = TextConverter
解释器= PDFPageInterpreter(rsrcmgr,设备)


对于PDFPage.get_pages(fp)中的页面:
interpreter.process_page(page)
data = retstr.getvalue()

打印数据

如果__name__ =='__main__':
pdfparser .argv [1])$ ​​b $ b


解决方案

来自 this SO回答tgray :

  def pdf_to_csv(filename,separator,threshold):
来自cStringIO import StringIO
from pdfminer.converter import LTChar,TextConverter
从pdfminer.layout导入LAParams
从pdfminer.pdfinterp导入PDFResourceManager,PDFPageInterpreter
从pdfminer.pdf页导入PDFPage

class CsvConverter(TextConverter):
def __init __(self,* args,** kwargs):
TextConverter .__ init __(self,* args,** kwargs)
self.separator = separator
self.threshold = threshold

def end_page(self,i):
from collections import defaultdict
lines = defaultdict(lambda:{})
child in self.cur_item._objs:#< - changed
如果isinstance(child,LTChar):
(_,_,x,y)= child.bbox
line = lines [int(-y)]
line [x] = child._text.encode(self.codec)#< - 更改
对于排序(lines.keys $ b line = lines [y]
self.line_creator(line)
self.outfp.write(self.line_creator(line))
self.outfp.write(\\\


def line_creator(self,line):
keys = sorted(line.keys())
#计算此行上每个字符之间的平均距离
average_istance = sum([keys [i] - keys [i-1] for i in range(1,len(keys))])/ len(keys)
将第一个字符追加到结果
结果= [line [keys [0]]]
对于范围内的i(1,len(keys)):
#如果此字符和最后一个字符之间的距离大于平均值*阈值
if(keys [i] - keys [i-1])> average_istance * self.threshold:
#将分隔符附加到该位置
result.append(self.separator)
#追加字符
result.append(line [keys [i ]])
printable_line =''.join(result)
返回printable_line

#...下面部分代码是
的混音#在pdfminer / tools / pdf2text模块中的convert()函数
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc,outfp,codec =utf-8因为我的测试文档是utf-8(注意:utf-8是默认的编解码器)

fp = open(filename,'rb')

interpreter = PDFPageInterpreter(rsrc,device)
for i,page in enumerate(PDFPage.get_pages(fp)):
outfp.write(START PAGE%d\\\
%i)
如果页面不是无:
print'none'
interpreter.process_page(page)
outfp.write(END PAGE%d\\\
%i )

device.close()
fp.close()

return outfp.getvalue()


if __name__ =='__main__':
#用于CSV的分隔符
separator =';'
#一个字符被视为新字/列的一部分的距离乘法器/块。通常1.5工作得很好
threshold = 1.5
print pdf_to_csv('myLovelyFile.pdf',separator,threshold)

链接中的答案和这个答案之间的主要区别是line_creator方法,它试图从PDF中提取一些结构。



应该使用PDFminer 20140328。


I want to be able to convert PDFs to CSV files and have found several useful scripts but, being new to Python, I have a question:

Where do you specify the filepath of the PDF and the CSV you want to print to?

I'm using Python 2.7.11 and PDFMiner 20140328.

import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

def pdfparser(data):

    fp = file(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)


    for page in PDFPage.get_pages(fp):
    interpreter.process_page(page)
    data =  retstr.getvalue()

    print data

if __name__ == '__main__':
pdfparser(sys.argv[1]) 

解决方案

Here is some modified code from this SO answer written by tgray:

def pdf_to_csv(filename, separator, threshold):
    from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)
            self.separator = separator
            self.threshold = threshold

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  # <-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  # <-- changed
            for y in sorted(lines.keys()):
                line = lines[y]
                self.line_creator(line)
                self.outfp.write(self.line_creator(line))
                self.outfp.write("\n")

        def line_creator(self, line):
            keys = sorted(line.keys())
            # calculate the average distange between each character on this row
            average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
            # append the first character to the result
            result = [line[keys[0]]]
            for i in range(1, len(keys)):
                # if the distance between this character and the last character is greater than the average*threshold
                if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
                    # append the separator into that position
                    result.append(self.separator)
                # append the character
                result.append(line[keys[i]])
            printable_line = ''.join(result)
            return printable_line

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    fp = open(filename, 'rb')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(PDFPage.get_pages(fp)):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            print 'none'
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()


if __name__ == '__main__':
    # the separator to use with the CSV
    separator = ';'
    # the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well
    threshold = 1.5
    print pdf_to_csv('myLovelyFile.pdf', separator, threshold)

The main difference between the answer in the link and this one is the line_creator method, which tries to extract some structure out of the PDF.

Should work with PDFminer 20140328.

这篇关于Python PDFMIner - PDF到CSV的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆