如何使用美丽的汤列出所有具有PA /在html文件内部的字符串 [英] How to list all strings that have a PA/ inside of a html file using beatiful soup

查看：103 发布时间：2018/6/25 18:23:43 python html pdf beautifulsoup converter

本文介绍了如何使用美丽的汤列出所有具有PA /在html文件内部的字符串的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有一个将pdf转换为html的程序，我需要对此程序进行补充，因此在转换后它会搜索标签PA /和它前面的字符，并将这些标签和字符保存为CSV文件， m试图做到这一点，但我不能，有人可以帮我吗？

以下是目前的代码：

 导入shlex 
导入子流程
导入os 
导入平台
从bs4导入BeautifulSoup 
导入re 
 import csv 
 import pickle 
 def rename_files（）：
 file_list = os.listdir（rC：\\PROJECT\\pdfs）
 print（ file_list）
 saved_path = os.getcwd（）
 print（'当前工作目录是'+ saved_path）
 os.chdir（r'C：\\PROJECT\\pdfs '）
 for file_name in file_list：
 os.rename（file_name，file_name.translate（None，））
 os.chdir（saved_path）
 rename_files（）
 
 def run（command）：
如果platform.system（）！='Windows'：
 args = shlex.split（command）
 else：
 args = command 
s = subprocess.Popen（args，
 stdout = subprocess.PIPE，
 stderr = subprocess.PIPE）
输出，错误= s.communicate（）
返回s.returncode == 0，输出错误
 
＃将其更改为PDF文件的基本目录
 base_directory ='C：\\PROJECT\\pdfs'
如果不是os.path.isdir（base_directory）：
打印％s不是目录％base_directory 
 exit（1）
＃将此更改为您的pdf2htmlEX可执行位置
 bin_path ='C：\\Python27\\\ \\ pdfminer-20140328 \\tools\\pdf2txt.py'
如果不是os.path.isfile（bin_path）：
 print找不到％s％bin_path 
 exit（1）
用于dir_path，dir_name_list，os.walk（base_directory）中的file_name_list：
用于file_name_list中的file_name：
 ＃如果这不是PDF文件
 if file_name.endswith（'。pdf'）：
＃跳过它
 continue 
 file_path = os.path.join（dir_path， file_name）
＃在此将PDF转换为HTML 
 args =（bin_path，file_name，file_path）
成功，输出，错误= run（python％s -o％s.html％s ％args）
如果不成功：
 print无法将％s转换为HTML％file_path 
 print％s％errors 
 htmls_path ='C：\ \PROJECT'
用于dir_path，dir_name_list，os.walk（htmls_path）中的file_name_list：
用于file_name_list中的file_name：
如果不是file_name.endswith（'。html'）：
以open（file_name）作为标记继续
：
 soup = BeautifulSoup（markup.read（））
 text = soup.get_text（）
 match = re.findall（ PA /（\S *）\s *（\S *），text）
 print（match）
 with ope n（'score.csv'，'w'）as f：
 writer = csv.writer（f）
 writer.writerows（'％s'％match）

html太大了，我会在这里写下它的一部分，包括PA和我不想要的文本：

 < html> 
< title>测试< / title> 
< body> 
< div style =position：absolute; border：textbox 1px solid; writing-mode：lr-tb; left：59px; top：34023px; width：84px; height：32px;>< span style =font-family：YFEHEP + Times-Bold; font-size：17px>只是一些文本，我不想在CSV文件中使用
< br>< / span><< ; < / div>< div style =position：absolute; border：textbox 1px solid; writing-mode：lr-tb; left：59px; top：34066px; width：84px; height：16px;>< span style =font-family： YFEHEP + Times-Roman; font-size：16px> PA / 01008/17 GTD 
< br>< / span>< / div>< div style =position：absolute; border： textbox 1px solid; writing-mode：lr-tb; left：59px; top：34105px; width：84px; height：16px;>< span style =font-family：YFEHEP + Times-Roman; font-size ：16px> PA / 01095/17 GTD 
< / body> 
< / html>

解决方案

检查在线演示

  import re 
 from bs4 import BeautifulSoup 
 html_doc =
< html> 
< title>测试< / title> 
< body> 
< div style =位置：绝对; border：textbox 1px solid;写入模式：LR-TB;左：59px;顶部：34023px;宽度：84px; height：32px;>< span style =font-family：YFEHEP + Times-Bold;字体大小：17px>只是一些文本，我不希望在CSV文件中有
< br>< / span>< span style =font-family：YFEHEP + Times-罗马; font-size：16px> PA / 00986/17 GTD 
< br>< / span>< / div>< div style =position：absolute; border：textbox 1px solid;写入模式：LR-TB;左：59px;顶部：34066px;宽度：84px; height：16px;>< span style =font-family：YFEHEP + Times-Roman; font-size：16px> PA / 01008/17 GTD 
< br>< / span>< / div>< div style =position：absolute; border：textbox 1px solid;写入模式：LR-TB;左：59px;顶部：34105px;宽度：84px; height：16px;>< span style =font-family：YFEHEP + Times-Roman; font-size：16px> PA / 01095/17 GTD 
< / body> 
< / html> 

 
汤= BeautifulSoup （html_doc，'html.parser'）
 text = soup.get_text（）
 
 match = re.findall（PA /（\S *）\s *（\ S *），text）
 print（match）

$ b $ pre $ 将csv 打开（'ur file.csv'，'wb'）如下： csv_out = csv.writer（out） csv_out.writerow（['fist_col'，'second_col']）用于匹配行： csv_out.writerow（row）

I have a program that converts pdfs into html and I needed to complement this program so after converting It would search for the tags PA/ and the character in front of it and save these tags and characters to a CSV file, I'm trying to do it but I can't, could someone help me out please?



Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
def rename_files():
    file_list = os.listdir(r"C:\\PROJECT\\pdfs")
    print(file_list)
    saved_path = os.getcwd()
    print('Current working directory is '+saved_path)
    os.chdir(r'C:\\PROJECT\\pdfs')
    for file_name in file_list:
        os.rename(file_name, file_name.translate(None, " "))
    os.chdir(saved_path)
rename_files()

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (bin_path, file_name, file_path)
        success, output, errors = run("python %s -o %s.html %s " %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors
htmls_path = 'C:\\PROJECT'
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
    for file_name in file_name_list:
        if not file_name.endswith('.html'):
            continue
        with open(file_name) as markup:
            soup = BeautifulSoup(markup.read())
            text = soup.get_text()
            match = re.findall("PA/(\S*)\s*(\S*)", text)
            print(match)
with open ('score.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows('%s' %match)
The html is too big, I'll write here a part of it that includes the PA's and the text that I don't want:
<html>
    <title>Testing</title>
    <body>
        <div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;"><span style="font-family: YFEHEP+Times-Bold; font-size:17px">JUST SOME TEXT THAT I DON'T WANT TO HAVE ON THE CSV FILE
        <br></span><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/00986/17 GTD
        <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01008/17 GTD
        <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01095/17 GTD
    </body>
</html>

 解决方案 
Check Online Demo
    import re
    from bs4 import BeautifulSoup
    html_doc = """
    <html>
        <title>Testing</title>
        <body>
            <div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34023px; width:84px; height:32px;"><span style="font-family: YFEHEP+Times-Bold; font-size:17px">JUST SOME TEXT THAT I DON'T WANT TO HAVE ON THE CSV FILE
            <br></span><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/00986/17 GTD
            <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34066px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01008/17 GTD
            <br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:59px; top:34105px; width:84px; height:16px;"><span style="font-family: YFEHEP+Times-Roman; font-size:16px">PA/01095/17 GTD
        </body>
    </html>
    """

    soup = BeautifulSoup(html_doc, 'html.parser')
    text = soup.get_text()

    match = re.findall("PA/(\S*)\s*(\S*)", text)
    print(match)
For writting to CSV
import csv
with open('ur file.csv','wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['fist_col','second_col'])
    for row in match:
        csv_out.writerow(row)


                        
这篇关于如何使用美丽的汤列出所有具有PA /在html文件内部的字符串的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

如何使用美丽的汤列出所有具有PA /在html文件内部的字符串 [英] How to list all strings that have a PA/ inside of a html file using beatiful soup

问题描述

相关文章

前端开发最新文章

热门教程

热门工具

登录关闭

如何使用美丽的汤列出所有具有PA /在html文件内部的字符串 [英] How to list all strings that have a PA/ inside of a html file using beatiful soup

问题描述

相关文章

前端开发最新文章

热门教程

热门工具

登录 关闭

登录关闭