lxml不使用django，scraperwiki [英] lxml not working with django, scraperwiki

查看：160 发布时间：2017/5/31 22:25:11 django lxml scraperwiki

本文介绍了lxml不使用django，scraperwiki的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在通过伊利诺伊州大会网站上的一个django应用程序来删除一些pdf。在部署在我的桌面上，直到urllib2超时才能正常工作。当我尝试部署在我的Bluehost服务器上时，lxml部分的代码会抛出一个错误。任何帮助将不胜感激。

I'm working on a django app that goes through Illinois' General Assembly website to scrape some pdfs. While deployed on my desktop it works fine until urllib2 times out. When I try to deploy on my Bluehost server, the lxml part of the code throws up an error. Any help would be appreciated.

import scraperwiki
from bs4 import BeautifulSoup
import urllib2
import lxml.etree
import re
from django.core.management.base import BaseCommand
from legi.models import Votes

class Command(BaseCommand):
    def handle(self, *args, **options):
        chmbrs =['http://www.ilga.gov/house/', 'http://www.ilga.gov/senate/']
        for chmbr in chmbrs:
            site = chmbr    
            url = urllib2.urlopen(site)
            content = url.read()
            soup = BeautifulSoup(content)
            links = []
            linkStats = []
            x=0
            y=0
            table = soup.find('table', cellpadding=3)
            for a in soup.findAll('a',href=True):
                if re.findall('Bills', a['href']):
                    l = (site + a['href']+'&Primary=True')
                    links.append(str(l))
                    x+=1
                    print x
            for link in links:
                url = urllib2.urlopen(link)
                content = url.read()
                soup = BeautifulSoup(content)
                table = soup.find('table', cellpadding=3)
                for a in table.findAll('a',href=True):
                    if re.findall('BillStatus', a['href']):
                        linkStats.append(str('http://ilga.gov'+a['href']))
            for linkStat in linkStats:
                url = urllib2.urlopen(linkStat)
                content = url.read()
                soup = BeautifulSoup(content)
                for a in soup.findAll('a',href=True):
                    if re.findall('votehistory', a['href']):
                        vl = 'http://ilga.gov/legislation/'+a['href']
                        url = urllib2.urlopen(vl)
                        content = url.read()
                        soup = BeautifulSoup(content)
                        for b in soup.findAll('a',href=True):
                            if re.findall('votehistory', b['href']):
                                llink = 'http://ilga.gov'+b['href']
                                try:
                                    u = urllib2.urlopen(llink)
                                    x = scraperwiki.pdftoxml(u.read())
                                    root = lxml.etree.fromstring(x)
                                    pages = list(root)
                                    chamber = str()
                                    for page in pages:
                                        print "working_1"
                                        for el in page:
                                            print "working_2"
                                            if el.tag == 'text':
                                                if int(el.attrib['top']) == 168:
                                                    chamber = el.text
                                                if re.findall("Senate Vote", chamber):
                                                    if int(el.attrib['top']) >= 203 and int(el.attrib['top']) < 231:
                                                        title = el.text
                                                        if (re.findall('House', title)):
                                                            title = (re.findall('[0-9]+', title))
                                                            title = "HB"+title[0]
                                                        elif (re.findall('Senate', title)):
                                                            title = (re.findall('[0-9]+', title))
                                                            title = "SB"+title[0]
                                                    if int(el.attrib['top']) >350 and int(el.attrib['top']) <650:
                                                        r = el.text
                                                        names = re.findall(r'[A-z-\u00F1]{3,}',r)
                                                        vs = re.findall(r'[A-Z]{1,2}\s',r)
                                                        for name in names:
                                                            legi = name
                                                            for vote in vs:
                                                                v = vote
                                                            if Votes.objects.filter(legislation=title).exists() == False:
                                                                c = Votes(legislation=title, legislator=legi, vote=v)
                                                                c.save()    
                                                                print 'saved'
                                                            else:
                                                                print 'not saved'                                                       
                                                elif int(el.attrib['top']) == 189:
                                                    chamber = el.text
                                                if re.findall("HOUSE ROLL CALL", chamber):
                                                    if int(el.attrib['top']) > 200 and int(el.attrib['top']) <215:
                                                        title = el.text
                                                        if (re.findall('HOUSE', title)):
                                                            title = (re.findall('[0-9]+', title))
                                                            title = "HB"+title[0]
                                                        elif (re.findall('SENATE', title)):
                                                            title = (re.findall('[0-9]+', title))
                                                            title = "SB"+title[0]
                                                    if int(el.attrib['top']) >385 and int(el.attrib['top']) <1000:
                                                        r = el.text
                                                        names = re.findall(r'[A-z-\u00F1]{3,}',r)
                                                        votes = re.findall(r'[A-Z]{1,2}\s',r)
                                                        for name in names:
                                                            legi = name
                                                            for vote in votes:
                                                                v = vote
                                                            if Votes.objects.filter(legislation=title).exists() == False:
                                                                c = Votes(legislation=title, legislator=legi, vote=v)
                                                                c.save()
                                                                print 'saved'
                                                            else:
                                                                print 'not saved'

                                except:
                                    pass

编辑1
以下是错误跟踪

EDIT 1 Here's the error trace

    Traceback (most recent call last):
  File "manage.py", line 10, in <module>
    execute_from_command_line(sys.argv)
  File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 399, in execute_from_command_line
    utility.execute()
  File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 392, in execute
    self.fetch_command(subcommand).run_from_argv(self.argv)
  File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 242, in run_from_argv
    self.execute(*args, **options.__dict__)
  File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 285, in execute
    output = self.handle(*args, **options)
  File "/home7/maythirt/GAB/legi/management/commands/vote.py", line 51, in handle
    root = lxml.etree.fromstring(x)
  File "lxml.etree.pyx", line 3032, in lxml.etree.fromstring (src/lxml/lxml.etree.c:68121)
  File "parser.pxi", line 1786, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:102470)
  File "parser.pxi", line 1674, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:101299)
  File "parser.pxi", line 1074, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:96481)
  File "parser.pxi", line 582, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:91290)
  File "parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:92476)
  File "parser.pxi", line 633, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:91939)
lxml.etree.XMLSyntaxError: None

lxml不使用django，scraperwiki [英] lxml not working with django, scraperwiki

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

lxml不使用django，scraperwiki [英] lxml not working with django, scraperwiki

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭