刮除未处理的例外 [英] scrapy unhandled exception

查看：295 发布时间：2017/9/30 22:32:25 exception scrapy unhandled

本文介绍了刮除未处理的例外的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我在linux上使用scrapy 0.16.2版本。我正在运行：

  scrapy crawl mycrawlspider -s JOBDIR = / mnt / mycrawlspider

我收到这个错误，阻止了scrapy（挂起，并没有自动完成，只有^ C停止）

  2012-11-20 15：04：51 + 0000 [ - ]未处理的错误追溯（最近的呼叫最后）：文件/ usr / lib / python2运行
 self.crawler.start（）文件/usr/lib/python2.7/site-packages/scrapy/crawler .py，第80行，起始
 reactor.run（installSignalHandlers = False）#block调用文件/usr/lib/python2.7/site-packages/twisted/internet/base.py，第1169行，在运行
 self.mainLoop（）文件/usr/lib/python2.7/site-packages/twisted/internet/base.py，第1178行，mainLoop 
 self.runUntilCurrent（） ---<这里捕获的异常> ---文件/usr/lib/python2.7/site-packages/twisted/internet/base.py，第800行，在runUntilCurrent 
 call.func（* call.args，** call.kw ）文件/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py，第41行，__call__ 
 return self._func（* self._a，** self._kw）文件/usr/lib/python2.7/site-packages/scrapy/core/engine.py，第116行，
 _next_request 
 self.crawl（request，spider）文件/ usr / lib / python2.7 / site-packages / scrapy / core / engine.py，第172行，爬网
 self.schedule（request，spider）文件/usr/lib/python2.7/site-packages /scrapy/core/engine.py，第176行，日程表
 return self.slots [spider] .scheduler.enqueue_request（request）文件/usr/lib/python2.7/site-packages/scrapy/ core / scheduler.py，第48行，enqueue_request 
如果没有request.dont_filter和self.df.request_seen（request）：exceptions.AttributeError：'NoneType'对象没有属性'dont_filter'

BTW这在版本0.14中有效

这是代码：

  class MySpider（CrawlSpider）：
 name ='alrroya'
 
 NEW_IGNORED_EXTENSIONS = list（IGNORED_EXTENSIONS） 
 NEW_IGNORED_EXTENSIONS.remove（'pdf'）
 
 download_delay = 0.05 
＃抓取时在这些域中保留
 allowed_domains = [] 
 
 all_domains = {} 
 
 start_urls = [] 
 
＃添加我们的回调，将为每个找到的链接调用
 rules = [
 Rule（SgmlLinkExtractor （deny_extensions = NEW_IGNORED_EXTENSIONS，tags =（'a'，'area'，'frame'，'iframe'），attrs =（'href'，'src'）），follow = True，callback ='parse_crawled_page' $ b] 
 
＃抓取了几个页面
 crawl_count = 0 
 
＃我们发现了多少PDF 
 pdf_count = 0 
 
 def __init __（self，* args，** kwargs）：
 CrawlSpide r .__ init __（self，* args，** kwargs）
 dispatcher.connect（self._spider_closed，signals.spider_closed）
 dispatcher.connect（self._spider_opened，signals.spider_opened）
 self .load_allowed_domains_and_start_urls（）
 
 def allowed_to_start（self）：
 curr_date = datetime.today（）
 curr_date = datetime（curr_date.year，curr_date.month，curr_date.day）
 jobdir = self.settings ['JOBDIR'] 
如果jobdir：
 mnt = os.path.dirname（os.path.normpath（jobdir））
 else：
 mnt =''
 
 checkfile = os.path.join（mnt，'％s.crawlercheck'％self.__class__.name）
 day = timedelta（days = 1）
如果os.path.exists（checkfile）：
f = open（checkfile，'r'）
 data = f.read（）
 f.close（）
 data = data.split（'\\\
'）
 reason = data [0] 
 try：
 reason_date = datetime.strptime（data [1]，'％Y-％m-％d'）
 
除了例外例外：
 reason_date =无
 
如果reason_date和'shutdown'的原因：
 reason = True 
 
 else：
如果reason_date和reason_date + day< = curr_date和'finished'在理由中：
 reason = True 
 
 else：
 reason = False 
 else：
 reason = True 
 
返回原因
 
 def _spider_opened（self，spider）：
如果蜘蛛不是自我：
 return 
 
 curr_date = datetime.today（）
 curr_date = datetime（curr_date。年，curr_date.month，curr_date.day）
 jobdir = spider.settings ['JOBDIR'] 
如果jobdir：
 mnt = os.path.dirname（os.path.normpath（jobdir ））
 else：
 mnt =''
 
 checkfile = os.path.join（mnt，'％s.crawlercheck'％self.__class__.name）
 day = timedelta（days = 1）
如果os.path.exists（checkfile）：
f = open（checkfile，'r'）
 data = f.read（）
 f.close（）
 data = data .split（'\\\
'）
 reason = data [0] 
 try：
 reason_date = datetime.strptime（data [1]，'％Y-％m-％d' ）
 
除了例外ex：
 reason_date = None 
 
如果reason_date和'shutdown'在原因：
f = open（checkfile，'w' ）
 f.write（'started\\\
'）
 f.write（str（date.today（）））
 f.close（）
 
 else：
如果reason_date和reason_date + day< = curr_date和'finished'在理由中：
f = open（checkfile，'w'）
 f.write（'started\\\
'）
 f.write（str（date.today（）））
 f.close（）
 
 else：
 crawler.engine.close_spider（self，'finished'）
如果jobdir和os.path.exists（jobdir）：
 shutil.rmtree（jobdir）
f = open（checkfile，'w '）
 f.write（'finished\\\
'）
 f.write（str（date.today（）））
 f.close（）
 os._exit （1）
 else：
f = open（checkfile，'w'）
 f.write（'started\\\
'）
 f.write（str（date.today （））
 f.close（）
 
 def _spider_closed（self，spider，reason）：
如果蜘蛛不是自我：
 return 
 
 jobdir = spider.settings ['JOBDIR'] 
如果jobdir：
 mnt = os.path.dirname（os.path.normpath（jobdir））
 else：
 mnt =''
 
 checkfile = os.path.join（mnt，' ％s.crawlercheck'％self.__class__.name）
如果'shutdown'在原因：
f = open（checkfile，'w'）
 f.write（'shutdown\\\
' ）
 f.write（str（date.today（）））
 f.close（）
 else：
如果jobdir和os.path.exists（jobdir）：
 shutil.rmtree（jobdir）
f = open（checkfile，'w'）
 f.write（'finished\\\
'）
 f.write（str（date.today （）））
 f.close（）
 
 def _requests_to_follow（self，response）：
 if getattr（response，'encoding'，None）！= None：
 return CrawlSpider._requests_to_follow（self，response）
 else：
 return [] 
 
 def make_requests_from_url（self，url）：
 http_client = httplib2.Http（）
 try：
 headers = {
'content-type'：'text / html'，
'user-agent'：random。选择（USER_AGENT_LIST）
} 
响应，content = http_client.request（url，method ='HEAD'，headers = headers）
＃〜if'pdf'in response ['content-type '] .lower（）或（url.endswith（'。pdf'）和'octet-stream'作为回应['content-type']。lower（））：
如果'pdf' （）：
如果self.allowed_to_start（）：
 self.get_pdf_link（url） 
 
 else：
返回CrawlSpider.make_requests_from_url（self，url）
 
除了例外，例如：
返回CrawlSpider.make_requests_from_url（self，url）
 
 def get_pdf_link（self，url）：
 source = self.__class__.name 
 parsed_url = urlparse（url）
 url_domain = parsed_url.netloc 
 url_path = parsed_url.path 
如果url_domain：
为域，路径如果url_domain.endswith（domain）：
 pre_and = False 
 pre_or = False 
和__cond = true 
 or_cond = False 
路径中的路径：
如果路径[0：1] =='！'：
 pre_and = True 
如果路径[1 ：] not in url_path：
 and_cond = and_cond and True 
 else：
 and_cond = and_cond and False 
 
 else：
 pre_or = True 
如果路径在url_path：
 or_cond = or_cond或True 
 else：
 or_cond = or_cond或False 
 
如果pre_and和pre_or：
如果和__cond和or_cond ：
 self.pdf_process（source，url）
 return 
 elif pre_and：
 if and_cond：
 self.pdf_process（source，url）
 return 
 elif pre_or：
 if or_cond：
 self.pdf_process（source，url）
 return 
 else：
 self.pdf_process（source，url）
 return 
 
 def parse_crawled_page（self，response）：
 self .__ class __。crawl_count + = 1 
 crawl_count = self .__ class __。crawl_count 
如果crawl_count％100 == 0：
打印'Crawled％d pages'％crawl_count 
 
如果'pdf'作为回应。 header.get（'content-type'，''）.lower（）：
 self.get_pdf_link（response.url）
 
返回项目（）
 
 （）
 day = timedelta（days = /currentissues.php?editiondt='+ currdate.strftime（'％Y /％m /％d'），）
 
 self .__ class __。all_domains = {
'alrroya'：{ 
'start_urls'：alrroya，
'allow_domains'：{
'epaper.alrroya.com'：frozenset（（）），
} 
} 
} 
 
 for self .__ class __。all_domains [self .__ class __。name] ['allow_domains']：
 self .__ class __。allowed_domains.append（domain）
 
 self .__ class __。start_urls.extend（self .__ class __。all_domains [self .__ class __。name] ['start_urls']）
 
 def pdf_process（self，source，url）：
 print'!!! '+ source +''+ url

解决方案

成为Scrapy中的错误。当前版本似乎不接受从make_requests_from_url（）返回的列表。我可以通过以下方式修改Scrapy代码来解决问题。

在文件Scrapy-0.16.5-py2.7.egg / scrapy / spider.py

更改：

  def start_requests（self） 
 for self.start_urls中的url：
 yield self.make_requests_from_url（url）

到：

  def start_requests（self）：
 for self.start_urls中的url：
 requests = self.make_requests_from_url（url）
如果类型（请求）是列表：
请求中的请求：
生产请求
其他：
生产请求

我希望官方Scrapy的人最终会解决这个问题。

I am using scrapy 0.16.2 version on linux. I'm running:

scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider

I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)

2012-11-20 15:04:51+0000 [-] Unhandled Error    Traceback (most recent call last):    File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
        self.crawler.start()      File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
        reactor.run(installSignalHandlers=False) # blocking call      File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
        self.mainLoop()       File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
        self.runUntilCurrent()  --- <exception caught here> ---       File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
        call.func(*call.args, **call.kw)      File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
        return self._func(*self._a, **self._kw)       File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
        self.crawl(request, spider)       File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
        self.schedule(request, spider)    File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
        return self.slots[spider].scheduler.enqueue_request(request)      File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
        if not request.dont_filter and self.df.request_seen(request):   exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'

BTW this worked in version 0.14

Here is the code:

class MySpider(CrawlSpider):
    name = 'alrroya'

    NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
    NEW_IGNORED_EXTENSIONS.remove('pdf')

    download_delay = 0.05
    # Stay within these domains when crawling
    allowed_domains = []

    all_domains = {}

    start_urls = []

    # Add our callback which will be called for every found link
    rules = [
        Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
    ]

    # How many pages crawled
    crawl_count = 0

    # How many PDFs we have found
    pdf_count = 0

    def __init__(self, *args, **kwargs):
        CrawlSpider.__init__(self, *args, **kwargs)
        dispatcher.connect(self._spider_closed, signals.spider_closed)
        dispatcher.connect(self._spider_opened, signals.spider_opened) 
        self.load_allowed_domains_and_start_urls()

    def allowed_to_start(self):
        curr_date = datetime.today()
        curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
        jobdir = self.settings['JOBDIR']
        if jobdir:
            mnt = os.path.dirname(os.path.normpath(jobdir))
        else:
            mnt = ''

        checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
        day = timedelta(days=1)
        if os.path.exists(checkfile):
            f = open(checkfile, 'r')
            data = f.read()
            f.close()
            data = data.split('\n')
            reason = data[0]
            try:
                reason_date = datetime.strptime(data[1], '%Y-%m-%d')

            except Exception as ex:
                reason_date = None

            if reason_date and 'shutdown' in reason:
                reason = True

            else:
                if reason_date and reason_date + day <= curr_date and 'finished' in reason:
                    reason = True

                else:
                    reason = False
        else:
            reason = True

        return reason

    def _spider_opened(self, spider):
        if spider is not self:
            return

        curr_date = datetime.today()
        curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
        jobdir = spider.settings['JOBDIR']
        if jobdir:
            mnt = os.path.dirname(os.path.normpath(jobdir))
        else:
            mnt = ''

        checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
        day = timedelta(days=1)
        if os.path.exists(checkfile):
            f = open(checkfile, 'r')
            data = f.read()
            f.close()
            data = data.split('\n')
            reason = data[0]
            try:
                reason_date = datetime.strptime(data[1], '%Y-%m-%d')

            except Exception as ex:
                reason_date = None

            if reason_date and 'shutdown' in reason:
                f = open(checkfile, 'w')
                f.write('started\n')
                f.write(str(date.today()))
                f.close()

            else:
                if reason_date and reason_date + day <= curr_date and 'finished' in reason:
                    f = open(checkfile, 'w')
                    f.write('started\n')
                    f.write(str(date.today()))
                    f.close()

                else:
                    crawler.engine.close_spider(self, 'finished')
                    if jobdir and os.path.exists(jobdir):
                        shutil.rmtree(jobdir)
                        f = open(checkfile, 'w')
                        f.write('finished\n')
                        f.write(str(date.today()))
                        f.close()
                    os._exit(1)
        else:
            f = open(checkfile, 'w')
            f.write('started\n')
            f.write(str(date.today()))
            f.close()

    def _spider_closed(self, spider, reason):
        if spider is not self:
            return

        jobdir = spider.settings['JOBDIR']
        if jobdir:
            mnt = os.path.dirname(os.path.normpath(jobdir))
        else:
            mnt = ''

        checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name) 
        if 'shutdown' in reason:
            f = open(checkfile, 'w')
            f.write('shutdown\n')
            f.write(str(date.today()))
            f.close()
        else:
            if jobdir and os.path.exists(jobdir):
                shutil.rmtree(jobdir)
                f = open(checkfile, 'w')
                f.write('finished\n')
                f.write(str(date.today()))
                f.close()

    def _requests_to_follow(self, response):
        if getattr(response, 'encoding', None) != None:
            return CrawlSpider._requests_to_follow(self, response)
        else:
            return []

    def make_requests_from_url(self, url):
        http_client = httplib2.Http()
        try:
            headers = {
                'content-type': 'text/html',
                'user-agent': random.choice(USER_AGENT_LIST)
            }
            response, content = http_client.request(url, method='HEAD', headers=headers)
            #~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
            if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower(): 
                if self.allowed_to_start():
                    self.get_pdf_link(url)

            else:
                return CrawlSpider.make_requests_from_url(self, url)

        except Exception as ex:
            return CrawlSpider.make_requests_from_url(self, url)

    def get_pdf_link(self, url):
        source = self.__class__.name
        parsed_url = urlparse(url)
        url_domain = parsed_url.netloc
        url_path = parsed_url.path
        if url_domain:
            for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
                if url_domain.endswith(domain):
                    pre_and = False
                    pre_or = False
                    and_cond = True
                    or_cond = False
                    for path in paths:
                        if path[0:1] == '!':
                            pre_and = True
                            if path[1:] not in url_path:
                                and_cond = and_cond and True
                            else:
                                and_cond = and_cond and False

                        else:
                            pre_or = True
                            if path in url_path:
                                or_cond = or_cond or True
                            else:
                                or_cond = or_cond or False

                    if pre_and and pre_or:
                        if and_cond and or_cond:
                            self.pdf_process(source, url)
                            return
                    elif pre_and:
                        if and_cond:
                            self.pdf_process(source, url)
                            return
                    elif pre_or:
                        if or_cond:
                            self.pdf_process(source, url)
                            return
                    else:
                        self.pdf_process(source, url)
                        return

    def parse_crawled_page(self, response):
        self.__class__.crawl_count += 1
        crawl_count = self.__class__.crawl_count
        if crawl_count % 100 == 0:
            print 'Crawled %d pages' % crawl_count

        if 'pdf' in response.headers.get('content-type', '').lower():
            self.get_pdf_link(response.url)

        return Item()

    def load_allowed_domains_and_start_urls(self):
        day = timedelta(days=1)
        currdate = date.today()

        alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)

        self.__class__.all_domains = {
            'alrroya': {
                'start_urls': alrroya,
                'allow_domains': {
                    'epaper.alrroya.com': frozenset(()),
                }
            }
        }

        for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
            self.__class__.allowed_domains.append(domain)

        self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])

    def pdf_process(self, source, url):
        print '!!! ' + source + ' ' + url

解决方案

This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.

In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py

Change:

def start_requests(self):
    for url in self.start_urls:
        yield self.make_requests_from_url(url)

To:

def start_requests(self):
    for url in self.start_urls:
        requests = self.make_requests_from_url(url)
        if type(requests) is list:
            for request in requests:
                yield request
        else:
            yield requests

I expect that the official Scrapy people will fix this eventually.

这篇关于刮除未处理的例外的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

刮除未处理的例外 [英] scrapy unhandled exception

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

刮除未处理的例外 [英] scrapy unhandled exception

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭