刮除未处理的例外 [英] scrapy unhandled exception
问题描述
scrapy crawl mycrawlspider -s JOBDIR = / mnt / mycrawlspider
我收到这个错误,阻止了scrapy(挂起,并没有自动完成,只有^ C停止)
2012-11-20 15:04:51 + 0000 [ - ]未处理的错误追溯(最近的呼叫最后):文件/ usr / lib / python2运行
self.crawler.start()文件/usr/lib/python2.7/site-packages/scrapy/crawler .py,第80行,起始
reactor.run(installSignalHandlers = False)#block调用文件/usr/lib/python2.7/site-packages/twisted/internet/base.py,第1169行,在运行
self.mainLoop()文件/usr/lib/python2.7/site-packages/twisted/internet/base.py,第1178行,mainLoop
self.runUntilCurrent() ---<这里捕获的异常> ---文件/usr/lib/python2.7/site-packages/twisted/internet/base.py,第800行,在runUntilCurrent
call.func(* call.args,** call.kw )文件/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py,第41行,__call__
return self._func(* self._a,** self._kw)文件/usr/lib/python2.7/site-packages/scrapy/core/engine.py,第116行,
_next_request
self.crawl(request,spider)文件/ usr / lib / python2.7 / site-packages / scrapy / core / engine.py,第172行,爬网
self.schedule(request,spider)文件/usr/lib/python2.7/site-packages /scrapy/core/engine.py,第176行,日程表
return self.slots [spider] .scheduler.enqueue_request(request)文件/usr/lib/python2.7/site-packages/scrapy/ core / scheduler.py,第48行,enqueue_request
如果没有request.dont_filter和self.df.request_seen(request):exceptions.AttributeError:'NoneType'对象没有属性'dont_filter'
BTW这在版本0.14中有效
这是代码:
class MySpider(CrawlSpider):
name ='alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
#抓取时在这些域中保留
allowed_domains = []
all_domains = {}
start_urls = []
#添加我们的回调,将为每个找到的链接调用
rules = [
Rule(SgmlLinkExtractor (deny_extensions = NEW_IGNORED_EXTENSIONS,tags =('a','area','frame','iframe'),attrs =('href','src')),follow = True,callback ='parse_crawled_page' $ b]
#抓取了几个页面
crawl_count = 0
#我们发现了多少PDF
pdf_count = 0
def __init __(self,* args,** kwargs):
CrawlSpide r .__ init __(self,* args,** kwargs)
dispatcher.connect(self._spider_closed,signals.spider_closed)
dispatcher.connect(self._spider_opened,signals.spider_opened)
self .load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year,curr_date.month,curr_date.day)
jobdir = self.settings ['JOBDIR']
如果jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt =''
checkfile = os.path.join(mnt,'%s.crawlercheck'%self.__class__.name)
day = timedelta(days = 1)
如果os.path.exists(checkfile):
f = open(checkfile,'r')
data = f.read()
f.close()
data = data.split('\\\
')
reason = data [0]
try:
reason_date = datetime.strptime(data [1],'%Y-%m-%d')
除了例外例外:
reason_date =无
如果reason_date和'shutdown'的原因:
reason = True
else:
如果reason_date和reason_date + day< = curr_date和'finished'在理由中:
reason = True
else:
reason = False
else:
reason = True
返回原因
def _spider_opened(self,spider):
如果蜘蛛不是自我:
return
curr_date = datetime.today()
curr_date = datetime(curr_date。年,curr_date.month,curr_date.day)
jobdir = spider.settings ['JOBDIR']
如果jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir ))
else:
mnt =''
checkfile = os.path.join(mnt,'%s.crawlercheck'%self.__class__.name)
day = timedelta(days = 1)
如果os.path.exists(checkfile):
f = open(checkfile,'r')
data = f.read()
f.close()
data = data .split('\\\
')
reason = data [0]
try:
reason_date = datetime.strptime(data [1],'%Y-%m-%d' )
除了例外ex:
reason_date = None
如果reason_date和'shutdown'在原因:
f = open(checkfile,'w' )
f.write('started\\\
')
f.write(str(date.today()))
f.close()
else:
如果reason_date和reason_date + day< = curr_date和'finished'在理由中:
f = open(checkfile,'w')
f.write('started\\\
')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self,'finished')
如果jobdir和os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile,'w ')
f.write('finished\\\
')
f.write(str(date.today()))
f.close()
os._exit (1)
else:
f = open(checkfile,'w')
f.write('started\\\
')
f.write(str(date.today ())
f.close()
def _spider_closed(self,spider,reason):
如果蜘蛛不是自我:
return
jobdir = spider.settings ['JOBDIR']
如果jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt =''
checkfile = os.path.join(mnt,' %s.crawlercheck'%self.__class__.name)
如果'shutdown'在原因:
f = open(checkfile,'w')
f.write('shutdown\\\
' )
f.write(str(date.today()))
f.close()
else:
如果jobdir和os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile,'w')
f.write('finished\\\
')
f.write(str(date.today ()))
f.close()
def _requests_to_follow(self,response):
if getattr(response,'encoding',None)!= None:
return CrawlSpider._requests_to_follow(self,response)
else:
return []
def make_requests_from_url(self,url):
http_client = httplib2.Http()
try:
headers = {
'content-type':'text / html',
'user-agent':random。选择(USER_AGENT_LIST)
}
响应,content = http_client.request(url,method ='HEAD',headers = headers)
#〜if'pdf'in response ['content-type '] .lower()或(url.endswith('。pdf')和'octet-stream'作为回应['content-type']。lower()):
如果'pdf' ():
如果self.allowed_to_start():
self.get_pdf_link(url)
else:
返回CrawlSpider.make_requests_from_url(self,url)
除了例外,例如:
返回CrawlSpider.make_requests_from_url(self,url)
def get_pdf_link(self,url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
如果url_domain:
为域,路径如果url_domain.endswith(domain):
pre_and = False
pre_or = False
和__cond = true
or_cond = False
路径中的路径:
如果路径[0:1] =='!':
pre_and = True
如果路径[1 :] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
如果路径在url_path:
or_cond = or_cond或True
else:
or_cond = or_cond或False
如果pre_and和pre_or:
如果和__cond和or_cond :
self.pdf_process(source,url)
return
elif pre_and:
if and_cond:
self.pdf_process(source,url)
return
elif pre_or:
if or_cond:
self.pdf_process(source,url)
return
else:
self.pdf_process(source,url)
return
def parse_crawled_page(self,response):
self .__ class __。crawl_count + = 1
crawl_count = self .__ class __。crawl_count
如果crawl_count%100 == 0:
打印'Crawled%d pages'%crawl_count
如果'pdf'作为回应。 header.get('content-type','').lower():
self.get_pdf_link(response.url)
返回项目()
()
day = timedelta(days = /currentissues.php?editiondt='+ currdate.strftime('%Y /%m /%d'),)
self .__ class __。all_domains = {
'alrroya':{
'start_urls':alrroya,
'allow_domains':{
'epaper.alrroya.com':frozenset(()),
}
}
}
for self .__ class __。all_domains [self .__ class __。name] ['allow_domains']:
self .__ class __。allowed_domains.append(domain)
self .__ class __。start_urls.extend(self .__ class __。all_domains [self .__ class __。name] ['start_urls'])
def pdf_process(self,source,url):
print'!!! '+ source +''+ url
成为Scrapy中的错误。当前版本似乎不接受从make_requests_from_url()返回的列表。我可以通过以下方式修改Scrapy代码来解决问题。
在文件Scrapy-0.16.5-py2.7.egg / scrapy / spider.py
更改:
def start_requests(self)
for self.start_urls中的url:
yield self.make_requests_from_url(url)
到:
def start_requests(self):
for self.start_urls中的url:
requests = self.make_requests_from_url(url)
如果类型(请求)是列表:
请求中的请求:
生产请求
其他:
生产请求
我希望官方Scrapy的人最终会解决这个问题。
I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.
这篇关于刮除未处理的例外的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!