Scrapy:在异地链接上爬行 1 级深度 [英] Scrapy: crawl 1 level deep on offsite links
问题描述
在scrapy中,我将如何让scrapy对允许域之外的所有链接仅抓取1级深度.在爬网过程中,我希望能够确保站点内的所有出站链接都正常工作,而不是 404 链接.我不希望它抓取非允许域的整个站点.我目前正在处理允许的域 404.我知道我可以将 DEPTH_LIMIT 设置为 1,但这也会影响允许的域.
In scrapy how would I go about having scrapy crawl only 1 level deep for all links outside the allowed domains. Within the crawl, I want to be able to make sure all outbound links within the site are working and not 404'd. I do not want it to crawl the whole site of the non-allowed domain. I am currently processing allowed domain 404s. I know that I can set a DEPTH_LIMIT of 1, but that will affect the allowed domain as well.
我的代码:
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from smcrawl.items import Website
import smcrawl.util
def iterate(lists):
for a in lists:
return a
class WalmartSpider(CrawlSpider):
handle_httpstatus_list = [200, 302, 404, 500, 502]
name = "surveymonkeycouk"
allowed_domains = ["surveymonkey.co.uk", "surveymonkey.com"]
start_urls = ['https://www.surveymonkey.co.uk/']
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),
process_value=smcrawl.util.trim),
callback="parse_items",
follow=True,),
)
#process_links=lambda links: [link for link in links if not link.nofollow] = filter nofollow links
#parses start urls
def parse_start_url(self, response):
list(self.parse_items(response))
def parse_items(self, response):
hxs = Selector(response)
sites = response.selector.xpath('//html')
items = []
for site in sites:
if response.status == 404:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
elif response.status == 200:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
titles = site.xpath('/html/head/title/text()').extract()
try:
titles = iterate(titles)
titles = titles.strip()
except:
pass
item['title'] = titles
h1 = site.xpath('//h1/text()').extract()
try:
h1 = iterate(h1)
h1 = h1.strip()
except:
pass
item['h1'] = h1
elif response.status == 302:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
titles = site.xpath('/html/head/title/text()').extract()
try:
titles = iterate(titles)
titles = titles.strip()
except:
pass
item['title'] = titles
h1 = site.xpath('//h1/text()').extract()
try:
h1 = iterate(h1)
h1 = h1.strip()
except:
pass
item['h1'] = h1
elif response.status == 404:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
titles = site.xpath('/html/head/title/text()').extract()
try:
titles = iterate(titles)
titles = titles.strip()
except:
pass
item['title'] = titles
h1 = site.xpath('//h1/text()').extract()
try:
h1 = iterate(h1)
h1 = h1.strip()
except:
pass
item['h1'] = h1
elif response.status == 500:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
titles = site.xpath('/html/head/title/text()').extract()
try:
titles = iterate(titles)
titles = titles.strip()
except:
pass
item['title'] = titles
h1 = site.xpath('//h1/text()').extract()
try:
h1 = iterate(h1)
h1 = h1.strip()
except:
pass
item['h1'] = h1
elif response.status == 502:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
titles = site.xpath('/html/head/title/text()').extract()
try:
titles = iterate(titles)
titles = titles.strip()
except:
pass
item['title'] = titles
h1 = site.xpath('//h1/text()').extract()
try:
h1 = iterate(h1)
h1 = h1.strip()
except:
pass
item['h1'] = h1
else:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract()
item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract()
item['original_url'] = response.meta.get('redirect_urls', [response.url])[0]
item['description'] = site.xpath('//meta[@name="description"]/@content').extract()
item['redirect'] = response.status
titles = site.xpath('/html/head/title/text()').extract()
try:
titles = iterate(titles)
titles = titles.strip()
except:
pass
item['title'] = titles
h1 = site.xpath('//h1/text()').extract()
try:
h1 = iterate(h1)
h1 = h1.strip()
except:
pass
item['h1'] = h1
items.append(item)
return items
推荐答案
我参考了 Scrapy 设置每个 allowed_domains 的深度限制 作为答案.这与我正在寻找的解决方案略有不同,但是如果有我愿意抓取的 URL 白名单,那么最终结果是相同的.谢谢!
I've referred to Scrapy set depth limit per allowed_domains as the answer. It's a little different than the solution I was looking for, but with a whitelist of URLs that I am willing to crawl, then end result is the same. Thank you!
这篇关于Scrapy:在异地链接上爬行 1 级深度的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!