StormCrawler发现并获取网站,但文档中没有保存任何内容 [英] StormCrawler DISCOVER and FETCH a website but nothing gets saved in docs
问题描述
有一个我要搜寻的网站,搜寻器发现并捕获URL,但是文档中没有任何内容.这是
网站https://cactussara.ir
.问题出在哪儿?!
这是该网站的robots.txt
:
There is a website that I'm trying to crawl, the crawler DISCOVER and FETCH the URLs but there is nothing in docs. this is
the website https://cactussara.ir
. where is the problem?!
And this is the robots.txt
of this website:
User-agent: *
Disallow: /
这是我的 urlfilters.json :
{
"com.digitalpebble.stormcrawler.filtering.URLFilters": [
{
"class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLFilter",
"name": "BasicURLFilter",
"params": {
"maxPathRepetition": 8,
"maxLength": 8192
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.depth.MaxDepthFilter",
"name": "MaxDepthFilter",
"params": {
"maxDepth": -1
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer",
"name": "BasicURLNormalizer",
"params": {
"removeAnchorPart": true,
"unmangleQueryString": true,
"checkValidURI": true,
"removeHashes": false
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.host.HostURLFilter",
"name": "HostURLFilter",
"params": {
"ignoreOutsideHost": true,
"ignoreOutsideDomain": false
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLNormalizer",
"name": "RegexURLNormalizer",
"params": {
"regexNormalizerFile": "default-regex-normalizers.xml"
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
"name": "RegexURLFilter",
"params": {
"regexFilterFile": "default-regex-filters.txt"
}
}
]
}
这是 crawler-conf.yaml :
# Default configuration for StormCrawler
# This is used to make the default values explicit and list the most common configurations.
# Do not modify this file but instead provide a custom one with the parameter -conf
# when launching your extension of ConfigurableTopology.
config:
fetcher.server.delay: 1.0
# min. delay for multi-threaded queues
fetcher.server.min.delay: 0.0
fetcher.queue.mode: "byHost"
fetcher.threads.per.queue: 1
fetcher.threads.number: 10
fetcher.max.urls.in.queues: -1
fetcher.max.queue.size: -1
# max. crawl-delay accepted in robots.txt (in seconds)
fetcher.max.crawl.delay: 30
# behavior of fetcher when the crawl-delay in the robots.txt
# is larger than fetcher.max.crawl.delay:
# (if false)
# skip URLs from this queue to avoid that any overlong
# crawl-delay throttles the crawler
# (if true)
# set the delay to fetcher.max.crawl.delay,
# making fetcher more aggressive than requested
fetcher.max.crawl.delay.force: false
# behavior of fetcher when the crawl-delay in the robots.txt
# is smaller (ev. less than one second) than the default delay:
# (if true)
# use the larger default delay (fetcher.server.delay)
# and ignore the shorter crawl-delay in the robots.txt
# (if false)
# use the delay specified in the robots.txt
fetcher.server.delay.force: false
# time bucket to use for the metrics sent by the Fetcher
fetcher.metrics.time.bucket.secs: 10
# SimpleFetcherBolt: if the delay required by the politeness
# is above this value, the tuple is sent back to the Storm queue
# for the bolt on the _throttle_ stream.
fetcher.max.throttle.sleep: -1
# alternative values are "byIP" and "byDomain"
partition.url.mode: "byHost"
# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# these are also persisted for the parent document (see below)
# metadata.transfer:
# - customMetadataName
# lists the metadata to persist to storage
# these are not transfered to the outlinks
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
metadata.track.path: true
metadata.track.depth: true
http.agent.name: "Anonymous Coward"
http.agent.version: "1.0"
http.agent.description: "built with StormCrawler ${version}"
http.agent.url: "http://someorganization.com/"
http.agent.email: "someone@someorganization.com"
http.accept.language: "fa-IR,fa_IR,en-us,en-gb,en;q=0.7,*;q=0.3"
http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
http.content.limit: -1
http.store.headers: false
http.timeout: 10000
http.skip.robots: true
# store partial fetches as trimmed content (some content has been fetched,
# but reading more data from socket failed, eg. because of a network timeout)
http.content.partial.as.trimmed: false
# for crawling through a proxy:
# http.proxy.host:
# http.proxy.port:
# okhttp only, defaults to "HTTP"
# http.proxy.type: "SOCKS"
# for crawling through a proxy with Basic authentication:
# http.proxy.user:
# http.proxy.pass:
http.robots.403.allow: true
# should the URLs be removed when a page is marked as noFollow
robots.noFollow.strict: false
# Guava caches used for the robots.txt directives
robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"
protocols: "http,https,file"
http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol"
# navigationfilters.config.file: "navigationfilters.json"
# selenium.addresses: "http://localhost:9515"
selenium.implicitlyWait: 0
selenium.pageLoadTimeout: -1
selenium.setScriptTimeout: 0
selenium.instances.num: 1
selenium.capabilities:
takesScreenshot: false
loadImages: false
javascriptEnabled: true
# illustrates the use of the variable for user agent
# phantomjs.page.settings.userAgent: "$userAgent"
# ChromeDriver config
# goog:chromeOptions:
# args:
# - "--headless"
# - "--disable-gpu"
# - "--mute-audio"
# DelegatorRemoteDriverProtocol
selenium.delegated.protocol: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
# no url or parsefilters by default
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
# JSoupParserBolt
jsoup.treat.non.html.as.error: false
parser.emitOutlinks: true
parser.emitOutlinks.max.per.page: -1
track.anchors: true
detect.mimetype: true
detect.charset.maxlength: 10000
# filters URLs in sitemaps based on their modified Date (if any)
sitemap.filter.hours.since.modified: -1
# staggered scheduling of sitemaps
sitemap.schedule.delay: -1
# whether to add any sitemaps found in the robots.txt to the status stream
# used by fetcher bolts
sitemap.discovery: false
# Default implementation of Scheduler
scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"
# revisit a page daily (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.default: 1440
# revisit a page with a fetch error after 2 hours (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.fetch.error: 120
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1
# custom fetch interval to be used when a document has the key/value in its metadata
# and has been fetched succesfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true
# fetchInterval.isFeed=true: 10
# max number of successive fetch errors before changing status to ERROR
max.fetch.errors: 3
# Guava cache use by AbstractStatusUpdaterBolt for DISCOVERED URLs
status.updater.use.cache: true
status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"
# Can also take "MINUTE" or "HOUR"
status.updater.unit.round.date: "SECOND"
# configuration for the classes extending AbstractIndexerBolt
# indexer.md.filter: "someKey=aValue"
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.text.maxlength: -1
indexer.canonical.name: "canonical"
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
- parse.description=description
谢谢.
推荐答案
页面包含
<meta name="robots" content="noindex,follow"/>
由解析器找到,并导致索引器螺栓跳过页面.
which are found by the parser and causes the indexer bolt to skip the page.
应该在度量标准中对此进行确认,其中已过滤应与获取的页面数相同.
This should be confirmed in the metrics where Filtered should be the same number as the pages fetched.
http.skip.robots 不适用于在页面本身中设置的指令.
http.skip.robots does not apply to the directives set in the page itself.
这篇关于StormCrawler发现并获取网站,但文档中没有保存任何内容的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!