StormCrawler发现并获取网站，但文档中没有保存任何内容 [英] StormCrawler DISCOVER and FETCH a website but nothing gets saved in docs

查看：98 发布时间：2020/7/10 1:39:45 stormcrawler

本文介绍了StormCrawler发现并获取网站，但文档中没有保存任何内容的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

有一个我要搜寻的网站，搜寻器发现并捕获URL，但是文档中没有任何内容.这是网站https://cactussara.ir.问题出在哪儿?！这是该网站的robots.txt:

There is a website that I'm trying to crawl, the crawler DISCOVER and FETCH the URLs but there is nothing in docs. this is the website https://cactussara.ir. where is the problem?! And this is the robots.txt of this website:

User-agent: *
Disallow: /

这是我的 urlfilters.json :

{
    "com.digitalpebble.stormcrawler.filtering.URLFilters": [
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLFilter",
            "name": "BasicURLFilter",
            "params": {
                "maxPathRepetition": 8,
                "maxLength": 8192
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.depth.MaxDepthFilter",
            "name": "MaxDepthFilter",
            "params": {
                "maxDepth": -1
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer",
            "name": "BasicURLNormalizer",
            "params": {
                "removeAnchorPart": true,
                "unmangleQueryString": true,
                "checkValidURI": true,
                "removeHashes": false
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.host.HostURLFilter",
            "name": "HostURLFilter",
            "params": {
                "ignoreOutsideHost": true,
                "ignoreOutsideDomain": false
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLNormalizer",
            "name": "RegexURLNormalizer",
            "params": {
                "regexNormalizerFile": "default-regex-normalizers.xml"
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
            "name": "RegexURLFilter",
            "params": {
                "regexFilterFile": "default-regex-filters.txt"
            }
        }
    ]
}

这是 crawler-conf.yaml :

# Default configuration for StormCrawler
# This is used to make the default values explicit and list the most common configurations.
# Do not modify this file but instead provide a custom one with the parameter -conf
# when launching your extension of ConfigurableTopology.  

config: 
  fetcher.server.delay: 1.0
  # min. delay for multi-threaded queues
  fetcher.server.min.delay: 0.0
  fetcher.queue.mode: "byHost"
  fetcher.threads.per.queue: 1
  fetcher.threads.number: 10
  fetcher.max.urls.in.queues: -1
  fetcher.max.queue.size: -1
  # max. crawl-delay accepted in robots.txt (in seconds)
  fetcher.max.crawl.delay: 30
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is larger than fetcher.max.crawl.delay:
  #  (if false)
  #    skip URLs from this queue to avoid that any overlong
  #    crawl-delay throttles the crawler
  #  (if true)
  #    set the delay to fetcher.max.crawl.delay,
  #    making fetcher more aggressive than requested
  fetcher.max.crawl.delay.force: false
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is smaller (ev. less than one second) than the default delay:
  #  (if true)
  #    use the larger default delay (fetcher.server.delay)
  #    and ignore the shorter crawl-delay in the robots.txt
  #  (if false)
  #    use the delay specified in the robots.txt
  fetcher.server.delay.force: false

  # time bucket to use for the metrics sent by the Fetcher
  fetcher.metrics.time.bucket.secs: 10

  # SimpleFetcherBolt: if the delay required by the politeness
  # is above this value, the tuple is sent back to the Storm queue 
  # for the bolt on the _throttle_ stream.
  fetcher.max.throttle.sleep: -1

  # alternative values are "byIP" and "byDomain"
  partition.url.mode: "byHost"

  # metadata to transfer to the outlinks
  # used by Fetcher for redirections, sitemapparser, etc...
  # these are also persisted for the parent document (see below)
  # metadata.transfer:
  # - customMetadataName

  # lists the metadata to persist to storage
  # these are not transfered to the outlinks
  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed

  metadata.track.path: true
  metadata.track.depth: true

  http.agent.name: "Anonymous Coward"
  http.agent.version: "1.0"
  http.agent.description: "built with StormCrawler ${version}"
  http.agent.url: "http://someorganization.com/"
  http.agent.email: "someone@someorganization.com"

  http.accept.language: "fa-IR,fa_IR,en-us,en-gb,en;q=0.7,*;q=0.3"
  http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  http.content.limit: -1
  http.store.headers: false
  http.timeout: 10000
  http.skip.robots: true

  # store partial fetches as trimmed content (some content has been fetched,
  # but reading more data from socket failed, eg. because of a network timeout)
  http.content.partial.as.trimmed: false

  # for crawling through a proxy:
  # http.proxy.host:
  # http.proxy.port:
  # okhttp only, defaults to "HTTP"
  # http.proxy.type: "SOCKS"
  # for crawling through a proxy with Basic authentication:
  # http.proxy.user:
  # http.proxy.pass:

  http.robots.403.allow: true

  # should the URLs be removed when a page is marked as noFollow
  robots.noFollow.strict: false

  # Guava caches used for the robots.txt directives 
  robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
  robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"

  protocols: "http,https,file"
  http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
  https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
  file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol"

  # navigationfilters.config.file: "navigationfilters.json"
  # selenium.addresses: "http://localhost:9515"
  selenium.implicitlyWait: 0
  selenium.pageLoadTimeout: -1
  selenium.setScriptTimeout: 0
  selenium.instances.num: 1
  selenium.capabilities:
    takesScreenshot: false
    loadImages: false
    javascriptEnabled: true
    # illustrates the use of the variable for user agent
    # phantomjs.page.settings.userAgent: "$userAgent"
    # ChromeDriver config
    # goog:chromeOptions:
    #   args: 
    #      - "--headless"
    #      - "--disable-gpu"
    #      - "--mute-audio"

  # DelegatorRemoteDriverProtocol
  selenium.delegated.protocol: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"

  # no url or parsefilters by default
  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"

  # JSoupParserBolt
  jsoup.treat.non.html.as.error: false
  parser.emitOutlinks: true
  parser.emitOutlinks.max.per.page: -1
  track.anchors: true
  detect.mimetype: true
  detect.charset.maxlength: 10000

  # filters URLs in sitemaps based on their modified Date (if any)
  sitemap.filter.hours.since.modified: -1

  # staggered scheduling of sitemaps
  sitemap.schedule.delay: -1

  # whether to add any sitemaps found in the robots.txt to the status stream
  # used by fetcher bolts
  sitemap.discovery: false

  # Default implementation of Scheduler
  scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: 1440

  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 120

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # custom fetch interval to be used when a document has the key/value in its metadata
  # and has been fetched succesfully (value in minutes)
  # fetchInterval.FETCH_ERROR.isFeed=true
  # fetchInterval.isFeed=true: 10

  # max number of successive fetch errors before changing status to ERROR
  max.fetch.errors: 3

  # Guava cache use by AbstractStatusUpdaterBolt for DISCOVERED URLs
  status.updater.use.cache: true
  status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"

  # Can also take "MINUTE" or "HOUR"
  status.updater.unit.round.date: "SECOND"

  # configuration for the classes extending AbstractIndexerBolt
  # indexer.md.filter: "someKey=aValue"
  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.text.maxlength: -1
  indexer.canonical.name: "canonical"
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description

谢谢.

StormCrawler发现并获取网站，但文档中没有保存任何内容 [英] StormCrawler DISCOVER and FETCH a website but nothing gets saved in docs

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

StormCrawler发现并获取网站，但文档中没有保存任何内容 [英] StormCrawler DISCOVER and FETCH a website but nothing gets saved in docs

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭