使用Web收获来刮取网页内容 [英] Scraping content of webpage using Web-harvest
问题描述
我想从网页上抓取特定内容,因为我正在使用网页收获。当我试图抓取内容时,它在其他网站上运行良好,但它不会为内容抓取内容 我的Java代码在这里: 我的XML在这里: 我想抓取此网址的第一个区块,例如候选名称,当前指定,公司等,但我无法通过在XML文件中使用它的类来刮擦,例如(我只尝试了第一次尝试抓取候选人名称) 但它不起作用。任何人都可以告诉我我做错了什么? 用于此网址。 Naukri.com使用技术手段排除机器人等对网站进行抓取以及抓取内容。用户承诺 不要规避这些方法。 I want to scrape particular contents from webpages, for this I am using web harvest. It is working well for other website when I tried to scrape contents but it is not scraping contents for this URL. My Java code is here: And My XML is here: I want to scrape first block of this URL e.g candidate name, current designation, company etc., but I am unable to scrape by using its class in XML file e.g. (I tried only one for first attempt to scrape candidate name only) But it's not working. Can anyone please tell me what I am doing wrong? ..it is not scraping contents for this URL. From the Terms & Conditions of Naukri.com: Naukri.com uses technological means to exclude Robots etc from crawling the website and scraping content. The user undertakes not to circumvent these methods.
这篇关于使用Web收获来刮取网页内容的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
import java.io.FileNotFoundException;
public class App
{
public static void main(String [] args)
{
try
{
ScraperConfiguration config = new ScraperConfiguration ( twit88.xml);
Scraper scraper = new Scraper(config,c:/ temp /);
//scraper.getHttpClientManager(().setHttpProxy(\"proxy-server,8001);
scraper.addVariableToContext(url,http://freesearch.naukri.com/preview/preview?uname=63017692f2b266780bfd20476cd67466001a4a17005b4a5355041f121b502e18514b4e4e43121c4151005&sid=73682841<=1339495252);
scraper.setDebug(true);
scraper.execute();
//在执行期间创建变量
变量article =(变量)scraper.getContext()。getVar(article);
//对文章做些什么...
System.out.println(article.toString());
//System.out.println(\"1234=====rtyu);
}
catch(FileNotFoundException e)
{
System.out.println(e.getMessage());
}
}
}
<?xml version =1.0encoding =UTF-8?>
< config charset =UTF-8>
<! -
< var-def name =url> http://twit88.com/blog/2008/01/02/java-encrypt-and-send-a - 大文件安全/< / var-def>
- >
<! - < file action =writepath =twit88 / twit88 $ {sys.date()}。xmlcharset =UTF-8> - >
<! -
< template>
<![CDATA [< twit88 date =$ {sys.datetime(dd.MM.yyyy)}> ]>
< / template>
- >
< var-def name =article>
< xquery>
< xq-param name =doc>
< / html-to-xml>
< / xq-param>
< xq-expression><![CDATA [
declare variable $ doc as node()external;
let $ title:= data($ doc // div [@ class =bdrGry] / div [@ class =boxHD1] / h1)
return
<物品>
< title> {data($ title)}< / title>
< / article>
]]>
< / xq-expression>
< / xquery>
< / var-def>
<! -
<![CDATA [< / twit88> ]> - >
<! - < / file> - >
< / config>
$ b $ pre $ 声明变量$ doc as node()external;
pre>
let $ title:= data($ doc // div [@ class =bdrGry] / div [@ class =boxHD1] / h1)
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.runtime.Scraper;
import org.webharvest.runtime.variables.Variable;
import java.io.FileNotFoundException;
public class App
{
public static void main(String[] args)
{
try
{
ScraperConfiguration config = new ScraperConfiguration("twit88.xml");
Scraper scraper = new Scraper(config, "c:/temp/");
//scraper.getHttpClientManager().setHttpProxy("proxy-server", 8001);
scraper.addVariableToContext("url", "http://freesearch.naukri.com/preview/preview?uname=63017692f2b266780bfd20476cd67466001a4a17005b4a5355041f121b502e18514b4e4e43121c4151005&sid=73682841<=1339495252");
scraper.setDebug(true);
scraper.execute();
// takes variable created during execution
Variable article = (Variable)scraper.getContext().getVar("article");
// do something with articles...
System.out.println(article.toString());
//System.out.println("1234=====rtyu");
}
catch (FileNotFoundException e)
{
System.out.println(e.getMessage());
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<config charset="UTF-8">
<!--
<var-def name="url">http://twit88.com/blog/2008/01/02/java-encrypt-and-send-a- large-file-securely/</var-def>
-->
<!-- <file action="write" path="twit88/twit88${sys.date()}.xml" charset="UTF-8"> -->
<!--
<template>
<![CDATA[ <twit88 date="${sys.datetime("dd.MM.yyyy")}"> ]]>
</template>
-->
<var-def name="article">
<xquery>
<xq-param name="doc">
<html-to-xml outputtype="browser-compact" prunetags="yes">
<http url="${url}"/>
</html-to-xml>
</xq-param>
<xq-expression><![CDATA[
declare variable $doc as node() external;
let $title := data($doc//div[@class="bdrGry"]/div[@class="boxHD1"]/h1)
return
<article>
<title>{data($title)}</title>
</article>
]]>
</xq-expression>
</xquery>
</var-def>
<!--
<![CDATA[ </twit88> ]]> -->
<!-- </file> -->
</config>
declare variable $doc as node() external;
let $title := data($doc//div[@class="bdrGry"]/div[@class="boxHD1"]/h1)