如何使用Scrapy在一个POST请求上捕获多个响应? [英] How to capture multiple responses on one single POST request using Scrapy?

查看:92
本文介绍了如何使用Scrapy在一个POST请求上捕获多个响应?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试抓取





现在,我试图找到一种解决方法这个问题。而且,我已经遵循了 StackOverflow 帖子中提到的几乎所有内容,但是作为回应,我得到了HTML代码并带有提及 插入的文本错误,请输入图像文本框中显示的新字符的javascript警报代码。因此,此解决方案也不适用于我。



这是我的小蜘蛛代码:

 #-*-编码:utf-8-*-
进口scrapy
进口cv2
进口pytesseract
从PIL进口图像
来自io进口字节IO
从element_data.items import ElectionDataItem

class ElectionSpider(scrapy.Spider):
名称='election'
allowed_domains = ['ceo.maharashtra.gov.in']
start_urls = ['https://ceo.maharashtra.gov.in/searchlist/SearchRollPDF.aspx']
dist_dict = []

def解析(自己,响应):
地区= response.css('select#Content_DistrictList> option :: attr(value)')[1] .extract()
data = {
'__EVENTTARGET':response.css('select#Content_DistrictList :: attr(name)')。extract_first(),
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':response.css('input# __VIEWSTATE :: attr(value)')。extract_first(),
'__EVENTVALIDATION' :response.css('input #__ EVENTVALIDATION :: attr(value)')。extract_first(),
'ctl00 $ Content $ DistrictList':区,
'ctl00 $ Content $ txtcaptcha':''
}
meta = {'handle_httpstatus_all':True}
request = scrapy.FormRequest(url = self.start_urls [0],method ='POST',formdata = data,meta = meta ,callback = self.parse_assembly)
request.meta ['district'] =地区
收益请求

def parse_assembly(self,response):
print(' parse_assembly')
组件= response.css('select#Content_AssemblyList> option :: attr(value)')[1] .extract()
data = {
'__EVENTTARGET':response.css('select#Content_AssemblyList :: attr(name)')。extract_first( ),
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':response.css('input #__ VIEWSTATE :: attr(value)')。extract_first (),
'__EVENTVALIDATION':response.css('input #__ EVENTVALIDATION :: attr(value)')。extract_first(),
'ctl00 $ Content $ DistrictList':response.meta ['district '],
'ctl00 $ Content $ AssemblyList':程序集,
'ctl00 $ Content $ txtcaptcha':''
}
meta = {'handle_httpstatus_all':True}
request = scrapy.FormRequest(url = self.start_urls [0],method ='POST',formdata = data,meta = meta,callback = self.parse_part)
request.meta ['district'] = response.meta ['district']
request.meta ['assembly'] =组件
收益请求

def parse_p art(self,response):
print(’parse_part’)
part = response.css(’select#Content_PartList> option :: attr(value)')[1] .extract()
data = {
'__EVENTTARGET':response.css('select#Content_PartList :: attr(name)')。extract_first( ),
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':response.css('input #__ VIEWSTATE :: attr(value)')。extract_first (),
'__EVENTVALIDATION':response.css('input #__ EVENTVALIDATION :: attr(value)')。extract_first(),
'ctl00 $ Content $ DistrictList':response.meta ['district '],
'ctl00 $ Content $ AssemblyList':response.meta ['assembly'],
'ctl00 $ Content $ PartList':part,
'ctl00 $ Content $ txtcaptcha': ''
}
meta = {'handle_httpstatus_all':True}
request = scrapy.FormRequest(url = self.start_urls [0],method ='POST',formdata = data,meta = meta,callback = self.parse_captcha)
request.meta ['__ VIEWSTATE'] = response.css('input #__ VIEWSTATE :: attr(value)')。extract_firs t()
request.meta ['__ EVENTVALIDATION'] = response.css('input #__ EVENTVALIDATION :: attr(value)')。extract_first()
request.meta ['district'] =响应.meta ['district']
request.meta ['assembly'] = response.meta ['assembly']
request.meta ['part'] =零件
收益请求

def parse_captcha(自己,响应):
data_for_later =响应
request = scrapy.Request(url ='https://ceo.maharashtra.gov.in/searchlist/Captcha。 aspx',callback = self.store_image)
request.meta ['__ VIEWSTATE'] = response.meta ['__ VIEWSTATE']
request.meta ['__ EVENTVALIDATION'] = response.meta ['__ EVENTVALIDATION' ]
request.meta ['district'] = response.meta ['district']
request.meta ['assembly'] = response.meta ['assembly']
request.meta ['part'] = response.meta ['part']
request.meta ['data_for_later'] = data_for_later
收益请求

def store_ima ge(self,response):
captcha_target_filename ='filename.png'
#保存要处理的图像
i = Image.open(BytesIO(response.body))
i。 save(captcha_target_filename)
captcha_text = self.solve_captcha(captcha_target_filename)
print(captcha_text)
data = {
'__EVENTTARGET':'',
'__EVENTARGUMENT': '',
'__LASTFOCUS':'',
'__VIEWSTATE':response.meta ['__ VIEWSTATE'],
'__EVENTVALIDATION':response.meta ['__ EVENTVALIDATION'],
'ctl00 $ Content $ DistrictList':response.meta ['district'],
'ctl00 $ Content $ AssemblyList':response.meta ['assembly'],
'ctl00 $ Content $ PartList ':response.meta ['part'],
'ctl00 $ Content $ txtcaptcha':captcha_text,
'ctl00 $ Content $ OpenButton':'Open PDF'
}
captcha_form = response.meta ['data_for_l ater']
meta = {'handle_httpstatus_all':True}
request = scrapy.FormRequest.from_response(captcha_form,method ='POST',formdata = data,meta = meta,callback = self.get_pdfs)
收益请求

def get_pdfs(自己,响应):
#这是捕获最终响应的位置
打印(response.text)

defsolve_captcha(self,image):
image = cv2.imread(image,0)
thresh = cv2.threshold(image,220,255,cv2.THRESH_BINARY)[1]

内核= cv2.getStructuringElement(cv2.MORPH_RECT,(3,3))
close = cv2.morphologyEx(阈值,cv2.MORPH_CLOSE,内核)

结果= 255 -close
cv2.imshow('thresh',thresh)
cv2.imshow('close',close)
cv2.imshow('result',result)

return pytesseract.image_to_string(result)

如果您经过上述网站并填写全部表格详细信息监视浏览器的 consols网络标签,您将对这个问题有所了解。



请指导我如何解决此问题。谢谢。

解决方案

这就是我讨厌ASP.NET应用程序的原因,它只会使您抓狂。无论如何,除了一件事之外,您的一切都几乎完美了

  def parse_captcha(self,response): 
data_for_later =响应
请求= scrapy.Request(URL ='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx',callback = self.store_image)
请求.meta ['__ VIEWSTATE'] = response.meta ['__ VIEWSTATE']
request.meta ['__ EVENTVALIDATION'] = response.meta ['__ EVENTVALIDATION']
request.meta ['district'] = response.meta ['district']
request.meta ['assembly'] = response.meta ['assembly']
request.meta ['part'] = response.meta ['part']
request.meta ['data_for_later'] = data_for_later
收益请求

此来自您设置 part 的响应,但是您要做的是复制 __ VIEWSTATE __ EVENTVALIDATION 设置零件之前。因此,您需要确保捕获正确的状态

  def parse_captcha(自身,响应): 
data_for_later =响应
请求= scrapy.Request(URL ='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx',callback = self.store_image)
请求.meta ['__ VIEWSTATE'] = response.css('input #__ VIEWSTATE :: attr(value)')。extract_first()
request.meta ['__ EVENTVALIDATION'] = response.css('input #__ EVENTVALIDATION: :attr(value)')。extract_first()
request.meta ['district'] = response.meta ['district']
request.meta ['assembly'] = response.meta [' assembly']
request.meta ['part'] = response.meta ['part']
request.meta ['data_for_later'] = data_for_later
收益请求


I am trying to web scrape this website and download the pdf files available when you complete the whole lifecycle of this website. I am using Scrapy for this. I am having some trouble with capturing the captcha at the right time.

This site is an ASPX webpage and uses 'Viewstates' to keep track of each POST requests. Now, if you go through this site, you'll understand that whenever you fill any dropdown fields, it sends POST request with 'Viewstate' value to a certain URL path, which you can see in the browser console. But at the same time, it sends another GET request to another URL to fetch the "CAPTCHA" image. But I am not able to get this response. I don't have any idea whether using Scrapy can we capture multiple requests multiple responses at the same time.

Now, I tried to find a workaround for this issue. And I have followed almost everything mentioned in this StackOverflow post, but in response I am getting HTML code with javascript alert code mentioning "Wrong text inserted, Please enter new characters shown in image textbox". So, this solution is also not working for me.

This is my scrapy spider code:

# -*- coding: utf-8 -*-
import scrapy
import cv2
import pytesseract
from PIL import Image
from io import BytesIO
from election_data.items  import ElectionDataItem

class ElectionSpider(scrapy.Spider):
    name = 'election'
    allowed_domains = ['ceo.maharashtra.gov.in']
    start_urls = ['https://ceo.maharashtra.gov.in/searchlist/SearchRollPDF.aspx']
    dist_dict = []

    def parse(self, response):
        district = response.css('select#Content_DistrictList > option::attr(value)')[1].extract()
        data = {
            '__EVENTTARGET' : response.css('select#Content_DistrictList::attr(name)').extract_first(),
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
            '__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
            'ctl00$Content$DistrictList' : district,
            'ctl00$Content$txtcaptcha' : ''
        }
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_assembly)
        request.meta['district'] = district
        yield request

    def parse_assembly(self, response):
        print('parse_assembly')
        assembly = response.css('select#Content_AssemblyList > option::attr(value)')[1].extract()
        data = {
            '__EVENTTARGET' : response.css('select#Content_AssemblyList::attr(name)').extract_first(),
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
            '__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
            'ctl00$Content$DistrictList' : response.meta['district'],
            'ctl00$Content$AssemblyList' : assembly,
            'ctl00$Content$txtcaptcha' : ''
        }
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_part)
        request.meta['district'] = response.meta['district']
        request.meta['assembly'] = assembly
        yield request

    def parse_part(self, response):
        print('parse_part')
        part = response.css('select#Content_PartList > option::attr(value)')[1].extract()
        data = {
            '__EVENTTARGET' : response.css('select#Content_PartList::attr(name)').extract_first(),
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
            '__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
            'ctl00$Content$DistrictList' : response.meta['district'],
            'ctl00$Content$AssemblyList' : response.meta['assembly'],
            'ctl00$Content$PartList' : part,
            'ctl00$Content$txtcaptcha' : ''
        }
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_captcha)
        request.meta['__VIEWSTATE'] = response.css('input#__VIEWSTATE::attr(value)').extract_first()
        request.meta['__EVENTVALIDATION'] = response.css('input#__EVENTVALIDATION::attr(value)').extract_first()
        request.meta['district'] = response.meta['district']
        request.meta['assembly'] = response.meta['assembly']
        request.meta['part'] = part
        yield request

    def parse_captcha(self, response):
        data_for_later = response
        request = scrapy.Request(url='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx', callback=self.store_image)
        request.meta['__VIEWSTATE'] = response.meta['__VIEWSTATE']
        request.meta['__EVENTVALIDATION'] = response.meta['__EVENTVALIDATION']
        request.meta['district'] = response.meta['district']
        request.meta['assembly'] = response.meta['assembly']
        request.meta['part'] = response.meta['part']
        request.meta['data_for_later'] = data_for_later
        yield request

    def store_image(self, response):
        captcha_target_filename = 'filename.png'
        # save the image for processing
        i = Image.open(BytesIO(response.body))
        i.save(captcha_target_filename)
        captcha_text = self.solve_captcha(captcha_target_filename)
        print(captcha_text)
        data = {
            '__EVENTTARGET' : '',
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.meta['__VIEWSTATE'],
            '__EVENTVALIDATION' : response.meta['__EVENTVALIDATION'],
            'ctl00$Content$DistrictList' : response.meta['district'],
            'ctl00$Content$AssemblyList' : response.meta['assembly'],
            'ctl00$Content$PartList' : response.meta['part'],
            'ctl00$Content$txtcaptcha' : captcha_text,
            'ctl00$Content$OpenButton': 'Open PDF'
        }
        captcha_form = response.meta['data_for_later']
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest.from_response(captcha_form, method='POST', formdata=data, meta=meta, callback=self.get_pdfs)
        yield request

    def get_pdfs(self, response):
        # THIS IS WHERE FINAL RESPONSE IS CAPTURED
        print(response.text)

    def solve_captcha(self, image):
        image = cv2.imread(image,0)
        thresh = cv2.threshold(image, 220, 255, cv2.THRESH_BINARY)[1]

        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
        close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)

        result = 255 - close
        cv2.imshow('thresh', thresh)
        cv2.imshow('close', close)
        cv2.imshow('result', result)

        return pytesseract.image_to_string(result)

If you go through the above-mentioned site and fill all the form details, monitor the browser consols network tab, you'll get an idea about this problem.

Kindly guide me in how to solve this issue. Thank you.

解决方案

That is the reason I hate ASP.NET applications, it just make you go nuts while scraping. Anyways, you had everything almost perfect, except one thing

def parse_captcha(self, response):
    data_for_later = response
    request = scrapy.Request(url='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx', callback=self.store_image)
    request.meta['__VIEWSTATE'] = response.meta['__VIEWSTATE']
    request.meta['__EVENTVALIDATION'] = response.meta['__EVENTVALIDATION']
    request.meta['district'] = response.meta['district']
    request.meta['assembly'] = response.meta['assembly']
    request.meta['part'] = response.meta['part']
    request.meta['data_for_later'] = data_for_later
    yield request

This comes from a response where you set the part, but what you do is copy the __VIEWSTATE and __EVENTVALIDATION previous to setting the part. So you need to make sure you capture the correct states

def parse_captcha(self, response):
    data_for_later = response
    request = scrapy.Request(url='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx', callback=self.store_image)
    request.meta['__VIEWSTATE'] = response.css('input#__VIEWSTATE::attr(value)').extract_first()
    request.meta['__EVENTVALIDATION'] = response.css('input#__EVENTVALIDATION::attr(value)').extract_first()
    request.meta['district'] = response.meta['district']
    request.meta['assembly'] = response.meta['assembly']
    request.meta['part'] = response.meta['part']
    request.meta['data_for_later'] = data_for_later
    yield request

这篇关于如何使用Scrapy在一个POST请求上捕获多个响应?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆