在运行scrapy之前插入多个输入字段 [英] insert multiple input fields before running scrapy

查看:47
本文介绍了在运行scrapy之前插入多个输入字段的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在引用一个 stackoverflow

我试图将这个逻辑放在 GUI 上,但我不确定如何去做.

这是目前 gui 的样子.

我希望能够在处理scrapy脚本之前输入用户可以输入所需信息的字段.

这里有一些scrapy脚本

my_spider.py

导入scrapy导入系统随机导入导入 csv从scrape.items 导入项目从 var_dump 导入 var_dumpsearch_item = input("输入搜索项:")location = input("位置:")second_location = input("第二个位置:")第三个位置 = input("第三个位置:")第四个位置 = input("第四个位置:")Fifth_location = input("第五个位置:")Six_location = input("第六个位置:")# 城市 = [#洛杉矶"、芝加哥"、休斯顿"、凤凰城"、费城"、圣安东尼奥"、沃思堡"、#圣地亚哥"、达拉斯"、圣何塞"、奥斯汀"、哥伦布"、印第安纳波利斯"、西雅图"、圣保罗"、纳什维尔"、#路易斯维尔"、普莱诺"#]# rancity = random.choice(city)类 YellowSpider(scrapy.Spider):名称 = "黄色"# start_urls = [# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location# # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" +third_location,# # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" +fourth_location#]def start_requests(self):yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" +third_location, self.parse3)yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" +fourth_location, self.parse4)yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + Fifth_location, self.parse5)yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + Six_location, self.parse6)# yield scrapy.Request('http://www.example.com/3.html', self.parse)def __init__(self):self.seen_business_names = []self.seen_phonenumbers = []self.seen_websites = []self.seen_emails = []定义解析(自我,响应):对于 href in response.css('div.v-card a.business-name::attr(href)'):产量 response.follow(href,self.businessprofile)对于 href 在 response.css('div.pagination a::attr(href)'):产生 response.follow(href, self.parse)def parse2(self, response):对于 href 在 response.css('div.v-card a.business-name::attr(href)'):产量 response.follow(href, self.businessprofile2)对于 href 在 response.css('div.pagination a::attr(href)'):产生 response.follow(href, self.parse2)def parse3(self, response):对于 href 在 response.css('div.v-card a.business-name::attr(href)'):产量 response.follow(href,self.businessprofile3)对于 href 在 response.css('div.pagination a::attr(href)'):产生 response.follow(href, self.parse3)…………

这是图形界面

ma​​in.py

from functools import partial从 PyQt5 导入 QtCore、QtGui、QtWidgets类 ScrapyWorker(QtCore.QObject):logChanged = QtCore.pyqtSignal(str)开始 = QtCore.pyqtSignal()完成 = QtCore.pyqtSignal()def __init__(self, parent=None):super(ScrapyWorker, self).__init__(parent)self._process = QtCore.QProcess(self)self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)self._process.setProgram('scrapy')self._process.started.connect(self.started)self._process.finished.connect(self.finished)定义运行(自我,项目,蜘蛛):self._process.setWorkingDirectory(项目)self._process.setArguments(['crawl', spider])self._process.start()@QtCore.pyqtSlot()def on_readyReadStandardOutput(self):数据 = self._process.readAllStandardOutput().data().decode()self.logChanged.emit(数据)@QtCore.pyqtSlot()定义停止(自我):self._process.kill()def蜘蛛(自我,项目):进程 = QtCore.QProcess()process.setProcessChannelMode(QtCore.QProcess.MergedChannels)process.setWorkingDirectory(项目)循环 = QtCore.QEventLoop()process.finished.connect(loop.quit)process.start('scrapy', ['list'])loop.exec_()返回 process.readAllStandardOutput().data().decode().split()类 MainWindow(QtWidgets.QMainWindow):def __init__(self, parent=None):super(MainWindow, self).__init__(parent)self.project_le = QtWidgets.QLineEdit()self.project_button = QtWidgets.QPushButton('选择项目')self.spider_combobox = QtWidgets.QComboBox()self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)self.text_edit = QtWidgets.QTextBrowser()self.input = QtWidgets.QLineEdit()self.input1 = QtWidgets.QLineEdit()self.input2 = QtWidgets.QLineEdit()self.input3 = QtWidgets.QLineEdit()self.input4 = QtWidgets.QLineEdit()self.input5 = QtWidgets.QLineEdit()self.input6 = QtWidgets.QLineEdit()central_widget = QtWidgets.QWidget()self.setCentralWidget(central_widget)躺 = QtWidgets.QVBoxLayout(central_widget)hlay = QtWidgets.QHBoxLayout()hlay.addWidget(self.project_le)hlay.addWidget(self.project_button)lay.addLayout(hlay)hlay2 = QtWidgets.QHBoxLayout()hlay2.addWidget(QtWidgets.QLabel("输入搜索项:"))hlay2.addWidget(self.input, 1)hlay3 = QtWidgets.QHBoxLayout()hlay4 = QtWidgets.QHBoxLayout()hlay5 = QtWidgets.QHBoxLayout()hlay6 = QtWidgets.QHBoxLayout()hlay7 = QtWidgets.QHBoxLayout()hlay8 = QtWidgets.QHBoxLayout()hlay3.addWidget(QtWidgets.QLabel("位置:"))hlay3.addWidget(self.input1, 1)hlay4.addWidget(QtWidgets.QLabel("位置 2 :"))hlay4.addWidget(self.input2, 1)hlay5.addWidget(QtWidgets.QLabel("位置 3 :"))hlay5.addWidget(self.input3, 1)hlay6.addWidget(QtWidgets.QLabel("位置 4 :"))hlay6.addWidget(self.input4, 1)hlay7.addWidget(QtWidgets.QLabel("位置 5 :"))hlay7.addWidget(self.input5, 1)hlay8.addWidget(QtWidgets.QLabel("位置 6 :"))hlay8.addWidget(self.input6, 1)lay.addLayout(hlay2)lay.addLayout(hlay3)lay.addLayout(hlay4)lay.addLayout(hlay5)lay.addLayout(hlay6)lay.addLayout(hlay7)lay.addLayout(hlay8)lay.addWidget(self.start_stop_button)Lay.addWidget(self.text_edit)self.start_stop_button.setEnabled(False)self.scrapy_worker = ScrapyWorker(self)self.scrapy_worker.logChanged.connect(self.insert_log)self.scrapy_worker.started.connect(self.text_edit.clear)self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))self.start_stop_button.toggled.connect(self.on_checked)self.project_button.clicked.connect(self.select_project)self.resize(640, 480)@QtCore.pyqtSlot(bool)def on_checked(self, state):如果状态:文件名 = self.project_le.text()finfo = QtCore.QFileInfo(文件名)目录 = finfo.dir().absolutePath()self.scrapy_worker.run(目录,self.spider_combobox.currentText())self.start_stop_button.setText('停止')别的:self.start_stop_button.setText('开始')self.scrapy_worker.stop()@QtCore.pyqtSlot()def select_project(self):文件名, _ = QtWidgets.QFileDialog.getOpenFileName(自己,"选择 .cfg 文件",QtCore.QDir.currentPath(),配置文件 (*.cfg)")如果文件名:self.project_le.setText(文件名)finfo = QtCore.QFileInfo(文件名)目录 = finfo.dir().absolutePath()蜘蛛 = self.scrapy_worker.spiders(目录)self.spider_combobox.clear()self.spider_combobox.addItems(蜘蛛)self.start_stop_button.setEnabled(True if spiders else False)@QtCore.pyqtSlot(str)def insert_log(self, text):prev_cursor = self.text_edit.textCursor()self.text_edit.moveCursor(QtGui.QTextCursor.End)self.text_edit.insertPlainText(文本)self.text_edit.setTextCursor(prev_cursor)如果 __name__ == '__main__':导入系统app = QtWidgets.QApplication(sys.argv)app.setStyle('融合')w = 主窗口()w.show()sys.exit(app.exec_())

解决方案

首先,您必须修改您的蜘蛛程序以直接通过控制台接受参数,避免使用 input() 方法:

yellowpage_spider.py

导入json导入scrapy从scrape.items 导入项目类 YellowSpider(scrapy.Spider):名称 = "黄色"def __init__(self, *args, **kwargs):super(YellowSpider, self).__init__(*args, **kwargs)self.seen_business_names = []self.seen_phonenumbers = []self.seen_websites = []self.seen_emails = []def start_requests(self):如果不是 hasattr(self, 'parameters'):返回参数 = json.loads(self.parameters)search_item = 参数['search_item']位置 = 参数['位置']对于位置中的位置:url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location)yield scrapy.Request(url=url, callback=self.parse, meta={'location': location})定义解析(自我,响应):location = response.meta['location']对于 href 在 response.css('div.v-card a.business-name::attr(href)'):yield response.follow(href, self.businessprofile, meta={'location': location})对于 href 在 response.css('div.pagination a::attr(href)'):yield response.follow(href, self.parse, meta={'location': location})def businessprofile(self, response):location = response.meta['location']用于 response.css('header#main-header') 中的业务:项目 = 项目()item['business_name'] = business.css('div.sales-info h1::text').extract()w = business.css('a.secondary-btn.website-link::attr(href)').extract()item['website'] = str(w).strip('[]')项目['位置'] = 位置s = business.css('a.email-business::attr(href)').extract()item['email'] = [item[7:] for item in s]item['phonenumber'] = business.css('p.phone::text').extract_first()对于项目 ['email'] 中的 x:#新代码在这里,调用self.seen_business_names如果 x 不在 self.seen_emails 中:如果项目['电子邮件']:如果项目['电话号码']:如果项目['网站']:self.seen_emails.append(x)产量项目

那么前面的代码需要一个名为parameters的参数:

scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'

所以在 GUI 中,我们现在必须使用 GUI 输入来形成条目:

gui.py

导入操作系统导入json从 functools 导入部分从 PyQt5 导入 QtCore、QtGui、QtWidgets导入实用程序dir_path = os.path.dirname(os.path.abspath(__file__))icon_dir = os.path.join(dir_path, 'assets', 'icons')类 ScrapyWorker(QtCore.QObject):logChanged = QtCore.pyqtSignal(str)开始 = QtCore.pyqtSignal()完成 = QtCore.pyqtSignal()def __init__(self, parent=None):super(ScrapyWorker, self).__init__(parent)self._process = QtCore.QProcess(self)self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)self._process.started.connect(self.started)self._process.finished.connect(self.finished)def run(self, project, program, arguments):self._process.setWorkingDirectory(项目)self._process.setProgram('scrapy')self._process.setArguments(参数)self._process.start()@QtCore.pyqtSlot()def on_readyReadStandardOutput(self):数据 = self._process.readAllStandardOutput().data().decode()self.logChanged.emit(数据)@QtCore.pyqtSlot()定义停止(自我):self._process.kill()类 LocationWidget(QtWidgets.QWidget):def __init__(self, parent=None):super(LocationWidget, self).__init__(parent)self.lay = QtWidgets.QVBoxLayout(self)self.lay.setContentsMargins(0, 0, 0, 0)self.lay.addStretch()self.setContentsMargins(0, 0, 0, 0)self.widgets = []self.create_row()def create_row(self):小部件 = QtWidgets.QWidget()widget.setContentsMargins(0, 0, 0, 0)hlay = QtWidgets.QHBoxLayout(widget)hlay.setContentsMargins(0, 0, 0, 0)lineedit = QtWidgets.QLineEdit()button = QtWidgets.QToolButton(clicked=self.on_cliled)button.setFocusPolicy(QtCore.Qt.NoFocus)hlay.addWidget(lineedit)hlay.addWidget(按钮)button.setIconSize(QtCore.QSize(24, 24))button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))self.widgets.append(widget)self.lay.insertWidget(-1, 小部件)@QtCore.pyqtSlot()def on_cliled(self):按钮 = self.sender()小部件 = button.parentWidget()如果 self.lay.indexOf(widget) == (self.lay.count()-1):self.create_row()别的:self.lay.removeWidget(小部件)widget.deleteLater()self.widgets.remove(widget)对于 self.widgets 中的小部件:button = widget.findChild(QtWidgets.QToolButton)button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))def get_locations(self):位置 = []对于 self.widgets 中的小部件:le = widget.findChild(QtWidgets.QLineEdit)如果 le.text():location.append(le.text())返回地点类 YellowWidget(QtWidgets.QMainWindow):def __init__(self, parent=None):super(YellowWidget, self).__init__(parent)self.setWindowTitle('黄页刮板')self.scrapy_worker = ScrapyWorker(self)self.search_item_le = QtWidgets.QLineEdit()self.location_widget = LocationWidget()self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)self.text_edit = QtWidgets.QTextBrowser()central_widget = QtWidgets.QWidget()self.setCentralWidget(central_widget)躺 = QtWidgets.QGridLayout(central_widget)lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)Lay.addWidget(self.search_item_le, 0, 1)lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, 对齐=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)Lay.addWidget(self.location_widget, 1, 1, 对齐=QtCore.Qt.AlignTop)lay.addWidget(self.start_stop_button, 2, 0, 1, 2)Lay.addWidget(self.text_edit, 3, 0, 1, 2)self.start_stop_button.toggled.connect(self.on_checked)self.scrapy_worker.logChanged.connect(self.insert_log)self.scrapy_worker.started.connect(self.text_edit.clear)self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))@QtCore.pyqtSlot(bool)def on_checked(self, state):如果状态:# crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'search_item = self.search_item_le.text()位置 = self.location_widget.get_locations()目录,程序,args = utils.create_arguments(search_item,locations)self.scrapy_worker.run(目录,程序,参数)self.start_stop_button.setText('停止')别的:self.start_stop_button.setText('开始')self.scrapy_worker.stop()@QtCore.pyqtSlot(str)def insert_log(self, text):prev_cursor = self.text_edit.textCursor()self.text_edit.moveCursor(QtGui.QTextCursor.End)self.text_edit.insertPlainText(文本)self.text_edit.setTextCursor(prev_cursor)如果 __name__ == '__main__':导入系统app = QtWidgets.QApplication(sys.argv)app.setStyle('融合')w = YellowWidget()w.resize(640, 480)w.show()sys.exit(app.exec_())

我使用了 utils.py 文件中的一个函数:

导入操作系统导入jsondef create_arguments(search_item, location):程序 = 'scrapy'dir_path = os.path.dirname(os.path.abspath(__file__))directory = os.path.join(dir_path, 'scrape')d = {search_item":search_item,位置":位置}参数 = '参数 ={}'.format(json.dumps(d))返回目录、程序、['crawl'、'yellow'、-a"、参数]

获得以下内容:

完整的项目在这里.

I'm referencing a stackoverflow answer that is similar to my GUI app. My scrappy application is a bit different. When exectuing the app, a user is prompt to enter keywords for scrapy to search for

looks like this

im trying to put this logic on the GUI, but im unsure how to do it.

here is what the gui looks like as of now.

I want to be able to input fields where a user can input the information need before processing the scrapy script.

here is a bit of the scrapy script

my_spider.py

import scrapy
import sys
import random
import csv
from scrape.items import Item
from var_dump import var_dump


search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second Location:")
third_location = input("Third Location:")
fourth_location = input("Fourth Location:")
fifth_location = input("Fifth Location:")
sixth_location = input("Sixth Location:")




# city = [
#     "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth", 
#     "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis",  "Seattle", "St. Paul", "Nashville", 
#     "Louisville", "Plano"
# ]

# rancity = random.choice(city)


class YellowSpider(scrapy.Spider):


    name = "yellow"

    # start_urls = [
    #     "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
    # ]

    def start_requests(self):
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location, self.parse3)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location, self.parse4)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fifth_location, self.parse5)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + sixth_location, self.parse6)
        # yield scrapy.Request('http://www.example.com/3.html', self.parse)

    def __init__(self):
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def parse(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse)

    def parse2(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile2)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse2)

    def parse3(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile3)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse3)
        ........

here is the GUI

main.py

from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets

class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.setProgram('scrapy')
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, spider):
        self._process.setWorkingDirectory(project)
        self._process.setArguments(['crawl', spider])
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

    def spiders(self, project):
        process = QtCore.QProcess()
        process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        process.setWorkingDirectory(project)
        loop = QtCore.QEventLoop()
        process.finished.connect(loop.quit)
        process.start('scrapy', ['list'])
        loop.exec_()
        return process.readAllStandardOutput().data().decode().split()

class MainWindow(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(MainWindow, self).__init__(parent)

        self.project_le = QtWidgets.QLineEdit()
        self.project_button = QtWidgets.QPushButton('Select Project')
        self.spider_combobox = QtWidgets.QComboBox()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()
        self.input = QtWidgets.QLineEdit()
        self.input1 = QtWidgets.QLineEdit()
        self.input2 = QtWidgets.QLineEdit()
        self.input3 = QtWidgets.QLineEdit()
        self.input4 = QtWidgets.QLineEdit()
        self.input5 = QtWidgets.QLineEdit()
        self.input6 = QtWidgets.QLineEdit()
        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)

        lay = QtWidgets.QVBoxLayout(central_widget)
        hlay = QtWidgets.QHBoxLayout()
        hlay.addWidget(self.project_le)
        hlay.addWidget(self.project_button)
        lay.addLayout(hlay)
        hlay2 = QtWidgets.QHBoxLayout()
        hlay2.addWidget(QtWidgets.QLabel("Input The Search Item :"))
        hlay2.addWidget(self.input, 1)
        hlay3 = QtWidgets.QHBoxLayout()
        hlay4 = QtWidgets.QHBoxLayout()
        hlay5 = QtWidgets.QHBoxLayout()
        hlay6 = QtWidgets.QHBoxLayout()
        hlay7 = QtWidgets.QHBoxLayout()
        hlay8 = QtWidgets.QHBoxLayout()
        hlay3.addWidget(QtWidgets.QLabel("Location :"))
        hlay3.addWidget(self.input1, 1 )
        hlay4.addWidget(QtWidgets.QLabel("Location 2 :"))
        hlay4.addWidget(self.input2, 1 )
        hlay5.addWidget(QtWidgets.QLabel("Location 3 :"))
        hlay5.addWidget(self.input3, 1 )
        hlay6.addWidget(QtWidgets.QLabel("Location 4 :"))
        hlay6.addWidget(self.input4, 1 )
        hlay7.addWidget(QtWidgets.QLabel("Location 5 :"))
        hlay7.addWidget(self.input5, 1 )
        hlay8.addWidget(QtWidgets.QLabel("Location 6 :"))
        hlay8.addWidget(self.input6, 1 )
        lay.addLayout(hlay2)
        lay.addLayout(hlay3)
        lay.addLayout(hlay4)
        lay.addLayout(hlay5)
        lay.addLayout(hlay6)
        lay.addLayout(hlay7)
        lay.addLayout(hlay8)
        lay.addWidget(self.start_stop_button)
        lay.addWidget(self.text_edit)

        self.start_stop_button.setEnabled(False)

        self.scrapy_worker = ScrapyWorker(self)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

        self.start_stop_button.toggled.connect(self.on_checked)
        self.project_button.clicked.connect(self.select_project)
        self.resize(640, 480)

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            filename = self.project_le.text()
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            self.scrapy_worker.run(directory, self.spider_combobox.currentText())
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot()
    def select_project(self):
        filename, _ = QtWidgets.QFileDialog.getOpenFileName(
            self,
            "Select .cfg file",
            QtCore.QDir.currentPath(),
            "Configure File (*.cfg)"
        )
        if filename:
            self.project_le.setText(filename)
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            spiders = self.scrapy_worker.spiders(directory)
            self.spider_combobox.clear()
            self.spider_combobox.addItems(spiders)
            self.start_stop_button.setEnabled(True if spiders else False)

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = MainWindow()
    w.show()
    sys.exit(app.exec_())

解决方案

First you have to modify your spider to accept arguments directly by the console avoiding using the input() method:

yellowpage_spider.py

import json
import scrapy
from scrape.items import Item

class YellowSpider(scrapy.Spider):
    name = "yellow"

    def __init__(self, *args, **kwargs):
        super(YellowSpider, self).__init__(*args, **kwargs)
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def start_requests(self):
        if not hasattr(self, 'parameters'):
            return
        parameters = json.loads(self.parameters)
        search_item = parameters['search_item']
        locations = parameters['locations']
        for location in locations:
            url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location)
            yield scrapy.Request(url=url, callback=self.parse, meta={'location': location})

    def parse(self, response):
        location = response.meta['location']
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile, meta={'location': location})

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse, meta={'location': location})

    def businessprofile(self, response):
        location = response.meta['location']
        for business in response.css('header#main-header'):
            item = Item()
            item['business_name'] = business.css('div.sales-info h1::text').extract()
            w = business.css('a.secondary-btn.website-link::attr(href)').extract()

            item['website'] = str(w).strip('[]')

            item['location'] = location

            s = business.css('a.email-business::attr(href)').extract()
            item['email'] = [item[7:] for item in s]

            item['phonenumber'] = business.css('p.phone::text').extract_first()
            for x in item['email']:
                #new code here, call to self.seen_business_names
                if x not in self.seen_emails:
                    if item['email']:
                        if item['phonenumber']:
                            if item['website']:
                                self.seen_emails.append(x)
                                yield item

Then the previous code expects a parameter called parameters:

scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'

So in the GUI we must now form the entry using the GUI inputs:

gui.py

import os
import json
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
import utils


dir_path = os.path.dirname(os.path.abspath(__file__))
icons_dir = os.path.join(dir_path, 'assets', 'icons')


class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, program, arguments):
        self._process.setWorkingDirectory(project)
        self._process.setProgram('scrapy')
        self._process.setArguments(arguments)
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

class LocationWidget(QtWidgets.QWidget):
    def __init__(self, parent=None):
        super(LocationWidget, self).__init__(parent)
        self.lay = QtWidgets.QVBoxLayout(self)
        self.lay.setContentsMargins(0, 0, 0, 0)
        self.lay.addStretch()
        self.setContentsMargins(0, 0, 0, 0)
        self.widgets = []
        self.create_row()

    def create_row(self):
        widget = QtWidgets.QWidget()
        widget.setContentsMargins(0, 0, 0, 0)
        hlay = QtWidgets.QHBoxLayout(widget)
        hlay.setContentsMargins(0, 0, 0, 0)
        lineedit = QtWidgets.QLineEdit()
        button = QtWidgets.QToolButton(clicked=self.on_clicled)
        button.setFocusPolicy(QtCore.Qt.NoFocus)
        hlay.addWidget(lineedit)
        hlay.addWidget(button)
        button.setIconSize(QtCore.QSize(24, 24))
        button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
        self.widgets.append(widget)
        self.lay.insertWidget(-1, widget)

    @QtCore.pyqtSlot()
    def on_clicled(self):
        button = self.sender()
        widget = button.parentWidget()
        if self.lay.indexOf(widget) == (self.lay.count()-1):
            self.create_row()
        else:
            self.lay.removeWidget(widget)
            widget.deleteLater()
            self.widgets.remove(widget)
        for widget in self.widgets:
            button = widget.findChild(QtWidgets.QToolButton)
            button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))
        self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))

    def get_locations(self):
        locations = []
        for widget in self.widgets:
            le = widget.findChild(QtWidgets.QLineEdit)
            if le.text():
                locations.append(le.text())
        return locations

class YellowWidget(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(YellowWidget, self).__init__(parent)
        self.setWindowTitle('Yellow Pages Scrapper')
        self.scrapy_worker = ScrapyWorker(self)
        self.search_item_le = QtWidgets.QLineEdit()
        self.location_widget = LocationWidget()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()

        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)
        lay = QtWidgets.QGridLayout(central_widget)
        lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)
        lay.addWidget(self.search_item_le, 0, 1)
        lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, alignment=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)
        lay.addWidget(self.location_widget, 1, 1, alignment=QtCore.Qt.AlignTop)
        lay.addWidget(self.start_stop_button, 2, 0, 1, 2)
        lay.addWidget(self.text_edit, 3, 0, 1, 2)

        self.start_stop_button.toggled.connect(self.on_checked)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            # crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'
            search_item = self.search_item_le.text()
            locations = self.location_widget.get_locations()
            directory, program, args = utils.create_arguments(search_item, locations)
            self.scrapy_worker.run(directory, program, args)
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = YellowWidget()
    w.resize(640, 480)
    w.show()
    sys.exit(app.exec_())

I used a function that is in the utils.py file:

import os
import json

def create_arguments(search_item, locations):
    program = 'scrapy'
    dir_path = os.path.dirname(os.path.abspath(__file__))
    directory = os.path.join(dir_path, 'scrape')
    d = {"search_item": search_item, "locations": locations}
    argument = 'parameters={}'.format(json.dumps(d))
    return directory, program, ['crawl', 'yellow', "-a", argument]

Obtaining the following:

The complete project is here.

这篇关于在运行scrapy之前插入多个输入字段的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆