如何将抓取的项目放入 Pyqt5 小部件中? [英] How to get scraped items into Pyqt5 widget?

查看:61
本文介绍了如何将抓取的项目放入 Pyqt5 小部件中?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试为 Scrapy 爬虫制作一个简单的 GUI,用户可以在其中按下开始按钮来运行抓取并在 textBrowser(或其他 qt 小部件,请告知)中查看抓取的结果.

我的蜘蛛:

导入scrapy,json类 CarSpider(scrapy.Spider):名称 = '汽车'start_urls = [https://www.target-website.com/"]定义解析(自我,响应):"一些代码""yield scrapy.Request(url=url, callback=self.parse_page)def parse_page(self, response):items = json.loads(response.body_as_unicode())['items']对于 i 在项目中:...scraped_item = {'制作':制作,'模型':模型,'年':年,}产出scraped_item

应用设计是在 Qt Designer 中完成的:

界面:

from PyQt5 import QtCore, QtGui, QtWidgets类 Ui_MainWindow(对象):def setupUi(self, MainWindow):MainWindow.setObjectName("MainWindow")MainWindow.resize(801, 612)sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)sizePolicy.setHorizo​​ntalStretch(0)sizePolicy.setVerticalStretch(0)sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())MainWindow.setSizePolicy(sizePolicy)self.centralwidget = QtWidgets.QWidget(MainWindow)sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)sizePolicy.setHorizo​​ntalStretch(0)sizePolicy.setVerticalStretch(0)sizePolicy.setHeightForWidth(self.centralwidget.sizePolicy().hasHeightForWidth())self.centralwidget.setSizePolicy(sizePolicy)self.centralwidget.setObjectName("centralwidget")self.pushButton = QtWidgets.QPushButton(self.centralwidget)self.pushButton.setGeometry(QtCore.QRect(10, 10, 211, 41))self.pushButton.setObjectName("pushButton")self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)self.pushButton_2.setEnabled(False)self.pushButton_2.setGeometry(QtCore.QRect(10, 60, 211, 41))self.pushButton_2.setObjectName(pushButton_2")self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)self.textBrowser.setGeometry(QtCore.QRect(240, 10, 551, 571))self.textBrowser.setObjectName("textBrowser")MainWindow.setCentralWidget(self.centralwidget)self.statusbar = QtWidgets.QStatusBar(MainWindow)self.statusbar.setObjectName("statusbar")MainWindow.setStatusBar(self.statusbar)self.retranslateUi(MainWindow)QtCore.QMetaObject.connectSlotsByName(MainWindow)def retranslateUi(self, MainWindow):_translate = QtCore.QCoreApplication.translateMainWindow.setWindowTitle(_translate(MainWindow", MainWindow"))self.pushButton.setText(_translate("MainWindow", "Run Scraper"))self.pushButton_2.setText(_translate(MainWindow", Stop"))

这是我尝试编写的用于处理数据的代码:

data_handler.py:

from PyQt5 import QtWidgets从 PyQt5.QtCore 导入 pyqtSignal, QThread从 my_gui 导入 Ui_MainWindow导入系统,操作系统导入子流程类蜘蛛线程(QThread):信号 = pyqtSignal()output_signal = pyqtSignal('PyQt_PyObject')def __init__(self):QThread.__init__(self)def __del__(self):self.wait()定义运行(自我):如果 os.path.exists('result.csv'):os.remove('result.csv')cmd =爬行车"proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)self.proc_id = proc.pid打印(self.proc_id)out = proc.communicate()对于线路输入:self.output_signal.emit(line)self.signal.emit()类 mywindow(QtWidgets.QMainWindow):def __init__(self):super(mywindow, self).__init__()self.ui = Ui_MainWindow()self.ui.setupUi(self)self.ui.pushButton.clicked.connect(self.slot_method)self.crawling_thread = SpiderThread()self.crawling_thread.signal.connect(self.finished)self.crawling_thread.output_signal.connect(self.update_text)self.ui.pushButton_2.clicked.connect(self.stop)def slot_method(self):self.ui.pushButton.setEnabled(False)self.ui.pushButton_2.setEnabled(True)self.ui.textBrowser.setText('')self.ui.textBrowser.append('开始抓取...')self.crawling_thread.start()定义完成(自我):self.ui.textBrowser.append('finished scraping') # 向用户显示输出self.ui.pushButton.setEnabled(True) # 启用按钮self.ui.pushButton_2.setEnabled(False)def update_text(self, 信号):self.ui.textBrowser.append(signal.decode(utf-8"))定义停止(自我):打印(self.crawling_thread.proc_id)os.kill(self.crawling_thread.proc_id)self.ui.textBrowser.append('抓取停止...')self.ui.pushButton.setEnabled(True) # 启用按钮self.ui.pushButton_2.setEnabled(False)定义主():app = QtWidgets.QApplication([])应用程序 = mywindow()应用程序.show()sys.exit(app.exec())如果 __name__ == '__main__':主要的()

使用此代码,我只能将 stdout 作为文本获取,并在完成抓取后将其放置在 textBrowser 中.如果抓取需要 20-30 分钟 - 我看不到 textBrowser 中的任何更改.是否有机会获得刮擦的物品并实时显示它们?也许有一个解决方案可以用第二个按钮停止/暂停抓取过程?

解决方案

你应该使用 QProcess 而不是使用 subproces.Popen() + QThread 因为任务是通过信号通知您更容易.

我创建了一个应用程序,它扫描项目中的所有蜘蛛,在 QComboBox 中显示它们,您可以在其中选择要运行的蜘蛛,然后有一个按钮允许您通过显示日志来启动或停止应用程序在 QTextBrowser 中.

假设scrapy项目有如下结构(该项目是scrapy的一个例子,可以在

I'm trying to make a simple GUI for Scrapy crawler, where user can push the Start button to run scraping and see the scraped results in textBrowser (or other qt widget, please advise).

My spider:

import scrapy, json


class CarSpider(scrapy.Spider):
    name = 'car'
    start_urls = ["https://www.target-website.com/"]

    def parse(self, response):
        """some code """
            yield scrapy.Request(url=url, callback=self.parse_page)

    def parse_page(self, response):
        items = json.loads(response.body_as_unicode())['items']
        for i in items:
            ...
            scraped_item = {
                'Make': make,
                'Model': model,
                'Year': year,                    
            }
            yield scraped_item

The app design was made in Qt Designer:

GUI:

from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(801, 612)
        sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
        sizePolicy.setHorizontalStretch(0)
        sizePolicy.setVerticalStretch(0)
        sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
        MainWindow.setSizePolicy(sizePolicy)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
        sizePolicy.setHorizontalStretch(0)
        sizePolicy.setVerticalStretch(0)
        sizePolicy.setHeightForWidth(self.centralwidget.sizePolicy().hasHeightForWidth())
        self.centralwidget.setSizePolicy(sizePolicy)
        self.centralwidget.setObjectName("centralwidget")
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(10, 10, 211, 41))
        self.pushButton.setObjectName("pushButton")
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setEnabled(False)
        self.pushButton_2.setGeometry(QtCore.QRect(10, 60, 211, 41))
        self.pushButton_2.setObjectName("pushButton_2")
        self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
        self.textBrowser.setGeometry(QtCore.QRect(240, 10, 551, 571))
        self.textBrowser.setObjectName("textBrowser")
        MainWindow.setCentralWidget(self.centralwidget)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.pushButton.setText(_translate("MainWindow", "Run Scraper"))
        self.pushButton_2.setText(_translate("MainWindow", "Stop"))

And here is the code that I tried to make to handle data:

data_handler.py:

from PyQt5 import QtWidgets
from PyQt5.QtCore import pyqtSignal, QThread
from my_gui import Ui_MainWindow 
import sys, os 
import subprocess


class SpiderThread(QThread):
    signal = pyqtSignal()
    output_signal = pyqtSignal('PyQt_PyObject')

    def __init__(self):
        QThread.__init__(self)

    def __del__(self):
        self.wait()

    def run(self):
        if os.path.exists('result.csv'):
            os.remove('result.csv')
        cmd = "scrapy crawl car"
        proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
        self.proc_id = proc.pid
        print(self.proc_id)
        out = proc.communicate()
        for line in out:
            self.output_signal.emit(line)
        self.signal.emit()


class mywindow(QtWidgets.QMainWindow):
    def __init__(self):
        super(mywindow, self).__init__()
        self.ui = Ui_MainWindow()
        self.ui.setupUi(self)
        self.ui.pushButton.clicked.connect(self.slot_method)

        self.crawling_thread = SpiderThread()
        self.crawling_thread.signal.connect(self.finished)
        self.crawling_thread.output_signal.connect(self.update_text)
        self.ui.pushButton_2.clicked.connect(self.stop)

    def slot_method(self):
        self.ui.pushButton.setEnabled(False)
        self.ui.pushButton_2.setEnabled(True)
        self.ui.textBrowser.setText('')
        self.ui.textBrowser.append('started scraping...')
        self.crawling_thread.start()

    def finished(self):
        self.ui.textBrowser.append('finished scraping')  # Show the output to the user
        self.ui.pushButton.setEnabled(True)  # Enable the pushButton
        self.ui.pushButton_2.setEnabled(False)

    def update_text(self, signal):
        self.ui.textBrowser.append(signal.decode("utf-8"))

    def stop(self):
        print(self.crawling_thread.proc_id)
        os.kill(self.crawling_thread.proc_id)
        self.ui.textBrowser.append('Scraping stopped...')
        self.ui.pushButton.setEnabled(True)  # Enable the pushButton
        self.ui.pushButton_2.setEnabled(False)


def main():
    app = QtWidgets.QApplication([])
    application = mywindow()
    application.show()
    sys.exit(app.exec())


if __name__ == '__main__':
    main()

With this code I could get only the stdout as text and place it in textBrowser only after finishing the scraping. And if scraping takes 20-30 minutes - I cannot see any changes in textBrowser. Is it any chance to get scraped items and display them in real time? And maybe there's a solution to stop/pause the scraping process with a second button?

解决方案

Instead of using subproces.Popen() + QThread you should use QProcess since the task is easier by informing you through signals.

I have created an application that scans all the spiders within a project showing them in a QComboBox where you can select which spider you want to run, then there is a button that allows you to start or stop the application by displaying the log in a QTextBrowser.

Assuming that the scrapy project has the following structure (the project is an example of scrapy, you can find it here):

tutorial
├── scrapy.cfg
└── tutorial
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── toscrape-css.py
        └── toscrape-xpath.py

The user has to select the .cfg file, this will show the available spiders, then press the start-stop button as desired.

from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets

class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.setProgram('scrapy')
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, spider):
        self._process.setWorkingDirectory(project)
        self._process.setArguments(['crawl', spider])
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

    def spiders(self, project):
        process = QtCore.QProcess()
        process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        process.setWorkingDirectory(project)
        loop = QtCore.QEventLoop()
        process.finished.connect(loop.quit)
        process.start('scrapy', ['list'])
        loop.exec_()
        return process.readAllStandardOutput().data().decode().split()

class MainWindow(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(MainWindow, self).__init__(parent)

        self.project_le = QtWidgets.QLineEdit()
        self.project_button = QtWidgets.QPushButton('Select Project')
        self.spider_combobox = QtWidgets.QComboBox()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()
        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)

        lay = QtWidgets.QVBoxLayout(central_widget)
        hlay = QtWidgets.QHBoxLayout()
        hlay.addWidget(self.project_le)
        hlay.addWidget(self.project_button)
        lay.addLayout(hlay)
        hlay2 = QtWidgets.QHBoxLayout()
        hlay2.addWidget(QtWidgets.QLabel("spiders:"))
        hlay2.addWidget(self.spider_combobox, 1)
        lay.addLayout(hlay2)
        lay.addWidget(self.start_stop_button)
        lay.addWidget(self.text_edit)

        self.start_stop_button.setEnabled(False)

        self.scrapy_worker = ScrapyWorker(self)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

        self.start_stop_button.toggled.connect(self.on_checked)
        self.project_button.clicked.connect(self.select_project)
        self.resize(640, 480)

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            filename = self.project_le.text()
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            self.scrapy_worker.run(directory, self.spider_combobox.currentText())
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot()
    def select_project(self):
        filename, _ = QtWidgets.QFileDialog.getOpenFileName(
            self,
            "Select .cfg file",
            QtCore.QDir.currentPath(),
            "Configure File (*.cfg)"
        )
        if filename:
            self.project_le.setText(filename)
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            spiders = self.scrapy_worker.spiders(directory)
            self.spider_combobox.clear()
            self.spider_combobox.addItems(spiders)
            self.start_stop_button.setEnabled(True if spiders else False)

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = MainWindow()
    w.show()
    sys.exit(app.exec_())

Output:

这篇关于如何将抓取的项目放入 Pyqt5 小部件中?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆