如何将抓取的项目放入 Pyqt5 小部件中? [英] How to get scraped items into Pyqt5 widget?
问题描述
我正在尝试为 Scrapy 爬虫制作一个简单的 GUI,用户可以在其中按下开始按钮来运行抓取并在 textBrowser(或其他 qt 小部件,请告知)中查看抓取的结果.
我的蜘蛛:
导入scrapy,json类 CarSpider(scrapy.Spider):名称 = '汽车'start_urls = [https://www.target-website.com/"]定义解析(自我,响应):"一些代码""yield scrapy.Request(url=url, callback=self.parse_page)def parse_page(self, response):items = json.loads(response.body_as_unicode())['items']对于 i 在项目中:...scraped_item = {'制作':制作,'模型':模型,'年':年,}产出scraped_item
应用设计是在 Qt Designer 中完成的:
界面:
from PyQt5 import QtCore, QtGui, QtWidgets类 Ui_MainWindow(对象):def setupUi(self, MainWindow):MainWindow.setObjectName("MainWindow")MainWindow.resize(801, 612)sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)sizePolicy.setHorizontalStretch(0)sizePolicy.setVerticalStretch(0)sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())MainWindow.setSizePolicy(sizePolicy)self.centralwidget = QtWidgets.QWidget(MainWindow)sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)sizePolicy.setHorizontalStretch(0)sizePolicy.setVerticalStretch(0)sizePolicy.setHeightForWidth(self.centralwidget.sizePolicy().hasHeightForWidth())self.centralwidget.setSizePolicy(sizePolicy)self.centralwidget.setObjectName("centralwidget")self.pushButton = QtWidgets.QPushButton(self.centralwidget)self.pushButton.setGeometry(QtCore.QRect(10, 10, 211, 41))self.pushButton.setObjectName("pushButton")self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)self.pushButton_2.setEnabled(False)self.pushButton_2.setGeometry(QtCore.QRect(10, 60, 211, 41))self.pushButton_2.setObjectName(pushButton_2")self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)self.textBrowser.setGeometry(QtCore.QRect(240, 10, 551, 571))self.textBrowser.setObjectName("textBrowser")MainWindow.setCentralWidget(self.centralwidget)self.statusbar = QtWidgets.QStatusBar(MainWindow)self.statusbar.setObjectName("statusbar")MainWindow.setStatusBar(self.statusbar)self.retranslateUi(MainWindow)QtCore.QMetaObject.connectSlotsByName(MainWindow)def retranslateUi(self, MainWindow):_translate = QtCore.QCoreApplication.translateMainWindow.setWindowTitle(_translate(MainWindow", MainWindow"))self.pushButton.setText(_translate("MainWindow", "Run Scraper"))self.pushButton_2.setText(_translate(MainWindow", Stop"))
这是我尝试编写的用于处理数据的代码:
data_handler.py:
from PyQt5 import QtWidgets从 PyQt5.QtCore 导入 pyqtSignal, QThread从 my_gui 导入 Ui_MainWindow导入系统,操作系统导入子流程类蜘蛛线程(QThread):信号 = pyqtSignal()output_signal = pyqtSignal('PyQt_PyObject')def __init__(self):QThread.__init__(self)def __del__(self):self.wait()定义运行(自我):如果 os.path.exists('result.csv'):os.remove('result.csv')cmd =爬行车"proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)self.proc_id = proc.pid打印(self.proc_id)out = proc.communicate()对于线路输入:self.output_signal.emit(line)self.signal.emit()类 mywindow(QtWidgets.QMainWindow):def __init__(self):super(mywindow, self).__init__()self.ui = Ui_MainWindow()self.ui.setupUi(self)self.ui.pushButton.clicked.connect(self.slot_method)self.crawling_thread = SpiderThread()self.crawling_thread.signal.connect(self.finished)self.crawling_thread.output_signal.connect(self.update_text)self.ui.pushButton_2.clicked.connect(self.stop)def slot_method(self):self.ui.pushButton.setEnabled(False)self.ui.pushButton_2.setEnabled(True)self.ui.textBrowser.setText('')self.ui.textBrowser.append('开始抓取...')self.crawling_thread.start()定义完成(自我):self.ui.textBrowser.append('finished scraping') # 向用户显示输出self.ui.pushButton.setEnabled(True) # 启用按钮self.ui.pushButton_2.setEnabled(False)def update_text(self, 信号):self.ui.textBrowser.append(signal.decode(utf-8"))定义停止(自我):打印(self.crawling_thread.proc_id)os.kill(self.crawling_thread.proc_id)self.ui.textBrowser.append('抓取停止...')self.ui.pushButton.setEnabled(True) # 启用按钮self.ui.pushButton_2.setEnabled(False)定义主():app = QtWidgets.QApplication([])应用程序 = mywindow()应用程序.show()sys.exit(app.exec())如果 __name__ == '__main__':主要的()
使用此代码,我只能将 stdout
作为文本获取,并在完成抓取后将其放置在 textBrowser
中.如果抓取需要 20-30 分钟 - 我看不到 textBrowser
中的任何更改.是否有机会获得刮擦的物品并实时显示它们?也许有一个解决方案可以用第二个按钮停止/暂停抓取过程?
你应该使用 QProcess
而不是使用 subproces.Popen() + QThread 因为任务是通过信号通知您更容易.
我创建了一个应用程序,它扫描项目中的所有蜘蛛,在 QComboBox 中显示它们,您可以在其中选择要运行的蜘蛛,然后有一个按钮允许您通过显示日志来启动或停止应用程序在 QTextBrowser 中.
假设scrapy项目有如下结构(该项目是scrapy的一个例子,可以在
I'm trying to make a simple GUI for Scrapy crawler, where user can push the Start button to run scraping and see the scraped results in textBrowser (or other qt widget, please advise).
My spider:
import scrapy, json
class CarSpider(scrapy.Spider):
name = 'car'
start_urls = ["https://www.target-website.com/"]
def parse(self, response):
"""some code """
yield scrapy.Request(url=url, callback=self.parse_page)
def parse_page(self, response):
items = json.loads(response.body_as_unicode())['items']
for i in items:
...
scraped_item = {
'Make': make,
'Model': model,
'Year': year,
}
yield scraped_item
The app design was made in Qt Designer:
GUI:
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(801, 612)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
MainWindow.setSizePolicy(sizePolicy)
self.centralwidget = QtWidgets.QWidget(MainWindow)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.centralwidget.sizePolicy().hasHeightForWidth())
self.centralwidget.setSizePolicy(sizePolicy)
self.centralwidget.setObjectName("centralwidget")
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setGeometry(QtCore.QRect(10, 10, 211, 41))
self.pushButton.setObjectName("pushButton")
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setEnabled(False)
self.pushButton_2.setGeometry(QtCore.QRect(10, 60, 211, 41))
self.pushButton_2.setObjectName("pushButton_2")
self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
self.textBrowser.setGeometry(QtCore.QRect(240, 10, 551, 571))
self.textBrowser.setObjectName("textBrowser")
MainWindow.setCentralWidget(self.centralwidget)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
self.pushButton.setText(_translate("MainWindow", "Run Scraper"))
self.pushButton_2.setText(_translate("MainWindow", "Stop"))
And here is the code that I tried to make to handle data:
data_handler.py:
from PyQt5 import QtWidgets
from PyQt5.QtCore import pyqtSignal, QThread
from my_gui import Ui_MainWindow
import sys, os
import subprocess
class SpiderThread(QThread):
signal = pyqtSignal()
output_signal = pyqtSignal('PyQt_PyObject')
def __init__(self):
QThread.__init__(self)
def __del__(self):
self.wait()
def run(self):
if os.path.exists('result.csv'):
os.remove('result.csv')
cmd = "scrapy crawl car"
proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
self.proc_id = proc.pid
print(self.proc_id)
out = proc.communicate()
for line in out:
self.output_signal.emit(line)
self.signal.emit()
class mywindow(QtWidgets.QMainWindow):
def __init__(self):
super(mywindow, self).__init__()
self.ui = Ui_MainWindow()
self.ui.setupUi(self)
self.ui.pushButton.clicked.connect(self.slot_method)
self.crawling_thread = SpiderThread()
self.crawling_thread.signal.connect(self.finished)
self.crawling_thread.output_signal.connect(self.update_text)
self.ui.pushButton_2.clicked.connect(self.stop)
def slot_method(self):
self.ui.pushButton.setEnabled(False)
self.ui.pushButton_2.setEnabled(True)
self.ui.textBrowser.setText('')
self.ui.textBrowser.append('started scraping...')
self.crawling_thread.start()
def finished(self):
self.ui.textBrowser.append('finished scraping') # Show the output to the user
self.ui.pushButton.setEnabled(True) # Enable the pushButton
self.ui.pushButton_2.setEnabled(False)
def update_text(self, signal):
self.ui.textBrowser.append(signal.decode("utf-8"))
def stop(self):
print(self.crawling_thread.proc_id)
os.kill(self.crawling_thread.proc_id)
self.ui.textBrowser.append('Scraping stopped...')
self.ui.pushButton.setEnabled(True) # Enable the pushButton
self.ui.pushButton_2.setEnabled(False)
def main():
app = QtWidgets.QApplication([])
application = mywindow()
application.show()
sys.exit(app.exec())
if __name__ == '__main__':
main()
With this code I could get only the stdout
as text and place it in textBrowser
only after finishing the scraping. And if scraping takes 20-30 minutes - I cannot see any changes in textBrowser
. Is it any chance to get scraped items and display them in real time? And maybe there's a solution to stop/pause the scraping process with a second button?
Instead of using subproces.Popen() + QThread you should use QProcess
since the task is easier by informing you through signals.
I have created an application that scans all the spiders within a project showing them in a QComboBox where you can select which spider you want to run, then there is a button that allows you to start or stop the application by displaying the log in a QTextBrowser.
Assuming that the scrapy project has the following structure (the project is an example of scrapy, you can find it here):
tutorial
├── scrapy.cfg
└── tutorial
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
├── __init__.py
├── toscrape-css.py
└── toscrape-xpath.py
The user has to select the .cfg file, this will show the available spiders, then press the start-stop button as desired.
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
class ScrapyWorker(QtCore.QObject):
logChanged = QtCore.pyqtSignal(str)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
def __init__(self, parent=None):
super(ScrapyWorker, self).__init__(parent)
self._process = QtCore.QProcess(self)
self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
self._process.setProgram('scrapy')
self._process.started.connect(self.started)
self._process.finished.connect(self.finished)
def run(self, project, spider):
self._process.setWorkingDirectory(project)
self._process.setArguments(['crawl', spider])
self._process.start()
@QtCore.pyqtSlot()
def on_readyReadStandardOutput(self):
data = self._process.readAllStandardOutput().data().decode()
self.logChanged.emit(data)
@QtCore.pyqtSlot()
def stop(self):
self._process.kill()
def spiders(self, project):
process = QtCore.QProcess()
process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
process.setWorkingDirectory(project)
loop = QtCore.QEventLoop()
process.finished.connect(loop.quit)
process.start('scrapy', ['list'])
loop.exec_()
return process.readAllStandardOutput().data().decode().split()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.project_le = QtWidgets.QLineEdit()
self.project_button = QtWidgets.QPushButton('Select Project')
self.spider_combobox = QtWidgets.QComboBox()
self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
self.text_edit = QtWidgets.QTextBrowser()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QVBoxLayout(central_widget)
hlay = QtWidgets.QHBoxLayout()
hlay.addWidget(self.project_le)
hlay.addWidget(self.project_button)
lay.addLayout(hlay)
hlay2 = QtWidgets.QHBoxLayout()
hlay2.addWidget(QtWidgets.QLabel("spiders:"))
hlay2.addWidget(self.spider_combobox, 1)
lay.addLayout(hlay2)
lay.addWidget(self.start_stop_button)
lay.addWidget(self.text_edit)
self.start_stop_button.setEnabled(False)
self.scrapy_worker = ScrapyWorker(self)
self.scrapy_worker.logChanged.connect(self.insert_log)
self.scrapy_worker.started.connect(self.text_edit.clear)
self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))
self.start_stop_button.toggled.connect(self.on_checked)
self.project_button.clicked.connect(self.select_project)
self.resize(640, 480)
@QtCore.pyqtSlot(bool)
def on_checked(self, state):
if state:
filename = self.project_le.text()
finfo = QtCore.QFileInfo(filename)
directory = finfo.dir().absolutePath()
self.scrapy_worker.run(directory, self.spider_combobox.currentText())
self.start_stop_button.setText('Stop')
else:
self.start_stop_button.setText('Start')
self.scrapy_worker.stop()
@QtCore.pyqtSlot()
def select_project(self):
filename, _ = QtWidgets.QFileDialog.getOpenFileName(
self,
"Select .cfg file",
QtCore.QDir.currentPath(),
"Configure File (*.cfg)"
)
if filename:
self.project_le.setText(filename)
finfo = QtCore.QFileInfo(filename)
directory = finfo.dir().absolutePath()
spiders = self.scrapy_worker.spiders(directory)
self.spider_combobox.clear()
self.spider_combobox.addItems(spiders)
self.start_stop_button.setEnabled(True if spiders else False)
@QtCore.pyqtSlot(str)
def insert_log(self, text):
prev_cursor = self.text_edit.textCursor()
self.text_edit.moveCursor(QtGui.QTextCursor.End)
self.text_edit.insertPlainText(text)
self.text_edit.setTextCursor(prev_cursor)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
app.setStyle('fusion')
w = MainWindow()
w.show()
sys.exit(app.exec_())
Output:
这篇关于如何将抓取的项目放入 Pyqt5 小部件中?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!