如何使用python web scraping从主页获取所有列表URL [英] How to get all listings urls from main page with python web scraping

查看:73
本文介绍了如何使用python web scraping从主页获取所有列表URL的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我写了一个用于Web抓取的代码,除了两个问题外,我的代码还可以.从详细信息页面来看,只是ISBN NO一切正常,从主页来看,我需要所有列表URL,以便我的代码可以从列表中获取日期.请指导我如何解决此问题. (主页面和详细信息页面)URL均在代码中.谢谢!

I wrote a code for web scraping, My code is ok just except two issues. From detail page, everything is ok just ISBN NO, and from main page, I need all listing URLs so that my code could scrape date from aa listings. Please guide me how can I fix this issue. Both(main page and details page )URLs are in the code. Thank you!

这是我的代码:

import requests
from bs4 import BeautifulSoup
import csv

def get_page(url):
    response = requests.get(url)

    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_detail_data(soup):

    try:
        title = soup.find('span',class_="title product-field",id=False).text
    except:
        title = 'empty'  
    print(title)
    try:
        writer = soup.find('a',class_="contributor-name",id=False).text
    except:
        writer = 'empty'  
    print(writer)   
    try:
        original_price = soup.find('div',class_="original-price",id=False).find('span').text
    except:
        original_price = 'empty'  
    print(original_price)  
    try:
        active_price = soup.find('div',class_="active-price",id=False).find('span').text
    except:
        active_price = 'empty'  
    print(active_price)     
    try:
        img = soup.find('div',class_="image-actions image-container product-type-icon-container book",id=False).find('img').attrs['src']
    except:
        img = 'empty'  
    print(img)   
    try:
        isbn = soup.find('div',class_="bookitem-secondary-metadata",id=False).find('li').attrs['ISBN: ']
    except:
        isbn = 'empty'  
    print(isbn) 
    data = {
        'title'             :   title,
        'writer'            :   writer,
        'original_price'    :   original_price,
        'active_price'      :   active_price,
        'image'             :   img,
        'isbn'              :   isbn
    }
    return data

def get_index_data(soup):
    titles_link = soup.find_all('a',class_="body_link_11")
    try:
        inks = soup.find('div', class_="item-info",id=False).find('p').find('a').get('href')
    except:
        inks = "empty"
    print(inks)

def main():
    #detail_page_url = "https://www.kobo.com/ww/en/ebook/mum-dad-1"
    mainurl = "https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q"
    #get_page(url)
    #get_detail_data(get_page(detail_page_url))
    get_index_data(get_page(mainurl))
if __name__ == '__main__':
    main()

推荐答案

import requests
import re
import json
from bs4 import BeautifulSoup
import csv


def Soup(content):
    soup = BeautifulSoup(content, 'html.parser')
    return soup


def Main(url):
    r = requests.get(url)
    soup = Soup(r.content)
    scripts = soup.findAll("script", type="application/ld+json",
                           text=re.compile("data"))
    prices = [span.text for span in soup.select(
        "p.product-field.price span span") if span.text != "USD"]
    with open("data.csv", 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Writer", "Price", "ISBN", "IMG", "URL"])
        for script, price in zip(scripts, prices):
            script = json.loads(script.text)
            title = script["data"]["name"]
            author = script["data"]["author"][0]["name"]
            img = f'https:{script["data"]["thumbnailUrl"]}'
            isbn = script["data"]["isbn"]
            url = script["data"]["url"]
            writer.writerow([title, author, price, isbn, img, url])


Main("https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q")

输出:在线查看

输出样本:

这篇关于如何使用python web scraping从主页获取所有列表URL的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆