如何抓取具有相同标签和类的多个结果 [英] How to scrape multiple result having same tags and class

查看:87
本文介绍了如何抓取具有相同标签和类的多个结果的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我的代码对于单个页面是准确的,但是当我使用for循环对多个记录运行此代码时,如果缺少像person这样的数据,那么(因为我对person变量,位置使用了索引no [1]和[2] ,电话号码和手机号码,但如果缺少某些内容(例如缺少人名),则下一个记录将在person变量中提取.您能解决这个问题吗? 这是我的代码:

My code is accurate for single page but when I run this code for multiple records using for loop and if there are some data missing like person then (as I used index no[1] and [2] for person variable ,location, phone no and cell no but if there are something missing like person name is missing) next record will be extracted at person variable. Could you please fix this issue? here is my code:

import requests
from bs4 import BeautifulSoup
import re


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'lxml') # 1. html , 2. parser
    return soup

def get_detail_data(soup):
        #soup = BeautifulSoup(r.text, 'html.parser')
        try:
            title = soup.find("h1", {'class': 'sc-AykKI'}).text
        except:
            title = 'Empty Title'
        #print(title)
        try:
            person = soup.find(
            "span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'}).text.strip()
        except:
            person = 'Empty Person'
        #print(person)
        try:
            addr = soup.findAll(
            "span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'})[1].text
        except:
            addr = 'Empty Address'
        #print(addr)
        #abn = soup.find('div', class_="box__Box-sc-1u3aqjl-0 kxddET").('a').text
        #print(abn)
        try:
            ratting = soup.find(
            "div", {'class': 'Rating__RatingText-sc-1r9ytu8-1 jIdgkl'}).text
        except:
            ratting = 'Empty Ratting'
        #print(ratting)
        try:
            abn = (re.search('abn\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            abn = 'Empty ABN'
        #print(abn)
        try:
            website = (re.search('website\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            website = 'Empty Website'
        #print(website )
        try:
            phone = (re.search('phone\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            phone = 'Empty Phone No'
        #print(phone)
        try:
            cell = (re.search('mobile\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            cell = 'Empty Cell No'
        #print(cell)

        data = {
        'title'         : title,
        'peron name'    : person,
        'address'       : addr,
        'phone no'      : phone,
        'cell no'       : cell,
        'abn no'        : abn,
        'website'       : website
        }
        return data
def get_index_data(soup):
    #soup = BeautifulSoup(r.text, 'html.parser')
    titles = []
    for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
        urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
        titles.append(urls)
    #print(titles)
    return titles

def Main():
    url = "https://hipages.com.au/connect/abcelectricservicespl/service/126298"
    mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
    main_titles = get_index_data(get_page(mainurl))
    for title in main_titles:
        data1 = get_detail_data(get_page(title))
        print(data1)


Main()

推荐答案

您需要从script标签而不是span和divs解析数据.

You need to parse your data from the script tag rather than the spans and divs.

尝试一下:

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from pandas import json_normalize
import json

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'lxml') 
    return soup

def get_detail_data(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    raw = res.text.split("<script> window.__INITIAL_STATE__=")[1]
    raw = raw.split("</script>")[0]
    data = json.loads(raw)
    data = json.loads(data)

    cols = ['abn', 'address', 'name', 'primary_location', 'service_area', 'state', 'suburb', 'website']

    df = pd.DataFrame(data["sites"]["list"]).T
    df = df[cols].reset_index(drop=True)

    primary_location = json_normalize(df.primary_location[0])
    df = pd.concat([df, primary_location], axis=1)
    to_drop = ["primary_location", "is_primary", "suburb_seo_key", "capital_city_seo_key"]
    df.drop(to_drop, axis=1, inplace=True)

    return df


def get_index_data(soup):
    titles = []
    for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
        urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
        titles.append(urls)
    return titles

def Main():
    mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
    main_titles = get_index_data(get_page(mainurl))  
    final_data = [] 
    for title in main_titles:
        data = get_detail_data(title)
        final_data.append(data)
    return final_data

data = Main()

df = pd.concat(data).reset_index(drop=True)
display(df)

这将为您提供更详细的数据.

This gives you much more detailed data by the way.

这篇关于如何抓取具有相同标签和类的多个结果的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆