如何使用python从网站中提取表 [英] How to extract table from website using python

查看:42
本文介绍了如何使用python从网站中提取表的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我一直在尝试从网站中提取表格,但我迷路了.谁能帮我 ?我的目标是提取范围页面表:

 导入请求导入json将熊猫作为pd导入汇入def get_organisationId(url):#url ='https://training.gov.au/Organisation/Details/31102'标头= {'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML,like Gecko)Chrome/87.0.4280.67 Safari/537.36'}分别= requests.get(URL,headers = headers)id_list = re.findall(r'OrganisationId =(.*?)&',分别为文本)OrganisationId = id_list [0],如果id_list否则无返回organizationId#首先获取OrganisationIdurl ='https://training.gov.au/Organisation/Details/31102'OrganisationId = get_organisationId(url)def get_AjaxScopeQualification(organisationId):如果organisationId:url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex = 4'标头= {'origin':'https://training.gov.au','referer':f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex = 4','用户代理':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML,例如Gecko)Chrome/87.0.4280.67 Safari/537.36','x-requested-with':'XMLHttpRequest'}data = {'page':'1','size':'100','orderBy':'Code-asc','groupBy':``,'filter':''}r = requests.post(URL,json = data,headers = headers)响应= json.loads(re.sub(r'new Date \((\ d +),(\ d +),(\ d +),0,0,0 \)',r'''\ 1- \ 2-\ 2',r.text))返回响应响应= get_AjaxScopeQualification(organisationId)dfn = pd.json_normalize(response,'data',meta = ['total'])打印(dfn.columns)print(dfn [['Code','Title','Extent']]) 

结果:

  response ['data'] [0]{'Id':'5096634d-4210-4fd4-a51d-f548cd39d57b','NrtId':'2feb7e3f-7fc6-4719-ba66-2a066f6635c7','RtoId':'3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9','TrainingComponentType':2'代码':'BSB20115',标题":商业II级证书",'IsImplicit':错误,'ExtentId':'01',范围":交付和评估",'开始日期':'2015-3-3','EndDate':'2022-3-3','DeliveryNsw':是的,'DeliveryVic':是的,'DeliveryQld':是的,'DeliverySa':是的,'DeliveryWa':是的,'DeliveryTas':是的,'DeliveryNt':是的,'DeliveryAct':是的,'ScopeDecisionType':0,'ScopeDecision':'交付和评估','OverseasCodeAlpha':无,'OverseasCodeAlhpaList':[],'OverseasCodeAlphaOutput':''} 

i have been trying to extract the table from website but i am lost. can anyone help me ? my goal is to extract the table of scope page : https://training.gov.au/Organisation/Details/31102

import requests
from bs4 import BeautifulSoup
url = "https://training.gov.au/Organisation/Details/31102"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

table = soup.find(id ="ScopeQualification")
[row.text.split() for row in table.find_all("tr")]

解决方案

  1. find OrganisationId from 'https://training.gov.au/Organisation/Details/31102'.
  2. find XHR url, https://training.gov.au/Organisation/AjaxScopeQualification/3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9?tabIndex=4, POST Method.

import requests
import json
import pandas as pd
import re

def get_organisationId(url):
    # url = 'https://training.gov.au/Organisation/Details/31102'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    id_list = re.findall(r'OrganisationId=(.*?)&', resp.text)
    organisationId = id_list[0] if id_list else None
    return organisationId

# get organisationId first
url = 'https://training.gov.au/Organisation/Details/31102'
organisationId = get_organisationId(url)


def get_AjaxScopeQualification(organisationId):
    if organisationId:
        url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex=4'
        headers = {
         'origin': 'https://training.gov.au',
         'referer': f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex=4',
         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
         'x-requested-with': 'XMLHttpRequest'
        }
        data = {'page': '1', 'size': '100', 'orderBy': 'Code-asc', 'groupBy': '', 'filter': ''}
        r = requests.post(url, json=data, headers=headers)
        response = json.loads(re.sub(r'new Date\((\d+),(\d+),(\d+),0,0,0\)', r'"\1-\2-\2"', r.text))
        return response
response = get_AjaxScopeQualification(organisationId)
dfn = pd.json_normalize(response, 'data', meta=['total'])
print(dfn.columns)
print(dfn[[ 'Code', 'Title', 'Extent']])

result:

response['data'][0]

{'Id': '5096634d-4210-4fd4-a51d-f548cd39d57b',
 'NrtId': '2feb7e3f-7fc6-4719-ba66-2a066f6635c7',
 'RtoId': '3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9',
 'TrainingComponentType': 2,
 'Code': 'BSB20115',
 'Title': 'Certificate II in Business',
 'IsImplicit': False,
 'ExtentId': '01',
 'Extent': 'Deliver and assess',
 'StartDate': '2015-3-3',
 'EndDate': '2022-3-3',
 'DeliveryNsw': True,
 'DeliveryVic': True,
 'DeliveryQld': True,
 'DeliverySa': True,
 'DeliveryWa': True,
 'DeliveryTas': True,
 'DeliveryNt': True,
 'DeliveryAct': True,
 'ScopeDecisionType': 0,
 'ScopeDecision': 'Deliver and assess',
 'OverseasCodeAlpha': None,
 'OverseasCodeAlhpaList': [],
 'OverseasCodeAlphaOutput': ''}

这篇关于如何使用python从网站中提取表的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆