仅搜寻第一页并将详细内容另存为Python中的数据框 [英] Only crawler the first page and save detailed contents as dataframe in Python
本文介绍了仅搜寻第一页并将详细内容另存为Python中的数据框的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
我正在尝试循环浏览页面,搜寻器并保存
I'm trying to loop pages, crawler and save detailed contents from this link:
Based on the code from here, I've modified the code to:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls[:1]: # remove [:1] to scrape all the pages
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
updated_df = pd.DataFrame()
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
# print(f"Fetching data for {key}...")
dfs = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4")
# https://stackoverflow.com/questions/39710903/pd-read-html-imports-a-list-rather-than-a-dataframe
for df in dfs:
# df = dfs[0].T
df = dfs[0].T.iloc[1:, :].copy()
updated_df = updated_df.append(df)
print(updated_df)
cols = ['项目编号', '转让/出租标的名称', '转让方/出租方名称', '转让标的评估价/年租金评估价(元)',
'转让底价/年租金底价(元)', '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期']
updated_df.columns = cols
updated_df.to_excel('./data.xlsx', index = False)
But it only successfully crawler the first page, how could I crawler all the pages and also add url
column? Thanks.
解决方案
Is this what you're looking for? This processes all the urls and dumps a list of dataframes to a single excel file.
Here's how:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
COLUMNS = [
'项目编号', '转让/出租标的名称', '转让方/出租方名称',
'转让标的评估价/年租金评估价(元)', '转让底价/年租金底价(元)',
'受让方/承租方名称', '成交价/成交年租金(元)', '成交日期', 'URL'
]
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls:
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
def post_process(list_of_dataframes: list, source_url: str) -> pd.DataFrame():
temp_df = list_of_dataframes[0]
temp_df = temp_df.append(
pd.Series(["URL", source_url], index=temp_df.columns),
ignore_index=True,
)
return temp_df.T.iloc[1:, :].copy()
def dump_to_excel(post_processed_dfs: list):
df = pd.concat(post_processed_dfs)
df.columns = COLUMNS
df.to_excel("scraped_data.xlsx", index=False)
processed_dfs = []
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
print(f"Fetching data for {key}...")
df_list = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4",
)
processed_dfs.append(post_process(df_list, follow_url))
dump_to_excel(processed_dfs)
Output:
这篇关于仅搜寻第一页并将详细内容另存为Python中的数据框的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文