BeautifulSoup,请求,数据框保存到Excel数组错误 [英] BeautifulSoup, Requests, Dataframe Saving to Excel arrays error
问题描述
我是Python的新手,并帮助完成了一个学校项目.任何帮助深表感谢.谢谢.到2004年和2003年时,我得到一个错误.这是由result_list列表引起的.错误是"ValueError:数组必须全部具有相同的长度".我如何引入可解决此问题的代码.分数很重要....
导入请求将熊猫作为pd导入从熊猫导入ExcelWriter从bs4导入BeautifulSoup#from openpyxl.writer.excel导入ExcelWriter导入openpyxl#from openpyxl import load_workbook导入csvyear_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']i = 0而我< = len(year_id)-1url ='https://lehighsports.com/sports/mens-soccer/schedule/'+ str(year_id [i])lehigh = request.get(URL).text汤= BeautifulSoup(lehigh,'lxml')date_list = []在soup.find_all('div',class _ ="sidearm-schedule-game-opponent-date")中的日期:date_list.append(date.get_text(strip = True,分隔符=''))name_list = []在汤中的名称.find_all('div',class _ ="sidearm-schedule-game-opponent-name"):name_list.append(name.get_text(strip = True,分隔符=''))result_list = []对于soup.find_all('div',class _ ="sidearm-schedule-game-result")的结果:result_list.append(result.get_text(strip = True,分隔符=''))opp_list = []对于汤中的opp.find_all('div',class _ ="sidearm-schedule-game-opponent-text"):opp_list.append(opp.get_text(strip = True,分隔符=''))conf_list = []对于汤中的conf.find_all('div',class _ ="sidearm-schedule-game-conference-conference"):conf_list.append(conf.get_text(strip = True))dict = {'date':date_list,'opponent':name_list,'result':result_list,'list':opp_list,'conference':conf_list}df = pd.DataFrame(dict)workbook1 = openpyxl.load_workbook('lehigh.xlsx')writer = pd.ExcelWriter('lehigh.xlsx',engine ='openpyxl')writer.book = workbook1df.to_excel(writer,sheet_name = str(year_id [i]),index = False,startrow = 0,startcol = 0)writer.save()writer.close()我=我+1
代码已更新:
导入请求从bs4导入BeautifulSoup将熊猫作为pd导入从itertools导入zip_longestd = []n = []res = []op = []yr = []与request.Session()作为要求:范围内的年份(2003,2020):打印(f提取年份#{year}")r = req.get(f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")如果r.status_code == 200:汤= BeautifulSoup(r.text,'html.parser')在汤中的日期.findAll("div",{'class':'sidearm-schedule-game-opponent-date flex-item-1'}):d.append(date.get_text(strip = True,分隔符="))用于汤中的名称.findAll("div",{'class':'sidearm-schedule-game-opponent-name'}):n.append(name.get_text(strip = True))对于汤.findAll("div",{'class':'sidearm-schedule-game-result'})的结果:结果= result.get_text(strip = True)res.append(结果)如果len(d)!= len(res):res.append("None")对于汤中的opp.findAll("div",{'class':'sidearm-schedule-game-opponent-text'}):op.append(opp.get_text(strip = True,分隔符=''))yr.append(年)数据= []对于zip_longest(yr,d,n,res,op)中的项目:data.append(项目)df = pd.DataFrame(data,columns = ['Year','Date','Name','Result','Opponent']).to_excel('lehigh.xlsx',index = False)
输出:
Code is updated:
Output: check-online 这篇关于BeautifulSoup,请求,数据框保存到Excel数组错误的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
i=0
while i <= len(year_id)-1:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + str(year_id[i])
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
date_list = []
for date in soup.find_all('div',class_="sidearm-schedule-game-opponent-date"):
date_list.append(date.get_text(strip=True, separator=' '))
name_list = []
for name in soup.find_all('div',class_="sidearm-schedule-game-opponent-name"):
name_list.append(name.get_text(strip=True, separator=' '))
result_list = []
for result in soup.find_all('div',class_="sidearm-schedule-game-result"):
result_list.append(result.get_text(strip=True, separator=' '))
opp_list = []
for opp in soup.find_all('div',class_="sidearm-schedule-game-opponent-text"):
opp_list.append(opp.get_text(strip=True, separator=' '))
conf_list = []
for conf in soup.find_all('div',class_="sidearm-schedule-game-conference-conference"):
conf_list.append(conf.get_text(strip=True))
dict = {'date':date_list,'opponent':name_list,'result':result_list,'list':opp_list,'conference':conf_list}
df = pd.DataFrame(dict)
workbook1 = openpyxl.load_workbook('lehigh.xlsx')
writer = pd.ExcelWriter('lehigh.xlsx', engine='openpyxl')
writer.book = workbook1
df.to_excel(writer, sheet_name=str(year_id[i]),index=False,startrow=0,startcol=0)
writer.save()
writer.close()
i = i+1
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import zip_longest
d = []
n = []
res = []
op = []
yr = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
r = req.get(
f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
d.append(date.get_text(strip=True, separator=" "))
for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
n.append(name.get_text(strip=True))
for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
result = result.get_text(strip=True)
res.append(result)
if len(d) != len(res):
res.append("None")
for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
op.append(opp.get_text(strip=True, separator=' '))
yr.append(year)
data = []
for items in zip_longest(yr, d, n, res, op):
data.append(items)
df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'Result', 'Opponent']).to_excel(
'lehigh.xlsx', index=False)