使用 BS4 抓取雅虎财经统计数据 [英] Web scraping of Yahoo Finance statistics using BS4

查看:24
本文介绍了使用 BS4 抓取雅虎财经统计数据的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我是 Python 编程的新手,但我发现了一些不同的代码片段并将它们编译到下面的代码中.Python 脚本从汇总数组中返回所有正确的 HTML 值,但没有从统计数组中返回值,因为这些值没有得到匹配.

I am new to Python programming, but I have found some different code snippets and have compiled them into the code underneath. The Python script are returning all the right HTML values, from the summary array but no values from the statistics array, because the values don't get matches.

我不知道如何提取雅虎财经统计面板上的值.它被称为 url2 和 key_stats_on_stat.

I don't know how to extract the values on the statistics pane on Yahoo Finance. Its referred to as url2, and key_stats_on_stat.

我希望你愿意帮助我.

import os, sys
import csv
from bs4 import BeautifulSoup
import xlsxwriter
import urllib3
from selenium import webdriver
import pdb
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 


key_stats_on_main =['Market Cap', 'PE Ratio (TTM)', 'EPS (TTM)']
key_stats_on_stat =['Enterprise Value', 'Trailing P/E', 'Forward P/E',
                     'PEG Ratio (5 yr expected)', 'Return on Assets', 'Quarterly Revenue Growth',
                     'EBITDA', 'Diluted EPS', 'Total Debt/Equity', 'Current Ratio']

stocks_arr =[]
pfolio_file= open("stocks.csv", "r")
for line in pfolio_file:
    indv_stock_arr = line.strip().split(',')
    stocks_arr.append(indv_stock_arr)

print(stocks_arr)

from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
options.add_argument('--no-sandbox') # Bypass OS security model
options.add_argument('--disable-gpu')  # applicable to windows os only
options.add_argument('start-maximized') # 
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options, executable_path=r'C:Users""DocumentsPython Scriptschromedriver_win32chromedriver.exe')
driver.get("https://finance.yahoo.com/quote/AMZN/")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//html/body/div/div/div/form/div/button[2]'))).click()

stock_info_arr = []

for stock in stocks_arr:
    stock_info = []
    ticker = stock[0]
    stock_info.append(ticker)

    url = "https://finance.yahoo.com/quote/{0}?p={0}".format(ticker) #Summary
    url2 = "https://finance.yahoo.com/quote/{0}/key-statistics?p={0}".format(ticker) #Statistics

    driver.get(url)

    innerHTML = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerHTML, 'html.parser')
    for stat in key_stats_on_main:
        page_stat1 = soup.find(text=stat)
        try:
            page_row1 = page_stat1.find_parent('tr')
            try:
                page_statnum1 = page_row1.find_all('span')[1].contents[1].get_text(strip=True)
                print(page_statnum1)
            except:
                page_statnum1 = page_row1.find_all('td')[1].contents[0].get_text(strip=True)
                print(page_statnum1)
        except:
            print('Invalid parent for this element')
            page_statnum1 = "N/A"

        stock_info.append(page_statnum1)

    driver.get(url2)
    innerHTML2 = driver.execute_script("return document.body.innerHTML")
    soup2 = BeautifulSoup(innerHTML2, 'html.parser')
    for stat in key_stats_on_stat:
        page_stat2 = soup2.find(text=stat)
        try:
            page_row2 = page_stat2.find_parent('tr')
            try:
                page_statnum2 = page_row2.find_all('span')[1].contents[0].get_text(strip=True)
                print(page_statnum2)
            except:
                page_statnum2 = page_row2.find_all('td')[1].contents[0].get_text(strip=True)
                print(page_statnum2)
        except:
            print('Invalid parent for this element')
            page_statnum2 = 'N/A'
        stock_info.append(page_statnum2)

    stock_info_arr.append(stock_info)

print(stock_info_arr)
########## WRITING OUR RESULTS INTO EXCEL

key_stats_on_main.extend(key_stats_on_stat)
workbook = xlsxwriter.Workbook('Stocks01.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 1

for stat in key_stats_on_main:
    worksheet.write(row, col, stat)
    col +=1

row = 1
col = 0
for our_stock in stock_info_arr:
    col = 0
    for info_bit in our_stock:
        worksheet.write(row, col, info_bit)
        col += 1
    row += 1
workbook.close()

print('Script completed')

推荐答案

您可以避免 selenium 的开销,并从脚本标签中提取正则表达式信息并解析为 json.我不确定您为什么要向第一个 url 发出请求,因为第二个 url 中似乎存在相同的信息,即

You could avoid overhead of selenium and regex out info from script tag and parse as json. I am unsure why you are doing request to first url as same info seems to be present in second url i.e.

Market Cap = Market Cap (intraday)
PE Ratio (TTM) = Trailing P/E
EPS (TTM)  = Diluted EPS (ttm)

也许他们在开市时会有所不同?但是,相同的方法可以用于第一个 url.

Perhaps they differ during market opening? Same approach can however be used with first url.

Py

import requests, re, json, pprint

p = re.compile(r'root.App.main = (.*);')
tickers = ['NKE','AAPL','SPG']
results = {}

with requests.Session() as s:

    for ticker in tickers:
        r = s.get('https://finance.yahoo.com/quote/{}/key-statistics?p={}'.format(ticker,ticker))
        data = json.loads(p.findall(r.text)[0])
        key_stats = data['context']['dispatcher']['stores']['QuoteSummaryStore']
        res = {
                'Enterprise Value' : key_stats['defaultKeyStatistics']['enterpriseValue']['fmt']
                ,'Trailing P/E' : key_stats['summaryDetail']['trailingPE']['fmt']
                ,'Forward P/E' : key_stats['summaryDetail']['forwardPE']['fmt']
                ,'PEG Ratio (5 yr expected)' : key_stats['defaultKeyStatistics']['pegRatio']['fmt']
                , 'Return on Assets' : key_stats['financialData']['returnOnAssets']['fmt']
                , 'Quarterly Revenue Growth' : key_stats['financialData']['revenueGrowth']['fmt']
                , 'EBITDA' : key_stats['financialData']['ebitda']['fmt']
                , 'Diluted EPS' : key_stats['defaultKeyStatistics']['trailingEps']['fmt']
                , 'Total Debt/Equity' : key_stats['financialData']['debtToEquity']['fmt']
                , 'Current Ratio' :  key_stats['financialData']['currentRatio']['fmt']
        }
        results[ticker] = res

pprint.pprint(results)

<小时>

示例输出:

这篇关于使用 BS4 抓取雅虎财经统计数据的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆