解析HTML code为滚动下来整个网页 [英] Parse the html code for a whole webpage scrolled down
本文介绍了解析HTML code为滚动下来整个网页的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
从BS4进口BeautifulSoup
进口的urllib,SYS
重装(SYS)
sys.setdefaultencoding函数(UTF-8)
R =了urllib.urlopen('https://twitter.com/ndtv').read()
汤= BeautifulSoup(R)
这会给我不是整个页面滚动下来,我想,但最终只有一些。
编辑:
硒进口的webdriver
从selenium.common.exceptions导入StaleElementReferenceException,TimeoutException异常
从selenium.webdriver.common.by进口国
从selenium.webdriver.support.ui进口WebDriverWait
从selenium.webdriver.support进口expected_conditions为EC
从BS4进口BeautifulSoup
进口的urllib,SYS,请求
重装(SYS)
sys.setdefaultencoding函数(UTF-8)类wait_for_more_than_n_elements_to_be_ present(对象):
高清__init __(自我,定位,计数):
self.locator =定位器
self.count =计数 高清__call __(自我,驱动程序):
尝试:
元素= EC._find_elements(驱动程序,self.locator)
返回LEN(元素)GT; self.count
除了StaleElementReferenceException:
返回False高清return_html_ code(URL):
司机= webdriver.Firefox()
driver.maximize_window()
driver.get(URL)
#初始等待的鸣叫加载
等待= WebDriverWait(驱动程序,10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,李[数据项-ID])))
#向下滚动到最后的鸣叫,直到没有装载更多的鸣叫
而真正的:
鸣叫= driver.find_elements_by_css_selector(李[数据项-ID])
number_of_tweets = LEN(微博)
打印number_of_tweets
driver.execute_script(论据[0] .scrollIntoView();,鸣叫[-1])
尝试:
wait.until(wait_for_more_than_n_elements_to_be_ present((By.CSS_SELECTOR,李[数据项-ID]),number_of_tweets))
除了TimeoutException异常:
打破
html_full_source = driver.page_source
driver.close()
返回html_full_source
URL =的https://twitter.com/thecoolstacks
#使用硒的浏览器
html_source = return_html_ code(URL)
soup_selenium = BeautifulSoup(html_source)
打印soup_selenium
text_tweet = []
alltweets_selenium = soup_selenium.find_all(ATTRS = {'数据项类型':'推特'})
在alltweets_selenium鸣叫:
鸣叫的#Text
html_tweet = tweet.find_all(P类_ =TweetTextSize TweetTextSize - 16px的JS-鸣叫文本鸣叫文本)
text_tweet.append(''。加入(html_tweet [0] .findAll(文= TRUE)))
打印text_tweet
预期输出:
从BS4进口BeautifulSoup网址导入请求=的https://twitter.com/thecoolstacks
REQ = requests.get(URL)
汤= BeautifulSoup(req.content)
alltweets = soup.find_all(ATTRS = {'数据项类型':'推特'})
打印alltweets [0]
解决方案
我仍然会坚持使用 Twitter的API 。
另外,这里是你如何与硒
解决这个问题:
- 使用的明确的等待的并确定一<一href=\"http://stackoverflow.com/questions/19377437/python-selenium-webdriver-writing-my-own-expected-condition/29377790#29377790\">custom预计条件的等待鸣叫上滚动负载
- 通过
执行滚动到最后装鸣叫scrollIntoView()
实施
硒进口的webdriver
从selenium.common.exceptions导入StaleElementReferenceException,TimeoutException异常
从selenium.webdriver.common.by进口国
从selenium.webdriver.support.ui进口WebDriverWait
从selenium.webdriver.support进口expected_conditions为EC
类wait_for_more_than_n_elements_to_be_ present(对象):
高清__init __(自我,定位,计数):
self.locator =定位器
self.count =计数 高清__call __(自我,驱动程序):
尝试:
元素= EC._find_elements(驱动程序,self.locator)
返回LEN(元素)GT; self.count
除了StaleElementReferenceException:
返回False
URL =https://twitter.com/ndtv
司机= webdriver.Firefox()
driver.maximize_window()
driver.get(URL)#初始等待的鸣叫加载
等待= WebDriverWait(驱动程序,10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,李[数据项-ID])))#向下滚动到最后的鸣叫,直到没有装载更多的鸣叫
而真正的:
鸣叫= driver.find_elements_by_css_selector(李[数据项-ID])
number_of_tweets = LEN(微博) driver.execute_script(论据[0] .scrollIntoView();,鸣叫[-1]) 尝试:
wait.until(wait_for_more_than_n_elements_to_be_ present((By.CSS_SELECTOR,李[数据项-ID]),number_of_tweets))
除了TimeoutException异常:
打破
这会下跌了,因为它是需要滚动加载所有现有的鸣叫在这个渠道。
下面是HTML的解析片段,提取鸣叫:
page_source = driver.page_source
driver.close()汤= BeautifulSoup(page_source)
在soup.select鸣叫(div.tweet div.content):
打印tweet.p.text
它打印:
父亲节Facebook发布由警察逮捕苏哈斯戈卡莱的儿子得到了近10000喜欢http://goo.gl/aPqlxf pic.twitter.com/JUqmdWNQ3c
第三季度#HWL2015结束!令人惊叹的东西。印度巴基斯坦2-2 - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp ...
为什么这些克什米尔男孩可能会错过自己的梦想IIT http://goo.gl/9LVKfK pic.twitter.com/gohX21Gibi
...
from bs4 import BeautifulSoup
import urllib,sys
reload(sys)
sys.setdefaultencoding("utf-8")
r = urllib.urlopen('https://twitter.com/ndtv').read()
soup = BeautifulSoup(r)
This would give me not the whole web page scrolled down the end which I want but only some of it.
EDIT:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print number_of_tweets
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
return html_full_source
url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
#Text of tweet
html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
text_tweet.append(''.join(html_tweet[0].findAll(text=True)))
print text_tweet
Intended Output:
import requests from bs4 import BeautifulSoup url='https://twitter.com/thecoolstacks'
req = requests.get(url)
soup = BeautifulSoup(req.content)
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'})
print alltweets[0]
解决方案
I would still insist on using the Twitter API.
Alternatively, here is how you can approach the problem with selenium
:
- use Explicit Waits and define a custom Expected Condition to wait for tweets to load on scroll
- perform the scroll to a last loaded tweet via
scrollIntoView()
Implementation:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
This would scroll down as much as it is needed to load all of the existing tweets in this channel.
Here is the HTML-parsing snippet, extracting tweets:
page_source = driver.page_source
driver.close()
soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
print tweet.p.text
It prints:
Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://goo.gl/aPqlxf pic.twitter.com/JUqmdWNQ3c
#HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK pic.twitter.com/gohX21Gibi
...
这篇关于解析HTML code为滚动下来整个网页的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文