需要帮助以刮除“显示更多"内容按钮 [英] Need help to scrape "Show more" button

查看:38
本文介绍了需要帮助以刮除“显示更多"内容按钮的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有followind代码

I have the followind code

import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time


url_list = [
        'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
#       'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::3-300',

   ]

df_list = [] 

for url in url_list:

    headers = ({'User-Agent':
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
         'Accept-Language': 'es-ES, es;q=0.5'})
    print (url)
    r = requests.get(url, headers = headers)
    print(r.status_code)
    soup = BeautifulSoup(r.content,'html.parser')
    items = soup.find_all('div',class_='col-xs-12 col-sm-6 col-sm-6 col-md-6 col-lg-3 col-product col-custom-width')
    # print(items)
    store = 'Coolmod'
    extraction_date = datetime.datetime.today().replace(microsecond=0)
    for item in items:
        product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        try:
            price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
            price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
            price = "No price"  # .replace('€','').replace('\t','').replace('\n', '').replace('\r', '')
        
         # old_price = item.find(class_ = 'old-price product-price').text[:-2] if item.find(class_ = 'old-price product-price') != None else None
        try:
            availability = item.find('div', class_ = 'product-availability cat-product-availability').text.replace('\t','').replace('\n', '').replace('\r', '')
         # except AttributeError:
         #     availability = item.find('span', class_ = 'btn-addtocart btn-icon disabled').text.replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
             availability = "No info"
         # stock = [item.find(class_ = 'item-availability').get_text() if item.find(class_ = 'item-availability') != None else None for item in items] 
        product_info = {
                'product_name' : product_name,
                'price' : price,
             #  'old_price' : old_price,
                'availability' : availability,
                'store' : store,
                'date_extraction' : extraction_date,
            }
        df_list.append(product_info)
    time.sleep(3)

df = pd.DataFrame(df_list)
print(df)

它可以正常工作,并返回具有预期结果的数据框.问题是仅检索二十个第一条记录,之后才显示"显示更多".按钮以获取接下来的二十个产品,依此类推.

It works fine and return a dataframe with the expected results. The problem is only retrieve the twenty first records, after that there is a "Show more" button in order to get the next twenty products and so on.

我看到了网页代码并进行了检查,但找不到与按钮进行交互的方法.

I see the web page code and inspect it but I don´t find a way to interact with the button.

任何想法或建议将不胜感激.

Any idea or suggestion would be much appreciated.

致谢.

推荐答案

最后我明白了

from selenium import webdriver 
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')

driver = webdriver.Chrome(executable_path=r"C:\\chromedriver.exe", options=options)


url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'

driver.get(url)

sleep(random.uniform(5.0, 7.5))

try:
    popup = driver.find_element_by_class_name('confirm').click()
except NoSuchElementException:
    pass


iter = 1
while iter > 0:
    sleep(random.uniform(3.5, 6.5))
    try:
        ver_mas = driver.find_element_by_class_name('button-load-more')
        actions = ActionChains(driver)
        actions.move_to_element(ver_mas).perform()
        driver.execute_script("arguments[0].click();", ver_mas)

    except NoSuchElementException:
        break
    iter += 1

page_source = driver.page_source

soup = BeautifulSoup(page_source, 'lxml')
# print(soup)

items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
# print(len(items))

df_list = []
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
    product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    try:
        price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        price = "No price"
    try:
        availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        availability = "No info"
    
    product_info = {
            'product_name' : product_name,
            'price' : price,
            'availability' : availability,
            'store' : store,
            'date_extraction' : extraction_date,
        }
    df_list.append(product_info)

df = pd.DataFrame(df_list)
print(df)

感谢@Alin Stelian提供的

Thanks @Alin Stelian for the help

这篇关于需要帮助以刮除“显示更多"内容按钮的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆