如何使用请求抓取单个URL-多页 [英] How to Scrape single URL- Multi pages using Requests

查看:35
本文介绍了如何使用请求抓取单个URL-多页的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试创建一个电子商务网站,但是1个类别具有不同的页面,并且所有页面共享一个URL

I am trying to Scrape an e-commerce Website but 1 category have different pages and all the pages share a single URL

prodlink=[]
onepage='https://www.storeandmore.co.za/subcategory/crates_and_food_bins'
res = requests.get(onepage)
soup = BeautifulSoup(res.text, 'html.parser') 
for i in soup.find_all('a', class_='product_link'):
    prodlinks.append(i['href'])

如何转到下一页并获取所有产品网址?请帮助

How can I go to the next page and get all the product urls? plz help

推荐答案

需要发布请求以提取2页数据.您将获得36个链接,因为我在其中添加了页面网址.感谢您提出的好问题.

need a post requests to extract the 2 page data. You will get 36 link because i added page url with it. Thanks for the nice question.

import requests
from bs4 import BeautifulSoup
import requests
prodlink=[]

cookies = {
    'ESOLVE': '537507d0d2226637df7896bac091d648',
    '_ga': 'GA1.3.677450556.1613230295',
    '_gid': 'GA1.3.1046474275.1613230295',
}

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'Upgrade-Insecure-Requests': '1',
    'Origin': 'https://www.storeandmore.co.za',
    'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundary57WatmxVwMPvJzr4',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document',
    'Referer': 'https://www.storeandmore.co.za/subcategory/crates_and_food_bins',
    'Accept-Language': 'en-US,en;q=0.9',
}

data = {
  '$------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name': '"MAX_FILE_SIZE"\\r\\n\\r\\n5000000\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="dowhat"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="menu_id"\\r\\n\\r\\n107\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="id"\\r\\n\\r\\n487\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="host"\\r\\n\\r\\nhttps://www.storeandmore.co.za/\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="module"\\r\\n\\r\\nbin/storeandmore.geckonet.co.za/\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="process_step"\\r\\n\\r\\n0\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="user_conversion_rate"\\r\\n\\r\\n1\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="user_currency_symbol"\\r\\n\\r\\nR\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="itemindex"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="discount_system"\\r\\n\\r\\nqty\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="store_location_id"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="temp_latitude"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="temp_longitude"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="temp_fulltext_user_address"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="searchphrase"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="dowhat_search"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="basket_total_items"\\r\\n\\r\\n0\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="Submit_Inc"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="Email_Inc"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="Password_Inc"\\r\\n\\r\\n\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="sorting_order"\\r\\n\\r\\n0\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_0"\\r\\n\\r\\n101-2-00526\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_1"\\r\\n\\r\\n101-2-00527\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_2"\\r\\n\\r\\n101-2-00528\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_3"\\r\\n\\r\\n101-2-00534\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_4"\\r\\n\\r\\n101-2-00535\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_5"\\r\\n\\r\\n101-2-00536\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_6"\\r\\n\\r\\n101-2-00532\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_7"\\r\\n\\r\\n101-2-00531\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_8"\\r\\n\\r\\n101-2-00533\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_9"\\r\\n\\r\\n103-4-00040\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_10"\\r\\n\\r\\n107-2-00065\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_11"\\r\\n\\r\\n107-2-00036\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_12"\\r\\n\\r\\n107-2-00054\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_13"\\r\\n\\r\\n107-2-00024\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_14"\\r\\n\\r\\n107-2-00025\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_15"\\r\\n\\r\\n107-2-00066\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_16"\\r\\n\\r\\n107-2-00067\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_17"\\r\\n\\r\\n107-2-00047\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_18"\\r\\n\\r\\n107-2-00046\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_19"\\r\\n\\r\\n107-2-00006\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_20"\\r\\n\\r\\n107-2-00026\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_21"\\r\\n\\r\\n107-2-00027\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_22"\\r\\n\\r\\n107-2-00028\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="code_23"\\r\\n\\r\\n107-2-00029\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="23c2cf7139e4775f5f4b9fe5deb183c70cae0e3c"\\r\\n\\r\\n2\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4\\r\\nContent-Disposition: form-data; name="sort"\\r\\n\\r\\nt1.`name`\\r\\n------WebKitFormBoundary57WatmxVwMPvJzr4--\\r\\n'
}

url='https://www.storeandmore.co.za/subcategory/crates_and_food_bins'
response1 = requests.get(url)

response2 = requests.post('https://www.storeandmore.co.za/sef/form_handler.php', headers=headers, cookies=cookies, data=data)
responses = [response1, response2]
for response in responses:

    soup = BeautifulSoup(response.text, 'html.parser')
    prodlink.append(response.url)
    for i in soup.find_all('a', class_='product_link'):
        prodlink.append(i['href'])

print(prodlink)
print(len(prodlink))

这篇关于如何使用请求抓取单个URL-多页的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆