2021-10-22 EC サイト情報のクロール (非同期を使用しない)

BeautifulSoup の使い方をマスターすることに重点を置き、class または id 属性を使用して select および select_one の位置決めを行い、正規表現を試してください。

from bs4 import BeautifulSoup
import requests
import time
import json
import re
# import phantomjs
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': 'https://www.jd.com/'}
cookiestr='shshshfpa=509a4836-a95f-a00b-da6f-a2cee3bdc012-1573951043; shshshfpb=xei61TmhyHUJmvGIu%2FBoS3w%3D%3D; __jdu=787291882; user-key=27687cb1-4237-49c1-be50-1389469ccb2a; cn=0; ipLoc-djd=1-72-4137-0; areaId=1; PCSYCityID=CN_330000_0_0; __jdc=122270672; 3AB9D23F7A4B3C9B=DCJUYZT25TVN4JGXFQIH5WNSLDSVHW4ZJE4YXXJEHTQW7CSAAWIXEJA5SY6KYZWKQQNRQEW5GIBRUEYWYHZTRPD5IU; _gcl_au=1.1.1358316623.1582685147; shshshfp=14c88871408acf96dfa7675a8c41baa8; __jda=122270672.787291882.1573690002.1582682640.1582851083.29; __jdv=122270672|direct|-|none|-|1582851083348; __jdb=122270672.3.787291882|29.1582851083'
cookies={} #构建cookies应对反爬
for i in cookiestr.split(';'):
    cookies[i.split('=')[0]]=i.split('=')[1]

def get_price(skuid): #获得产品原价及促销价
    url='https://p.3.cn/prices/mgets?callback=jQuery7409665&ext=11101100&pin=&type=1&area=1_72_4137_0&skuIds=J_%s'%skuid
    html = requests.get(url,headers=headers,cookies=cookies) #cookies似乎不需要
    pattern = re.compile(r'{.+}')
    originalprice=(json.loads(re.findall(pattern, html.text)[0])['op'])
    promotionprice=(json.loads(re.findall(pattern, html.text)[0])['p'])
    return originalprice,promotionprice #返回原价和促销价的元祖

def get_comments(skuid):#获得产品评价信息
    url='https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=%s'%skuid
    html = requests.get(url,headers=headers,cookies=cookies)
    dic=json.loads(html.text)
    return dic #返回字典格式


def get_html(url):
    # b=webdriver.PhantomJS() #反爬情况下, 采用selenium+PhantomJS爬取
    # b.get(url)
    # if b.page_source:
    #     print('获取页面成功...')
    #     parse_html(b.page_source)
    # else:
    #     print('Error:', html.text)
    # return b.page_source
    html=requests.get(url,headers=headers)
    if html.status_code==200:
        # print('获取页面成功...')
        # print(html.text)
        parse_html(html.text)
    else:
        print('Error:',html.text)
    return html.text

def parse_html(html):
    soup=BeautifulSoup(html,'lxml')
    products=soup.select('#J_goodsList > ul > li') #这里的>不能省略,否则会有重复信息获取 id用#,class用.
    n=0
    for i in products:
        try:
            shopname = i.select_one('div > div.p-shop').get('data-shop_name')
            # shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以
            sku=i.get('data-sku')
            comments = get_comments(sku)
            price = get_price(sku)[1]
            productname =i.select_one('div > div.p-name > a > em').text.strip()
            productlink='http:'+i.select_one('div > div.p-img > a')['href']
            img='http:'+i.select_one('div > div.p-img > a > img')['src']
            # r.hmset(sku, {'商品名称': productname, '店铺':  shopname,'商品链接':productlink,'商品图片链接':img})
            n+=1
        except:
            img = 'http:' + i.select_one('div > div.p-img > a > img')['data-lazy-img']
            # shopname = i.select_one('div > div.p-shop').get('data-shop_name')
            # # shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以
            # sku = i.select_one('div')['data-sku']
            # productname = i.select_one('div > div.p-name > a > em').text.strip()
            # productlink = 'http:' + i.select_one('div > div.p-img > a')['href']
            n += 1
        finally:
        # print(shopname ,',',sku,',', productname,',', productlink,',', img)
            print(shopname, sku, productname, productlink, img, '价格:', price, '评论数:',
              comments['CommentsCount'][0]['CommentCount'], '好评率:', comments['CommentsCount'][0]['GoodRate'])
    print(len(products))
    print(n)

if __name__=='__main__':
    time1=time.time()
    url = ['https://list.jd.com/list.html?cat=9847,9850&page=%s'%str(i) for i in range(1,2)]
    pageNum=0
    for i in url:
        time.sleep(1)
        pageNum+=1
        print('Crawling Page No.:',pageNum)
        get_html(i)
    time2=time.time()
    print('Time used:',time2-time1) #Time used: 4.054231882095337这是未被反爬情况下,爬取5页的时间

おすすめ

転載: blog.csdn.net/weixin_45387160/article/details/120912921