python爬取京东店铺商品价格数据

# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


class jd(object):  
    def __init__(self):

        self.s = requests.session()   ## 创建一个session对象
        headers = {
            'Accept': '*/*',
            'Accept-Language': 'zh-CN',
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G60 Safari/603.3.8',
                   }
        self.s.headers.update(headers)   ### 设置请求头



    def getpid(self,url,name,path):

        self.shopid=re.search('index-(.*?).html',url).group(1)    ###获取店铺ID号
        #searchurl='https://shop.m.jd.com/search/search?shopId='+str(self.shopid)
        t = int(time.time() * 1000)     ###13位时间戳
        searchurl = 'https://shop.m.jd.com/search/searchWareAjax.json?r=' + str(t)    ##请求数据网址
        headers={
            'origin':'https://shop.m.jd.com',
            'referer':'https://shop.m.jd.com/search/search?shopId='+str(self.shopid),
        }
        self.s.headers.update(headers)   ###更新请求头
        wareId_list=[]
        wname_list=[]
        jdPrice_list=[]
        for i in range(1,10000):   ###爬取页数范围   没有找到商品后会自动退出循环
            time.sleep(random.random())  ##随机延时0-1秒

            ##设置请求数据
            data={
                'shopId':str(self.shopid),
                'searchPage':str(i),
                'keyword':'',
                'searchSort':'0',
                'shopCategoryId':'',
                'clickSku':'',
                'skus':'',
                'jdDeliver':'0',
                'pageFrom':'',
            }
            time.sleep(random.random())   ##随机延时0-1秒  没有这句可能会出错
            req=self.s.post(url=searchurl,data=data,verify=False).text   ###获取数据
            print(req)
            wareId=re.findall('"wareId":(.*?),',req)   ##获取商品ID
            wname=re.findall('"wname":"(.*?)",',req)    ###获取商品名称
            jdPrice=re.findall('"jdPrice":"(.*?)",',req)    ###获取商品价格

            if wareId==[]:    ###如果没有找到ID退出循环
                break

            #####处理数据
            wareId_list.extend(wareId)
            wname_list.extend(wname)
            jdPrice_list.extend(jdPrice)
            wareId_l=len(wareId_list)
            name_list=[]
            name_list.append(name)
            name_list.extend(name_list*(wareId_l-1))
            jddata={
                'name':name_list,
                'wareId':wareId_list,
                'wname':wname_list,
                'jdPrice':jdPrice_list
            }


            df = pd.DataFrame(data=jddata)
            df.to_csv(path + r'\jdmall.csv',  index=False, encoding="GB18030")     ###保存csv文件



if __name__ == '__main__':

    url='https://mall.jd.com/index-1000000182.html'    ##店铺地址
    path = r'E:\JD\test'     ###保存路径
    jd=jd()
    jd.getpid(url,'华硕',path)   ###华硕  自定义备注的字段

猜你喜欢

转载自blog.csdn.net/weixin_39416561/article/details/82684753