python苏宁易购店铺商品数据爬取

苏宁易购店铺商品数据爬取

#!coding=utf-8
##苏宁易购店铺商品数据爬取
import requests
import re
import math
import random
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


class suning(object):

    def __init__(self,path):
        self.path=path #CSV保存地址
        self.s=requests.session()
        headers = {'Accept': '*/*',
                   'Accept-Language': 'zh-CN',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36'
                   }
        self.s.headers.update(headers)

    def closes(self):
        self.s.close()

    def get_shopid(self,url):  #获取店铺ID号


        page = self.s.get(url=url, verify=False).text
        #print(page)
        shopid=re.search('@id": "(.*?)://shop.suning.com/(.*?)/index.html"',page).group(2)
        shopnm=re.search('<title>(.*?)</title>',page).group(1)
        if len(shopid) < 10:
            l = 10 - len(shopid)
            shopid = '0' * l+shopid

        return shopid,shopnm

    def goodid(self,url):   #获取店铺商品SPU

        shop=self.get_shopid(url)
        shopid=shop[0]
        shopnm=shop[1]
        url='https://csearch.suning.com/emall/cshop/queryByKeyword.do?vendor_Id={}&keyword=&start=0&rows=48&sortField=&cf=price:&pcode=&callback=jsonpQueryByKeyword'.format(shopid)
        page = self.s.get(url=url, verify=False).text
        time.sleep(random.random())
        total=re.search('"totalSize":(.*?),',page).group(1)
        p=math.ceil(int(total)/48)
        df = pd.DataFrame()
        df.loc[0, "nm"] = ''
        df.loc[:, "url"] = ''
        df.loc[:, "spu"] = ''
        df.loc[:, "subColors"] = ''
        df.loc[:, "price1"] = ''
        df.loc[:, "countOfarticle"] = ''
        df.loc[:, "praiseRate"] = ''
        df.loc[:, "firstShelfTime"] = ''
        df.loc[:, "inventory"] = ''
        df.to_csv(self.path + r'\sngoodid.csv', index=False, encoding="GB18030")
        y=0

        for i in range(0,p):
            start = i*48
            url = 'https://csearch.suning.com/emall/cshop/queryByKeyword.do?vendor_Id={}&keyword=&start={}&rows=48&sortField=&cf=price:&pcode=&callback=jsonpQueryByKeyword'.format(
                shopid,start)
            html=self.s.get(url=url, verify=False).text
            time.sleep(random.random())

            nm=re.findall('"catentdesc":"(.*?)"}',html)
            url=re.findall('"commidityUrl":"(.*?)",',html)
            price=re.findall('"price":"(.*?)",',html)
            countOfarticle=re.findall('"countOfarticle":"(.*?)",',html)
            praiseRate=re.findall('"praiseRate":(.*?),',html)
            firstShelfTime=re.findall('"firstShelfTime":"(.*?)",',html)
            inventory=re.findall('"inventory":"(.*?)",',html)
            subColors=re.findall('"subColors":(.*?),"',html)
            nm_l=len(nm)
            #print(nm)
            #print(subColors)
            for j in range(0,nm_l):
                if subColors[j]=='""':
                    df.at[y, "nm"] = nm[j]
                    df.at[y, "url"] = url[j]
                    df.at[y, "price1"] = price[j]
                    df.at[y, "countOfarticle"] = countOfarticle[j]
                    df.at[y, "praiseRate"] = praiseRate[j]
                    df.at[y, "firstShelfTime"] = firstShelfTime[j]
                    df.at[y, "inventory"] = inventory[j]
                    df.at[y, "spu"] =re.search('\d/(.*?).html',url[j]).group(1).strip()
                    df.at[y, "subColors"] = ''
                    df.to_csv(self.path + r'\sngoodid.csv', index=False, encoding="GB18030")
                    y+=1
                else:
                    s_sub=subColors[j].split(',')
                    print(s_sub)
                    for k in range(0,len(s_sub)):

                        df.at[y, "spu"] = s_sub[k].split('|')[0].replace('"','').strip()
                        df.at[y, "subColors"] =s_sub[k].split('|')[2]
                        df.at[y, "nm"] = nm[j]
                        df.at[y, "url"] = url[j]
                        df.at[y, "price1"] = price[j]
                        df.at[y, "countOfarticle"] = countOfarticle[j]
                        df.at[y, "praiseRate"] = praiseRate[j]
                        df.at[y, "firstShelfTime"] = firstShelfTime[j]
                        df.at[y, "inventory"] = inventory[j]
                        y+=1
                        df.loc[:, "shopnm"] = shopnm
                        df.to_csv(self.path + r'\sngoodid.csv',index=False, encoding="GB18030")
                        #print(str(df['spu'][y]))
        print(df)
        return df

    def spudata(self,df):
        df_l=len(df['nm'])
        x=math.ceil(df_l/20)
        price_list=[]
        for i in range(0,x):
            spus=''
            shopids=''
            if i ==(x-1):
                ii=df_l-i*20
            else:
                ii=20
            for j in range(0,ii):
                y=i*20+j
                spu=str(df['spu'][y])
                #print(y)
                spu='0'*(18-len(spu))+spu
                if spus=='':
                    spus += spu
                else:
                    spus+=','+spu
                shop=re.search('com/(.*?)/',df['url'][y]).group(1)
                if shopids=='':
                    shopids += shop
                else:
                    shopids+=','+shop
            #print(len(spus))
            url=r'https://icps.suning.com/icps-web/getVarnishAllPrice014/{}_020_0200101_{}_1_getClusterPrice.vhtm?callback=getClusterPrice'.format(spus,shopids)
            html=self.s.get(url=url, verify=False).text
            time.sleep(random.random())
            price=re.findall('"price":"(.*?)",',html)
            print(price)
            price_list.extend(price)
            #print(len(price))
        df.loc[:, "price"] = price_list
        df.to_csv(self.path + r'\spudata.csv', index=False, encoding="GB18030")
        return df


if __name__=='__main__':
    path=r'E:\suning'
    sn=suning(path=path)
    url='https://phoenix.suning.com'
    df=sn.goodid(url)
    #df=pd.read_csv(path+'\sngoodid.csv',encoding="GB18030")
    sn.spudata(df)
    sn.closes()

GITHUB:https://github.com/linyhuan/Crawler

猜你喜欢

转载自blog.csdn.net/weixin_39416561/article/details/82018864