python3 获取京东冰箱的相关数据(学习记录)

程序说明:这个程序可以爬取京东冰箱的相关数据,采用cookie登录(不登录可能获取不到数据),因为没有用selenium进行模拟爬取,所以只获取了首页的前26条数据,要想用获取全部数据,就可以用selenium来爬取所有数据,最后说明,因本人比较菜,代码写的不太好,最重要的是要知道爬取信息的思路。

'''
    Time:2020-06-14
    实现功能:爬取京东冰箱的数据,因为没有使用selenium,所以之抓取了网页提前加载好的26项冰箱数据
    采用cookie的方式进行登录
'''
import re,urllib.request
import requests
from lxml import html
from bs4 import BeautifulSoup
import pymysql
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Cookie':'areaId=5; ipLoc-djd=5-164-47713-0; _c_id=qie3ha7b1gfvaxys3c01592101567877gfoz; _s_id=pu1yswlh2ajk4wrkdeo1592117385919nkup; DeviceSeq=30d2ab2cf9f240c5a09a212d34a28907; pu1yswlh2ajk4wrkdeo1592117385919nkup=-39; unpl=V2_ZzNtbRZXEEVzC09SL05dVWIARggRUxAdIV8SUX5KDwQ3VBEJclRCFnQUR1NnGVkUZwcZXERcQhxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsdXQNvChpbQVVzJXI4dmRyGF8HYAYiXHJWc1chVEVTfB1VByoDFlxEX0odcwtEZHopXw%3d%3d; shshshfp=bdfbd30676db05baec1153d41b7f159a; shshshfpa=a4850859-3d83-914b-036c-e7889540c9f7-1583322638; shshshsID=37018e3c59156f7fa746619a807faf5c_10_1592118448475; shshshfpb=iHdcqfg%3D%3D%3B%20__jdu%3D1507989723; alc=jsOC235AjYbV9f620pTxfA==; _t=wux8PpWEocpdwcexwIhpnahtXcWUy58W9egqZyoKdDA=; wlfstk_smdl=343q1yydfzs7xrczpq4exfidwr91aabd; 3AB9D23F7A4B3C9B=X3JFCZIMGPDHGJIXAU3TPV74CT2ZIHYJUKRIEAMRBPYQWMEOUKMRSDKEER27LH35HFLF2NQA2CGDOWWGOBYG3GHHBU; TrackID=1ON2lV0ZKX-RuwIHxaO8_kbljs2dxmi60_My1bPgiznwCbZzJ9x8n14j6NEOEC7ry4FZy9Tet64ED28efg7fGEBQHHlok6Qecv-p40tU5xuw; thor=4EB2BECB2C05A9A0E1516FC3255F7EF7DF34F553A4E3D79CE79B83418481A723FC3C9B3B2E49F32C8D3801D1CFA400F5A9D107557ADCF0077792AFF459A8A42BA1EEFFB74C8AAADDA6BCE4C91616EA10BC03C1AFA236ED7A96C84B17607BA4BE609D6CF84CA505B8B6B2419FAA7E7495FC915E093104E90E0F66B69DB012532B28DF597B44419C9EC36E5EC966493CAB6411A6E85FFB6E3FC9DFF81DF36836A9; pinId=j6dNqBQwFcHW_FjgalQC9bV9-x-f3wj7; pin=jd_629df3df257ff; unick=%E6%A0%91%E4%B8%8B%E9%82%A3%E7%89%87%E9%81%AE%E9%98%B3%E5%A4%84; ceshi3.com=000; _tp=QqRLCDtS8YYvQgCnvfVUcuP7spHaPG6PPhYMaHGm8DQ%3D; logining=1; _pst=jd_629df3df257ff; __jda=76161171.1507989723.1592040090.1592101570.1592117379.7; __jdb=76161171.17.1507989723|7.1592117379; __jdc=76161171; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_96d39f8f1cd44ca1943e800e7bac1b63|1592118474453; __jdu=1507989723'
    }
# Mysql数据库
def Mysql(SQL):
    conn = pymysql.connect(host="localhost", user="root", password="lulianghao", db="studentmysql", port=3306)
    Conn = conn.cursor()
    try:
        Conn.execute(SQL)
        conn.commit()
    except Exception as e:
        conn.rollback()
    finally:
        conn.close()
def url_get(url):
    response = requests.get(url, headers=headers)
    htmls=response.text
    return htmls
# 用xpath获取数据,也可以用jquery和正则获取,那个方便就用那个获取数据
def get_data_name(url):
    docs = html.fromstring(url_get(url))
    name = docs.xpath("//div/div/a/em/text()")
    price = docs.xpath("//div/div/strong/i/text()")
    Author = docs.xpath("//div/div/span/a/text()")
    Xuangou =docs.xpath("//div/span/em/text()")
    get_url=[]
    get_json_image=[]
    # 这里实在没有办法,就采用了循环输出这个xpath,进而获取ipg网址
    for i in range(1, 27):
        a = "//*[@id='J_goodsList']/ul/li["
        b = str(i)
        c = "]/div/div[1]/a/img/@src"
        d=a+b+c
        Url = docs.xpath(d)
        get_url.append(Url)
    for geturl in get_url:
        for i in geturl:
            get_json_image.append('http://'+str(i))
    zhishu=[]
    Name=[]
    Price=[]
    Autjo=[]
    for iss in name:
        Name.append(iss)
    for iss in price:
        Price.append(iss)
    for iss in Author:
        Autjo.append(iss)
    for zhishus in Xuangou:
        zhishu.append(zhishus)
    patten = re.compile('href="//item(.*?)"', re.S)
    herfs = re.findall(patten, url_get(url))
    list = []
    for i in herfs:
        pattens = re.compile('(.*?)html', re.S)
        Url = re.findall(pattens, i)
        for i in Url:
            list.append(i)
    list2 = []
    for x in list:
        if x not in list2:
            list2.append(x)
    urlss = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=6371310,100008527158,6559653,6790737,100005404692,3482385,7300282,990705,100012312692,6044639,2788652,100012791656,11587360522,100003174244,100004328824,100003759928,1787111,6679030,100013116380,3459483,831075,10155316848,818271,100002472431,52260539551,1560207,3801116,4512422,3934375,100002152187&callback=jQuery5979221&_=1592140221399'
    response = requests.get(urlss)
    htmlss = response.text
    patten = re.compile('CommentCountStr":"(.*?)","', re.S)
    getdata = re.findall(patten, htmlss)
    ShopS=[]
    for j, i in enumerate(getdata):
        if (j <= 27):
            ShopS.append(i)
    for i,ios in enumerate(get_json_image):
        get_list2='https://item'+list2[i]+'html'
        print("第",i+1,"条数据信息:价格:",Price[i],"名称:",Name[i],"商家店名:",Autjo[i],"选购指数:",zhishu[i],
              "\n图片地址:",get_json_image[i],"\n详细信息地址:",get_list2,
              "\n""销量:",ShopS[i], "条销量数据")
        Mysql("insert into jingdongbingxiangget values('%s','%s','%s','%s','%s','%s','%s');"%(Price[i],Name[i],Autjo[i]
                                                       ,zhishu[i],get_json_image[i],get_list2[i],ShopS[i]))
def main():
    print("Mysql数据库已连接.....")
    url = 'https://search.jd.com/Search?keyword=%E5%86%B0%E7%AE%B1&enc=utf-8&suggest=1.his.0.0&wq=&pvid=d4e2610641164f8a833c32a67e38650d'
    get_data_name(url)
    print("程序运行结束.....")
if __name__ == '__main__':
    main()

爬取记录:
在这里插入图片描述

只做记录学习
(qq:九七二四三九三二九,有不懂的可以问我,一起学习)


猜你喜欢

转载自blog.csdn.net/weixin_45005209/article/details/106762171
今日推荐