程序说明:这个程序可以爬取京东冰箱的相关数据,采用cookie登录(不登录可能获取不到数据),因为没有用selenium进行模拟爬取,所以只获取了首页的前26条数据,要想用获取全部数据,就可以用selenium来爬取所有数据,最后说明,因本人比较菜,代码写的不太好,最重要的是要知道爬取信息的思路。
'''
Time:2020-06-14
实现功能:爬取京东冰箱的数据,因为没有使用selenium,所以之抓取了网页提前加载好的26项冰箱数据
采用cookie的方式进行登录
'''
import re,urllib.request
import requests
from lxml import html
from bs4 import BeautifulSoup
import pymysql
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Cookie':'areaId=5; ipLoc-djd=5-164-47713-0; _c_id=qie3ha7b1gfvaxys3c01592101567877gfoz; _s_id=pu1yswlh2ajk4wrkdeo1592117385919nkup; DeviceSeq=30d2ab2cf9f240c5a09a212d34a28907; pu1yswlh2ajk4wrkdeo1592117385919nkup=-39; unpl=V2_ZzNtbRZXEEVzC09SL05dVWIARggRUxAdIV8SUX5KDwQ3VBEJclRCFnQUR1NnGVkUZwcZXERcQhxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsdXQNvChpbQVVzJXI4dmRyGF8HYAYiXHJWc1chVEVTfB1VByoDFlxEX0odcwtEZHopXw%3d%3d; shshshfp=bdfbd30676db05baec1153d41b7f159a; shshshfpa=a4850859-3d83-914b-036c-e7889540c9f7-1583322638; shshshsID=37018e3c59156f7fa746619a807faf5c_10_1592118448475; shshshfpb=iHdcqfg%3D%3D%3B%20__jdu%3D1507989723; alc=jsOC235AjYbV9f620pTxfA==; _t=wux8PpWEocpdwcexwIhpnahtXcWUy58W9egqZyoKdDA=; wlfstk_smdl=343q1yydfzs7xrczpq4exfidwr91aabd; 3AB9D23F7A4B3C9B=X3JFCZIMGPDHGJIXAU3TPV74CT2ZIHYJUKRIEAMRBPYQWMEOUKMRSDKEER27LH35HFLF2NQA2CGDOWWGOBYG3GHHBU; TrackID=1ON2lV0ZKX-RuwIHxaO8_kbljs2dxmi60_My1bPgiznwCbZzJ9x8n14j6NEOEC7ry4FZy9Tet64ED28efg7fGEBQHHlok6Qecv-p40tU5xuw; thor=4EB2BECB2C05A9A0E1516FC3255F7EF7DF34F553A4E3D79CE79B83418481A723FC3C9B3B2E49F32C8D3801D1CFA400F5A9D107557ADCF0077792AFF459A8A42BA1EEFFB74C8AAADDA6BCE4C91616EA10BC03C1AFA236ED7A96C84B17607BA4BE609D6CF84CA505B8B6B2419FAA7E7495FC915E093104E90E0F66B69DB012532B28DF597B44419C9EC36E5EC966493CAB6411A6E85FFB6E3FC9DFF81DF36836A9; pinId=j6dNqBQwFcHW_FjgalQC9bV9-x-f3wj7; pin=jd_629df3df257ff; unick=%E6%A0%91%E4%B8%8B%E9%82%A3%E7%89%87%E9%81%AE%E9%98%B3%E5%A4%84; ceshi3.com=000; _tp=QqRLCDtS8YYvQgCnvfVUcuP7spHaPG6PPhYMaHGm8DQ%3D; logining=1; _pst=jd_629df3df257ff; __jda=76161171.1507989723.1592040090.1592101570.1592117379.7; __jdb=76161171.17.1507989723|7.1592117379; __jdc=76161171; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_96d39f8f1cd44ca1943e800e7bac1b63|1592118474453; __jdu=1507989723'
}
# Mysql数据库
def Mysql(SQL):
conn = pymysql.connect(host="localhost", user="root", password="lulianghao", db="studentmysql", port=3306)
Conn = conn.cursor()
try:
Conn.execute(SQL)
conn.commit()
except Exception as e:
conn.rollback()
finally:
conn.close()
def url_get(url):
response = requests.get(url, headers=headers)
htmls=response.text
return htmls
# 用xpath获取数据,也可以用jquery和正则获取,那个方便就用那个获取数据
def get_data_name(url):
docs = html.fromstring(url_get(url))
name = docs.xpath("//div/div/a/em/text()")
price = docs.xpath("//div/div/strong/i/text()")
Author = docs.xpath("//div/div/span/a/text()")
Xuangou =docs.xpath("//div/span/em/text()")
get_url=[]
get_json_image=[]
# 这里实在没有办法,就采用了循环输出这个xpath,进而获取ipg网址
for i in range(1, 27):
a = "//*[@id='J_goodsList']/ul/li["
b = str(i)
c = "]/div/div[1]/a/img/@src"
d=a+b+c
Url = docs.xpath(d)
get_url.append(Url)
for geturl in get_url:
for i in geturl:
get_json_image.append('http://'+str(i))
zhishu=[]
Name=[]
Price=[]
Autjo=[]
for iss in name:
Name.append(iss)
for iss in price:
Price.append(iss)
for iss in Author:
Autjo.append(iss)
for zhishus in Xuangou:
zhishu.append(zhishus)
patten = re.compile('href="//item(.*?)"', re.S)
herfs = re.findall(patten, url_get(url))
list = []
for i in herfs:
pattens = re.compile('(.*?)html', re.S)
Url = re.findall(pattens, i)
for i in Url:
list.append(i)
list2 = []
for x in list:
if x not in list2:
list2.append(x)
urlss = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=6371310,100008527158,6559653,6790737,100005404692,3482385,7300282,990705,100012312692,6044639,2788652,100012791656,11587360522,100003174244,100004328824,100003759928,1787111,6679030,100013116380,3459483,831075,10155316848,818271,100002472431,52260539551,1560207,3801116,4512422,3934375,100002152187&callback=jQuery5979221&_=1592140221399'
response = requests.get(urlss)
htmlss = response.text
patten = re.compile('CommentCountStr":"(.*?)","', re.S)
getdata = re.findall(patten, htmlss)
ShopS=[]
for j, i in enumerate(getdata):
if (j <= 27):
ShopS.append(i)
for i,ios in enumerate(get_json_image):
get_list2='https://item'+list2[i]+'html'
print("第",i+1,"条数据信息:价格:",Price[i],"名称:",Name[i],"商家店名:",Autjo[i],"选购指数:",zhishu[i],
"\n图片地址:",get_json_image[i],"\n详细信息地址:",get_list2,
"\n""销量:",ShopS[i], "条销量数据")
Mysql("insert into jingdongbingxiangget values('%s','%s','%s','%s','%s','%s','%s');"%(Price[i],Name[i],Autjo[i]
,zhishu[i],get_json_image[i],get_list2[i],ShopS[i]))
def main():
print("Mysql数据库已连接.....")
url = 'https://search.jd.com/Search?keyword=%E5%86%B0%E7%AE%B1&enc=utf-8&suggest=1.his.0.0&wq=&pvid=d4e2610641164f8a833c32a67e38650d'
get_data_name(url)
print("程序运行结束.....")
if __name__ == '__main__':
main()
爬取记录:
只做记录学习
(qq:九七二四三九三二九,有不懂的可以问我,一起学习)