Python rastrea el nombre y el precio de los productos seckill de Jingdong

Python es muy divertido

# !-*-coding:utf-8-*-
# ! 2019/3/6 13:51
# !@Author:Cy 2019 03
# !@File:jdsk.py
import os
import time
import json
import datetime
import requests
import re

url="https://item.jd.com/7293066.html#askAnswer"

class Crawl(object):
	def __init__(self):
		'''
		说明:类初始化操作
		'''
		self.headers={
			'Accept':'*/*',
			'Accept-Encoding':'gzip,deflate,br',
			'Accept-Language':'zh-CN,zh;q=0.9',
			'User-Agent':'Mozilla/5.0(Windows NT 10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/71.0.3578.80Safari/537.36',
			'Referer':'https://miaosha.jd.com/category.html'
			}
		f_dir= os.getcwd()
		f_name='京东秒杀%s.json'%str(datetime.datetime.now().date())
		print("爬虫_QQ小招 自检完成,初始化成功" + f_dir + f_name)
		self.path= os.path.join(f_dir, f_name)
		self.fp = open(self.path,'w',encoding='utf-8')
		self.fp.write('[\n')
		print("准备进入主函数")

	def crawl(self,url,cate):#爬虫
		print('进入爬虫函数,现在采集',cate,'这类')
		url='https://ai.jd.com/index_new'
		data = {
		'app': 'Seckill',
		'action': 'pcSeckillCategoryGoods',
		'callback': 'pcSeckillCategoryGoods',
		'id': self.category_id[cate],
		'_': int(time.time())
		}
		res=self.response_handler(url,data)
		res=res.text
		#print(res)
		re1=r'pcSeckillCategoryGoods\((.*?)\);'
		aaa=re.findall(re1,res,re.S | re.M)
		#print(aaa[0])
		datas=json.loads(aaa[0]).get('goodsList',[])
		#print(datas)
		items=self.parse(datas)
		for item in items:
			#print(item)
			self.witer(item)
	def response_handler(self,url,data):#向拼好的链接发post请求,返回requests
		print('开始构造响应')
		res=requests.post(url=url,data=data,headers=self.headers)
		print('构造响应成功')
		return  res

	def parse(self,res):
		#解析返回的res,形成结构化数据
		#print("开始解析数据")
		#print(res)
		items = []
		for data in range(len(res)):
			#print(res[data]['wname'])#.split(' ')[0:3]
			#print(res[data]['miaoShaPrice'])
			item={}
			item[res[data]['wname']]= res[data]['miaoShaPrice']
			items.append(item)
		#print(items)
		print('解析数据完成')
		return items

	def witer(self,item):
		#print('存入数据',item)
		date = json.dumps(item, ensure_ascii=False)
		print('存入数据',date)
		self.fp.write(date+',\n')
	def close(self):
		self.fp.write(']')
		self.fp.close()
		print('采集完成!数据存储在%s' % self.fp)
		print('完成爬虫')
	def main(self):
		url="https://miaosha.jd.com/category.html"
		print(url)
		self.category_id={
			'电脑办公':29,
			'生活电器':19,
			'手机通讯':30,
			'大家电':25,
			'智能数码':31,
			'饮料酒水':45,
			'家居家装':37,
			'母婴童装':43,
			'食品生鲜':44
		}
		print("进入主函数了")
		for cate in self.category_id.keys():
			print(cate,self.category_id[cate])
			#print('准备爬%s类 链接为%s'%cate,self.category_id[cate])
			self.crawl(url,cate)
		self.close()

if __name__ == '__main__':
	c=Crawl()
	c.main()

 

Supongo que te gusta

Origin blog.csdn.net/paycho/article/details/88366509
Recomendado
Clasificación