spider代码:
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from jingdong.items import JingdongItem import re import urllib class JdSpider(scrapy.Spider): name = 'jd' allowed_domains = ['jd.com'] start_urls = ['http://jd.com/'] def parse(self, response): key = "笔记本" search_url = "https://search.jd.com/Search?keyword=" + key + "&enc=utf-8&wq=" + key for i in range(1,101): page_url = search_url + "&page=" + str(i*2-1) yield Request(url=page_url,callback=self.next) def next(self,response): id = response.xpath('//ul[@class="gl-warp clearfix"]/li/@data-sku').extract() #print(id) for j in range(len(id)): ture_url = "https://item.jd.com/" + str(id[j]) + ".html" yield Request(url=ture_url,callback=self.next2) def next2(self,response): item = JingdongItem() item['title'] = response.xpath('//head/title/text()').extract()[0].replace('【图片 价格 品牌 报价】-京东','').replace('【行情 报价 价格 评测】-京东','') item['link'] = response.url #价格抓包 ture_id = re.findall(r'https://item.jd.com/(.*?).html',item['link'])[0] price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + str(ture_id) price_txt = urllib.request.urlopen(price_url).read().decode('utf-8', 'ignore') item['price'] = re.findall(r'"p":"(.*?)"',price_txt)[0] #评论抓包 comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(ture_id) comment_txt = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore') item['comment'] = re.findall(r'"CommentCount":(.*?),"',comment_txt)[0] return item
pipline代码:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql.cursors class JingdongPipeline(object): # 连接登录mysql,新建数据表 def __init__(self): self.conn = pymysql.connect(host="127.0.0.1", user="root", passwd="", db="jd", charset = 'utf8') cur = self.conn.cursor() cur.execute("USE jd") cur.execute( "CREATE TABLE computer(title VARCHAR(100),link VARCHAR(50),price VARCHAR(50),comment VARCHAR(50))") self.conn.commit() def process_item(self, item, spider): try: title_1 = item['title'] link_1 = item['link'] price_1 = item['price'] comment_1 = item['comment'] cur = self.conn.cursor() cur.execute("INSERT INTO computer(title,link,price,comment) VALUES (%s,%s,%s,%s)",(title_1,link_1,price_1,comment_1)) self.conn.commit() except Exception as err: pass return item
使用的是navicat作为mysql的交互
最后结果:
遇到的一些难题:
1、mysql的安装,参考我的另一篇博文:
当python遇到mysql时,如何顺利安装mysql
2、抓包:我所取的数据里面,有两个字段是需要抓包的,一个是price,另一个是comment,抓包的时候注意包的地址,里面一般会包括关键字,例如price的包的链接名里面也会有price
3、我的navicat插数据进去的时候中文会显示‘???‘的乱码,这里我是参考:点击打开链接
4、经过多次调试之后,发现访问数据量太多了,京东开始问我要验证码了,验证码解码方面还在学习当中,掌握了之后再回头做修改