Scrapy框架基于crawl爬取京东商品信息爬虫

Items.py文件

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JingdongItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()  # 创建容器
    shop = scrapy.Field()
    shoplink=scrapy.Field()
    price = scrapy.Field()
    comment = scrapy.Field()

Jd.py爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jingdong.items import JingdongItem
from scrapy.http import Request
import urllib.request
import re
class JdSpider(CrawlSpider):
    name = 'jd'
    allowed_domains = ['jd.com']
    start_urls = ['http://www.jd.com/']
    '''
    def start_requests(self):
        ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0'}
        yield Request('https://search.jd.com/Search?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%86%AC%E5%A5%B3&enc=utf-8', headers=ua)
    '''
    rules = (
        Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),
    )
    def parse_item(self, response):
        try:
            i = JingdongItem()
            thisurl=response.url #后加
            pat='item.jd.com/(.*?).html'
            x=re.search(pat,thisurl)
            if (x):
                thisid=re.compile(pat).findall(thisurl)[0]
                title=response.xpath('//html/head/title/text()').extract()
                #//div[@class="name"]/a/@title   div[@class="brand-logo"]/a/img
                shop=response.xpath('//div[@class="name"]/a/@title').extract()
                shoplink=response.xpath('//div[@class="name"]/a/@href').extract()
                priceurl='https://p.3.cn/prices/mgets?callback=jQuery9030294&type=1&area=1_72_4137_0&pdtk=&pduid=378203029&pdpin=&pin=null&pdbp=0&skuIds=J_'+str(thisid)
                commenturl='https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv191&productId='+str(thisid)+'&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1'
                pricedata=urllib.request.urlopen(priceurl).read().decode('utf-8','ignore')
                commentdata=urllib.request.urlopen(commenturl).read().decode('utf-8','ignore')
                pricepat='"p":"(.*?)"'
                commentpat='"goodRateShow":(.*?),'
                price=re.compile(pricepat).findall(pricedata)
                comment=re.compile(commentpat).findall(commentdata)
                if(len(title) and len(shop) and len(shoplink) and len(price) and len(comment)):
                    i['title']=title
                    i['shop']=shop
                    i['shoplink']=shoplink
                    i['price']=price
                    i['comment']=comment
                    '''print(title[0])
                    print(shop[0])
                    print(shoplink[0])
                    print(price[0])
                    print(comment[0])
                    print('-----------')'''
                else:
                    pass
            else:
                pass
            return i
        except Exception as e:
            print(e)

Pipelines.py文件

# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class JingdongPipeline(object):
    def process_item(self, item, spider):
        conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='dd')
        for i in range(0,len(item['title'])):
            title=item['title'][i]
            shop=item['shop'][i]
            shoplink=item['shoplink'][i]
            price=item['price'][i]
            comment=item['comment'][i]
            sql = "insert into jd(title,shop,shoplink,price,comment)values('" + title + "','" + shop + "','" + shoplink + "','" + price + "','" + comment + "')"
            conn.query(sql)
            conn.commit()
        conn.close()
        return item

Scrapy框架基于crawl爬取京东商品信息爬虫

Items.py文件

Jd.py爬虫文件

Pipelines.py文件

猜你喜欢