1. scrapy startproject Qrcode # 创建项目名称为:Qrcode
2. cd Qrcode # 进入项目目录
3. scrapy genspider qrcode 'www.cnyifeng.net' # 创建爬虫:qrcode 爬取的域名为:www.cnyifeng.net
4. 编写需要采集的数据库字段 在:items.py文件中
5. 在 spiders 文件夹下的qrcode.py文件中编写爬虫文件
6. 在管道文件pipelines.py中保存数据到文件或数据库中
7. 运行项目前在settings.py配置文件中启用管道文件
ITEM_PIPELINES = { 'Demo.pipelines.DemoPipeline': 300, }
8. 运行项目:scrapy crawl qrcode
======================
案例代码:爬取翼蜂网络新闻中心
1. items.py文件
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QrcodeItem(scrapy.Item): title = scrapy.Field() zhaiyao = scrapy.Field() times = scrapy.Field()
2. qrcode.py爬虫文件
# -*- coding: utf-8 -*- import scrapy from Qrcode.items import QrcodeItem class QrcodeSpider(scrapy.Spider): name = 'qrcode' allowed_domains = ['www.cnyifeng.net'] # start_urls = ['http://cnyifeng.net/'] # 拼接url baseURL = "http://www.cnyifeng.net/news/1/{}.html" offset = 1 start_urls = [baseURL.format(offset)] def parse(self,response): node_list = response.xpath("//div[@class='news_con']/dl[@class='news_dl']") for node in node_list: item = QrcodeItem() if len(node.xpath(".//a[@class='dt_1']//text()")): item['title'] = node.xpath(".//a[@class='dt_1']//text()").extract()[0] else: item['title'] = '' item['zhaiyao'] = node.xpath("./dd//text()").extract() item['times'] = node.xpath(".//span//text()").extract() # print(item) yield item if len(response.xpath("//div[@class='flickr']//span[@class='disabled']")) == 0: url = response.xpath("//div[@class='flickr']/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse) else: ToNext = response.xpath("//div[@class='flickr']//span[@class='disabled']//text()").extract()[0].encode('utf-8') if str(ToNext != '下一页»'): url = response.xpath("//div[@class='flickr']/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse)
3.pipelines.py管道文件持久化数据
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from Qrcode.myEncoding import myEncoder class QrcodePipeline(object): def __init__(self): self.f = open("yf.json","w") def process_item(self, item, spider): content = json.dumps(dict(item),ensure_ascii=False,cls=myEncoder) + ",\n" self.f.write(content) return item def close_spider(self,spider): self.f.close()
4. settings.py配置文件中启用管道
ITEM_PIPELINES = { 'Demo.pipelines.DemoPipeline': 300, }
5.自定义类myEncoding.py 3.0以后的版本可忽略
import json class myEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, bytes): return str(obj,encoding='utf-8') return json.JSONEncoder.default(self, obj)