tutorial/pipelines.py:项目的pipelines文件,需要注册到setting.py中,会自动执行process_item方法
tutorial/settings.py:项目的设置文件
tutorial/spiders/:存储爬虫的目录,写好文件后,自动生效
目标:抓取cnblog的标题和新闻
1.新建立一个项目
执行
scrapy startproject cnblog
2.修改item.py,添加title、url、content字段
import scrapy class CnblogItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() url = scrapy.Field() title = scrapy.Field() content = scrapy.Field() pass
3.新建一个spider,抓取内容
from scrapy.spider import Spider from scrapy.selector import Selector import urllib.request from scrapy.http import Request from cnblog.items import CnblogItem class CnSpider(Spider): #爬虫的名称 name = "cnblog" #允许爬的域名,防止爬偏了 allowed_domains = ["cnblogs.com","news.cnblogs.com"] #设置起始的链接,获取全部翻页链接 start_urls = [] for pn in range(1,2): url = 'https://news.cnblogs.com/n/page/%s/' % pn start_urls.append(url) #获取所有的内容页面链接 def parse(self, response): sel = Selector(response) news_list = sel.xpath('//h2[@class="news_entry"]') for new_i in news_list: new_link=new_i.xpath('a/@href').extract() link_0=str("https://news.cnblogs.com"+new_link[0]) yield Request(link_0,callback=self.parse_item) #抓取新闻详细页内容 def parse_item(self,response): item = CnblogItem() item['url'] = response.request.url item['title'] = response.xpath('//div[@id="news_title"]/a/text()').extract()[0] item['content'] = response.xpath('//div[@id="news_body"]').extract()[0] yield item
4.定义pipelines,将内容保存到items.jl
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from Util import FileUtil class CnblogPipeline(object): def __init__(self): self.file = open('items.jl', 'w') self.url_seen = set() def process_item(self, item, spider): #过滤下重复的数据 if item['url'] in self.url_seen: raise DropItem("Duplicate item found: %s" % item) else: self.url_seen.add(item['url']) line = json.dumps(dict(item)) + "\n" self.file.write(line) FileUtil.saveNews(item['url'],item['title'],item['content']) return item
5.激活定义的pipelines
修改settings.py
ITEM_PIPELINES = { 'cnblog.pipelines.CnblogPipeline': 300, }
6.FileUtil.py的代码
from urllib.request import quote import urllib.request import sqlite3 class FileUtil: #根据url下载图片,如果没有设置图片地址,自动保存到D:\\download\\图片名称 def downImg(imgUrl,savePath=None): imgName=imgUrl.split('/')[-1] preUrl=imgUrl.replace(imgName,"") if savePath is None: savePath="D:\\download\\"+imgName conn = urllib.request.urlopen(preUrl+quote(imgName)) f = open(savePath,'wb') f.write(conn.read()) f.close() print('Saved:'+savePath) def saveNews(url,title=None,content=None): if title is None: title="" if content is None: content="" conn = sqlite3.connect('news.db') cursor = conn.cursor() # 执行一条SQL语句,创建user表: cursor.execute('create table IF NOT EXISTS news (id INTEGER PRIMARY KEY, url varchar(100),title vachar(100),content text)') cursor.execute('select * from news where url=\''+url+'\'') values=cursor.fetchall() if len(values) > 0:#链接以前就存在 print('链接已经存在:'+url) else: cursor.execute('insert into news (url, title,content) values (\''+url+'\', \''+title+'\', \''+content+'\')') print("save success."+url) # 关闭Cursor: cursor.close() # 提交事务: conn.commit() # 关闭Connection: conn.close()