1:建立虚拟环境 mkvirtualenv --python=(python路径) 虚拟环境名
2:进入虚拟环境 workon 虚拟环境名
3:安装scrapy 使用豆瓣源安装 pip install -i https://pypi.douban.com/simple/ scrapy
4:进入工程目录,创建工程 scrapy startproject ArticleSpider(项目名称)
5:进入pycharm,导入工程,选择环境
6:进入spiders,创建爬虫 scrapy genspider jobbole(名称) blog.jobbole.com(域名)
7:创建main.py进行调试from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy","crawl","jobbole"])
8:在settings.py中设置ROBOTSTXT_OBEY = False
9:编写parse()函数
功能:
1.获取文章列表中的文章url并交给scrapy下载后并进行解析
2.获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
0:调用
import scrapy import re from scrapy.http import Request from urllib import parse
1:代码
def parse(self, response): """ 1.获取文章列表中的文章url并交给scrapy下载后并进行解析 2.获取下一页的url并交给scrapy进行下载, 下载完成后交给parse :param response: :return: """ #获取列表页所有文章的url并交给scrapy下载后进行解析 post_nodes = response.css("div#archive div.floated-thumb div.post-thumb a") for post_node in post_nodes: post_url = post_node.css("::attr(href)").extract_first("") img_url = post_node.css("img::attr(src)").extract_first("") # img_url = parse.urljoin(response.url,img_url) yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":img_url},callback=self.parse_detail,dont_filter=True) # print(post_url) #提取下一页url并交给scrapy进行下载 next_url = response.css("a.next.page-numbers::attr(href)").extract_first() if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse,dont_filter=True)
yield用于交给scrapy进行下载
parse.urljoin将域名和网址合并成最终网址
callback传递回调函数
设置dont_filter=True防止被过滤掉而不去执行callback
meta参数很重要,用来将列表页面爬取的内容如封面图传递给parse_deatil中的response,在response中可以用front_image_url = response.meta.get("front_image_url","")接收
10:在jobbole.py中的定义方法paser_detail(self,response),使用xpath或css选择器对网页字段解析。 start_urls设置为爬虫初始列表页网址。
在cmd中使用 scrapy shell 网址 进行调试
response.xpth("xpth语法/text()").extract_first()
response.css("css语法::text").extract_first()
xpath:
css:
例:
def parse_detail(self, response): #提取文章具体字段 # re_selector = response.xpath("//*[@id='post-113568']/div[1]/h1/text()") title = response.xpath("//*[@id='post-113568']/div[1]/h1/text()").extract()[0] praise_nums = response.xpath("//div[@class='post-adds']/span[1]/h10/text()").extract()[0] fav_nums = response.xpath("//div[@class='post-adds']/span[2]/text()").extract()[0] match_re = re.match(".*(\d+).*",fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0
response.css("div#archive div.floated-thumb div.post-thumb a::attr(href)").extract_first()
11:编写items.py
将爬取过来的每一个item实例路由到pipelines,在piplines中集中处理数据的保存、去重等。类似于字典,比字典功能要多。在Item中只有一个Field类型,可以保存任意数据类型。title = scrapy.Filed()
在items.py中新建一个类,并定义好item
class JobBoleArticleItem(scrapy.Item): title = scrapy.Field() create_data = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field() front_image_path = scrapy.Field() praise_nums = scrapy.Field() comment_nums = scrapy.Field() fav_nums = scrapy.Field() tags = scrapy.Field() content = scrapy.Field()
在jobbole.py中引用定义好的JobBoleArticleItem
from ArticleSpider.items import JobBoleArticleItem在函数parse_detail中将爬取的项保存在item中
article_item = JobBoleArticleItem() article_item["title"] = title article_item["url"] = response.url article_item["create_data"] = creat_data article_item["front_image_url"] = front_image_url article_item["fav_nums"] = fav_nums article_item["comment_nums"] = comment_nums article_item["praise_nums"] = praise_nums article_item["tags"] = tags article_item["content"] = content
yield article_item #传递到pipelines中
12:配置settings.py和pipelines.py
在settings.py中将item的pipeline打开
ITEM_PIPELINES = { 'ArticleSpider.pipelines.ArticlespiderPipeline': 300,#item的传输管道 数字越小越早进入管道 # 'scrapy.pipelines.images.ImagesPipeline':1, #比300小说明先进入这里 'ArticleSpider.pipelines.ArticleImagePipeline': 1, } IMAGES_URLS_FIELD = "front_image_url" #处理的形式为数组 所以要将item中此项改为数组 project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os.path.join(project_dir,"images")
scrapy.pipelines.images.ImagesPipeline 是将爬取的图片进行下载
IMAGES_URLS_FIELD 是将item中的front_image_url传递过来 才能对图片进行下载
project_dir 为获取当前目录
IMAGES_STORE 设置图片保存路径
ArticleSpider.pipelines.ArticleImagePipeline为在piplinse.py中的自定义类
from scrapy.pipelines.images import ImagesPipeline
class ArticleImagePipeline(ImagesPipeline): def item_completed(self, results, item, info): for ok, value in results: image_file_path = value["path"] item["front_image_path"] = image_file_path return item
该类继承ImagesPipeline 并重写了item_completed方法,目的是获取图片路径并添加到item
url_object_id = scrapy.Field() #使用md5函数把url变成等长的唯一序列
获取url的id就是将url通过md5方法变成唯一等长的序列
md5方法需要自己编写,新建utils包用于放自定义常用函数,创建common.py
import hashlib def get_md5(url): if isinstance(url, str): url = url.encode("utf-8") m = hashlib.md5() m.update(url) return m.hexdigest()
由于python3中为unicode编码(判断是否为str等同于判断是否为unicode)而md5方法不识别,所以需要将传过来的url编码成utf-8.
生成id为0efdf49af511fd88681529ef8c2e5fbf的形式
然后在parse_detail方法中加入item项
article_item["url_object_id"] = get_md5(response.url)
这时,所有的item项赋值完毕。
13.将爬取的item保存到数据库或本地
将item保存为json文件,建立保存json的pipeline
自定义方式:
import codecs import json
class JsonWithEncodingPipeline(object): #自定义导出json文件 def __init__(self): self.file = codecs.open('article.json', 'w',encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" #将item转为字典,ensure_ascii设置为False否则当有中文或其他编码时出错 self.file.write(lines) return item def spider_closed(self, spider): self.file.close()使用提供的JsonItemExporter方式:
from scrapy.exporters import JsonItemExporter
class JsonExporterPipleline(object): #调用scrapy提供的json export导出json文件 def __init__(self): self.file = open('articleexport.json','wb') self.exporter = JsonItemExporter(self.file,encoding="utf-8",ensure_ascii=False) self.exporter.start_exporting() def close_spider(self,spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
同时需要在settings.py中设置
ITEM_PIPELINES = { # 'ArticleSpider.pipelines.ArticlespiderPipeline': 300,#item的传输管道 数字越小越早进入管道 # 'scrapy.pipelines.images.ImagesPipeline':1, #比300小说明先进入这里 # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 3, 'ArticleSpider.pipelines.JsonExporterPipleline':2, 'ArticleSpider.pipelines.ArticleImagePipeline': 1, } IMAGES_URLS_FIELD = "front_image_url" #处理的形式为数组 所以要将item中此项改为数组 project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os.path.join(project_dir,"images")
将item保存到mysql数据库
pipeline中设置
import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapi #twisted提供异步操作容器
class MysqlTwistedPipeline(object): def __init__(self,dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset = "utf8", cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True, ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): #使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error)#处理异常 def handle_error(self, failure): #处理异步的异常 print(failure) def do_insert(self, cursor, item): #执行具体的插入 insert_sql = "insert into jobbole_article(title, create_data, url,fav_nums,url_object_id) VALUES (%s,%s,%s,%s,%s)" cursor.execute(insert_sql,(item["title"],item["create_data"],item["url"],item["fav_nums"],item["url_object_id"]))在settings.py中进行相应设置