当前的目录结构:
main.py中:
from scrapy.cmdline import execute
import sys
import os
# os.path.abspath(__file__) 是当前文件路径:F:\Program Files\爬虫项目\new\Spider\main.py
# os.path.dirname(os.path.abspath(__file__)) 是当前文件所在文件夹的路径:F:\Program Files\爬虫项目\new\Spider
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# 调用execute可以执行scrapy脚本
execute(["scrapy", "crawl", "cnblogs"])
settings.py中:
import os
# 当前文件所在文件夹(项目文件夹)的路径
PROJECT_DIR = os.path.abspath(os.path.dirname(__file__))
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'Spider.pipelines.SpiderPipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline': 200,
'Spider.pipelines.ArticleImagesPipeline': 200,
'Spider.pipelines.SetDefalutForTagsPipeline': 210,
# 'Spider.pipelines.JsonSavePipeline': 210,
'Spider.pipelines.JsonExporterPipleline': 220,
# 'Spider.pipelines.MysqlPipeline': 230,
'Spider.pipelines.MysqlTwistedPipline': 230,
}
# 指定Item中的图片URL,供ImagesPipeline下载
IMAGES_URLS_FIELD = 'front_image_url'
# 配置图片存放路径
IMAGES_STORE = os.path.join(PROJECT_DIR, 'images')
# 配置图片url不存在时的默认图片url
DEFAULT_IMAGE_URL = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1566626671633&di=ba4e0040482738d23a6e045e9a8b7844&imgtype=0&src=http%3A%2F%2Fossimg.xinli001.com%2Fvisioncn%2F600x400%2FVCG41126333227.jpg%3Fx-oss-process%3Dimage%2Fquality%2CQ_80'
# 下载图片的最小高度和宽度
# IMAGES_MIN_HEIGHT = 100
# IMAGES_MIN_WIDTH = 100
# MySQL相关配置
MYSQL_HOST = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = ''
MYSQL_DBNAME = 'Spider'
items.py中:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import datetime
import re
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst, Join
from scrapy.loader import ItemLoader
class SpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class CnblogsArticleItemLoader(ItemLoader):
"""
定制ItemLoader
"""
default_output_processor = TakeFirst()
def add_author(value):
return value + " by dmxjhg"
def date_convert(value):
create_date = re.findall('\d{4}-\d{2}-\d{2}', value)[0]
try:
create_date = datetime.datetime.strptime(create_date, "%Y-%m-%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def return_value(value):
return value
class CnblogsArticleItem(scrapy.Item):
# 标题
title = scrapy.Field(input_processor = MapCompose(lambda x: x + "——Cnblogs", add_author))
# 创建时间
create_date = scrapy.Field(input_processor = MapCompose(date_convert))
# 文章URL
url = scrapy.Field()
# MD5定长转换
url_id = scrapy.Field()
# 封面图URL
front_image_url = scrapy.Field(output_processor = return_value)
# 封面如路径
front_image_path = scrapy.Field()
# 文章内容
content = scrapy.Field()
# 标签
tags = scrapy.Field(output_processor=Join(","))
# 来源
source = scrapy.Field()
pipelines.py中:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import codecs
import json
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exporters import JsonItemExporter
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class SpiderPipeline(object):
def process_item(self, item, spider):
return item
class ArticleImagesPipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if isinstance(item, dict) or self.images_result_field in item.fields:
item[self.images_result_field] = [x for ok, x in results if ok]
if "front_image_url" in item:
image_file_path = []
for is_success, value_dict in results:
image_file_path.append(value_dict.get("path", ""))
item["front_image_path"] = image_file_path
return item
class JsonSavePipeline(object):
"""
自定义json文件的导出
"""
def __init__(self):
self.file = codecs.open('article.json', 'w', encoding="utf-8")
self.first_item = True
self.file.write("[")
def process_item(self, item, spider):
if self.first_item:
self.first_item = False
else:
self.file.write(', ')
lines = json.dumps(dict(item), ensure_ascii=False)
self.file.write(lines)
return item
def close_spider(self, spider):
self.file.write("]\n")
self.file.close()
class JsonExporterPipleline(object):
"""
调用scrapy提供的JsonExporter导出json文件
"""
def __init__(self):
self.file = open('article-exporter.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class MysqlPipeline(object):
"""
采用同步的机制写入mysql
"""
def __init__(self):
self.conn = MySQLdb.connect(
'127.0.0.1', # HOST
'root', # USER
'', # PASSWORD
'Spider', # DB_NAME
charset="utf8",
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into cnblogs_article(url_id, title, create_date, url, front_image_url, front_image_path, content, tags, source)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item["url_id"],
item["title"],
item["create_date"],
item["url"],
item["front_image_url"][0],
item["front_image_path"][0],
item["content"],
item["tags"],
item["source"]))
self.conn.commit()
return item
class MysqlTwistedPipline(object):
"""
采用异步的机制写入mysql
"""
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
password = settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
# 使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
# 处理异常
query.addErrback(self.handle_error, item, spider)
return item
def handle_error(self, failure, item, spider):
# 处理异步插入的异常
print(failure)
print(item['url'])
def do_insert(self, cursor, item):
# 执行具体的插入
insert_sql = """
insert into cnblogs_article(url_id, title, create_date, url, front_image_url, front_image_path, content, tags, source)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_sql, (item["url_id"],
item["title"],
item["create_date"],
item["url"],
item["front_image_url"][0],
item["front_image_path"][0],
item["content"],
item["tags"],
item["source"]))
class SetDefalutForTagsPipeline(object):
def process_item(self, item, spider):
if "tags" in item:
pass
else:
item["tags"] = "无"
return item
common.py中:
import hashlib
def get_md5(url):
if isinstance(url, str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
if __name__ == "__main__":
print(get_md5("http://jobbole.com".encode("utf-8")))
cnblogs.py中:
# -*- coding: utf-8 -*-
import re
from urllib import parse
import datetime
import scrapy
from scrapy.http import Request
from scrapy.loader import ItemLoader
from Spider.items import CnblogsArticleItem, CnblogsArticleItemLoader
from Spider.settings import DEFAULT_IMAGE_URL
from Spider.utils.common import get_md5
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['news.cnblogs.com'] # 允许的域名
start_urls = ['https://news.cnblogs.com'] # 起始url
def parse(self, response):
"""
1. 获取文章列表页中的文章的url,交给scrapy下载后,进行具体字段的解析
2. 获取下一页的url,交给scrapy进行下载,下载完成后交给parse
"""
# 获取文章列表页中的文章的url,交给scrapy下载后,进行具体字段的解析
article_nodes_selector = response.xpath('//div[@id="news_list"]/div[@class="news_block"]/div[@class="content"]')
for article_node_selector in article_nodes_selector:
article_url = article_node_selector.xpath('h2/a/@href').extract_first()
front_image_url = parse.urljoin(response.url, article_node_selector.xpath('div[@class="entry_summary"]/a/img/@src').extract_first(DEFAULT_IMAGE_URL))
yield Request(url=parse.urljoin(response.url, article_url),
meta = {"front_image_url": front_image_url},
callback=self.parse_detail)
pass
pass
# 获取下一页的url,交给scrapy进行下载,下载完成后交给parse
next_url = response.xpath('//div[@class="pager"]/a[not(@class)]/@href').extract_first('end')
if next_url != 'end':
next_url = parse.urljoin(response.url, next_url)
yield Request(url=next_url, callback=self.parse)
pass
pass
def parse_detail(self, response):
"""
解析文章具体字段
"""
if 'account' in response.url:
pass
else:
# 通过item loader加载item
# 1.实例化一个item_loader
item_loader = CnblogsArticleItemLoader(item=CnblogsArticleItem(), response=response)
# 2.填充字段的提取规则
item_loader.add_xpath("title", '//*[@id="news_title"]/a/text()')
item_loader.add_value("url", response.url)
item_loader.add_value("url_id", get_md5(response.url))
item_loader.add_xpath("create_date", '//*[@id="news_info"]/span[2]/text()')
item_loader.add_value("front_image_url", response.meta.get("front_image_url", ""))
item_loader.add_xpath("source", '//*[@id="link_source2"]/text()')
item_loader.add_xpath("tags", '//*[@id="news_more_info"]/div/a/text()')
item_loader.add_xpath("content", '//*[@id="news_body"]')
# 3.加载Item
article_item = item_loader.load_item()
# 4.将CnblogsArticleItem实例yield给pipelines.py
yield article_item
pass
目前有两个问题,一是我们在未登录的状态下,只能访问近三天的文章;二是我们每次只能爬取到60条文章数据。这些问题我们会在后续进行解决。