scrapy实现自动抓取51job并分别保存到redis，mongo和mysql数据库中

项目简介

利用scrapy抓取51job上的python招聘信息，关键词为“python”，范围：全国

利用redis的set数据类型保存抓取过的url，现实避免重复抓取；

利用脚本实现每隔一段时间，网站更新后自动抓取；

利用mongo和mysql，分别保存抓取结果。

主要内容

网站分析

进入51job后，输入关键字python，搜索范围改为全国，通过分析得到该网页为静态网页

搜索后生成的url即为开始抓取的url：https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html

明确抓取字段

编写items.py文件，明确要抓取的字段

import scrapy

class QcItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 数据来源
    source = scrapy.Field()
    # 抓取时间
    utc_time = scrapy.Field()

    # 职位名称
    work_position = scrapy.Field()
    # 公司名称
    name_company = scrapy.Field()
    # 工作地点
    work_place = scrapy.Field()
    # 薪资范围
    salary = scrapy.Field()
    # 发布时间
    publish_time = scrapy.Field()

    # 工作详情
    content = scrapy.Field()
    # 联系方式
    contact = scrapy.Field()

编写爬虫文件

来到爬虫文件后，考虑给每一个请求添加一个请求头信息，因此，在下载中间件中添加请求头中间件

class QcSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    def process_request(self, request, spider):
        """
        给每一个请求随机分配一个代理
        :param request:
        :param spider:
        :return:
        """
        user_agent = random.choice(ua)
        request.headers['User-Agent'] = user_agent

添加请求头后，来到爬虫文件，编写parse函数，解析数据：

class QcSpider(scrapy.Spider):
    name = 'qc'
    # allowed_domains = ['51job.com']

    # 开始url
    start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html']

    def parse(self, response):
        # 先编写下载中间件，给每个请求加一个User-Agent
        # 解析数据
        node_list = response.xpath('//div[@class="el"]')
        for node in node_list:

            # 匹配详情页链接,观察51job发现前面4个节点不是招聘信息，因此也没有详情页
            # 因此，取不到详情页链接，表示可以忽略，不用存
            detail_link = node.xpath('./p/span/a/@href')
            if detail_link:
                item = QcItem()
                item['work_position'] = node.xpath('./p/span/a/@title').extract_first()
                item['name_company'] = node.xpath('./span[@class="t2"]/a/text()').extract_first()
                item['work_place'] = node.xpath('./span[@class="t3"]/text()').extract_first()
                item['salary'] = node.xpath('./span[@class="t4"]/text()').extract_first()
                item['publish_time'] = node.xpath('./span[@class="t5"]/text()').extract_first()

                # 解析详情页数据
                yield scrapy.Request(detail_link.extract_first(), callback=self.parse_detail, meta={"item": item})

在开始解析详情页数据之前，中下载中间件中，搭建redis，利用redis的set数据类型，将每一个详情页的链接添加到数据库中；

实现避免重复抓取，如果详情页的url在redis中，则忽略该次请求。

class QcRedisMiddleware(object):
    """
    将第一个页面上的每一个url放入redis的set类型中，防止重复爬取
    """
    # 连接redis
    def __init__(self):
        self.redis = redis.StrictRedis(host='localhost', port=6379, db=1)

    def process_request(self, request, spider):

        # 将来自详情页的链接存到redis中
        if request.url.startswith("https://jobs.51job.com/"):
            # MD5加密详情页链接
            url_md5 = hashlib.md5(request.url.encode()).hexdigest()

            # 添加到redis，添加成功返回True,否则返回False
            result = self.redis.sadd('qc_url', url_md5)

            # 添加失败，说明链接已爬取，忽略该请求
            if not result:
                raise IgnoreRequest

继续来到爬虫文件中，编写详情页数据解析的内容。

   def parse_detail(self, response):
        item = response.meta['item']
        # 编写下载中间件，将详情页链接存到redis中，达到去重复的目的

        # 解析页面所有数据
        content = response.xpath('//div[@class="bmsg job_msg inbox"]').xpath('string(.)').extract()

        # content = response.xpath('//div[@class="bmsg job_msg inbox"]/*/text()').extract()
        # 取联系方式
        contact = response.xpath('//div[@class="bmsg inbox"]/p/text()').extract()

        # 拿到的content有空格和换行符，利用正则，去掉空白符
        item['content'] = re.sub('\s', '', ''.join(content))
        item['contact'] = ''.join(contact).strip()

        yield item

此时，索要解析的数据均解析完毕，接下来就是将解析的数据进行保存。

数据保存

编写pipelines.py文件，保存item。利用mongo和mysql两种方式分别保存数据。

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import pymongo
import pymysql
from datetime import datetime

class QcPipeline(object):
    def process_item(self, item, spider):

        # 添加数据源
        item['source'] = spider.name

        # 添加爬取时间
        item['utc_time'] = str(datetime.utcnow())

        return item

class QcJsonPipeline(object):
    """
    保存为json数据
    """
    def open_spider(self, spider):

        # 打开文件
        self.file = open('qc.json', 'a', encoding='utf-8')

    def process_item(self, item, spider):

        content = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(content)

        return item

    def close_spider(self, spider):
        self.file.close()

class QcMongoPipeline(object):
    """
    存入大Mongodb中
    """
    def open_spider(self, spider):

        # 实例化mongo客户端并链接
        self.client = pymongo.MongoClient(host='localhost', port=27017)
        # 创建库和集合
        self.collection = self.client['qc']['qc']

    def process_item(self, item, spider):
        # 添加数据
        self.collection.insert(dict(item))

        return item

    def close_spider(self, spider):
        # 关闭数据库
        self.client.close()


class QcMysqlPipeline(object):
    """
    数据存入到mysql
    """
    def open_spider(self, spider):
        self.conn = pymysql.connect(
            host='localhost',
            port=3306,
            database='qc',
            user='z',
            password='136833',
            charset='utf8'
        )
        # 实例一个游标
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):

        sql = ("insert into qc(source, utcTime, workName, "
               "company, workPosition, salary, publishTime, "
               "content, contact)"
               "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)")

        list_item = [item['source'], item['utc_time'], item['work_position'],
                  item['name_company'], item['work_place'], item['salary'], item['publish_time'],
                  item['content'], item['contact']]

        self.cursor.execute(sql, list_item)
        # 提交数据
        self.conn.commit()

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

    # create table qc
    # (
    #     id INT unsigned PRIMARY KEY auto_increment NOT NULL,
    #     source VARCHAR(20) DEFAULT "",
    #     utcTime DATETIME DEFAULT "1111-11-11 11:11:11",
    #     workName VARCHAR(40) DEFAULT "",
    #     company VARCHAR(40) DEFAULT "",
    #     workPosition VARCHAR(40) DEFAULT "",
    #     salary VARCHAR(40) DEFAULT "",
    #     publishTime VARCHAR(20) DEFAULT "",
    #     content TEXT(1024),
    #     contact VARCHAR(40) DEFAULT ""
    # );

自动抓取

最后实现自动爬取，单独编写一个脚本文件，隔一段时间自动抓取。

from scrapy import cmdline
import time

# cmdline.execute("scrapy crawl qc".split())


import os
import time

while True:
    """
    每隔10s自动爬取一次，实现自动更新
    """
    os.system("scrapy crawl qc")
    time.sleep(20)

完整代码

参见：https://github.com/zInPython/qiancheng