爬虫项目实战：51job抓取--scrapy版存于数据库

准备要抓取的数据和得到链接的xpath

创建项目

scrapy startproject pythonjob

创建爬虫

scrapy genspider job 51job.com

job.py中代码：

# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider

from pythonjob.items import PythonjobItem


class JobSpider(RedisSpider):
    name = 'job'
    # 存于redis
    redis_key = 'jobspider:start_urls'
    allowed_domains = ['51job.com']
    page = 1
    url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&keyword=python&curr_page="
    # start_urls = [url + str(page)]

    def parse(self, response):
        href_list = response.xpath('//div[@class="el"]/p/span/a/@href').extract()
        for href in href_list:
            yield scrapy.Request(href, callback=self.detal_page_parse)


    def detal_page_parse(self, response):
        item = PythonjobItem()
        # 职位链接
        url = response.url
        # 招聘标题
        title = response.xpath('//div[@class="cn"]/h1/text()').extract()[0]
        # 工资情况
        salary = response.xpath('//div[@class="cn"]/strong/text()').extract()
        salary = ''.join(salary)
        # 招聘内容
        content = response.xpath('//div[@class="bmsg job_msg inbox"]/p/text()|//div[@class="bmsg job_msg inbox"]//li/text()|//div[@class="bmsg job_msg inbox"]//text()').extract()
        content = ''.join(content).replace('\t','').replace('\r\n','').replace(' ','').replace('\xa0','')
        # 招聘职位
        pos = response.xpath('//span[@class="lname"]/text()').extract()[0]
        # 招聘人数
        count = response.xpath('//div[@class="t1"]/span[3]/text()').extract()[0]
        print('count=====',count)
        if len(content) < 1:
            content = title
        item['url'] = url
        item['title'] = title
        item['salary'] = salary
        item['content'] = content
        item['pos'] = pos
        return item

添加运行分布式 test_redis.py 文件：

from scrapy import cmdline

cmdline.execute('scrapy runspider job.py'.split())

pipelines.py 文件(管道文件，用于对数据的处理以及存储)：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from datetime import datetime

#新添加的
import pymongo
import pymysql
from pythonjob.settings import MONGO_HOST, MONGO_PORT, MONG0_DBNAME, SHEET_NAME


class ExamplePipeline(object):
    # 存于redis中增加两个字段
    def process_item(self, item, spider):
        item["crawled"] = datetime.utcnow()
        item["spider"] = 'PC'
        return item

class PythonjobPipeline(object):
    # 用于存储到本地
    def __init__(self):
        self.file = open('job.josn','w',encoding='utf-8')
    def process_item(self, item, spider):
        # josn_str = json.dumps(dict(item),ensure_ascii=False)+'\n'
        # self.file.write(josn_str)
        return item
    def close_file(self, spider):
        self.file.close()


class job_mongo(object):
    # 用于存储到mongodb数据库
    def __init__(self):
        mongo_host = MONGO_HOST
        mongo_port = MONGO_PORT
        sheet_name = SHEET_NAME
        db_name = MONG0_DBNAME
        print("打印mongo的信息===", mongo_host, mongo_port, db_name, sheet_name)
        client = pymongo.MongoClient(host=mongo_host,port=mongo_port)
        db_name = client[db_name]
        self.sheet_name = db_name[sheet_name]
    def process_item(self, item, spider):
        python_dict = dict(item)
        self.sheet_name.insert(python_dict)
        return item

class job_mysql(object):
    # 用于存储到MySQL数据库
    def __init__(self):
        mysql_port = 3306
        mysql_host = '127.0.0.1'
        dbname = 'python_jobdb'
        user = 'afu'
        password = '123456'
        sheetname = 'job_items'
        print("打印mysql的信息===", mysql_host, mysql_port, dbname, sheetname)
        self.con = pymysql.connect(host=mysql_host, user=user, password=password,
                                   database=dbname, port=mysql_port, charset='utf8')
        # 创建表
        self.cursor = self.con.cursor()

    def process_item(self, item, spider):
        args = [item["url"], item["title"], item["salary"], item["content"], item["pos"], item["crawled"], item["spider"]]
        sql = "INSERT INTO job_items(url,title,salary,content,pos,crawled,spider) VALUES (%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql, args)
        # 事务提交
        self.con.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.con.close()

items中：

import scrapy


class PythonjobItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位链接
    url = scrapy.Field()
    # 招聘标题
    title = scrapy.Field()
    # 工资情况
    salary = scrapy.Field()
    # 招聘内容
    content = scrapy.Field()
    # 招聘职位
    pos = scrapy.Field()
    # 增加到redis两个字段(记录时间和分机姓名)
    crawled = scrapy.Field()
    spider = scrapy.Field()

settings.py配置中：

BOT_NAME = 'pythonjob'

SPIDER_MODULES = ['pythonjob.spiders']
NEWSPIDER_MODULE = 'pythonjob.spiders'


# scrapy_redis相关配置
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
#使用scrapy-redis自己的组件去重,不使用scrapy默认的去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis自己调度器,不使用scrapy默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#调度状态持久化，不清理redis缓存，允许暂停/启动爬虫
SCHEDULER_PERSIST = True
#按照sorted 排序顺序出队列，建议使用某一个，这样才能在redis数据库中看到,其实可以不写不影响结果
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
COOKIES_ENABLED = False
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 记录日志
LOG_LEVEL = 'DEBUG'
# 增加爬出延迟时间,减轻服务器压力
DOWNLOAD_DELAY = 1

ITEM_PIPELINES = {

	'pythonjob.pipelines.ExamplePipeline': 290,
	'pythonjob.pipelines.PythonjobPipeline': 291,
	# 数据存到MySQL,mongodb,redis数据库没有先后顺序,互不干扰
	'pythonjob.pipelines.job_mysql': 292,
	'pythonjob.pipelines.job_mongo': 293,
    # 下面这个管道是必须要启用的--支持数据存储到redis数据库里
	'scrapy_redis.pipelines.RedisPipeline': 400,
}

# 变量名可以随意设置
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379

MONGO_HOST = '127.0.0.1'
MONGO_PORT = 27017
MONG0_DBNAME = 'python_job'
SHEET_NAME = 'job'

redis存储数据情况：

mongdb存储数据情况：

存储MySQL情况：

爬虫项目实战：51job抓取--scrapy版存于数据库

猜你喜欢