scrapy同时爬取51job和智联招聘

scrapy同时运行2个及以上爬虫方法
每个爬虫创建对应的运行文件,然后运行每个文件。每个爬虫的数据模型要相同。
run1.py
# -*- coding:utf-8 -*-
from scrapy import cmdline
#c cmdline.execute(['scrapy, crawl, 爬虫1'])
cmdline.execute('scrapy crawl 爬虫1'.split(' '))
run2.py
# -*- coding:utf-8 -*-
from scrapy import cmdline
#c cmdline.execute(['scrapy, crawl, 爬虫2'])
cmdline.execute('scrapy crawl 爬虫2'.split(' '))
爬虫文件
数据清洗:负责清除数据两端的空格,空行,特殊符号等, 常用操作一般是strip, 包括清除无效数据,例如数据格式不完整的数据, 以及重复的数据
查找元素尽量用xpath定位,少用索引.因为有可能出现索引越界错误.
只有在不明确错误时使用异常捕获.
job.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem


class JobSpider(scrapy.Spider):
    name = 'job'
    allowed_domains = ['51 job.com ']
  #3个开始链接,分别为python,php,java职位信息
    start_urls = [
        ' http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200 ,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
        ' http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200 ,000000,0000,00,9,99,php,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
        ' http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200 ,000000,0000,00,9,99,html,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
    ]

    def parse(self, response):
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_job_info,
            meta={},
            dont_filter=True
        )

    def parse_next_page(self, response):
        """
        解析下一页
        :param response:
        :return:
        """
        next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
        if next_page:
            yield scrapy.Request(
                url=next_page,
                callback=self.parse_job_info,
                meta={},
                dont_filter=True
            )
        """
        递归:如果一个函数内部自己调用自己
             这种形式就叫做递归
        """

    def parse_job_info(self, response):
        """
        解析工作信息
        :param response:
        :return:
        """
        job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
        for job_div in job_div_list:
            job_name = job_div.xpath("p/span/a/@title").extract_first('无工作名称').strip().replace(",", "/")
            job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first('无公司名称').strip()
            job_place = job_div.xpath("span[@class='t3']/text()").extract_first('无地点名称').strip()
            job_salary = job_div.xpath("span[@class='t4']/text()").extract_first('面议').strip()
            job_time = job_div.xpath("span[@class='t5']/text()").extract_first('无时间信息').strip()
            job_type = '51job' if '51 job.com ' in response.url else '其它'
            print(job_type, job_name, job_company_name, job_place, job_salary, job_time)
            """
            数据清洗:负责清除数据两端的空格,空行,特殊符号等
            常用操作一般是strip
            包括清除无效数据,例如数据格式不完整的数据
            以及重复的数据
            """
            item = JobspiderItem()
            item['job_name'] = job_name
            item['job_company_name'] = job_company_name
            item['job_place'] = job_place
            item['job_salary'] = job_salary
            item['job_time'] = job_time
            item['job_type'] = job_type
            item['fan_kui_lv'] = "没有反馈率"
            yield item
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            dont_filter=True,
        )

zl.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem

class ZlSpider(scrapy.Spider):
    name = 'zl'
    allowed_domains = [' zhaopin.com ']
    start_urls = [
    ]

    def parse(self, response):
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_job_info,
            meta={},
            dont_filter=True,

        )
        pass

    def parse_job_info(self, response):
        """
        解析工作信息
        :param response:
        :return:
        """
        zl_table_list = response.xpath("//div[@id='newlist_list_content_table']/table[@class='newlist']")
        for zl_table in zl_table_list[1:]:
            #网页自动生成 运行起来看效果,或者右键查看源码
            # zl_td_list = zl_table.xpath("tr[1]/td")
            #问题:td数不是5个 报错 索引越界
            # td1 = zl_td_list[0]
            # td2 = zl_td_list[1]
            # td3 = zl_td_list[2]
            # td4 = zl_td_list[3]
            # td5 = zl_td_list[4]

            #查找元素尽量用xpath定位,少用索引.因为有可能出现索引越界错误.
            #只有在不明确错误时使用异常捕获.
            # //text() 获取标签内所有文本
            #extract()把列表里的元素转换成文本.本身还是列表
            #extract_first("默认值")  把列表里的元素转换成文本并取出第一个,如果取不到返回默认值
            td1 = zl_table.xpath("tr/td[@class='zwmc']/div/a//text()").extract()
            # 新的列表 = map(任意函数名,应用的列表)  用于对列表中的所有元素做处理
            td1 = map(str.strip, td1)
            job_name = "".join(td1).replace(",", "/")
            fan_kui_lv = zl_table.xpath("tr/td[@class='fk_lv']/span/text()").extract_first("没有反馈率").strip()
            job_company_name = zl_table.xpath("tr/td[@class='gsmc']/a[1]/text()").extract_first("没有公司名称").strip()
            job_salary = zl_table.xpath("tr/td[@class='zwyx']/text()").extract_first("面议").strip()
            job_place = zl_table.xpath("tr/td[@class='gzdd']/text()").extract_first("没有地址").strip()
            print(job_name,fan_kui_lv,job_company_name,job_salary,job_place)
            item = JobspiderItem()
            item['job_name'] = job_name
            item['job_company_name'] = job_company_name
            item['job_place'] = job_place
            item['job_salary'] = job_salary
            item['job_time'] = "没有时间"
            item['job_type'] = "智联招聘"
            item['fan_kui_lv'] = "没有反馈率"
            yield item
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            meta={},
            dont_filter=True,
        )


    def parse_next_page(self, response):
        """
        解析下一页
        :param response:
        :return:
        """
        next_page = response.xpath("//a[text()='下一页']/@href").extract_first("")
        if next_page :
            yield scrapy.Request(
                url=next_page,
                callback=self.parse_job_info,
                meta={},
                dont_filter=True,
            )

item.py
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:

import scrapy


class JobspiderItem(scrapy.Item):
    # define the fields for your item here like:
    job_name = scrapy.Field()
    job_company_name = scrapy.Field()
    job_place = scrapy.Field()
    job_salary = scrapy.Field()
    job_time = scrapy.Field()
    job_type = scrapy.Field()
    fan_kui_lv = scrapy.Field()
middlewares.py
自己写个类 JobUserMiddleware加到该文件中。该类的作用是自动生成useragent,使用时需要在setting.py中DOWNLOADER_MIDDLEWARES配置,在设置时同时也要禁用系统自带的默认useragent。这是由于下面的类是参考python包site-packages/scrapy/downloadermiddlewares中的代码写的,如果不禁用系统的,默认使用系统的useragent。
class JobUserMiddleware(object):
    """This middleware allows spiders to override the user_agent"""

    def __init__(self, user_agent='Scrapy',name=''):
        self.user_agent = UserAgent()

    @classmethod
    def from_crawler(cls, crawler):
        # o = cls(crawler.settings['USER_AGENT'],'张三')
        # cls后的数据会自动赋值给构造函数的对应参数

        o = cls()
        # crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
        return o

    def spider_opened(self, spider):
        # =右边代码的含义是从spider中获得user_agent的属性,
        # 如果没有默认为self.user_agent的内容
        # self.user_agent = getattr(spider, 'user_agent', self.user_agent)
        pass

    def process_request(self, request, spider):
        if self.user_agent:
            request.headers.setdefault(b'User-Agent', self.user_agent.random)
pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting

#pipeline:俗称管道,用于接收爬虫返回的item数据.

class JobspiderPipeline(object):
    def process_item(self, item, spider):
        return item


class ToCsvPipeline(object):
    def process_item(self, item, spider):
        with open("job.csv", "a", encoding="gb18030") as f:
            job_name = item['job_name']
            job_company_name = item['job_company_name']
            job_place = item['job_place']
            job_salary = item['job_salary']
            job_time = item['job_time']
            job_type = item['job_type']
            fan_kui_lv = item['fan_kui_lv']
            job_info = [job_name, job_company_name, job_place, job_salary, job_time, job_type, fan_kui_lv, "\n"]
            f.write(",".join(job_info))
        #把item传递给下一个pipeline做处理
        return item

settings.py
需要修改的有以下
#是否遵守robots协议,默认为Ture
ROBOTSTXT_OBEY = False
#延迟设置,防止速度过快被服务器检测到
DOWNLOAD_DELAY = 0.5
#禁用cookie追踪降低被发现几率
COOKIES_ENABLED = False
#启用middlewares
DOWNLOADER_MIDDLEWARES = {
   'JobSpider.middlewares.JobUserMiddleware': 543,
    #禁用系统useragent,None表示禁用,数字越小优先级越高
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
}
#启用item
ITEM_PIPELINES = {
   'JobSpider.pipelines.ToCsvPipeline': 300,
}







猜你喜欢

转载自blog.csdn.net/lollipop_sun/article/details/79489317