Python爬虫:scrapy爬取腾讯社招职位信息

爬取腾讯社招职位信息地址
https://hr.tencent.com/position.php

三个文件代码如下:
spdier.py

# -*- coding: utf-8 -*-

# author : pengshiyu
# date : 2-18-4-19

import scrapy
from scrapy.selector import Selector
from tencent_position_item import  TencentPositionItem

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class TencentPositionSpider(scrapy.Spider):
    name = "tencent_position"
    allowed_domains = ["tencent.com"]
    custom_settings = {
        "ITEM_PIPELINES":{
            "myspider.tencent_position_spider.tencent_position_pipeline.TencentPositionPipeline": 100,
        }
    }

    start_urls =[
        "https://hr.tencent.com/position.php"
    ]

    def parse(self, response):

        base_url = "https://hr.tencent.com/"

        rows = response.css(".even, .odd")
        # 或者使用xpath解析器  或 |  ;  与 and
        # rows = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for row in rows:
            position_name = row.xpath("./td[1]/a/text()").get()
            position_link = row.xpath("./td[1]/a/@href").get()
            position_type = row.xpath("./td[2]/text()").get()
            position_number = row.xpath("./td[3]/text()").get()
            work_location = row.xpath("./td[4]/text()").get()
            publish_time = row.xpath("./td[5]/text()").get()

            # 输出提取的信息
            print "*"*30
            print position_name
            print position_link
            print position_type
            print position_number
            print work_location
            print publish_time

            # 保存到item
            item = TencentPositionItem()

            item["position_name"] = position_name
            item["position_link"] = base_url + position_link
            item["position_type"] = position_type
            item["position_number"] = position_number
            item["work_location"] = work_location
            item["publish_time"] = publish_time

            yield item

        # 翻页, 下一页
        # 方式1 正则匹配, 可能有的版本不能用re_first,那就用re
        regex = u'<a href="([^<]*)" id="next">下一页</a>'
        ret = Selector(response).re_first(regex, replace_entities=False)

        # 方式2 css选择器查找
        next_url = response.css("#next::attr(href)").extract_first()

        if next_url != u"javascript:;":
            next_url = base_url + next_url
            print "下一页:", next_url
            yield scrapy.Request(url=next_url, callback=self.parse)
        else:
            print "最后一页了", next_url

item.py

# -*- coding:utf-8 -*-

import scrapy

class TencentPositionItem(scrapy.Item):
    position_name = scrapy.Field() # 职位名称
    position_link = scrapy.Field() # 职位链接详情
    position_type = scrapy.Field()  # 职位类型
    position_number = scrapy.Field() # 职位数量
    work_location = scrapy.Field()  # 工作地点
    publish_time = scrapy.Field()  # 发布时间

pipline.py

# -*- coding: utf-8 -*-

import json
import os

BASE_DIR = os.path.abspath(__file__)

class TencentPositionPipeline(object):
    def __init__(self):
        self.f = open("tencent_position.txt", "w")
        self.count = 0

    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False)+"\n"
        self.f.write(content)
        self.count += 1
        return item


    def close_spider(self, spider):
        print "爬取信息条数:{count}".format(count=self.count)
        self.f.close()

猜你喜欢

转载自blog.csdn.net/mouday/article/details/80041111