scrapy 爬虫流程示例

爬取腾讯hr招聘信息

工程目录：

1，创建爬虫： scrapy startproject tencent tencent.com

2.进入tencent目录，创建爬虫文件：scrapy genspider tencent_hr

tencent\tencent\spiders\tencent_hr.py #引擎

# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem#导入Item
class TencentHrSpider(scrapy.Spider):
    name = 'tencent_hr'             #爬虫的别名，
    allowed_domains = ['tencent.com']#爬取的范围
    url='https://hr.tencent.com/position.php?&start='  #构造爬取的url
    offset=0                        #由于要爬多页，每一页url后缀加10，用offset的变化来爬不同的页
    start_urls = [url+str(offset)]  #开始爬的起点的url

    def parse(self, response):    #必选的方法，爬到的页面响应放在这里来处理
        #下面的for循环处理爬取一页的数据
        for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'):# | 代表或的关系
            item=TencentItem()      #产生一个存放数据的item
            item['positiontname'] = each.xpath('.//a/text()').extract()[0]  #职位名称
            item['positiontlink'] = each.xpath('.//a/@href').extract()[0]   #职位url
            item['positiontype'] = each.xpath('./td[2]/text()').extract()[0]#职位所属类型
            item['peopleNum'] = each.xpath('./td[3]/text()').extract()[0]   #招的人数
            item['workLocation'] = each.xpath('./td[4]/text()').extract()[0]#工作地点
            item['publishtime'] = each.xpath('./td[5]/text()').extract()[0]# 发布时间
            item['pagenum']=self.offset/10                                 #用来看爬到第几页了
            yield item  #yield是python内置的语法：生成器，就是这个函数返回一堆item的生成器，
        #处理好了爬完一页的数据
        if self.offset < 1680:
            self.offset += 10
        #产生新的请求
        yield scrapy.Request(self.url+str(self.offset),callback=self.parse)

tencent/tencent/items.py #要爬取的数据字段

# -*- coding: utf-8 -*-

import scrapy
class TencentItem(scrapy.Item):
    positiontname=scrapy.Field()
    positiontlink=scrapy.Field()
    positiontype=scrapy.Field()
    peopleNum=scrapy.Field()
    workLocation=scrapy.Field()
    publishtime=scrapy.Field()
    pagenum=scrapy.Field()

tencent/tencent/settings.py 配置文件

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'tencent.pipelines.TencentPipeline': 300,
}

tencent/tencent/pipelines.py 要使用pipline，记得在上面的settings.py里把管道名字写对

# -*- coding: utf-8 -*-

import json
class TencentPipeline(object):
    #可选方法：存储数据之前做的事
    def __init__(self):
        self.file = open('tencent.json', 'w')
    #必选方法：存储数据
    def process_item(self, item, spider):
        text=json.dumps(dict(item),ensure_ascii=False)+'\n'
        self.file.write(text.encode("utf-8"))
        return item #return是必须的

    # 可选方法：存储数据之后做的事
    def close_spider(self):
        self.file.close()

0.0.1 scrapy 爬虫示例

scrapy 爬虫流程示例

猜你喜欢