python-scrapy爬取腾讯社招网

学习完python爬虫之后，脑袋发热想实践一下，看看下各位大佬的博客，然后自己也开始做了些测试，以下将介绍scrapy框架爬去腾讯社招网的数据并保存至mongoDB。

首先在搭建好scrapy框架后，首先进入在需要创建项目的路径进入cmd 输入命令：scrapy startproject 项目名，本次demo名字为tecent

接下来开始代码的编写，创建爬虫类：

import scrapy
import scrapy.http
from bs4 import BeautifulSoup
from tecent import items

class tecentSpider(scrapy.Spider):

    # 爬虫名称
    name = 'tecentSpider'
    # 爬取范围
    allowed_domains = ["hr.tencent.com"]
    #起始链接
    start_url = 'https://hr.tencent.com/position.php'
    #拼接URL
    get_url = 'https://hr.tencent.com/'
    #基础链接
    base_url = 'https://hr.tencent.com/position.php?&start='

    def start_requests(self):
        yield scrapy.Request(self.start_url, self.parse)

    def parse(self, response):
        html = BeautifulSoup(response.text, 'lxml')
        #地址列表
        add_list = html.select('#additems > a')
        #类型列表
        type_list = html.select('#searchrow3 > .pl9 > a')
        for add in add_list:
            if  add.get_text() != '全部':
                itemAdd = items.cityItem()
                itemAdd['addressUrl'] = add.attrs.get('href')
                itemAdd['addressName'] = add.get_text()
                yield itemAdd
                yield scrapy.Request(self.get_url+add.attrs.get('href'), self.parse_city_page)

        for type in type_list:
            if type.get_text() != '全部':
                itemType = items.typeItem()
                itemType['typeUrl'] = type.attrs.get('href')
                itemType['typeName'] = type.get_text()
                yield itemType

    def parse_city_page(self,response):
        html = BeautifulSoup(response.text, 'lxml')
        page_list = html.select('.pagenav > a')
        max_page = int(page_list[len(page_list)-2].get_text())+1
        city_url = self.get_url+html.select('#searchrow2 > #additems > .active')[0].attrs.get('href')
        for i in range(1,max_page):
            #获取每页路径
            url = city_url+'&start='+str(int(i-1)*10)
            yield scrapy.Request(url, self.parse_page_data)

    def parse_page_data(self,response):
        html = BeautifulSoup(response.text, 'lxml')
        tr_list = html.select('.tablelist > tr')
        for tr in tr_list:
            item = items.TecentItem()
            if tr.attrs.get('class')[0] != 'h':
                if tr.attrs.get('class')[0] != 'f':
                    item['name'] = tr.select('.square > a')[0].get_text()
                    item['type'] = tr.select('td')[1].get_text()
                    item['personNum'] = tr.select('td')[2].get_text()
                    item['address'] = tr.select('td')[3].get_text()
                    item['time'] = tr.select('td')[4].get_text()
                    item['pageUrl'] = self.get_url+tr.select('.square > a')[0].attrs.get('href')
                    yield item
                    yield scrapy.Request(item['pageUrl'], self.parse_item_detail)

    def parse_item_detail(self,response):
        html = BeautifulSoup(response.text, 'lxml')
        tr_list = html.select('.tablelist > tr')
        item = items.TecentItemDetail()
        item['title'] = tr_list[0].select('td')[0].get_text()
        item['address'] = tr_list[1].select('td')[0].get_text().split('：')[-1]
        item['type'] = tr_list[1].select('td')[1].get_text().split('：')[-1]
        item['num'] = tr_list[1].select('td')[2].get_text().split('：')[-1]
        item['task'] = tr_list[2].select('.squareli')[0].get_text()
        item['require'] = tr_list[3].select('.squareli')[0].get_text()
        yield item

以上代码包含了五个解析方法，分别返回指定的item，解析html界面使用BeautifulSoup，个人觉得还不错，这个可以看个人喜好

接下来创建items文件，scrapy创建项目包含了items.py，我们只需在此文件中编写即可：

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

#招聘信息条目
class TecentItem(scrapy.Item):
    #职位名称
    name = scrapy.Field()
    #职位类型
    type = scrapy.Field()
    #招聘人数
    personNum = scrapy.Field()
    #招聘地址
    address = scrapy.Field()
    #发布时间
    time = scrapy.Field()
    #职位链接
    pageUrl = scrapy.Field()

#招聘地址
class cityItem(scrapy.Item):
    #城市招聘信息链接
    addressUrl = scrapy.Field()
    #地址
    addressName = scrapy.Field()

#招聘类型
class typeItem(scrapy.Item):
    #类型招聘信息链接
    typeUrl = scrapy.Field()
    #类型
    typeName = scrapy.Field()

#招聘条目详情
class TecentItemDetail(scrapy.Item):
    #招聘标题
    title = scrapy.Field()
    #工作地点
    address = scrapy.Field()
    #职位类别
    type = scrapy.Field()
    #招聘人数
    num = scrapy.Field()
    #工作职责
    task = scrapy.Field()
    #工作要求
    require = scrapy.Field()

items.py中的item可以根据自己需要爬取的内容定义，我这里是定义了四个，具体可以见注释。

接下来scapy会将处理完的结果返回至pipelines.py文件处理，其处理方式根据自己的业务进行编写，本demo中编写代码如下：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.utils.project import get_project_settings
#保存社招条目
class TecentPipeline(object):

    def __init__(self):
        self.settings = get_project_settings()
        # 链接数据库
        self.client = pymongo.MongoClient(host=self.settings['MONGO_HOST'], port=self.settings['MONGO_PORT'])
        # 数据库登录需要帐号密码的话
        # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
        self.db = self.client[self.settings['MONGO_DB']]

    def process_item(self, item, spider):
        data = dict(item)
        if item.__class__.__name__ == 'TecentItem':
            self.coll = self.db[self.settings['MONGO_COLL_ITEM']]
        if item.__class__.__name__ == 'TecentItemDetail':
            self.coll = self.db[self.settings['MONGO_COLL_ITEM_DETAIL']]
        if item.__class__.__name__ == 'cityItem':
            self.coll = self.db[self.settings['MONGO_COLL_ADDRESS']]
        if item.__class__.__name__ == 'typeItem':
            self.coll = self.db[self.settings['MONGO_COLL_TYPE']]

        self.coll.insert(data)
        return item

注意：爬虫类中的解析方法在为item赋完值后，都会使用yield关键词，这个其实很关键，不使用yield关键词你的item在pipelines.py文件中无法获得值。

最后，你还需要做一些配置，比如数据库的配置，开启pipelines.py，这些配置主要在setting.py中，其代码如下：

# -*- coding: utf-8 -*-

# Scrapy settings for tecent project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tecent'

SPIDER_MODULES = ['tecent.spiders']
NEWSPIDER_MODULE = 'tecent.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tecent (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

MONGO_HOST = "127.0.0.1"  # 主机IP
MONGO_PORT = 27017  # 端口号
MONGO_DB = "tecent"  # 库名
MONGO_COLL_TYPE = "tecent_type"
MONGO_COLL_ADDRESS = "tecent_address"
MONGO_COLL_ITEM = "tecent_item"
MONGO_COLL_ITEM_DETAIL = "tecent_item_detail"

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tecent.middlewares.TecentSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'tecent.middlewares.TecentDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'tecent.pipelines.TecentPipeline': 1,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

好了，代码基本已经编写完毕，最后编写启动方法，写在begin.py中，没有的话就在同路径下创建一个：

from scrapy import cmdline

cmdline.execute(['scrapy', 'crawl' ,'tecentSpider'])

最后是项目的结构图：

代码仅供参考，才疏学浅，望各位多多指教！

python-scrapy爬取腾讯社招网

猜你喜欢