遇到ajax怎么办?scrapy爬虫搞一下腾讯招聘

都Ajax啦,思路不用讲啦,直接放爬虫的py文件

# -*- coding: utf-8 -*-
import scrapy,re						#Ajax 嘛。。肯定要用正则啦,还要拼接URL哦
from urllib.parse import urlencode		

class HrSpider(scrapy.Spider):			#Network 中 查看XHR 找到 实际存放信息的链接  并获取  拼接信息data
    data = {
        'pageIndex': '1',			#翻页时候可以发现 pageIndex其实就是页码
        'pageSize': '10',
        'language': 'zh-cn',
        'area': 'cn'
    }
    name = 'hr'				#爬虫名字
    allowed_domains = ['tencent.com']			#限制爬虫范围
    start_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' + urlencode(data)					#使用urlencode进行拼接第一页的完整链接
    start_urls = [start_url]	#爬虫开始

    def parse(self, response):		#分析~
        temp = {}				#新建空的temp字典,一会存放用正则爬取的对应信息
        html = response.text		#对网页内容进行解码   
        #print(html)
        pattern_LocationName = re.compile('"LocationName":"(.*?)"',re.S)		#正则的规则编写
        pattern_RecruitPostName = re.compile('"RecruitPostName":"(.*?)"',re.S)
        temp["LocationName"] = re.findall(pattern_LocationName,html)	#爬取对应规则的数据
        temp["RecruitPostName"] = re.findall(pattern_RecruitPostName,html)
        for LocationName,RecruitPostName in zip(temp["LocationName"],temp["RecruitPostName"]):	#已经爬取了信息存放在temp字典中,但是因为是一坨一坨的存放的   所以我们分开比较好看  就用zip()分开存放
            item = {}
            item["地点"] = LocationName
            item["工作名称"] = RecruitPostName
            yield item				#使用生成器  会传到pipelines

        #next_url	这里就是需要翻页啦		
        data = {
            'pageIndex': '1',
            'pageSize': '10',
            'language': 'zh-cn',
            'area': 'cn'
        }

        if int(data['pageIndex']) < 3:
            data['pageIndex'] = int(data['pageIndex']) + 1
            next_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' + urlencode(data)
            yield scrapy.Request(
                next_url,
                callback = self.parse
            )

这个是pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient			#mongodb

client = MongoClient()			#实例化对象
collection = client["hr"]["info"]		#创建hr数据库  info 表

class TencentPipeline(object):
    def process_item(self, item, spider):	
        collection.insert(item)			#添加数据
        return item

Over~

猜你喜欢

转载自blog.csdn.net/dh0805dh/article/details/89962196