Python使用scrapy框架爬取腾讯招聘

目标网址:https://careers.tencent.com/search.html?pcid=40001

创建项目 : scrapy startproject tencent

创建爬虫scrapy genspider tc careers.tencent.com

tc.py

# -*- coding: utf-8 -*-
import scrapy
import json


class TcSpider(scrapy.Spider):
    name = 'tc'
    allowed_domains = ['careers.tencent.com']
    one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1583292137338&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
    two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1583296082393&postId={}&language=zh-cn'
    # 最开始的url
    start_urls = [one_url.format(1)]

    def parse(self, response):
        for page in range(1, 11):
            url = self.one_url.format(page)
            yield scrapy.Request(
                url=url,
                callback=self.get_data
            )

    def get_data(self, response):
        data = json.loads(response.text)['Data']['Posts']
        for content in data:
            item = {}
            Post_id = content['PostId']
            item['title'] = content['RecruitPostName']       # title 招聘标题
            item['City'] = content['LocationName']              # 城市
            item['art'] = content['CategoryName']            # 技术类别
            yield scrapy.Request(
                url=self.two_url.format(Post_id),
                callback=self.spider_data,
                meta={'item': item}
            )

    def spider_data(self, response):
        item = response.meta['item']
        data = json.loads(response.text)['Data']
        item['Duty'] = data['Responsibility']
        item['ask'] = data['Requirement']
        yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()      # title
    City = scrapy.Field()       # 城市
    art = scrapy.Field()        # 技术类别
    Duty = scrapy.Field()       # 职责
    ask = scrapy.Field()        # 要求
    pass

settings务必要开启管道
pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
class TencentPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect('127.0.0.1', 'root', 'root', 'spider')
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        print(item)
        sql = "insert into 腾讯(title, City, art, Duty, ask) value (%s, %s, %s, %s, %s)"
        args = (item['title'], item['City'], item['art'], item['Duty'], item['ask'])
        self.cursor.execute(sql, args)
        return item

    def close_spider(self, spider):
        self.connect.commit()
        self.connect.close()

爬取的部分内容截图:
在这里插入图片描述

发布了54 篇原创文章 · 获赞 26 · 访问量 6162

猜你喜欢

转载自blog.csdn.net/qq_37662827/article/details/104659053