目标网址:https://careers.tencent.com/search.html?pcid=40001
创建项目 : scrapy startproject tencent
创建爬虫:scrapy genspider tc careers.tencent.com
tc.py
# -*- coding: utf-8 -*-
import scrapy
import json
class TcSpider(scrapy.Spider):
name = 'tc'
allowed_domains = ['careers.tencent.com']
one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1583292137338&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1583296082393&postId={}&language=zh-cn'
# 最开始的url
start_urls = [one_url.format(1)]
def parse(self, response):
for page in range(1, 11):
url = self.one_url.format(page)
yield scrapy.Request(
url=url,
callback=self.get_data
)
def get_data(self, response):
data = json.loads(response.text)['Data']['Posts']
for content in data:
item = {}
Post_id = content['PostId']
item['title'] = content['RecruitPostName'] # title 招聘标题
item['City'] = content['LocationName'] # 城市
item['art'] = content['CategoryName'] # 技术类别
yield scrapy.Request(
url=self.two_url.format(Post_id),
callback=self.spider_data,
meta={'item': item}
)
def spider_data(self, response):
item = response.meta['item']
data = json.loads(response.text)['Data']
item['Duty'] = data['Responsibility']
item['ask'] = data['Requirement']
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field() # title
City = scrapy.Field() # 城市
art = scrapy.Field() # 技术类别
Duty = scrapy.Field() # 职责
ask = scrapy.Field() # 要求
pass
settings务必要开启管道
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class TencentPipeline(object):
def __init__(self):
self.connect = pymysql.connect('127.0.0.1', 'root', 'root', 'spider')
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
print(item)
sql = "insert into 腾讯(title, City, art, Duty, ask) value (%s, %s, %s, %s, %s)"
args = (item['title'], item['City'], item['art'], item['Duty'], item['ask'])
self.cursor.execute(sql, args)
return item
def close_spider(self, spider):
self.connect.commit()
self.connect.close()
爬取的部分内容截图: