一.编写Item
import scrapy class LagouItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() positionId = scrapy.Field()#职位ID,作为辨识字段插入数据库 city = scrapy.Field()# 城市 positionName = scrapy.Field()#职位 salary = scrapy.Field()#工资 workYear = scrapy.Field()#经验要求 education = scrapy.Field()# 教育程度 businessZones = scrapy.Field()#工作地点(科技园)一个元素的列表 companyShortName = scrapy.Field()#公司简称 companyFullName = scrapy.Field()# 公司全称 companySize = scrapy.Field()# 公司人数规模 industryField = scrapy.Field()#公司服务方向 positionAdvantage = scrapy.Field()#职位优势(一句话) createTime = scrapy.Field()# 岗位发布时间
二.编写Spiders
此处注意拉钩网的反爬策略,所以加入带有不用登陆获得的cookie的headers(不加cookie会导致只能爬四五页就会因请求频繁而反爬),另外需要在settings里设置scrapy自带的cookie机制关闭
import scrapy from Lagou.items import LagouItem import json,time,random class LagouspiderSpider(scrapy.Spider): name = "lagouspider" allowed_domains = ["www.lagou.com"] url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false' page = 1 allpage =0 cookie = ??? headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'cookie': cookie } def start_requests(self): yield scrapy.FormRequest(self.url, headers=self.headers, formdata={ 'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse) def parse(self, response): #print(response.text) item = LagouItem() data = json.loads(response.text) totalCount = data['content']['positionResult']['totalCount']#总共多少条信息 resultSize = data['content']['positionResult']['resultSize']#每页多少条信息 result = data['content']['positionResult']['result']#得到一个包含15个信息的列表 for each in result: for field in item.fields: if field in each.keys(): item[field] = each.get(field) yield item time.sleep(random.randint(5, 10)) if int(resultSize): self.allpage = int(totalCount) // int(resultSize) + 1 if self.page < self.allpage: self.page += 1 yield scrapy.FormRequest(self.url, headers=self.headers, formdata={ 'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
三.编写Pipelines
import json,pymongo from scrapy.conf import settings class JsonPipeline(object): def __init__(self): self.file = open('job.json','w',encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item),ensure_ascii=False) + '\n' self.file.write(line) return item def close_spider(self,spider): self.file.close() class MongoPipeline(object): def __init__(self): mongo_uri = settings.get('MONGO_URI')# localhost:27017 mongo_db = settings.get('MONGO_DB')#数据库名 self.client = pymongo.MongoClient(mongo_uri) self.db = self.client[mongo_db] def process_item(self, item, spider): #self.db['拉钩关键词招聘信息表'].insert(dict(item)) self.db['拉钩关键词招聘信息表'].update({'positionId':item['positionId']},dict(item),True) return item def close_spider(self, spider): self.client.close()
四.编写settings(加入)
COOKIES_ENABLED = False ITEM_PIPELINES = { 'Lagou.pipelines.JsonPipeline': 100, 'Lagou.pipelines.MongoPipeline': 300 } MONGO_URI = 'localhost:27017' MONGO_DB = 'lagou'
注意:1、headers
2、headers里的cookie
3、formdata
4、settings设置scrapy默认的cookie加载机制关闭,而是用每次发请求时自带的包含cookie的headers
5、mongoDB数据库的存储insert用到的两种方法,此处update很关键(取的去重比较的字段应唯一),可以去重