注:爬取后的信息将以json格式存储,并将文件命名为“recruit.json”可用Notepad++打开。
代码实现:
items.py
# -*- coding: utf-8 -*- import scrapy class TxhrItem(scrapy.Item): #职位名称 positionName = scrapy.Field() #职位类别 positionType = scrapy.Field() #需求人数 needNum = scrapy.Field() #工作地点 workingSpace = scrapy.Field() #发布时间 publishTime = scrapy.Field()
爬虫文件(spiders/txHRSpider.py)
# -*- coding: utf-8 -*- import scrapy from txHR.items import TxhrItem class TxhrspiderSpider(scrapy.Spider): name = 'txHR' allowed_domains = ['tencent.com'] initialURL = 'https://hr.tencent.com/position.php?@start=&start=' bias = 0 url = initialURL + str(bias) start_urls = [url] def parse(self, response): # even=偶,odd=奇 for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): #创建模型对象 item = TxhrItem() item['positionName'] = each.xpath("td[1]/a/text()").extract()[0] test_null = each.xpath("td[2]/text()").extract() #因为网页中有的记录中的“职位类别”为空,如果不加上下面的判断,程序会在中途报错 if test_null == []: item['positionType'] = "Null" else: item['positionType'] = test_null[0] item['needNum'] = each.xpath("td[3]/text()").extract()[0] item['workingSpace'] = each.xpath("td[4]/text()").extract()[0] item['publishTime'] = each.xpath("td[5]/text()").extract()[0] yield item self.bias += 10 #抓取前1000条社会招聘信息 if self.bias < 1000: url = self.initialURL+str(self.bias) yield scrapy.Request(url, callback=self.parse)
pipelines.py
# -*- coding: utf-8 -*- import json class TxhrPipeline(object): def __init__(self): self.output = open("recruit.json", 'w') def process_item(self, item, spider): # 将爬取的信息先转换为字典,再转换为json格式的键值对 jsonText = json.dumps(obj=dict(item), ensure_ascii=False) + '\n' self.output.write(jsonText) return item def close_spider(self): self.output.close()
settings.py
BOT_NAME = 'txHR' SPIDER_MODULES = ['txHR.spiders'] NEWSPIDER_MODULE = 'txHR.spiders' ROBOTSTXT_OBEY = True DEFAULT_REQUEST_HEADERS = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', } ITEM_PIPELINES = { 'txHR.pipelines.TxhrPipeline': 300, }