-
创建一个新的爬虫:
scrapy genspider tencent "tencent.com"
-
编写items.py
获取职位名称、详细信息、
class TencentItem(scrapy.Item):
# define the fields for your item here like:
jobTitle = scrapy.Field()
jobCategories = scrapy.Field()
number = scrapy.Field()
location = scrapy.Field()
releasetime = scrapy.Field()
编写tencent.py
# -*- coding: utf-8 -*-
import re
import scrapy
from Tencent import items
class MytencentSpider(scrapy.Spider):
name = 'myTencent'
allowed_domains = ['hr.tencent.com']
start_urls = ['https://hr.tencent.com/position.php?lid=2218&start=0#a']
def parse(self, response):
for data in response.xpath("//tr[@class=\"even\"] | //tr[@class=\"odd\"]"):
item = items.TencentItem()
item["jobTitle"] = data.xpath("./td[1]/a/text()")[0].extract()
item["jobLink"] = data.xpath("./td[1]/a/@href")[0].extract()
item["jobCategories"] = data.xpath("./td[1]/a/text()")[0].extract()
item["number"] = data.xpath("./td[2]/text()")[0].extract()
item["location"] = data.xpath("./td[3]/text()")[0].extract()
item["releasetime"] = data.xpath("./td[4]/text()")[0].extract()
yield item
for i in range(1, 200):
newurl = "https://hr.tencent.com/position.php?lid=2218&start=%d#a" % (i*10)
yield scrapy.Request(newurl, callback=self.parse)
编写pipeline.py文件
class TencentPipeline(object):
def __init__(self):
self.file = open("tencent.txt", "w", encoding="utf-8")
def process_item(self, item, spider):
line = str(item) + "\r\n"
self.file.write(line)
self.file.flush()
return item
def __del__(self):
self.file.close()
在 setting.py 里设置ITEM_PIPELINES
ITEM_PIPELINES = {
"mySpider.pipelines.TencentJsonPipeline":300
}
执行爬虫:
scrapy crawl tencent