这次用的是scrapy框架爬取,爬的内容是左边分类里面的每一个分类第一页的求职信息
求职信息如图:
这次爬的有职位,薪水,学历,天数,地理位置
思路:
首先在自己创建的小蜘蛛里设置一个函数,处理开始的页面抓到所有的分类链接,然后用callback回调处理页面函数,再将提取信息传入pipelines保存。
代码呈上:
首先是items部分:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()#职位
salary = scrapy.Field()#薪水
education_background = scrapy.Field()#学历
day = scrapy.Field()#天数
location = scrapy.Field()#位置
settings:(只需要修改一些信息即可)
BOT_NAME = 'job'
SPIDER_MODULES = ['job.spiders']
NEWSPIDER_MODULE = 'job.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36',
'Referer': 'https://blog.csdn.net/qq_43391383/article/details/87033451',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
LOG_LEVEL = 'WARNING'
ITEM_PIPELINES = {
'job.pipelines.JobPipeline': 300,
}
下面是最重要的小蜘蛛:(这里起名为ciwei)
# -*- coding: utf-8 -*-
import scrapy
from job.items import JobItem
import time
class CiweiSpider(scrapy.Spider):
name = 'ciwei'
allowed_domains = ['ciweishixi.com']#域名
start_urls = ['https://www.ciweishixi.com/']#初始url
#抓取初始页面的各种分类的url然后访问将响应传给get_HTML函数
def parse(self, response):
infos = response.xpath('/html/body/div[2]/div[1]/div/div')
for info in infos:
urls = info.xpath('div[2]/a/@href').extract()
for url in urls:
requests = scrapy.Request(url, method="get", callback=self.get_Html)
yield requests
#接收响应抽取我们要的信息
def get_Html(self, response):
infos = response.xpath('/html/body/div[2]/section[2]/div/section/article')
for info in infos:
item = JobItem()
#实习职位
item['title'] = info.xpath('div[2]/div[1]/a/text()').extract_first()
main = info.xpath('div[2]/div[2]/span/text()').extract()
#薪水
item['salary'] = main[0]
#学历
item['education_background'] = main[1]
#天数
item['day'] = main[-1]
#位置
item['location'] = info.xpath('div[2]/div[1]/div/i[2]/text()').extract_first()
yield item
time.sleep(1)#设置休眠时间
pipelines:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import codecs
class JobPipeline(object):
#以TXT文本保存
# def __init__(self):
# self.fp = open("F://ciwei.txt","a+",encoding="utf-8")
#
# def process_item(self, item, spider):
# self.fp.write(
# str((item['title'].strip(),item['salary'],item['education_background'],item['day'],item['location'])) + '\n'
# )
#
# return item
#
# def close(self):
# self.fp.close()
#Excel保存
def __init__(self):
with open("F://qiuzhi.csv", "ab+") as self.fp:
self.fp.write(codecs.BOM_UTF8) # 为了防止在windows下直接打开csv文件出现乱码
self.fp = open("F://qiuzhi.csv",'a+',newline='',encoding='utf-8')
self.write = csv.writer(self.fp)
self.write.writerow(['职位', '薪水', '学历', '天数', '地区'])
def process_item(self, item, spider):
self.write.writerow([item['title'].strip(),item['salary'],item['education_background'],item['day'],item['location']])
def close(self):
self.fp.close()
最后在cmd中cd到文件所在目录执行scrapy crawl ciwei
可以看到我们要的信息已经保存在F盘中的文件内