注:若运行以下代码报出有关“PIL”的错误,则只需安装pillow即可。
实现思路:
(1)使用Fiddler抓包工具,找出为斗鱼APP颜值区提供数据的URL(Json格式)
(2)在items中定义需要记录的相关信息
(3)在spider中实现迭代爬取各主播的信息
(4)在Pipeline中通过继承内置的ImagesPipeline类,重写其功能,实现图片的下载
(5)在settings.py文件中进行相关配置
代码实现:
items.py
# -*- coding: utf-8 -*- import scrapy class DouyuItem(scrapy.Item): #图片链接 vertical_src = scrapy.Field() #主播名 nickname = scrapy.Field() #图片保存路径 imagePath = scrapy.Field()
爬虫文件(spiders/douyuMM.py)
# -*- coding: utf-8 -*- import scrapy import json from Douyu.items import DouyuItem class DouyummSpider(scrapy.Spider): name = 'douyuMM' allowed_domains = ['capi.douyucdn.cn'] initial_URL = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=' offset = 0 URL = initial_URL + str(offset) start_urls = [URL] def parse(self, response): data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] #将图片链接存储成列表形式,供pipelines迭代 image_url = each['vertical_src'] item['vertical_src'] = [image_url] yield item self.offset += 20 #抓取前100个主播的图片 if self.offset < 100: self.URL = self.initial_URL + str(self.offset) yield scrapy.Request(url=self.URL, callback=self.parse)
pipelines.py
# -*- coding: utf-8 -*- import os import scrapy """ pipelines提供了图片相关的方法,因此只需继承ImagesPipeline 并对“get_media_requests”和“item_completed”重写即可实现图片的下载 """ from scrapy.pipelines.images import ImagesPipeline #通过get_project_settings来获取settings.py文件中设置的变量 from scrapy.utils.project import get_project_settings class DouyuImagesPipeline(ImagesPipeline): IMAGES_STORE = get_project_settings().get('IMAGES_STORE') def get_media_requests(self, item, info): for image_url in item['vertical_src']: yield scrapy.Request(image_url) """ 亦可用下面的代码,只不过下面的代码只能抓取一张图片 image_url = item['vertical_src'] yield scrapy.Request(image_url) """ def item_completed(self, results, item, info): """ result结构: [(True, {'checksum': '2b00042f7481c7b056c4b410d28f33cf', 'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg', 'url': 'http://www.example.com/files/product1.pdf'}), (False, Failure(...))] """ image_path = [x['path'] for ok, x in results if ok] # 修改图片保存名称为主播昵称 # 并将爬取的图片存储在IMAGES_STORE设置的相对路径下,用“full”文件存储 os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + 'full/' + item["nickname"] + ".jpg") item['imagePath'] = self.IMAGES_STORE + 'full/' + item["nickname"] return item
settings.py
BOT_NAME = 'Douyu' SPIDER_MODULES = ['Douyu.spiders'] NEWSPIDER_MODULE = 'Douyu.spiders' ROBOTSTXT_OBEY = True DEFAULT_REQUEST_HEADERS = { 'User-Agent':'DYZB/4.100 (iPhone; iOS 11.3.1; Scale/3.00)', 'Accept': 'application/vnd.mapi-yuba.douyu.com.4.0+json', 'Accept-Language': 'zh-Hans-CN;q=1' } IMAGES_STORE = 'data/斗鱼主播图片/' ITEM_PIPELINES = { 'Douyu.pipelines.DouyuImagesPipeline': 300, }