scrapy skin crawling king of glory

First, the project needs

King Rong Yaoguan web crawling all skin pictures

Claim:

Hero as a folder name

Skin name as the name of the picture

Skin image into the corresponding folder by your hero

Second, the project resolved

Defined items files, settings hero_name , pf_names , image_urls , ImagesRF Royalty Free field

Page analysis to determine the crawl ideas

Setting spider files, grab all the heroes name, skin name, picture URL

Too much data set download middleware, disguised as a browser

Setting item pipeline, pipeline rewrite classes, inheritance ImagesPipeline , change the download path name and picture on demand

Set finishing settings file

Handling bug

Third, the project code

import scrapy

 

class PvpqqItem(scrapy.Item):

    # define the fields for your item here like:

    hero_name = scrapy.Field () # hero name

    pf_names = scrapy.Field () # skin name

    image_urls = scrapy.Field () # image project URL

images = scrapy.Field () # about downloading the image information field

import scrapy

from ..items import PvpqqItem

 

class PfSpider(scrapy.Spider):

    name = 'pf'

    # allowed_domains = ['https://pvp.qq.com']

    start_urls = ['https://pvp.qq.com/web201605/herolist.shtml']

 

    def parse(self, response):

        pf_urls = response.xpath('//ul[@class="herolist clearfix"]/li/a/@href').extract()

        for pf_url in pf_urls:

            yield scrapy.Request(url='https://pvp.qq.com/web201605/%s' % pf_url, callback=self.pf_parse)

 

    def pf_parse(self, response):

        item = PvpqqItem()

        item['hero_name'] = response.xpath('//h2[@class="cover-name"]/text()').extract_first()

        # ' Force Paladin's & 0 | Death Knight & 1 | Lionheart & 13 | soul Men & 12' == ' [' Paladin force ',' Death Knight ',' Lionheart ',' soul-Men ']

        item['pf_names'] = response.xpath('//ul[@class="pic-pf-list pic-pf-list3"]/@data-imgname').re('(.*?)\&\\d+\|?')

        item['image_urls'] = []

        for num in range(1, len(item['pf_names'])+1):

            # //game.gtimg.cn/imgs/yxzj/img201606/heroimg/166/166-mobileskin-1.jpg

            # Removal - behind the characters, and then re-stitching

            image_url_head = response.xpath('//a[@class="hero-video"]/img/@src').extract_first()[:-5]

            image_url = "https:{}{}.jpg".format(image_url_head, num)

            item['image_urls'].append(image_url)

        yield item

import random

 

class RandomUserAgentMiddleware(object):

 

    def __init__(self, user_agents):

        self.user_agents = user_agents

 

    @classmethod

    def from_crawler(cls, crawler):

        # settings.py文件中加载MY_USER_AGENTS的值

        s = cls(user_agents=crawler.settings.get('MY_USER_AGENTS'))

        return s

 

    def process_request(self, request, spider):

        # 随机设置User-Agent的值

        agent = random.choice(self.user_agents)

        # 将其赋给Request

        request.headers['User-Agent'] = agent

        # proxy = random.choice(self.proxy)

        # request.meta['proxy'] = proxy

        return None

import os

from scrapy.pipelines.images import ImagesPipeline

from . import settings

 

 

# 继承ImagesPipeline

class PvpqqPipeline(ImagesPipeline):

 

    # 此方法是在发送下载请求之前调用,其实此方法本身就是去发送下载请求

    def get_media_requests(self, item, info):

        # 调用原父类方法,发送下载请求并获取返回的结果(request的列表)

        request_objs = super().get_media_requests(item, info)

        # 给每个request对象带上meta属性传入hero_namepf_name参数,并返回

        for request_obj, num in zip(request_objs, range(0, len(item['pf_names']))):

            request_obj.meta['hero_name'] = item['hero_name']

            request_obj.meta['pf_name'] = item['pf_names'][num]

        return request_objs

 

    # 此方法是在图片将要被存储的时候调用,用来获取这个图片存储的全部路径

    def file_path(self, request, response=None, info=None):

        # 获取requestmeta属性的hero_name作为文件夹名称

        hero_name = request.meta.get('hero_name')

        # 获取requestmeta属性的pf_name并拼接作为文件名称

        image_name = request.meta.get('pf_name') + '.jpg'

        # 获取IMAGES_STORE图片的默认地址并拼接

        image_store = settings.IMAGES_STORE

        hero_name_path = os.path.join(image_store, hero_name)

        # 判断地址是否存在,不存则创建

        if not os.path.exists(hero_name_path):

            os.makedirs(hero_name_path)

        # 拼接文件夹地址与图片名图片存储的全部路径并返回

        image_path = os.path.join(hero_name_path, image_name)

        return image_path

import os

 

BOT_NAME = 'pvpqq'

SPIDER_MODULES = ['pvpqq.spiders']

NEWSPIDER_MODULE = 'pvpqq.spiders'


ROBOTSTXT_OBEY = False

 

DOWNLOADER_MIDDLEWARES = {

   'pvpqq.middlewares.RandomUserAgentMiddleware': 543,

}

# 自定义的图片处理管道

ITEM_PIPELINES = {

   'pvpqq.pipelines.PvpqqPipeline': 300,

}

# 设置所有图片默认地址,必须设置

# IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'imgs')

IMAGES_STORE = 'C:\\Users\\lenovo\\Desktop\\爬虫\\spider_demo\\pvpqq\\imgs'

function(){ //智汇返佣 http://www.kaifx.cn/broker/thinkmarkets.html

# 设置图片通道失效时间

IMAGES_EXPIRES = 90

# 设置允许重定向,否则可能找不到图片

MEDIA_ALLOW_REDIRECTS = True

# 切换User_Agent

MY_USER_AGENTS = [

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "

        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "

        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "

        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "

        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "

        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "

        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "

        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "

        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "

        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]

from scrapy import cmdline

 

# 运行此文件代替命令行运行scrapy项目

cmdline.execute("scrapy crawl pf".split(" "))

 

 

 


Guess you like

Origin blog.51cto.com/14511863/2456367