First, the project needs
King Rong Yaoguan web crawling all skin pictures
Claim:
Hero as a folder name
Skin name as the name of the picture
Skin image into the corresponding folder by your hero
Second, the project resolved
Defined items files, settings hero_name , pf_names , image_urls , ImagesRF Royalty Free field
Page analysis to determine the crawl ideas
Setting spider files, grab all the heroes name, skin name, picture URL
Too much data set download middleware, disguised as a browser
Setting item pipeline, pipeline rewrite classes, inheritance ImagesPipeline , change the download path name and picture on demand
Set finishing settings file
Handling bug
Third, the project code
import scrapy
class PvpqqItem(scrapy.Item):
# define the fields for your item here like:
hero_name = scrapy.Field () # hero name
pf_names = scrapy.Field () # skin name
image_urls = scrapy.Field () # image project URL
images = scrapy.Field () # about downloading the image information field
import scrapy
from ..items import PvpqqItem
class PfSpider(scrapy.Spider):
name = 'pf'
# allowed_domains = ['https://pvp.qq.com']
start_urls = ['https://pvp.qq.com/web201605/herolist.shtml']
def parse(self, response):
pf_urls = response.xpath('//ul[@class="herolist clearfix"]/li/a/@href').extract()
for pf_url in pf_urls:
yield scrapy.Request(url='https://pvp.qq.com/web201605/%s' % pf_url, callback=self.pf_parse)
def pf_parse(self, response):
item = PvpqqItem()
item['hero_name'] = response.xpath('//h2[@class="cover-name"]/text()').extract_first()
# ' Force Paladin's & 0 | Death Knight & 1 | Lionheart & 13 | soul Men & 12' == ' [' Paladin force ',' Death Knight ',' Lionheart ',' soul-Men ']
item['pf_names'] = response.xpath('//ul[@class="pic-pf-list pic-pf-list3"]/@data-imgname').re('(.*?)\&\\d+\|?')
item['image_urls'] = []
for num in range(1, len(item['pf_names'])+1):
# //game.gtimg.cn/imgs/yxzj/img201606/heroimg/166/166-mobileskin-1.jpg
# Removal - behind the characters, and then re-stitching
image_url_head = response.xpath('//a[@class="hero-video"]/img/@src').extract_first()[:-5]
image_url = "https:{}{}.jpg".format(image_url_head, num)
item['image_urls'].append(image_url)
yield item
import random
class RandomUserAgentMiddleware(object):
def __init__(self, user_agents):
self.user_agents = user_agents
@classmethod
def from_crawler(cls, crawler):
# 在settings.py文件中加载MY_USER_AGENTS的值
s = cls(user_agents=crawler.settings.get('MY_USER_AGENTS'))
return s
def process_request(self, request, spider):
# 随机设置User-Agent的值
agent = random.choice(self.user_agents)
# 将其赋给Request
request.headers['User-Agent'] = agent
# proxy = random.choice(self.proxy)
# request.meta['proxy'] = proxy
return None
import os
from scrapy.pipelines.images import ImagesPipeline
from . import settings
# 继承ImagesPipeline类
class PvpqqPipeline(ImagesPipeline):
# 此方法是在发送下载请求之前调用,其实此方法本身就是去发送下载请求
def get_media_requests(self, item, info):
# 调用原父类方法,发送下载请求并获取返回的结果(request的列表)
request_objs = super().get_media_requests(item, info)
# 给每个request对象带上meta属性传入hero_name、pf_name参数,并返回
for request_obj, num in zip(request_objs, range(0, len(item['pf_names']))):
request_obj.meta['hero_name'] = item['hero_name']
request_obj.meta['pf_name'] = item['pf_names'][num]
return request_objs
# 此方法是在图片将要被存储的时候调用,用来获取这个图片存储的全部路径
def file_path(self, request, response=None, info=None):
# 获取request的meta属性的hero_name作为文件夹名称
hero_name = request.meta.get('hero_name')
# 获取request的meta属性的pf_name并拼接作为文件名称
image_name = request.meta.get('pf_name') + '.jpg'
# 获取IMAGES_STORE图片的默认地址并拼接
image_store = settings.IMAGES_STORE
hero_name_path = os.path.join(image_store, hero_name)
# 判断地址是否存在,不存则创建
if not os.path.exists(hero_name_path):
os.makedirs(hero_name_path)
# 拼接文件夹地址与图片名图片存储的全部路径并返回
image_path = os.path.join(hero_name_path, image_name)
return image_path
import os
BOT_NAME = 'pvpqq'
SPIDER_MODULES = ['pvpqq.spiders']
NEWSPIDER_MODULE = 'pvpqq.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'pvpqq.middlewares.RandomUserAgentMiddleware': 543,
}
# 自定义的图片处理管道
ITEM_PIPELINES = {
'pvpqq.pipelines.PvpqqPipeline': 300,
}
# 设置所有图片默认地址,必须设置
# IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'imgs')
IMAGES_STORE = 'C:\\Users\\lenovo\\Desktop\\爬虫\\spider_demo\\pvpqq\\imgs'
function(){ //智汇返佣 http://www.kaifx.cn/broker/thinkmarkets.html
# 设置图片通道失效时间
IMAGES_EXPIRES = 90
# 设置允许重定向,否则可能找不到图片
MEDIA_ALLOW_REDIRECTS = True
# 切换User_Agent
MY_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
from scrapy import cmdline
# 运行此文件代替命令行运行scrapy项目
cmdline.execute("scrapy crawl pf".split(" "))