在Scrapy中文网找到一个小例子,虽然不是自己写的,但是把它温习一遍也是学习。
Item部分:主要爬去内容为图片的名字和链接,链接用来下载。
# -*- coding: utf-8 -*-
import scrapy
class AoisolasItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
ImgUrl = scrapy.Field()
pass
Middleware:该部分主要用来破解网站的防盗链技术,即通过设置请求头信息,伪造我们是从该网站访问该图片的。而不是其他网站盗用链接
# -*- coding: utf-8 -*-
from scrapy import signals
class AoisolasSpiderMiddleware(object):
def process_request(self, request, spider):
referer = request.url
if referer:
request.headers['referer'] = referer
pipelines:该部分主要负责储存图片的
get_media_requests:获取url,通过url获取图片,
file_path:通过正则表达式处理name,结合name和处理后的url生成不同文件夹,达到为图片分类的目的。
# -*- coding: utf-8 -*-
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
import re
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['ImgUrl']:
yield Request(image_url,meta={'item':item['name']})
def file_path(self, request, response=None, info=None):
name = request.meta['item']
# name = filter(lambda x: x not in '()0123456789', name)
name = re.sub(r'[?\\*|“<>:/()0123456789]', '', name)
image_guid = request.url.split('/')[-1]
# name2 = request.url.split('/')[-2]
filename = u'full2/{0}/{1}'.format(name, image_guid)
return filename
# return 'full/%s' % (image_guid)
def item_completed(self, results, item, info):
image_path = [x['path'] for ok, x in results if ok]
if not image_path:
raise DropItem('Item contains no images')
item['image_paths'] = image_path
return item
setting.py:基本设置
BOT_NAME = 'AoiSolas'
SPIDER_MODULES = ['AoiSolas.spiders']
NEWSPIDER_MODULE = 'AoiSolas.spiders'
# 设置图片存储路径
IMAGES_STORE = 'E:\meizi2'
#启动pipeline中间件
ITEM_PIPELINES = {
'AoiSolas.pipelines.MyImagesPipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
'AoiSolas.middlewares.AoisolasSpiderMiddleware': 1,
}
ROBOTSTXT_OBEY = True
爬虫文件:该网站的首页下面有很多页链接,每页有很多美女链接,每个美女链接点开又是很多页,每页有很多美女照片,所以在下面进行了两次页面url循环,我只添加了一个链接,因为内容太多了。
# -*- coding: utf-8 -*-
import scrapy
from ..items import AoisolasItem
class AoisolaspiderSpider(scrapy.Spider):
name = "AoiSola"
allowed_domains = ["www.mm131.com"]
start_urls = ['http://www.mm131.com/qingchun/']
#'http://www.mm131.com/xinggan/'
#'http://www.mm131.com/qingchun/'
#'http://www.mm131.com/xiaohua/'
#'http://www.mm131.com/chemo/'
#'http://www.mm131.com/qipao/'
# 'http://www.mm131.com/mingxing/'
def parse(self, response):
list = response.css(".list-left dd:not(.page)")
for img in list:
imgurl = img.css("a::attr(href)").extract_first()
imgurl2 = str(imgurl)
print(imgurl2)
next_url = response.css(".page-en:nth-last-child(2)::attr(href)").extract_first()
if next_url is not None:
# 主界面的下一页
yield response.follow(next_url, callback=self.parse)
yield scrapy.Request(imgurl2, callback=self.content)
def content(self, response):
item = AoisolasItem()
item['name'] = response.css(".content h5::text").extract_first()
item['ImgUrl'] = response.css(".content-pic img::attr(src)").extract()
yield item
# 提取图片,存入文件夹
# print(item['ImgUrl'])
next_url = response.css(".page-ch:last-child::attr(href)").extract_first()
if next_url is not None:
# 分界面下一页
yield response.follow(next_url, callback=self.content)
最后进入config根目录下执行爬虫:
scrapy crawl AoiSola
祝爬虫顺利!