scrapy爬取海贼王漫画

scrapy爬取海贼王漫画

1、创建项目scrapy startproject onepiecesScrapy
2、创建spider
cd onepieces
Scrapy scrapy genspider onepieces http://manhua.fzdm.com/02/
3、改写项目文件在settings中增加

ROBOTSTXT_OBEY = False # 这个不禁用,遵守协议还怎么爬,人家默认不让你爬啊
COOKIES_ENABLED = False
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
}
3、在middle中增加随机User-Agent
我用的自己维护的代理池,这里就不写随机代理了,直接写随机User-Agent。
在middlewares.py中编写随机头。


class RandomUserAgentMiddleware():
    def __init__(self):
        self.user_agents =  ["Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
    "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
    "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
    "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
    "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
    "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
    "Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"]

    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.user_agents)

并在settings中增加:
settings :
DOWNLOADER_MIDDLEWARES = {
‘onepiece.middlewares.RandomUserAgentMiddleware’: 543,
}

代码分析

为了了解动漫的进展,看相应的漫画是个不错的选择。而KuKu动漫又是免费的试看平台,满足我的需求。奉上URL:http://manhua.fzdm.com/02/。
在此之前,不得不说的一句话就是:请勿将程序用于任何商业用途,仅供交流学习。尊重著作权,请购买正版漫画。
1、 spider分析

class OnepiecesSpider(scrapy.Spider):
    name = 'onepieces'
    allowed_domains = ['manhua.fzdm.com']
    start_urls = ['http://manhua.fzdm.com/02/']
    server_img = 'http://p17.xiaoshidi.net/'
    pattern_img = re.compile(r'var mhurl1="(.*?)"')
    def parse(self, response):
        pass
    # 从start_requests发送请求
    def start_requests(self):
        yield scrapy.Request(url=self.start_urls[0], callback=self.parse1)

    # 解析response,获得章节图片链接地址
    def parse1(self, response):

        items = []
        # 章节链接地址
        urls = response.xpath('//*[@id="content"]/li/a[1]/@href').extract()
        # 章节名
        dir_names = response.xpath('//*[@id="content"]/li/a[1]/text()').extract()
        # 保存章节链接和章节名
        for index in range(len(urls)):
            item = ComicItem()
            item['link_url'] = response.urljoin(urls[index])
            item['dir_name'] = dir_names[index]
            item["img_url"]=[]
            items.append(item)
        #根据每个章节的链接,发送Request请求,并传递item参数
        for item in items:
            yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)


            # 解析获得章节第一页的页码数和图片链接

    def parse2(self, response):
        # 接收传递的item
        item = response.meta['item']
        # 获取章节的第一页的链接
        # 获取章节的第一页的图片链接   从script中获取的
        img_url=response.xpath('/html/body/script[7]').re(r'var mhurl="(.*?)"')
        item['img_url'].append(self.server_img+img_url[0])
        pre_img_url=response.xpath('//*[@id="pjax-container"]/script[2]/text()').extract_first()
        img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url)[0]]
        # 将获取的章节的第一页的图片链接保存到img_url中
        item['img_url'] .extend(img_url)
        # 返回item,交给item pipeline下载图片
        next = response.xpath( '//*[@id="pjax-container"]//a[@class="pure-button pure-button-primary"]/@href').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse3)


    # 解析获得本章节其他页面的图片链接
    def parse3(self, response):
        # 接收传递的item
        item = response.meta['item']
        # 获取该页面的链接
        pre_img_url = response.xpath('//*[@id="pjax-container"]/script[2]/text()').extract_first()
        img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url)[0]]
        if img_url :#如果为空则为最后一页
            # 将获取的图片链接保存到img_url中
            item['img_url'].extend(img_url)
            print(item["dir_name"]+img_url[0])
            next = response.xpath('//*[@id="pjax-container"]//a[@class="pure-button pure-button-primary"]/@href').extract()[1]
            print("next :"+next)
            url = response.urljoin(next)

            # 返回item,交给item pipeline下载图片

            yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse3)
            yield item

2、编写items文件
Item是保存文件爬取数据的容器,使用方法和字典相同。
创建Item需要继承Scrapt.Item类,类型定义为scrapy.Field字段。

class ComicItem(scrapy.Item):


	dir_name = scrapy.Field()
	link_url = scrapy.Field()
	img_url = scrapy.Field()
	image_paths = scrapy.Field()

3、编写pipelines文件


class OnepiecePipeline(object):
    def process_item(self, item, spider):
        return item


class ComicImgDownloadPipeline(object):

	def process_item(self, item, spider):
		#如果获取了图片链接,进行如下操作
		if 'img_url' in item:
			images = []
			#文件夹名字
			dir_path = '%s/%s' % (settings.IMAGES_STORE, item['dir_name'])
			#文件夹不存在则创建文件夹
			if not os.path.exists(dir_path):
				os.makedirs(dir_path)
			#获取每一个图片链接
			for image_url in item['img_url']:
				#解析链接,根据链接为图片命名
				houzhui = image_url.split('/')[-1].split('.')[-1]
				qianzhui = image_url.split('/')[-1].split('.')[0]
				#图片名
				image_file_name = '第' + qianzhui + '页.' + houzhui
				#图片保存路径
				file_path = '%s/%s' % (dir_path, image_file_name)
				images.append(file_path)
				if os.path.exists(file_path):
					continue
				#保存图片
				with open(file_path, 'wb') as handle:
					response = requests.get(url = image_url)
					for block in response.iter_content(1024):
						if not block:
							break
						handle.write(block)
			#返回图片保存路径
			item['image_paths'] = images
		return item

最后爬取scrapy crawl onepieces

代码地址:https://download.csdn.net/download/huangwencai123/11142906

猜你喜欢

转载自blog.csdn.net/huangwencai123/article/details/89510848