scrapy crawling One Piece manga
1. Create a project startproject onepiecesScrapy scrapy
2, create Spider
cd onepieces
Scrapy scrapy genspider onepieces http://manhua.fzdm.com/02/
3, rewrite the increase in the project file settings in
ROBOTSTXT_OBEY = False # this is not disabled, but also how to climb abide by the agreement, they will not let you climb ah default
COOKIES_ENABLED = False
DOWNLOAD_DELAY = 0.25 MS # 250 of Delay
DEFAULT_REQUEST_HEADERS = {
'the Accept': 'text / HTML, the Application / XHTML + xml, file application / XML; Q = 0.9, /; Q = 0.8 ',
' the Accept-Language ':' EN ',
}
. 3, an increase in the random User-agent middle of
their own to maintain the agent pool I, do not write random here Acting, direct write random User-agent.
Random write head in middlewares.py in.
class RandomUserAgentMiddleware():
def __init__(self):
self.user_agents = ["Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)
And the increase in settings:
settings:
DOWNLOADER_MIDDLEWARES = {
'onepiece.middlewares.RandomUserAgentMiddleware': 543,
}
Code Analysis
In order to understand the progress of animation, see the corresponding comic is a good choice. The animation is free of KuKu Look platform to meet my needs. Give it the URL of: http://manhua.fzdm.com/02/.
Prior to this, have to say is: Do not program for any commercial purposes, only for the exchange of learning. Respect copyright, please buy genuine comic.
1, spider analysis
class OnepiecesSpider(scrapy.Spider):
name = 'onepieces'
allowed_domains = ['manhua.fzdm.com']
start_urls = ['http://manhua.fzdm.com/02/']
server_img = 'http://p17.xiaoshidi.net/'
pattern_img = re.compile(r'var mhurl1="(.*?)"')
def parse(self, response):
pass
# 从start_requests发送请求
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse1)
# 解析response,获得章节图片链接地址
def parse1(self, response):
items = []
# 章节链接地址
urls = response.xpath('//*[@id="content"]/li/a[1]/@href').extract()
# 章节名
dir_names = response.xpath('//*[@id="content"]/li/a[1]/text()').extract()
# 保存章节链接和章节名
for index in range(len(urls)):
item = ComicItem()
item['link_url'] = response.urljoin(urls[index])
item['dir_name'] = dir_names[index]
item["img_url"]=[]
items.append(item)
#根据每个章节的链接,发送Request请求,并传递item参数
for item in items:
yield scrapy.Request(url=item['link_url'], meta={'item': item}, callback=self.parse2)
# 解析获得章节第一页的页码数和图片链接
def parse2(self, response):
# 接收传递的item
item = response.meta['item']
# 获取章节的第一页的链接
# 获取章节的第一页的图片链接 从script中获取的
img_url=response.xpath('/html/body/script[7]').re(r'var mhurl="(.*?)"')
item['img_url'].append(self.server_img+img_url[0])
pre_img_url=response.xpath('//*[@id="pjax-container"]/script[2]/text()').extract_first()
img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url)[0]]
# 将获取的章节的第一页的图片链接保存到img_url中
item['img_url'] .extend(img_url)
# 返回item,交给item pipeline下载图片
next = response.xpath( '//*[@id="pjax-container"]//a[@class="pure-button pure-button-primary"]/@href').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse3)
# 解析获得本章节其他页面的图片链接
def parse3(self, response):
# 接收传递的item
item = response.meta['item']
# 获取该页面的链接
pre_img_url = response.xpath('//*[@id="pjax-container"]/script[2]/text()').extract_first()
img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url)[0]]
if img_url :#如果为空则为最后一页
# 将获取的图片链接保存到img_url中
item['img_url'].extend(img_url)
print(item["dir_name"]+img_url[0])
next = response.xpath('//*[@id="pjax-container"]//a[@class="pure-button pure-button-primary"]/@href').extract()[1]
print("next :"+next)
url = response.urljoin(next)
# 返回item,交给item pipeline下载图片
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse3)
yield item
2, the preparation of the file items
Item crawling container is taken to save the file data, and a method using the same dictionary.
Creating Item need to inherit Scrapt.Item class type is defined as scrapy.Field field.
class ComicItem(scrapy.Item):
dir_name = scrapy.Field()
link_url = scrapy.Field()
img_url = scrapy.Field()
image_paths = scrapy.Field()
3, the preparation of pipelines file
class OnepiecePipeline(object):
def process_item(self, item, spider):
return item
class ComicImgDownloadPipeline(object):
def process_item(self, item, spider):
#如果获取了图片链接,进行如下操作
if 'img_url' in item:
images = []
#文件夹名字
dir_path = '%s/%s' % (settings.IMAGES_STORE, item['dir_name'])
#文件夹不存在则创建文件夹
if not os.path.exists(dir_path):
os.makedirs(dir_path)
#获取每一个图片链接
for image_url in item['img_url']:
#解析链接,根据链接为图片命名
houzhui = image_url.split('/')[-1].split('.')[-1]
qianzhui = image_url.split('/')[-1].split('.')[0]
#图片名
image_file_name = '第' + qianzhui + '页.' + houzhui
#图片保存路径
file_path = '%s/%s' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
#保存图片
with open(file_path, 'wb') as handle:
response = requests.get(url = image_url)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
#返回图片保存路径
item['image_paths'] = images
return item
Finally crawling scrapy crawl onepieces
Code Address: https://download.csdn.net/download/huangwencai123/11142906