利用requests和正则表达式爬取王者荣耀官网英雄皮肤,并结构化保存在文件夹中
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
Created on Mon Aug 20 10:07:46 2018
@author: Python
"""
import requests
import re
import os
#爬取一个页面的信息
def get_one_page(url):
response = requests.get(url=url)
if response.status_code == 200:
req = response.text
# 解决网页中文乱码
html = req.encode(
'ISO-8859-1').decode(requests.utils.get_encodings_from_content(req)[0])
return html
else:
return None
#解析得到的页面信息,提取需要的数据和链接
def parse_one_page(html):
# 通过正则匹配得到需要的信息
# 匹配url
pat = r'<li><a href="herodetail/([\w]*).shtml" target="_blank">'
pattern = re.compile(pat, re.S)
result = re.findall(pattern, html)
for i in result:
# 完整的url
hero_url = "https://pvp.qq.com/web201605/herodetail/" + \
str(i) + ".shtml"
hero_html = get_one_page(hero_url)
# print(hero_html)
# 正则得到图片的url
pat = r'<div class="zk-con1 zk-con" style="background:url(.*?) center 0">'
pattern = re.compile(pat)
pic = re.findall(pattern, hero_html)
# 得到文件夹名字和图片名字,方便结构化保存图片
# 从下面可以看出正则表达式提取数据的麻烦和困难,推荐用pyquery
title_pat = r'<h3 class="cover-title">([\s\S].*?)</h3>'
name_pat = r'<h2 class="cover-name">([\s\S].*?)</h2>'
title_pattern = re.compile(title_pat)
title = re.findall(title_pattern, hero_html)
name_pattern = re.compile(name_pat)
name = re.findall(name_pattern, hero_html)
str_title = ''.join(title).encode().decode()
str_name = ''.join(name).encode().decode()
# print(str_title+'_'+str_name)
pic_url = "https:" + str(pic[0][2:-2])
pic_html = requests.get(pic_url)
# 把图片文件保存在文件夹中
if not os.path.exists(str_name):
print('创建文件夹:{}'.format(str_name))
os.mkdir(
'C:\Users\Python\Desktop\python\10.Crawler\Demo\heroPic' + str_name)
os.chdir(
'C:\Users\Python\Desktop\python\10.Crawler\Demo\heroPic' + str_name)
with open(str_title + '.jpg', 'wb') as f:
for chunk in pic_html.iter_content():
f.write(chunk)
f.close()
if __name__ == "__main__":
# 种子url
url = "https://pvp.qq.com/web201605/herolist.shtml"
html = get_one_page(url)
# print(html)
parse_one_page(html)