import re
import requests
import os
hehehe = os.getcwd()
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
'Referer': 'http://i.meizitu.net'}
# 获取all页面的html文本
def get_root_html(root_url):
all_page_url = requests.get(root_url, headers=headers)
if all_page_url.status_code == 200:
return all_page_url.text
return "页面获取失败"
# 解析all页面, 获取里面所有项目的url
def parse_root_html(root_html):
all_url_re = r'<a\shref="(http://www.mzitu.com/\d+)"\starget="_blank"'
pattern = re.compile(all_url_re)
all_url_list = re.findall(pattern, root_html)
return all_url_list
# 获取单个项目的页面
def get_one_page_html(one_page_url):
one_page_html = requests.get(one_page_url, headers=headers)
return one_page_html.text
# 解析单个项目的页面, 获取该项目一共有多少张图片, 然后组合每一张照片所在的url
def parse_one_page_html(one_page_html, one_page_url):
# 获取存在最大页数的那个div标签
pattern = re.compile(r'<div\sclass="pagenavi">.*?</div>', re.S)
div = re.findall(pattern, one_page_html)
# 获取div内所有的span标签
pattern1 = re.compile(r'<span.*?</span>', re.S)
one_page_url_list = re.findall(pattern1, div[0])
# 最大页数的数字在返回的span列表的倒数第二项, 得到这个span标签内的数字即: 该项目的最大页数
# print(one_page_url_list[-2])
max_num_re = r'\d+'
pattern2 = re.compile(max_num_re)
max_num = re.findall(pattern2, one_page_url_list[-2])
# print(max_num[0])
max_num = max_num[0]
every_img_page_list = []
for i in range(int(max_num)):
one_img_url = str(one_page_url) + '/' + str(i)
every_img_page_list.append(one_img_url)
return every_img_page_list
# 为该项目在当前路径下, 创建文件夹
def mkdir_folder(one_page_html):
folder_name_re = r'<h2\sclass="main-title">(.*?)</h2>'
pattern = re.compile(folder_name_re, re.S)
folder_name = re.findall(pattern, one_page_html)[0]
# 我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉
path = str(folder_name).replace("?", '_')
path = path.strip()
isExists = os.path.exists(os.path.join(str(hehehe), path))
if not isExists:
os.makedirs(os.path.join(str(hehehe), path))
os.chdir(os.path.join(str(hehehe), path)) # 切换到目录
print(folder_name + "文件创建成功")
return folder_name
# 获取图片的地址
def get_img_url(one_img_page_url):
one_img_page_html = requests.get(one_img_page_url, headers=headers)
ong_img_re = r'<img\ssrc="(.*?)"\salt=".*?"'
pattern = re.compile(ong_img_re, re.S)
one_img_url = re.findall(pattern, one_img_page_html.text)[0]
# print("一个图片的地址:"+str(one_img_url))
return one_img_url
# 下载图片
def download_one_img(one_img_url, folder_name):
# print(one_img_url)
# print(type(one_img_url))
img = requests.get(one_img_url, headers=headers)
file_name = one_img_url[-6:]
print("正在下载图片:" + str(file_name))
f = open(str(file_name), 'wb')
f.write(img.content)
f.close()
# 该爬虫的总控制函数体
def main(root_url):
root_html = get_root_html(root_url)
all_url_list = parse_root_html(root_html)
for i in range(5):
one_page_html = get_one_page_html(all_url_list[i])
folder_name = mkdir_folder(one_page_html)
every_img_page_list = parse_one_page_html(
one_page_html, all_url_list[i])
for i in range(len(every_img_page_list)):
one_img_url = get_img_url(every_img_page_list[i])
download_one_img(one_img_url, folder_name)
if __name__ == "__main__":
main('http://www.mzitu.com/all')