妹子图爬取__正则实现

妹子图爬取

页面链接

正则实现代码:

import re
import requests
import os
hehehe = os.getcwd()

headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    'Referer': 'http://i.meizitu.net'}


# 获取all页面的html文本
def get_root_html(root_url):
    all_page_url = requests.get(root_url, headers=headers)
    if all_page_url.status_code == 200:
        return all_page_url.text
    return "页面获取失败"


# 解析all页面, 获取里面所有项目的url
def parse_root_html(root_html):
    all_url_re = r'<a\shref="(http://www.mzitu.com/\d+)"\starget="_blank"'
    pattern = re.compile(all_url_re)
    all_url_list = re.findall(pattern, root_html)
    return all_url_list


# 获取单个项目的页面
def get_one_page_html(one_page_url):
    one_page_html = requests.get(one_page_url, headers=headers)
    return one_page_html.text


# 解析单个项目的页面, 获取该项目一共有多少张图片, 然后组合每一张照片所在的url
def parse_one_page_html(one_page_html, one_page_url):
    # 获取存在最大页数的那个div标签
    pattern = re.compile(r'<div\sclass="pagenavi">.*?</div>', re.S)
    div = re.findall(pattern, one_page_html)
    # 获取div内所有的span标签
    pattern1 = re.compile(r'<span.*?</span>', re.S)
    one_page_url_list = re.findall(pattern1, div[0])
    # 最大页数的数字在返回的span列表的倒数第二项, 得到这个span标签内的数字即: 该项目的最大页数
    # print(one_page_url_list[-2])
    max_num_re = r'\d+'
    pattern2 = re.compile(max_num_re)
    max_num = re.findall(pattern2, one_page_url_list[-2])

    # print(max_num[0])
    max_num = max_num[0]
    every_img_page_list = []
    for i in range(int(max_num)):
        one_img_url = str(one_page_url) + '/' + str(i)
        every_img_page_list.append(one_img_url)
    return every_img_page_list


# 为该项目在当前路径下, 创建文件夹
def mkdir_folder(one_page_html):
    folder_name_re = r'<h2\sclass="main-title">(.*?)</h2>'
    pattern = re.compile(folder_name_re, re.S)
    folder_name = re.findall(pattern, one_page_html)[0]
    # 我注意到有个标题带有 ?  这个符号Windows系统是不能创建文件夹的所以要替换掉
    path = str(folder_name).replace("?", '_')
    path = path.strip()

    isExists = os.path.exists(os.path.join(str(hehehe), path))
    if not isExists:
        os.makedirs(os.path.join(str(hehehe), path))
        os.chdir(os.path.join(str(hehehe), path))  # 切换到目录
    print(folder_name + "文件创建成功")
    return folder_name


# 获取图片的地址
def get_img_url(one_img_page_url):
    one_img_page_html = requests.get(one_img_page_url, headers=headers)
    ong_img_re = r'<img\ssrc="(.*?)"\salt=".*?"'
    pattern = re.compile(ong_img_re, re.S)
    one_img_url = re.findall(pattern, one_img_page_html.text)[0]
    # print("一个图片的地址:"+str(one_img_url))
    return one_img_url


# 下载图片
def download_one_img(one_img_url, folder_name):
    # print(one_img_url)
    # print(type(one_img_url))
    img = requests.get(one_img_url, headers=headers)
    file_name = one_img_url[-6:]
    print("正在下载图片:" + str(file_name))
    f = open(str(file_name), 'wb')
    f.write(img.content)
    f.close()


# 该爬虫的总控制函数体
def main(root_url):
    root_html = get_root_html(root_url)
    all_url_list = parse_root_html(root_html)
    for i in range(5):
        one_page_html = get_one_page_html(all_url_list[i])
        folder_name = mkdir_folder(one_page_html)
        every_img_page_list = parse_one_page_html(
            one_page_html, all_url_list[i])
        for i in range(len(every_img_page_list)):
            one_img_url = get_img_url(every_img_page_list[i])
            download_one_img(one_img_url, folder_name)


if __name__ == "__main__":
    main('http://www.mzitu.com/all')
View Code

猜你喜欢

转载自www.cnblogs.com/amou/p/9206528.html