python 爬取 mzitu 图片

#!/usr/bin/env python
#-*-conding:utf-8-*-

import requests # 发送http请求
from bs4 import BeautifulSoup # 解析html
import lxml # 解析器 中文不乱码
import os #创建文件夹


path = 'C:/Users/Administrator/Desktop/tmp' #保存文件路径
folder = '/mzitu/'

request_url = 'https://www.mzitu.com/all/'
headers = headers2 = {
    'referer':request_url,
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
}

# 下载图片
def down_img(img_url, path, headers):
    ret_html = requests.get(url=img_url,headers=headers)
    name = path + "/" + img_url.split('/')[-1]
    with open(name, "ab") as f:
        f.write(ret_html.content)

ret_html = requests.get(url=request_url,headers=headers)
# print(ret_html.content)
soup = BeautifulSoup(ret_html.content,'html.parser')
# print(soup)
a_list = soup.find('p', attrs={"class":"url"}).find_all('a')
# print(a_list)
url_list = []
for i in a_list:
    url_list.append(i['href'])
print(url_list)
for i in url_list:
    ret_html = requests.get(url=i,headers=headers)
    soup = BeautifulSoup(ret_html.content,'html.parser')

    title = soup.find("h2", attrs={"class":"main-title"}).text
    img_url = soup.find("div", attrs={"class":"main-image"}).find("img")['src']
    page = soup.find("div", attrs={"class":"pagenavi"}).find_all("a")[-2].find("span").text
    ext = '.'+img_url.split('.')[-1]

    # 创建文件夹
    if not os.path.isdir(path+folder+title):
        os.makedirs(path+folder+title)

    # 下载第一张图片
    img_url2 = img_url
    headers2['referer'] = i
    down_img(img_url2, path+folder+title, headers2)
    

    for j in range(2, int(page)+1):
        if j < 10:
            img_url2 = img_url[0:-6] + "0" + str(j) + ext
        else:
            img_url2 = img_url[0:-6] + str(j) + ext

        headers2['referer'] = i + "/" + str(j)
        down_img(img_url2, path+folder+title, headers2)
        

猜你喜欢

转载自blog.csdn.net/m0_37711659/article/details/86646853
今日推荐