mzitu网站图片抓取,Referer

Referer需要匹配最新页面的url

from lxml import etree
import requests
import os
from urllib import request



# 获取图片
def get_page_detail(url,title):

    # 创建文件夹
    if not os.path.exists('mzituDownload/'+title):
        os.makedirs('mzituDownload/'+title)
    response = requests.get(url)


    html_ele = etree.HTML(response.text)

    total_a = html_ele.xpath('//div[@class="pagenavi"]/a')
    # 获取每个目录下的图片的个数
    total = total_a[-2].xpath('./span')
    print(total[0].text)
    total = int(total[0].text)
    for num in range(1,total+1):
        full_url = url+'/'+str(num)
        print(full_url)
        response1 = requests.get(full_url)
        # print(response1.text)
        html_ele1 = etree.HTML(response1.text)
        src_page = html_ele1.xpath('//div[@class="main-image"]/p/a/img/@src')
        src_page = src_page[0]

        filename = 'mzitu'+src_page.split('/')[-1]
        print(src_page,filename)
        headers = {
            # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            # 'Accept-Encoding': 'gzip, deflate',
            # 'Accept-Language': 'zh-CN,zh;q=0.9',
            # 'Cache-Control': 'max-age=0',
            # 'Connection': 'keep-alive',
            # 'Cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1534167987,1534207219,1534477749,1534574688; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1534578369',
            # 'Host': 'www.mzitu.com',
            # 'If-Modified-Since': 'Fri, 18 Aug 2018 03:38:35 GMT',
            'Referer': full_url,
            # 'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        }
        # myheaders = [('User - Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36')]
        # opener = request.build_opener()
        # opener.addheaders = headers
        # request.install_opener(opener)
        # request.urlretrieve(src_page,'mzituDownload/'+title+'/'+filename)
        response = requests.get(src_page,headers=headers)

        # handler = request.ProxyHandler(proxy)
        # opener = request.build_opener(handler)

        # opener.addheaders = headers
        # request.install_opener(opener)
        # response_ = opener.open(src_page)
        # req = request.Request(src_page,headers=headers)
        # response_ = request.urlopen(req)
        with open('mzituDownload/'+title+'/'+filename,'wb')as f:
            f.write(response.content)


# 获取目录
def get_page(count):
    # 爬取三页图片
    url = 'http://www.mzitu.com/page/%d/'
    for page_num in range(1,2):
        full_url = url%page_num

        response = requests.get(full_url)

        html_ele = etree.HTML(response.text)
        li_list = html_ele.xpath('//div[@class="postlist"]/ul/li')
        for li in li_list:
            href = li.xpath('./a/@href')
            print(href[0])
            title = li.xpath('./span[1]/a')
            print(title[0].text)

            get_page_detail(href[0],title[0].text)
        #     break
        # break
if __name__ == '__main__':

    get_page(5)

猜你喜欢

转载自blog.csdn.net/qq_41847171/article/details/81865226