Referer需要匹配最新页面的url
from lxml import etree
import requests
import os
from urllib import request
# 获取图片
def get_page_detail(url,title):
# 创建文件夹
if not os.path.exists('mzituDownload/'+title):
os.makedirs('mzituDownload/'+title)
response = requests.get(url)
html_ele = etree.HTML(response.text)
total_a = html_ele.xpath('//div[@class="pagenavi"]/a')
# 获取每个目录下的图片的个数
total = total_a[-2].xpath('./span')
print(total[0].text)
total = int(total[0].text)
for num in range(1,total+1):
full_url = url+'/'+str(num)
print(full_url)
response1 = requests.get(full_url)
# print(response1.text)
html_ele1 = etree.HTML(response1.text)
src_page = html_ele1.xpath('//div[@class="main-image"]/p/a/img/@src')
src_page = src_page[0]
filename = 'mzitu'+src_page.split('/')[-1]
print(src_page,filename)
headers = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
# 'Accept-Encoding': 'gzip, deflate',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
# 'Cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1534167987,1534207219,1534477749,1534574688; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1534578369',
# 'Host': 'www.mzitu.com',
# 'If-Modified-Since': 'Fri, 18 Aug 2018 03:38:35 GMT',
'Referer': full_url,
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
}
# myheaders = [('User - Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36')]
# opener = request.build_opener()
# opener.addheaders = headers
# request.install_opener(opener)
# request.urlretrieve(src_page,'mzituDownload/'+title+'/'+filename)
response = requests.get(src_page,headers=headers)
# handler = request.ProxyHandler(proxy)
# opener = request.build_opener(handler)
# opener.addheaders = headers
# request.install_opener(opener)
# response_ = opener.open(src_page)
# req = request.Request(src_page,headers=headers)
# response_ = request.urlopen(req)
with open('mzituDownload/'+title+'/'+filename,'wb')as f:
f.write(response.content)
# 获取目录
def get_page(count):
# 爬取三页图片
url = 'http://www.mzitu.com/page/%d/'
for page_num in range(1,2):
full_url = url%page_num
response = requests.get(full_url)
html_ele = etree.HTML(response.text)
li_list = html_ele.xpath('//div[@class="postlist"]/ul/li')
for li in li_list:
href = li.xpath('./a/@href')
print(href[0])
title = li.xpath('./span[1]/a')
print(title[0].text)
get_page_detail(href[0],title[0].text)
# break
# break
if __name__ == '__main__':
get_page(5)