妹子图片的全爬取

import requests,os,time,random
from lxml import etree
from urllib import request

判断系统中是否存在文件夹 若不存在则创建

if not os.path.exists(‘xiazai’):
os.mkdir(‘xiazai’)

爬取所有页码的所有图片

def allpage_allpages(pages):
for i in range(1,pages):
#遍历每一页动态生成url
url=’http://www.mzitu.com/page/{}’.format(repr(i))
#获取响应的字符串格式
response=requests.get(url)
res_text=response.text
#xpath的使用 匹配到每一栏的url 并放在一个列表之中
res_xpath=etree.HTML(res_text)
first_page_url=res_xpath.xpath(‘//ul[@id=”pins”]/li/a/@href’)
#调用函数爬取一栏所有的图片
onepage_allpages(first_page_url)

爬取一栏所有的图片

def onepage_allpages(onepage_allpages_url):
if onepage_allpages_url:
for image_page_url in onepage_allpages_url:
response=requests.get(image_page_url)
rextext=response.text
images_xpath=etree.HTML(rextext)
#用xpath 获取到图片的总数
image_lens = images_xpath.xpath(‘//div[@class=”pagenavi”]/a/span/text()’)[4]
#遍历每一张图片就是一个页面 从页面中获取image 对应的连接
for i in range(1,int(image_lens)+1):
# print(i)
images_url=image_page_url+’/%d’% i
#调用函数 爬取一页图片
onepage(images_url)

爬取一页图片

def onepage(onepage_url):
response2=requests.get(onepage_url)
res_two_text=response2.text
image123_xpath=etree.HTML(res_two_text)
# print(image123_xpath,’=’*100)
tupian_url=image123_xpath.xpath(‘//div[@class=”main-image”]/p/a/img/@src’)[0]
# print(tupian_url,’==’*15)
headers = {
# ‘cookie’:’Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1534860397; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1534860441’,
‘referer’: onepage_url,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36’
}
response3=requests.get(tupian_url,headers=headers)
# print(response3.text)
filename=’xiazai/’+ str(random.random())+tupian_url.split(‘/’)[-1]
# request.urlretrieve(tupian_url,filename)
with open(filename,’wb’) as ff :
ff.write(response3.content)

if name==’main‘:
allpage_allpages(3)
”’
妹子图爬取注意事项:
爬取步骤:
1.找出每一页的url之间的规律 动态生成url 并用xpath 存放到类表中
2.遍历列表中的url 找到总的页码数 并循环每一页所对应的url 找到图片对应页url
3.下载图片的url 并在请求(最后这一次请求要在headers中加入referer 介绍人)一次用with open 下载到本地文件夹
4.介绍人是发送请求的上一次的url
”’

猜你喜欢

转载自blog.csdn.net/chengjintao1121/article/details/81950549
今日推荐