版权声明:作者:小白 https://blog.csdn.net/weixin_43687366/article/details/88958235
抓取网页的全部图片!然后再逐一保存!
import requests
from lxml import etree
#面向对象编程
class Spider(object):
def __init__(self):
#反爬虫措施,加请求头部信息,这个是在network中查看的
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36',
'Referer': 'https://www.mzitu.com/'
}
def start_request(self):
#1、获取整体网页数据 requests
for i in range(1,204):
print("==========正在抓取%s页========="%i)
res = requests.get("https://www.mzitu.com/page/"+str(i)+"/",headers=self.headers)
html = etree.HTML(res.content.decode())
self.xpath_data(html)
def xpath_data(self,html):
#抽取想要的数据,标题 图片 xpath
#图片
src_list = html.xpath('//ul[@id="pins"]/li/a/img/@data-original')
#标题
alt_list = html.xpath('//ul[@id="pins"]/li/a/img/@alt')
for src,alt in zip(src_list,alt_list):
file_name = alt +'.jpg'
res = requests.get(src,headers=self.headers)
print("正在抓取图片:"+file_name)
try:
with open(file_name,"wb") as f:
f.write(res.content)
except:
print("==========文件名有误!=========")
spider = Spider()
spider.start_request()
里面代码基本上已经标注了!
下面直接上结果了