爬虫抓取网页图片

版权声明:作者:小白 https://blog.csdn.net/weixin_43687366/article/details/88958235

 抓取网页的全部图片!然后再逐一保存!


import requests
from lxml import etree
#面向对象编程
class Spider(object):
	def __init__(self):
		#反爬虫措施,加请求头部信息,这个是在network中查看的
		self.headers={
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36',

			'Referer': 'https://www.mzitu.com/'

		}

	def start_request(self):
		#1、获取整体网页数据 requests
		for i in range(1,204):
			print("==========正在抓取%s页========="%i)
			res = requests.get("https://www.mzitu.com/page/"+str(i)+"/",headers=self.headers)
			html = etree.HTML(res.content.decode())
			self.xpath_data(html)
			
			
	def xpath_data(self,html):
		#抽取想要的数据,标题 图片 xpath
		#图片
		src_list = html.xpath('//ul[@id="pins"]/li/a/img/@data-original')
		#标题
		alt_list = html.xpath('//ul[@id="pins"]/li/a/img/@alt')
		for src,alt in zip(src_list,alt_list):
			file_name = alt +'.jpg'
			res = requests.get(src,headers=self.headers)
			print("正在抓取图片:"+file_name)
			try:
				with open(file_name,"wb") as f:
					f.write(res.content)
			except:
				print("==========文件名有误!=========")
	


spider = Spider()
spider.start_request()

里面代码基本上已经标注了!

下面直接上结果了

猜你喜欢

转载自blog.csdn.net/weixin_43687366/article/details/88958235