不小心看到了一个爬取图片的Python帖子,看了一下评论,各种夸赞楼主好人,本着学习的心态,点进www.mzitu.com去一看,果然好福利,嗯,这种技术值得学习!!说做就做,安装pyCharm,这个网上破解教程比较多,安装以后,我们新建工程,代码如下:
import requests
from lxml import etree
# 设计模式 --》面向对象编程
class Spider(object):
def __init__(self):
# 反反爬虫措施,加请求头部信息
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Referer": "https://www.mzitu.com/xinggan/"
}
def start_request(self):
# 1. 获取整体网页的数据 requests
for i in range(1, 204):
print("==========正在抓取%s页==========" % i)
response = requests.get("https://www.mzitu.com/page/" + str(i) + "/", headers=self.headers)
html = etree.HTML(response.content.decode())
self.xpath_data(html)
def xpath_data(self, html):
# 2. 抽取想要的数据 标题 图片 xpath
src_list = html.xpath('//ul[@id="pins"]/li/a/img/@data-original')
alt_list = html.xpath('//ul[@id="pins"]/li/a/img/@alt')
for src, alt in zip(src_list, alt_list):
file_name = alt + ".jpg"
response = requests.get(src, headers=self.headers)
print("正在抓取图片:" + file_name)
# 3. 存储数据 jpg with open
try:
with open('D:\\meizi\\' + file_name, "wb") as f:
f.write(response.content)
except:
print("==========文件名有误!==========")
spider = Spider()
spider.start_request()
因为之前写的脚本被服务器拒绝,所以重点是伪装请求头,即Referer,这个伪装好以后,可以开启多线程进行爬取,粘贴好该代码以后,可能会报错,提示找不到lxml模块,楼主用File>Settings>Project:untitled1> Project Interpreter点击+号,搜索lxml模块安装,结果提示失败,没办法,采取解决办法如下:
window+r弹cmd命令,输入:pip install lxml
这一步时间有点长,结果最终竟然导入模块成功,但是波浪线报错消息并没有消失,执行代码的时候,仍旧是提示找不到模块,找到了一个解决办法:
点击add,找到pip install路径,引入该路径下的python.exe即可,我们查看pip路径,方法如下:
找到路径,并且复制该路径(module导入的路径),如下:
好了,到了见证奇迹的时刻了,我们执行代码,结果真的就取到了,结果图就不贴了,总结一下代码缺点:
1)未用多线程,爬取速度不是特别快
2)未有容错机制,如果爬取过程中网络不好,需要重试,未增加重试代码
3)文件未分类,实在不方便管理
根据这个总结,本着学习的态度,重新整理代码如下:
# -*- coding: utf-8 -*-
import requests
import os
from lxml import etree
from threading import *
from time import sleep
nMaxThread = 3 # 这里设置需要开启几条线程
ThreadLock = BoundedSemaphore(nMaxThread)
gHeads = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
class Meizitu(Thread):
def __init__(self, url, title):
Thread.__init__(self)
self.url = url # 这里的url在后面的referer中需要使用
self.title = title
def run(self):
try:
PhotoUrl, Page = self.GetPhotoUrlAndPageNum()
if PhotoUrl and Page > 0:
self.SavePhoto(PhotoUrl, Page)
finally:
ThreadLock.release()
def GetPhotoUrlAndPageNum(self):
html = requests.get(self.url, headers=gHeads)
if html.status_code == 200:
xmlContent = etree.HTML(html.text)
PhotoUrl = xmlContent.xpath("//div[@class='main-image']/p/a/img/@src")[0][:-6] # 01.jpg 正好是-6
PageNum = xmlContent.xpath("//div[@class='pagenavi']/a[5]/span/text()")[0]
return PhotoUrl, int(PageNum)
else:
return None, None
def SavePhoto(self, url, page):
savePath = "D:/meizi/photo/%s" % self.title
if not os.path.exists(savePath):
os.makedirs(savePath)
for i in range(page):
heads = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Referer": "%s/%d" % (self.url, i + 1),
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8"
}
j = 0
while j < 5:
print(u"Download : %s/%d.jpg" % (self.title, i + 1))
html = requests.get("%s%02d.jpg" % (url, i + 1), headers=heads)
if html.status_code == 200:
with open(savePath + "/%d.jpg" % (i + 1), "wb") as f:
f.write(html.content)
break
elif html.status_code == 404:
j += 1
sleep(0.05)
continue
else:
return None
def main():
while True:
try:
nNum = int(20)
if nNum > 0:
break
except ValueError:
print(u"请输入数字。")
continue
for i in range(nNum):
url = "https://www.mzitu.com/xinggan/page/%d/" % (i + 1)
html = requests.get(url, headers=gHeads)
if html.status_code == 200:
xmlContent = etree.HTML(html.content)
hrefList = xmlContent.xpath("//ul[@id='pins']/li/a/@href")
titleList = xmlContent.xpath("//ul[@id='pins']/li/a/img/@alt")
for i in range(len(hrefList)):
ThreadLock.acquire()
t = Meizitu(hrefList[i], titleList[i])
t.start()
if __name__ == '__main__':
main()