Python爬虫爬取图片实战(多线程和文件分类实现)

不小心看到了一个爬取图片的Python帖子,看了一下评论,各种夸赞楼主好人,本着学习的心态,点进www.mzitu.com去一看,果然好福利,嗯,这种技术值得学习!!说做就做,安装pyCharm,这个网上破解教程比较多,安装以后,我们新建工程,代码如下:

import requests
from lxml import etree


# 设计模式 --》面向对象编程
class Spider(object):
    def __init__(self):
        # 反反爬虫措施,加请求头部信息
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
            "Referer": "https://www.mzitu.com/xinggan/"
        }

    def start_request(self):
        # 1. 获取整体网页的数据 requests
        for i in range(1, 204):
            print("==========正在抓取%s页==========" % i)
            response = requests.get("https://www.mzitu.com/page/" + str(i) + "/", headers=self.headers)
            html = etree.HTML(response.content.decode())
            self.xpath_data(html)

    def xpath_data(self, html):
        # 2. 抽取想要的数据 标题 图片 xpath
        src_list = html.xpath('//ul[@id="pins"]/li/a/img/@data-original')
        alt_list = html.xpath('//ul[@id="pins"]/li/a/img/@alt')
        for src, alt in zip(src_list, alt_list):
            file_name = alt + ".jpg"
            response = requests.get(src, headers=self.headers)
            print("正在抓取图片:" + file_name)
            # 3. 存储数据 jpg with open
            try:
                with open('D:\\meizi\\' + file_name, "wb") as f:
                    f.write(response.content)
            except:
                print("==========文件名有误!==========")


spider = Spider()
spider.start_request()

因为之前写的脚本被服务器拒绝,所以重点是伪装请求头,即Referer,这个伪装好以后,可以开启多线程进行爬取,粘贴好该代码以后,可能会报错,提示找不到lxml模块,楼主用File>Settings>Project:untitled1> Project Interpreter点击+号,搜索lxml模块安装,结果提示失败,没办法,采取解决办法如下:
window+r弹cmd命令,输入:pip install lxml
这一步时间有点长,结果最终竟然导入模块成功,但是波浪线报错消息并没有消失,执行代码的时候,仍旧是提示找不到模块,找到了一个解决办法:

点击add,找到pip install路径,引入该路径下的python.exe即可,我们查看pip路径,方法如下:

找到路径,并且复制该路径(module导入的路径),如下:

好了,到了见证奇迹的时刻了,我们执行代码,结果真的就取到了,结果图就不贴了,总结一下代码缺点:
1)未用多线程,爬取速度不是特别快
2)未有容错机制,如果爬取过程中网络不好,需要重试,未增加重试代码
3)文件未分类,实在不方便管理

根据这个总结,本着学习的态度,重新整理代码如下:

# -*- coding: utf-8 -*-

import requests
import os
from lxml import etree
from threading import *
from time import sleep

nMaxThread = 3  # 这里设置需要开启几条线程
ThreadLock = BoundedSemaphore(nMaxThread)

gHeads = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}


class Meizitu(Thread):
    def __init__(self, url, title):
        Thread.__init__(self)
        self.url = url  # 这里的url在后面的referer中需要使用
        self.title = title

    def run(self):
        try:
            PhotoUrl, Page = self.GetPhotoUrlAndPageNum()
            if PhotoUrl and Page > 0:
                self.SavePhoto(PhotoUrl, Page)
        finally:
            ThreadLock.release()

    def GetPhotoUrlAndPageNum(self):
        html = requests.get(self.url, headers=gHeads)
        if html.status_code == 200:
            xmlContent = etree.HTML(html.text)
            PhotoUrl = xmlContent.xpath("//div[@class='main-image']/p/a/img/@src")[0][:-6]  # 01.jpg  正好是-6
            PageNum = xmlContent.xpath("//div[@class='pagenavi']/a[5]/span/text()")[0]
            return PhotoUrl, int(PageNum)
        else:
            return None, None

    def SavePhoto(self, url, page):
        savePath = "D:/meizi/photo/%s" % self.title
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        for i in range(page):
            heads = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Referer": "%s/%d" % (self.url, i + 1),
                "Accept": "image/webp,image/apng,image/*,*/*;q=0.8"
            }
            j = 0
            while j < 5:
                print(u"Download : %s/%d.jpg" % (self.title, i + 1))
                html = requests.get("%s%02d.jpg" % (url, i + 1), headers=heads)
                if html.status_code == 200:
                    with open(savePath + "/%d.jpg" % (i + 1), "wb") as f:
                        f.write(html.content)
                    break
                elif html.status_code == 404:
                    j += 1
                    sleep(0.05)
                    continue
                else:
                    return None


def main():
    while True:
        try:
            nNum = int(20)
            if nNum > 0:
                break
        except ValueError:
            print(u"请输入数字。")
            continue
    for i in range(nNum):
        url = "https://www.mzitu.com/xinggan/page/%d/" % (i + 1)
        html = requests.get(url, headers=gHeads)
        if html.status_code == 200:
            xmlContent = etree.HTML(html.content)
            hrefList = xmlContent.xpath("//ul[@id='pins']/li/a/@href")
            titleList = xmlContent.xpath("//ul[@id='pins']/li/a/img/@alt")
            for i in range(len(hrefList)):
                ThreadLock.acquire()
                t = Meizitu(hrefList[i], titleList[i])
                t.start()


if __name__ == '__main__':
    main()
发布了185 篇原创文章 · 获赞 11 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_39309402/article/details/103505560