python爬虫练习(1)批量下载素材公社(‘https://www.tooopen.com/img‘)任意类型图片

**整体思路大致就是 输入你想要爬取的图片分类 加上你想下载的几页 即可实现批量下载

下面展示分步骤代码+具体的思路:

一、生成一个保存图片的文件夹

这里是把文件夹创建在.py文件的同级目录了 可以根据自己设定

path = os.getcwd()     #获取当前文件所在路径
path_name = path + '/' + '素材公社'
if not os.path.exists(path_name):
      os.mkdir(path_name)

二、获取图片分类选项和url 生成字典

这里要下载的分类选项不包括风景图片 人物图片等大类

这里不包括风景图片等大类

观察链接末尾在推导式中用if条件语句可将大类选项筛选掉

观察分类地址利用if条件语句在字典推导式中将风景图片大类排除掉

def get_meun():
      url = 'https://www.tooopen.com/img'
      res = requests.get(url, headers=headers)
      html = etree.HTML(res.text)
      urls = html.xpath('/html/body/div[3]/div/div/ul/li/a/@href')
      names = html.xpath('/html/body/div[3]/div//div/ul/li/a/text()')
      dic = {
    
    k: v for k, v in zip(names, urls) if '_' in v}
      return dic

三 、 下载图片

检查源码观察我们发现
在这里插入图片描述
在这里插入图片描述

所以
进入图片真正的下载地址 寻找高清大图的下载链接

在这里插入图片描述
此处的url就是 我们需要的高清大图的链接 下载即可
代码如下:

# 下载图片
def download_img(url, start, end):
     count = 0   # 计数   下载多少张图片
     url1 = url.replace('.aspx', '_1_{}.aspx')# 每页的真正网页格式
     img_urls1 = []  #存放图片下载地址
     img_names = [] 
     for i in range(start, end + 1):
          url2 = url1.format(i)
          res = requests.get(url2, headers=headers).text
          img_url = re.findall(r'a class="pic" href="(.*?)"', res)
          img_name = etree.HTML(res).xpath('/html/body/div[5]/ul/li/div/a/img/@alt')
#          img_names.append(img_name)
#          img_urls1.append(img_url)
          img_urls1 += img_url 
          img_names += img_name
#     print(img_urls1)
#     print(img_names)
     img_urls2 = []
     for j in img_urls1:
           res2 = requests.get(j, headers=headers).text
           img_url2 = etree.HTML(res2).xpath('/html/body/table/tr/td/img/@src')
           #img_url2 = re.findall(r'<img src="(.*?)"  title=' alt=', res2)# 图片真正的下载链接
           img_urls2 += img_url2
#     print(img_urls2)
     for u, n in zip(img_urls2, img_names):
           file_name = n + '.jpg'
           img_content = requests.get(u, headers=headers).content
           with open(path_name + '/' + file_name, 'wb') as f:
                 f.write(img_content)
                 count += 1
                 time.sleep(random.randint(1,2)) #美爬取一张图片 休眠 防止访问频率过快被反爬
     print('\n' + '-' * 15 + ' 已成功为您下载{}张图片 '.format(count) + '-' * 15 + '\n')

四、输入我们想要下载图片的分类和页数 启动图片下载器开始下载

def main():  
      pic_dic = get_meun()
      choice = input('请输入您想下载的图片类型:')       
      url3 = pic_dic.get(choice)
      print('=' * 15 + ' 图片下载器启动 ' + '=' * 15)
      start_page = int(input("请输入起始页码:   "))
      end_page = int(input("请输入结束页码:   "))
      print('~已开始为您下载~')
      download_img(url3, start_page, end_page)   

五、为了防止被反爬我们可以多用一些user_agent 伪装请求头

user_agent = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"    
]

六、下面是我写的代码

import requests
from lxml import etree
import re
import os
import time
import random


# 定义在当前.py文件的目录下创建文件夹
path = os.getcwd()     
path_name = path + '/' + '素材公社'
if not os.path.exists(path_name):
      os.mkdir(path_name)

# 获取图片分类选项表单 生成字典
def get_meun():
      url = 'https://www.tooopen.com/img'
      res = requests.get(url, headers=headers)
      html = etree.HTML(res.text)
      urls = html.xpath('/html/body/div[3]/div/div/ul/li/a/@href')
      names = html.xpath('/html/body/div[3]/div//div/ul/li/a/text()')
      dic = {
    
    k: v for k, v in zip(names, urls) if '_' in v}
      return dic

# 下载图片
def download_img(url, start, end):
     count = 0   # 计数   下载多少张图片
     url1 = url.replace('.aspx', '_1_{}.aspx')# 每页的真正网页格式
     img_urls1 = []  #存放图片下载地址
     img_names = [] 
     for i in range(start, end + 1):
          url2 = url1.format(i)
          res = requests.get(url2, headers=headers).text
          img_url = re.findall(r'a class="pic" href="(.*?)"', res)
          img_name = etree.HTML(res).xpath('/html/body/div[5]/ul/li/div/a/img/@alt')
#          img_names.append(img_name)
#          img_urls1.append(img_url)
          img_urls1 += img_url 
          img_names += img_name
#     print(img_urls1)
#     print(img_names)
     img_urls2 = []
     for j in img_urls1:
           res2 = requests.get(j, headers=headers).text
           img_url2 = etree.HTML(res2).xpath('/html/body/table/tr/td/img/@src')
           #img_url2 = re.findall(r'<img src="(.*?)"  title=' alt=', res2)# 图片真正的下载链接
           img_urls2 += img_url2
#     print(img_urls2)
     for u, n in zip(img_urls2, img_names):
           file_name = n + '.jpg'
           img_content = requests.get(u, headers=headers).content
           with open(path_name + '/' + file_name, 'wb') as f:
                 f.write(img_content)
                 count += 1
                 time.sleep(random.randint(1,2)) #美爬取一张图片 休眠 防止访问频率过快被反爬
     print('\n' + '-' * 15 + ' 已成功为您下载{}张图片 '.format(count) + '-' * 15 + '\n')

def main():  
      pic_dic = get_meun()
      choice = input('请输入您想下载的图片类型:')       
      url3 = pic_dic.get(choice)
      print('=' * 15 + ' 图片下载器启动 ' + '=' * 15)
      start_page = int(input("请输入起始页码:   "))
      end_page = int(input("请输入结束页码:   "))
      print('~已开始为您下载~')
      download_img(url3, start_page, end_page)   

user_agent = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"    
]

if __name__ == '__main__':
      headers = {
    
    
                  
               "Referer": "https://www.tooopen.com/img",
               "User_Agent": random.choice(user_agent)
   }

      main()
      

总结 :

简单爬取数据思路原理:
获取网页信息—解析页面信息—模拟翻页—匹配所需数据—下载到本地文件

猜你喜欢

转载自blog.csdn.net/weixin_46940290/article/details/108692274
今日推荐