爬取4k图片网图片

注意更改路径

  1 import os
  2 import requests
  3 from lxml import etree
  4 from urllib.request import urlopen, Request
  5 import time
  6 
  7 class BiAnImage():
  8     def __init__(self):
  9         self.base_url = "http://pic.netbian.com"
 10         self.header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
 11     def get_html(self, url):
 12         response = requests.get(url, self.header)
 13         if response.status_code == 200:
 14             response.encoding = response.apparent_encoding
 15             return response.text
 16         return None
 17     def get_url_1_list(self, html_1):
 18         url_1_items = []
 19         title_1_items = []
 20         x_html = etree.HTML(html_1)
 21         url_list = x_html.xpath('//div[@id="main"]/div[2]/a/@href')
 22         title_list = x_html.xpath('//div[@id="main"]/div[2]/a/text()')
 23         for url, title in zip(url_list, title_list):
 24             url_1_items.append(self.base_url + url)
 25             title_1_items.append(title)
 26         return title_1_items, url_1_items
 27     def get_url_2_list(self, html_2):
 28         url_2_items = []
 29         title_2_items = []
 30         x_html = etree.HTML(html_2)
 31         url_list = x_html.xpath('//ul[@class="clearfix"]/li/a/@href')
 32         title_list = x_html.xpath('//ul[@class="clearfix"]/li/a/b/text()')
 33         last_page = x_html.xpath('//a[text()="下一页"]/preceding-sibling::a[1]/text()')  # 直接查找下一页 => 上一个元素
 34         for url, title in zip(url_list, title_list):
 35             url_2_items.append(self.base_url + url)
 36             title_2_items.append(title)
 37         return url_2_items, title_2_items, last_page
 38     def get_image_url(self, image_html):
 39         x_image_html = etree.HTML(image_html)
 40         image_url = x_image_html.xpath('//a[@id="img"]/img/@src')
 41         return self.base_url + image_url[0]
 42     def save_image(self, save_path, image_name, image_url):
 43         req = Request(url=image_url, headers=self.header)
 44 
 45         content = urlopen(req).read()
 46         img_name = image_name.replace(' ', '') + image_url[-4:]
 47         with open(save_path + img_name, 'wb') as f:
 48             f.write(content)
 49             print(img_name, "下载完成...")
 50     def run(self):
 51         # 获取所有分类标题, 链接
 52         html = self.get_html(self.base_url)
 53         title_1_items, url_1_items = self.get_url_1_list(html)
 54         for title_1, url_1 in zip(title_1_items, url_1_items):
 55             if title_1 == "4K动漫":
 56             # if title_1 == "4K风景": TODO: 这里加一个判断就可以下载指定分类下的图片
 57                 html_2 = self.get_html(url_1)
 58                 url_2_items, title_2_items, last_page = self.get_url_2_list(html_2)
 59 
 60                 # 通过拿到分类页面中的last_page, 获取该分类下所有页面链接
 61                 for page in range(1, int(last_page[0])):
 62                     if page == 1:
 63                         more_url_1 = url_1  # more_url_1 是每个分类下每一页的链接
 64                     else:
 65                         more_url_1 = url_1 + "index_{}.html".format(page)
 66                     detail_html = self.get_html(more_url_1)
 67                     url_2_items, title_2_items, last_page = self.get_url_2_list(detail_html)
 68 
 69                     # 获取当前页面中所有图片链接
 70                     for url_2, title_2 in zip(url_2_items, title_2_items):
 71 
 72                         # print(title_1, url_1, last_page[0], more_url_1, title_2, url_2)
 73                         pictures = "C:/Users/25766/AppData/Local/Programs/Python/Python38/imgs/"
 74 
 75                         time.sleep(2)
 76                         # 在这里对下载的文件进行分类, 如果文件不存在, 就直接新建一个文件夹
 77                         if os.path.exists(pictures + title_1) is False:
 78                             os.makedirs(pictures + title_1)
 79                         save_path = pictures + title_1 + "/"
 80                         image_html = self.get_html(url_2)
 81                         img_url = self.get_image_url(image_html)
 82                         self.save_image(save_path, title_2, img_url)
 83                         #print(save_path)
 84 
 85                           # 跳出一个页面中所有图片链接
 86                      # 跳出一个分类的所有页面
 87                  # 跳出所有分类
 88 
 89 bian = BiAnImage()
 90 bian.run()
 91 

猜你喜欢

转载自www.cnblogs.com/rstz/p/12704537.html