python 爬取煎蛋ooxx妹子图

煎蛋网妹子图首页(http://jandan.net/ooxx),这个链接看起来怎么那么邪恶呢?经分析网站隐藏了图片地址。心一横,采取曲线路线,成功爬取大量妹子图~

源码如下:

 1 import requests
 2 import re
 3 import os
 4 import base64
 5 from urllib.request import urlretrieve
 6 
 7 
 8 class JianDan:
 9     def __init__(self):
10         self.url_temp = "http://jandan.net/ooxx/page-{}#comments"
11         self.header = {
12             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
13 
14     def get_url_list(self):
15         url_list = [self.url_temp.format(page) for page in range(1, 51)]
16         return url_list
17 
18     def parse_url(self, url):
19         try:
20             response = requests.get(url, headers=self.header)
21             html_str = response.content.decode()
22             img_base_urls = re.findall(
23                 r'<span class="img-hash">(.*?)</span>', html_str)
24             img_urls = list(
25                 map(lambda base_url: "http:" + base64.b64decode(base_url).decode('utf-8'), img_base_urls))
26             return img_urls
27         except Exception as e:
28             print(f"请求目标网站异常:{e}")
29 
30     def make_file(self):
31         dir_name = '煎蛋IMG'
32         get_path = os.getcwd()
33         path_dir = get_path + "/" + dir_name
34         if not os.path.isdir(path_dir):
35             print(f"创建煎{dir_name}文件夹成功")
36             os.mkdir(path_dir)
37         else:
38             print(f"{dir_name}G文件夹已存在创建失败")
39         return path_dir
40 
41     def download(self, img_url, file_path):
42         file_name = "/" + img_url.split('/')[-1]
43         print(f"###### 正在保存 -> {file_name} ")
44         try:
45             urlretrieve(img_url, file_path + file_name)
46             print(f"###### 保存成功 -> {file_name} ")
47         except Exception as e:
48             print(f'下载图片失败:{file_name}')
49 
50     def run(self):
51         url_list = self.get_url_list()
52         file_path = self.make_file()
53         for url in url_list:
54             print("#### 获取第{}页图片 ####".format(url_list.index(url) + 1))
55             img_urls = self.parse_url(url)
56             for img_url in img_urls:
57                 self.download(img_url, file_path)
58 
59         print("end...")
60 
61 
62 if __name__ == '__main__':
63     jiandan = JianDan()
64     jiandan.run()

执行结果:

初学python与爬虫,要学习的还很多。煎蛋网以后还会尝试用更高效的方式来爬取测试的~

猜你喜欢

转载自www.cnblogs.com/cecej/p/9790011.html