爬取妹子图

本文转自 https://blog.csdn.net/baidu_35085676/article/details/68958267 

文中的代码,我自己跑了一遍,主要的解析的方式用的是 BeautifulSoup  但是代码跑起来可能会出现一些问题 TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。个人觉得应该是网站的反爬虫机制的问题,可以尝试一下,变换ip地址。

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import os
  4 import time
  5 
  6 all_url = 'http://www.mzitu.com'
  7 #http请求头
  8 Hostreferer = {
  9     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
 10     'Referer': 'http://www.mzitu.com'
 11                }
 12 Picreferer = {
 13     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
 14     'Referer': 'http://i.meizitu.net'
 15 }
 16 #此请求头破解盗链
 17 #保存地址
 18 path = 'E:/pythonFile/meititu/mei/'
 19 #记录文件
 20 data = 'E:/pythonFile/meititu/mei/.data'
 21 #读取保存记录
 22 def get_log(file):
 23     page = 1
 24     line = 0
 25     try:
 26         with open(file, 'r') as f:
 27             l = f.readline()
 28             page, line = [int(i) for i in l.split('|')]
 29     except Exception as e:
 30         print(e)
 31         print('读取记录失败,从初始开始')
 32     return page, line
 33 
 34 #保存记录
 35 def put_log(file, page, line):
 36     try:
 37         with open(file, "w") as f:
 38             f.write('{}|{}'.format(page, line))
 39     except Exception as e:
 40         print('保存记录失败:[{}]'.format(e))
 41 
 42 #找寻最大页数
 43 def find_max_page():
 44     start_html = requests.get(all_url, headers=Hostreferer)
 45     soup = BeautifulSoup(start_html.text, "html.parser")
 46     page = soup.find_all('a', class_='page-numbers')
 47     max_page = page[-2].text
 48     max_page = int(max_page)
 49     return max_page
 50 
 51 if __name__ == "__main__":
 52     same_url = 'http://www.mzitu.com/page/'
 53     max_page = find_max_page()
 54     page, line = get_log(data)
 55     print('从{}页,{}行开始缓存'.format(page, line))
 56     for n in range(page, int(max_page)+1):
 57         ul = same_url+str(n)
 58         start_html = requests.get(ul, headers=Hostreferer)
 59         soup = BeautifulSoup(start_html.text, "html.parser")
 60         all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
 61         for lines in range(line, len(all_a)):
 62             a = all_a[lines]
 63             title = a.get_text() #提取文本
 64             if(title != ''):
 65                 print("准备扒取:"+title)
 66                 #win不能创建带?的目录
 67                 if(os.path.exists(path+title.strip().replace('?',''))):
 68                         #print('目录已存在')
 69                         flag = 1
 70                 else:
 71                     os.makedirs(path+title.strip().replace('?',''))
 72                     flag = 0
 73                 os.chdir(path + title.strip().replace('?', ''))
 74                 href = a['href']
 75                 html = requests.get(href, headers=Hostreferer)
 76                 mess = BeautifulSoup(html.text, "html.parser")
 77                 # 最大也在class='pagenavi'div中的第6个span
 78                 pic_max = mess.find("div", class_='pagenavi').find_all('span')
 79                 print(pic_max)
 80                 print(len(pic_max)) #确定最大页数在第几个span标签,网页可能会变动
 81                 pic_max = pic_max[6].text #最大页数
 82                 print(pic_max)
 83                 if(flag == 1 and len(os.listdir(path+title.strip().replace('?',''))) >= int(pic_max)):
 84                     print('已经保存完毕,跳过')
 85                     continue
 86                 for num in range(1, int(pic_max)+1):
 87                     while True:
 88                         pic = href+'/'+str(num)
 89                         html = requests.get(pic, headers=Hostreferer)
 90                         mess = BeautifulSoup(html.text, "html.parser")
 91                         pic_url = mess.find('img', alt=title)
 92                         if(pic_url):
 93                             break
 94                     # print(pic_url['src'])
 95                     html = requests.get(pic_url['src'], headers=Picreferer)
 96                     file_name = pic_url['src'].split(r'/')[-1]
 97                     f = open(file_name, 'wb')
 98                     f.write(html.content)
 99                     f.close()
100                 put_log(data, n, lines)
101                 time.sleep(0.5)
102         print('',n,'页完成')
103         line = 0
104         time.sleep(10)

猜你喜欢

转载自www.cnblogs.com/tianqianlan/p/11332724.html