import urllib.request import re import os import socket from urllib.error import URLError from urllib.error import HTTPError class PaChong(): def url_open(self, url, encodestr): #Open URL try : req = urllib.request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0') response = urllib.request.urlopen(req) html = response.read().decode(encodestr) return html except socket: print("时间超时") except HTTPError as e: print('HTTPError code:', e.code) except URLError as e: print('URLError Reason:', e.reason) def work_dir(self, stringname): #working directory try : os.makedirs(r ' c:/image/ ' + stringname) except FileExistsError: pass os.chdir(r ' c:/image/ ' + stringname) def page_bufen(self, html): #Get some webpage addresses p = r " <li><a href='(.*?(?:\.html){0,1})' " page_bufen_list = re.findall( p, html) return page_bufen_list def save_photo(self, html): #Save the page picture print ( " Save picture " ) p = r'<img.*?src="(.*?\.jpg)"' photo_list = re.findall(p, html) print(photo_list) for each in photo_list: strdir = each.split("/")[-3] + "/" + each.split("/")[-2] self.work_dir(strdir) filename = each.split("/")[-1] try: urllib.request.urlretrieve(each, filename) except HTTPError as e: print('HTTPError urlretrieve阶段Error code:', e.code) except URLError as e: print('URLError urlretrieve阶段Reason:', e.reason) def section_del(self, del_str, fenge): #URL delete section section_del_list = del_str.split(fenge) section_del_list.pop() del_str = (fenge ).join(section_del_list) + fenge return del_str def photo_groud_page(self, url, encodestr): #All pages of a group of pictures, all pictures are downloaded html = self.url_open(url, encodestr) page_list = self.page_bufen(html) for each in page_list: if each == page_list[0]: each = url html_each = self.url_open(each, encodestr) else: strhead = self.section_del(url, "/") url_each = strhead + each html_each = self.url_open(url_each, encodestr) self.save_photo(html_each) def page_full(self, html): # 获取网页地址 p = r'<li><a href="(.*?(?:\.html){0,1})"' page_first_list = re.findall(p, html) return page_first_list def firt_page_yeshu(self, html): #Get the total number of pages crawled Partial URL p = r " <li><a href='(.*?(?:\.html){0,1})' " firt_page_yeshu_list = re.findall(p, html) print (firt_page_yeshu_list) part1 = firt_page_yeshu_list[-1] part2 = part1.split(".")[0] part3 = part2.split("_")[-1] page_yeshu_list = [] for each in range(int(part3)): if each == 0: continue if each == 1: continue else: tempstr = part2.split("_")[0]+"_"+part2.split("_")[1]+"_"+str(each)+".html" page_yeshu_list.append(tempstr) return page_yeshu_list def workstart(self, url, encodestr): #start work html = self.url_open(url, encodestr) first_list = self.page_full(html) for each in first_list: # 首页抓取图片 self.photo_groud_page(each, encodestr) page_yeshu_list = self.firt_page_yeshu(html) #Other pages grab pictures for each in page_yeshu_list: each_a = url + each print(each_a) html_a = self.url_open(each_a, encodestr) other_list = self.page_full(html_a) print(other_list) for each_b in other_list: self.photo_groud_page(each_b, encodestr) if __name__ == '__main__': pc = PaChong() pc.workstart("http://www.169tp.com/guoneimeinv/", "gbk")