Web crawler crawling pictures

import urllib.request
import re
import os
import socket
from urllib.error import URLError
from urllib.error import HTTPError


class PaChong(): def url_open(self, url, encodestr):   #Open URL 
        try :
            req = urllib.request.Request(url)
            req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0')
            response = urllib.request.urlopen(req)
            html = response.read().decode(encodestr)
            return html
        except socket:
            print("时间超时")
        except HTTPError as e:
            print('HTTPError code:', e.code)
        except URLError as e:
            print('URLError Reason:', e.reason)

    def work_dir(self, stringname):   #working directory 
        try :
            os.makedirs(r ' c:/image/ ' + stringname)
         except FileExistsError:
             pass 
        os.chdir(r ' c:/image/ ' + stringname)

    def page_bufen(self, html): #Get   some webpage addresses 
        p = r " <li><a href='(.*?(?:\.html){0,1})' " 
        page_bufen_list = re.findall( p, html)
         return page_bufen_list

    def save_photo(self, html): #Save the   page picture 
        print ( " Save picture " )
        p = r'<img.*?src="(.*?\.jpg)"'
        photo_list = re.findall(p, html)
        print(photo_list)
        for each in photo_list:
            strdir = each.split("/")[-3] + "/" + each.split("/")[-2]
            self.work_dir(strdir)
            filename = each.split("/")[-1]
            try:
                urllib.request.urlretrieve(each, filename)
            except HTTPError as e:
                print('HTTPError urlretrieve阶段Error code:', e.code)
            except URLError as e:
                print('URLError urlretrieve阶段Reason:', e.reason)

    def section_del(self, del_str, fenge): #URL   delete section 
        section_del_list = del_str.split(fenge)
        section_del_list.pop()
        del_str = (fenge ).join(section_del_list) + fenge
        return del_str

    def photo_groud_page(self, url, encodestr): #All   pages of a group of pictures, all pictures are downloaded 
        html = self.url_open(url, encodestr)
        page_list = self.page_bufen(html)
        for each in page_list:
            if each == page_list[0]:
                each = url
                html_each = self.url_open(each, encodestr)
            else:
                strhead = self.section_del(url, "/")
                url_each = strhead + each
                html_each = self.url_open(url_each, encodestr)
            self.save_photo(html_each)

    def page_full(self, html):  # 获取网页地址
        p = r'<li><a href="(.*?(?:\.html){0,1})"'
        page_first_list = re.findall(p, html)
        return page_first_list

    def firt_page_yeshu(self, html): #Get   the total number of pages crawled Partial URL 
        p = r " <li><a href='(.*?(?:\.html){0,1})' " 
        firt_page_yeshu_list = re.findall(p, html)
         print (firt_page_yeshu_list)
        part1 = firt_page_yeshu_list[-1]
        part2 = part1.split(".")[0]
        part3 = part2.split("_")[-1]
        page_yeshu_list = []
        for each in range(int(part3)):
            if each == 0:
                continue
            if each == 1:
                continue
            else:
                tempstr = part2.split("_")[0]+"_"+part2.split("_")[1]+"_"+str(each)+".html"
                page_yeshu_list.append(tempstr)
        return page_yeshu_list

    def workstart(self, url, encodestr): #start   work html 
        = self.url_open(url, encodestr)

        first_list = self.page_full(html)
        for each in first_list:  # 首页抓取图片
            self.photo_groud_page(each, encodestr)

        page_yeshu_list = self.firt_page_yeshu(html) #Other   pages grab pictures 
        for each in page_yeshu_list:
            each_a = url + each
            print(each_a)
            html_a = self.url_open(each_a, encodestr)
            other_list = self.page_full(html_a)
            print(other_list)
            for each_b in other_list:
                self.photo_groud_page(each_b, encodestr)


if __name__ == '__main__':
    pc = PaChong()
    pc.workstart("http://www.169tp.com/guoneimeinv/", "gbk")

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324853765&siteId=291194637