Python3.6 Learning Crawler Introduction

Code attached directly:

# -*- coding:utf-8 -*-
# embarrassing thing encyclopedia crawler
import urllib.request
import them


class Crawler:

    # Get HTML information
    def open_url(self, url):
        req = urllib.request.Request(url)
        # Disguise as browser access
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
                                     ' Chrome/49.0.2623.112 Safari/537.36')
        response = urllib.request.urlopen(req)
        return response.read()

    # find pictures
    def find_img(self, url):
        # Open the address and transcode
        html = self.open_url(url).decode('utf-8')
        # Get the subscript of the first 'img src=' in html
        img_start = html.find('img src=')
        img_url_arr = []
        while img_start != -1:
            # Search from img_start to img_start + 200 to get the subscript of the first '.jpg'
            img_end = html.find('.jpg', img_start, img_start + 200)
            if img_end != -1:
                # Intercept the string according to img_start and img_end to get the img address
                img_url = html[img_start+9:img_end+4]
                # join the array
                img_url_arr.append(img_url)
            # Get pictures that end with '.JPEG'
            # img_end_jpeg = html.find('.JPEG', img_start, img_start + 200)
            # if img_end_jpeg != -1:
            #     img_url = html[img_start+9:img_end_jpeg+5]
            #     img_url_arr.append(img_url)
            # keep looking
            img_start = html.find('img src=', img_start + 9)
        return img_url_arr

    def save_img(self, img_url_arr):
        # loop through the array
        for img_url in img_url_arr:
            print("Download image address: %s" % img_url)
            # Truncate the string from the penultimate '/' to the end as the file name
            filename = img_url.split('/')[-1]
            with open(filename, 'wb') as f:
                # Open the address. It depends on the situation, and the pictures of some web pages contain "http:"
                img = self.open_url('http:' + img_url)
                f.write(img)

    def start_download(self, page=10):
        # %d is the page number, followed by %
        url = "http://www.qiushibaike.com/imgrank/page/%d/"
        try:
            # create folder
            os.mkdir('Encyclopedia of embarrassing things')
        except Exception:
            print("")
        # Move the working environment to the created folder
        os.chdir('Encyclopedia of embarrassing things')
        for p in range(1, page + 1):
            print("Start page %d" %p)
            # %Replace the page number to get the picture address
            img_url_arr = self.find_img(url % p)
            # save Picture
            self.save_img(img_url_arr)

crawler = Crawler()
crawler.start_download(3)



Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324516600&siteId=291194637