#!/usr/bin/python # -*- coding: UTF-8 -*- _author_ = 'BH8ANK' import urllib.request import re import os #os.rmdir("D:/images") # 1, open the page, read the number of pictures, grab html wangzhi = " https://www.zhihu.com/question/43551423 " keywords = " .jpg " def get_html(url): page = urllib.request.urlopen(url) html = page.read() return html content = str(get_html(wangzhi)) a = content.count(keywords) # print(content) # print("This page has %d pictures" %a) # 2, match image files, grab jpg def get_image(html): t = r ' src="(.+?\.jpg)" ' #Regular expression, matches the content behind r, which comes from the html of the web page img = re.compile(t) #The regular expression The formula is translated into its object html1 = html.decode( ' utf-8 ' ) # html is decoded with decode('utf-8'), from bytes to string. # py3's urlopen returns not string but bytes. If there is no such step, the following error will be reported # return _compile(pattern, flags).findall(string) # TypeError: cannot use a string pattern on a bytes-like object img_list = re.findall(img,html1) #Find all pictures that match the regular expression in html and store them in the list list # try: #Either use the exception handling method, or use if not to judge whether the path exists # os.mkdir("D:/images") # except FileExistsError: # pass if not os.path.exists( " D:/images " ): # Create os.mkdir( " D:/images " ) if it does not exist print ( " \n\nCreat Success\n\n " ) # input() n = 1 #The key to understand here is the loop variable n, which controls the loop and names the captured graph for html in img_list: urllib.request.urlretrieve(html, 'D:/images/%s.jpg' %n) n += 1 print("一共抓到%d张图" %(n - 1)) return img_list # for img_url in img_list: # urllib.urlretrieve(img_url, 'D:/tmp/%s.jpg' % n) # return img_list # c = get_image(get_html(wangzhi)) # d = c.decode('utf-8') # # print(d) # 3, create a folder and save jpg if __name__ == "__main__": daima = get_html(wangzhi) print (daima) #This is the html code of the target page # input() get_image(daima) #Get the matching image from html and store it
The code part itself is relatively simple, the key parts to understand are re.compile and re.findall.
Easy place to go wrong:
1. The following two lines, why decode?
html1 = html.decode('utf-8') img_list = re.findall(img,html1)
If you don't decode it, you will get an error
The reason is:
TypeError: can't use a string pattern on a bytes-like object.
html is decoded with decode('utf-8'), from bytes to string.
py3's urlopen returns not strings but bytes, which is different from py2.
2. The meaning of the following sentence is to filter [src="(.+?\.jpg)"], which is determined by viewing the html of the webpage
t = r'src="(.+?\.jpg)"'
3. When creating a folder, you need to judge whether there is this folder in the current path. You can use two methods, if not or try except