Code attached directly:
# -*- coding:utf-8 -*-
# embarrassing thing encyclopedia crawler
import urllib.request
import them
class Crawler:
# Get HTML information
def open_url(self, url):
req = urllib.request.Request(url)
# Disguise as browser access
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/49.0.2623.112 Safari/537.36')
response = urllib.request.urlopen(req)
return response.read()
# find pictures
def find_img(self, url):
# Open the address and transcode
html = self.open_url(url).decode('utf-8')
# Get the subscript of the first 'img src=' in html
img_start = html.find('img src=')
img_url_arr = []
while img_start != -1:
# Search from img_start to img_start + 200 to get the subscript of the first '.jpg'
img_end = html.find('.jpg', img_start, img_start + 200)
if img_end != -1:
# Intercept the string according to img_start and img_end to get the img address
img_url = html[img_start+9:img_end+4]
# join the array
img_url_arr.append(img_url)
# Get pictures that end with '.JPEG'
# img_end_jpeg = html.find('.JPEG', img_start, img_start + 200)
# if img_end_jpeg != -1:
# img_url = html[img_start+9:img_end_jpeg+5]
# img_url_arr.append(img_url)
# keep looking
img_start = html.find('img src=', img_start + 9)
return img_url_arr
def save_img(self, img_url_arr):
# loop through the array
for img_url in img_url_arr:
print("Download image address: %s" % img_url)
# Truncate the string from the penultimate '/' to the end as the file name
filename = img_url.split('/')[-1]
with open(filename, 'wb') as f:
# Open the address. It depends on the situation, and the pictures of some web pages contain "http:"
img = self.open_url('http:' + img_url)
f.write(img)
def start_download(self, page=10):
# %d is the page number, followed by %
url = "http://www.qiushibaike.com/imgrank/page/%d/"
try:
# create folder
os.mkdir('Encyclopedia of embarrassing things')
except Exception:
print("")
# Move the working environment to the created folder
os.chdir('Encyclopedia of embarrassing things')
for p in range(1, page + 1):
print("Start page %d" %p)
# %Replace the page number to get the picture address
img_url_arr = self.find_img(url % p)
# save Picture
self.save_img(img_url_arr)
crawler = Crawler()
crawler.start_download(3)