Use of the library;
import requests import os from bs4 import BeautifulSoup import time from multiprocessing import Pool
Custom image storage path;
= R & lt path ' E: \ crawler \ \\ 0805 '
Request header, the analog browser requests;
In the browser's location, press f12 to open the developer mode;
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }
Main function;
def get_images(url): data = 'https:' res = requests.get(url,headers=headers) soup = BeautifulSoup(res.text,'lxml') url_infos = soup.select('div.thumb > a > img') # print(url_infos) for url_info in url_infos: try: urls = data+url_info.get('src') if os.path.exists(path+urls.split('/' ) [-. 1 ]): Print ( ' images downloaded ' ) the else : Image = requests.get (URLs, headers = headers) with Open (path + urls.split ( ' / ' ) [-. 1], ' WB ' ) AS FP: fp.write (image.content) Print ( ' downloading: ' + URLs) the time.sleep ( 0.5 ) the except Exception AS E: Print (E)
Start crawlers;
IF the __name__ == ' __main__ ' : # route list URLs = [ ' https://www.qiushibaike.com/imgrank/page/{}/ ' .format (I) for I in Range (1,14 )] # open multi-process crawling the pool = Pool () pool.map (get_images, URLs) Print ( " gripping completed ' )
In crawling;
Open folder to view crawling results;
done
Complete Code;
import requests import os from bs4 import BeautifulSoup import time from multiprocessing import Pool """ ************常用爬虫库*********** requests BeautifulSoup pyquery lxml ************爬虫框架*********** scrapy 三大解析方式:re,css,xpath """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } path = r'E:\爬虫\0805\\' def get_images(url): data = 'https:' res = requests.get(url,headers=headers) soup = BeautifulSoup(res.text,'lxml') url_infos = soup.select('div.thumb > a > img') # print(url_infos) for url_info in url_infos: try: urls = data+url_info.get('src') if os.path.exists(path+urls.split('/')[-1]): print('图片已下载') else: image = requests.get(urls,headers=headers) with open(path+urls.split('/')[-1],'wb') as fp: fp.write(image.content) print('正在下载:'+urls) time.sleep(0.5) except Exception as e: print(e) if __name__ == '__main__': # 路由列表 urls = ['https://www.qiushibaike.com/imgrank/page/{}/'.format(i) for i in range(1,14)] # 开启多进程爬取 pool = Pool() pool.map(get_images,urls) print('抓取完毕')