1. Master Image Source crawling
# Crawling owners 'http://sc.chinaz.com/tupian/gudianmeinvtupian.html', all classic beauty pictures Import os Import Time Import Random Import Requests from lxml Import etree from multiprocessing.dummy Import Pool # Get all pages the URL URL = ' http://sc.chinaz.com/tupian/gudianmeinvtupian.html ' page_url_list = [F ' http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html ' for I in Range ( 2,7 )] page_url_list.insert (0, URL) headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36', # 'Content-Encoding':'gzip', # 'Content-Type': 'text/html', } pig_url_list = [] def get_pig_url(url): response = requests.get(url=url, headers=headers) #xpath解析数据 tree = etree.HTML(response.content.decode()) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: URL = div.xpath ( ' .//img/@src2 ' ) [0] pig_url_list.append (URL) DEF downloads (URL): '' ' download the image data ' '' return requests.get ( URL = URL, headers = headers) .content DEF save_pig (Data): '' ' save image ' '' # name = url.split ( '/') [-. 1] name = STR (random.randrange (0,1000000 )) + ' .jpg ' # thread storage file name for improvement path = ' zhanzhangpig / '+name with open(path,' WB ' ) AS F: f.write (Data) IF Not os.path.exists ( ' zhanzhangpig ' ): os.makdirs ( ' zhanzhangpig ' ) # thread pool Print ( ' multi-start threaded crawling ' ) START_TIME = the time.time () the pool = pool (. 8 ) pool.map (get_pig_url, page_url_list) DATA_LIST = pool.map (downloads, pig_url_list) pool.map (save_pig, DATA_LIST) # Close thread pool END_TIME = the time.time () Print (' Multi-threaded crawling end ' ) Print ( ' time-consuming: ' , end_time- start_time) pool.close () pool.join ()
2 crawling sister subnet picture (https://www.mzitu.com/tag/ugirls/)
import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool session=requests.session() if not os.path.exists('meizitu'): os.makedirs('meizitu') url='https://www.mzitu.com/tag/ugirls/' page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' forI in Range (2,17 )] page_url_list.insert (0, URL) headers = { ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 6.1; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / Safari 72.0.3626.119 / 537.36 ' , ' Upgrade-Insecure-requests ' : ' . 1 ' , ' the Referer ' : ' https://www.mzitu.com/tag/ugirls/ ' # anti-climbing mechanism: the need to carry the requested webpage original address } pig_url_list = [] DEF get_pig_url (URL): Response= session.get(url=url, headers=headers) # print(response.text) #xpath解析数据 tree = etree.HTML(response.content.decode()) div_list = tree.xpath('//ul[@id="pins"]/li') for div in div_list: url = div.xpath('.//img/@data-original')[0] pig_url_list.append(url) def download(url): '''下载图片数据''' # print(url) return session.get(url=url,headers=headers) .content DEFsave_pig (the Data): '' ' Save image ' '' name = str (random.randrange (0,1000000)) + ' .jpg ' # thread storage file name for improvement path = ' meizitu / ' + name with Open (path , ' WB ' ) AS F: f.write (Data) Print ( ' multi-start threaded crawling ' ) START_TIME = the time.time () # open thread the pool Pool = (10 ) # pig_url_list = get_pig_url (URL = URL) # single page crawling # multiple pages crawled pool.map (get_pig_url, page_url_list) # Print (pig_url_list) DATA_LIST = pool.map (download, pig_url_list) pool.map (save_pig, DATA_LIST) pool.close () pool.join () # close the thread pool end_time = time.time () Print ( ' multi-threaded crawling end ' ) Print ( ' time-consuming: ' , end_time- start_time) # -------------------- statistics files in a folder number ----------------- Print (len (the os.listdir ( ' ./meizitu ' )))
!!! 384 Mito so you get
1. Master Image Source crawling
#爬取站长'http://sc.chinaz.com/tupian/gudianmeinvtupian.html',所有的古典美女图片 import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool #获取所有页面的url url ='http://sc.chinaz.com/tupian/gudianmeinvtupian.html' page_url_list=[f'http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html' for i in range(2,7)] page_url_list.insert(0,url) headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36', # 'Content-Encoding':'gzip', # 'Content-Type': 'text/html', } pig_url_list = [] def get_pig_url(url): response = requests.get(url=url, headers=headers) #xpath解析数据 tree = etree.HTML(response.content.decode()) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: url = div.xpath('.//img/@src2')[0] pig_url_list.append(url) def download(url): '''下载图片数据''' return requests.get(url=url,headers=headers).content def save_pig(data): '''保存图片''' # name=url.split('/')[-1] name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善 path='zhanzhangpig/'+name with open(path,'wb') as f: f.write(data) if not os.path.exists('zhanzhangpig'): os.makedirs('zhanzhangpig') # 使用线程池 print('多线程爬取开始') start_time=time.time() pool=Pool(8) pool.map(get_pig_url,page_url_list) data_list=pool.map(download,pig_url_list) pool.map(save_pig,data_list) #关闭线程池 end_time=time.time() print('多线程爬取结束') print('耗时:',end_time-start_time) pool.close() pool.join()
2 爬取妹子网图片(https://www.mzitu.com/tag/ugirls/)
import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool session=requests.session() if not os.path.exists('meizitu'): os.makedirs('meizitu') url='https://www.mzitu.com/tag/ugirls/' page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' for i in range(2,17)] page_url_list.insert(0,url) headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.mzitu.com/tag/ugirls/' # 反爬机制:需携带网页请求的原地址 } pig_url_list = [] def get_pig_url(url): response = session.get(url=url, headers=headers) # print(response.text) #xpath解析数据 tree = etree.HTML(response.content.decode()) div_list = tree.xpath('//ul[@id="pins"]/li') for div in div_list: url = div.xpath('.//img/@data-original')[0] pig_url_list.append(url) def download(url): '''下载图片数据''' # print(url) return session.get(url=url,headers=headers).content def save_pig(data): '''保存图片''' name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善 path='meizitu/'+name with open(path,'wb') as f: f.write(data) print('多线程爬取开始') start_time=time.time() #开启线程 pool=Pool(10) # pig_url_list=get_pig_url(url=url) #单页爬取 #多页爬取 pool.map (get_pig_url, page_url_list) pool.map(get_pig_url,page_url_list) # #Print (pig_url_list) DATA_LIST = pool.map (downloads, pig_url_list) pool.map (save_pig, DATA_LIST) pool.close () pool.join () # Close thread pool END_TIME = the time.time () Print ( ' multi-threaded crawling end ' ) Print ( ' time-consuming: ' , end_time- start_time) # number of files in a folder statistics -------------------- ------- ---------- Print (len (the os.listdir ( ' ./meizitu ' )))
!!! 384 Mito so you get