Reptile Series --- multi-threaded crawling examples

1. Master Image Source crawling

Copy the code
# Crawling owners 'http://sc.chinaz.com/tupian/gudianmeinvtupian.html', all classic beauty pictures 
Import os
 Import Time
 Import Random
 Import Requests
 from lxml Import etree
 from multiprocessing.dummy Import Pool
 # Get all pages the URL 
URL = ' http://sc.chinaz.com/tupian/gudianmeinvtupian.html ' 
page_url_list = [F ' http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html '  for I in Range ( 2,7 )] 
page_url_list.insert (0, URL) 

headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36',
    # 'Content-Encoding':'gzip',
    # 'Content-Type': 'text/html',
}
pig_url_list = []
def get_pig_url(url):
    response = requests.get(url=url, headers=headers)
    #xpath解析数据
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath('//div[@id="container"]/div')
    for div in div_list: 
        URL = div.xpath ( ' .//img/@src2 ' ) [0] 
        pig_url_list.append (URL) 

DEF downloads (URL):
     '' ' download the image data ' '' 
    return requests.get ( URL = URL, headers = headers) .content 

DEF save_pig (Data):
     '' ' save image ' '' 
    # name = url.split ( '/') [-. 1] 
    name = STR (random.randrange (0,1000000 )) + ' .jpg '  # thread storage file name for improvement 
    path = ' zhanzhangpig / '+name
    with open(path,' WB ' ) AS F: 
        f.write (Data) 

IF  Not os.path.exists ( ' zhanzhangpig ' ): 
    os.makdirs ( ' zhanzhangpig ' )
 # thread pool 
Print ( ' multi-start threaded crawling ' ) 
START_TIME = the time.time () 
the pool = pool (. 8 ) 
pool.map (get_pig_url, page_url_list) 
DATA_LIST = pool.map (downloads, pig_url_list) 
pool.map (save_pig, DATA_LIST) 
# Close thread pool 
END_TIME = the time.time ()
 Print (' Multi-threaded crawling end ' )
 Print ( ' time-consuming: ' , end_time- start_time) 

pool.close () 
pool.join ()
Copy the code

 

2 crawling sister subnet picture (https://www.mzitu.com/tag/ugirls/)

Copy the code
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
session=requests.session()
if not os.path.exists('meizitu'):
    os.makedirs('meizitu')

url='https://www.mzitu.com/tag/ugirls/'
page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' forI in Range (2,17 )] 
page_url_list.insert (0, URL) 

headers = {
     ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 6.1; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / Safari 72.0.3626.119 / 537.36 ' ,
     ' Upgrade-Insecure-requests ' : ' . 1 ' ,
     ' the Referer ' : ' https://www.mzitu.com/tag/ugirls/ '  # anti-climbing mechanism: the need to carry the requested webpage original address 
} 
pig_url_list = []
 DEF  get_pig_url (URL):
    Response= session.get(url=url, headers=headers)
    # print(response.text)
    #xpath解析数据
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath('//ul[@id="pins"]/li')
    for div in div_list:
        url = div.xpath('.//img/@data-original')[0]
        pig_url_list.append(url)

def download(url):
    '''下载图片数据'''
    # print(url)
    return session.get(url=url,headers=headers) .content 

DEFsave_pig (the Data):
     '' ' Save image ' '' 
    name = str (random.randrange (0,1000000)) + ' .jpg '  # thread storage file name for improvement 
    path = ' meizitu / ' + name 
    with Open (path , ' WB ' ) AS F: 
        f.write (Data) 

Print ( ' multi-start threaded crawling ' ) 
START_TIME = the time.time ()
 # open thread 
the pool Pool = (10 )
 # pig_url_list = get_pig_url (URL = URL) # single page crawling 
# multiple pages crawled

pool.map (get_pig_url, page_url_list) 
# Print (pig_url_list) 
DATA_LIST = pool.map (download, pig_url_list) 
pool.map (save_pig, DATA_LIST) 

pool.close () 
pool.join () 
# close the thread pool 
end_time = time.time ()
 Print ( ' multi-threaded crawling end ' )
 Print ( ' time-consuming: ' , end_time- start_time)
 # -------------------- statistics files in a folder number ----------------- 
Print (len (the os.listdir ( ' ./meizitu ' )))
Copy the code

!!! 384 Mito so you get

 

1. Master Image Source crawling

Copy the code
#爬取站长'http://sc.chinaz.com/tupian/gudianmeinvtupian.html',所有的古典美女图片
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
#获取所有页面的url
url ='http://sc.chinaz.com/tupian/gudianmeinvtupian.html'
page_url_list=[f'http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html' for i in range(2,7)]
page_url_list.insert(0,url)

headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36',
    # 'Content-Encoding':'gzip',
    # 'Content-Type': 'text/html',
}
pig_url_list = []
def get_pig_url(url):
    response = requests.get(url=url, headers=headers)
    #xpath解析数据
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath('//div[@id="container"]/div')
    for div in div_list:
        url = div.xpath('.//img/@src2')[0]
        pig_url_list.append(url)

def download(url):
    '''下载图片数据'''
    return requests.get(url=url,headers=headers).content

def save_pig(data):
    '''保存图片'''
    # name=url.split('/')[-1]
    name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善
    path='zhanzhangpig/'+name
    with open(path,'wb') as f:
        f.write(data)

if not os.path.exists('zhanzhangpig'):
    os.makedirs('zhanzhangpig')
# 使用线程池
print('多线程爬取开始')
start_time=time.time()
pool=Pool(8)
pool.map(get_pig_url,page_url_list)
data_list=pool.map(download,pig_url_list)
pool.map(save_pig,data_list)
#关闭线程池
end_time=time.time()
print('多线程爬取结束')
print('耗时:',end_time-start_time)

pool.close()
pool.join()
Copy the code

 

2 爬取妹子网图片(https://www.mzitu.com/tag/ugirls/)

Copy the code
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
session=requests.session()
if not os.path.exists('meizitu'):
    os.makedirs('meizitu')

url='https://www.mzitu.com/tag/ugirls/'
page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' for i in range(2,17)]
page_url_list.insert(0,url)

headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
    'Upgrade-Insecure-Requests': '1',
    'Referer': 'https://www.mzitu.com/tag/ugirls/' # 反爬机制:需携带网页请求的原地址
}
pig_url_list = []
def get_pig_url(url):
    response = session.get(url=url, headers=headers)
    # print(response.text)
    #xpath解析数据
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath('//ul[@id="pins"]/li')
    for div in div_list:
        url = div.xpath('.//img/@data-original')[0]
        pig_url_list.append(url)

def download(url):
    '''下载图片数据'''
    # print(url)
    return session.get(url=url,headers=headers).content

def save_pig(data):
    '''保存图片'''
    name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善
    path='meizitu/'+name
    with open(path,'wb') as f:
        f.write(data)

print('多线程爬取开始')
start_time=time.time()
#开启线程
pool=Pool(10)
# pig_url_list=get_pig_url(url=url) #单页爬取
#多页爬取

pool.map (get_pig_url, page_url_list) 
pool.map(get_pig_url,page_url_list)
# #Print (pig_url_list) 
DATA_LIST = pool.map (downloads, pig_url_list) 
pool.map (save_pig, DATA_LIST) 

pool.close () 
pool.join () 
# Close thread pool 
END_TIME = the time.time ()
 Print ( ' multi-threaded crawling end ' )
 Print ( ' time-consuming: ' , end_time- start_time)
 # number of files in a folder statistics -------------------- ------- ---------- 
Print (len (the os.listdir ( ' ./meizitu ' )))
Copy the code

!!! 384 Mito so you get

 

Guess you like

Origin www.cnblogs.com/abdm-989/p/12129839.html