进程池爬取汽车之家.py

import time
import requests
#线程池、进程池
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
#多线程:
from threading import Thread
#多进程:
from multiprocessing import Process
#进程池:
from multiprocessing import Pool
from bs4 import BeautifulSoup
#导入cpu_count查看CPU信息获取本机CPU核数:
from multiprocessing import cpu_count

def task(url):
#format格式化页数:
response = requests.get("https://www.autohome.com.cn/all/{}/#liststart".format(url))
#获取编码:
# print(response.encoding)
#转码:
response.encoding = "gbk"
#获取文本:
text = response.text
#解析文本:
soup = BeautifulSoup(text,"html.parser")
#获取div:
div = soup.find(name = "div",attrs={"id":"auto-channel-lazyload-article"})
#获取img:
img_list = div.find_all(name = "img")
#获取第一个链接和长度:
# print(img_list[0],len(img_list))
print(response.url)
for i in img_list:
print("https:" + i.get("src"))
break

if __name__ == '__main__':
"""进程池一般开CPU核数、线程池开CPU核数的2-5倍、"""
# print(cpu_count())
stat = time.time()
#开启进程池、4核是4进程乘以2总共是8个进程:
p = ProcessPoolExecutor(max_workers=cpu_count())
for i in range(1,110):
p.submit(task,i)
p.shutdown()
print("耗时:%s" %(time.time() - stat))

猜你喜欢

转载自www.cnblogs.com/zhang-da/p/12215525.html