爬天极网燕莹轩多线程.py

import os
import requests # 发送请求
from bs4 import BeautifulSoup # 解析文本
import re
import threading

base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, '3')

# 拿到前五页url
url_heard = "http://pic.yesky.com"
url_start = "/c/6_20491_1.shtml"
response = requests.get(url=url_heard+url_start)
soup = BeautifulSoup(response.text, "html.parser")
div_obj = soup.find(name="div", attrs={"class": "flym"})
a_list = div_obj.find_all(name='a')
href_list = []
for a in a_list:
if a.get('href') not in href_list:
href_list.append(a.get('href'))
href_list.insert(0, url_start)

def func(url_heard, img_path, u):
response1 = requests.get(url=url_heard+u)
soup1 = BeautifulSoup(response1.text, 'html.parser') # 将请求结果交给bs4解析
div_obj1 = soup1.find(name='div', attrs={"class": "lb_box"}) # 经过分析之后,定位到指定div

list_dd = div_obj1.find_all(name='dd')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find('a')

# 拼接文件夹的路径,并创建文件夹
title = re.sub('[\/:*?"<>|]', '_', a_obj.text)
dir_path = os.path.join(img_path, title)
if not os.path.isdir(dir_path): # 判断文件是否存在
os.mkdir(dir_path)

a_response = requests.get(a_obj.get('href'))
a_response.encoding = 'GBK' # 标题汉字
soup2 = BeautifulSoup(a_response.text, 'html.parser')
div_obj2 = soup2.find(name='div', attrs={"class": "overview"})
print(div_obj2)

try:
img_list = div_obj2.find_all(name='img')
for img in img_list:
img_src = img.get("src")
img_response = requests.get(img_src.replace('113x113', '740x-')) # ******************此网站找到的捷径规律
file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])
with open(file_path, 'wb') as f:
f.write(img_response.content)
except Exception as e:
pass
t = []
n = 1
for u in href_list:
t.append(threading.Thread(target=func, name="线程"+str(n), args=(url_heard, img_path, u)))
n += 1

for i in t:
i.start()

效果如下:

猜你喜欢

转载自www.cnblogs.com/zhang-da/p/12210008.html