爬天极网燕莹轩多线程.py

import os
import requests  # 发送请求
from bs4 import BeautifulSoup   # 解析文本
import re
import threading

base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, '3')

# 拿到前五页url
url_heard = "http://pic.yesky.com"
url_start = "/c/6_20491_1.shtml"
response = requests.get(url=url_heard+url_start)
soup = BeautifulSoup(response.text, "html.parser")
div_obj = soup.find(name="div", attrs={"class": "flym"})
a_list = div_obj.find_all(name='a')
href_list = []
for a in a_list:
    if a.get('href') not in href_list:
        href_list.append(a.get('href'))
href_list.insert(0, url_start)

def func(url_heard, img_path, u):
    response1 = requests.get(url=url_heard+u)
    soup1 = BeautifulSoup(response1.text, 'html.parser')  # 将请求结果交给bs4解析
    div_obj1 = soup1.find(name='div', attrs={"class": "lb_box"})  # 经过分析之后,定位到指定div

    list_dd = div_obj1.find_all(name='dd')
    for dd in list_dd:  # 每一张图片的dl
        a_obj = dd.find('a')

        # 拼接文件夹的路径,并创建文件夹
        title = re.sub('[\/:*?"<>|]', '_', a_obj.text)
        dir_path = os.path.join(img_path, title)
        if not os.path.isdir(dir_path):  # 判断文件是否存在
            os.mkdir(dir_path)

        a_response = requests.get(a_obj.get('href'))
        a_response.encoding = 'GBK'  # 标题汉字
        soup2 = BeautifulSoup(a_response.text, 'html.parser')
        div_obj2 = soup2.find(name='div', attrs={"class": "overview"})
        print(div_obj2)

        try:
            img_list = div_obj2.find_all(name='img')
            for img in img_list:
                img_src = img.get("src")
                img_response = requests.get(img_src.replace('113x113', '740x-'))  # ******************此网站找到的捷径规律
                file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])
                with open(file_path, 'wb') as f:
                    f.write(img_response.content)
        except Exception as e:
            pass
t = []
n = 1
for u in href_list:
    t.append(threading.Thread(target=func, name="线程"+str(n), args=(url_heard, img_path, u)))
    n += 1

for i in t:
    i.start()

效果如下：
爬天极网燕莹轩多线程.py

猜你喜欢