Python crawler 1 ------ (crawl pictures to achieve multi-page download)

1. Crawling tasks

        Obtain pictures from Bing Wallpapers - Bing home page daily wallpaper download (peapix.com) , and realize crawling of multiple pages and download them to corresponding folders, realize multi-threading and speed up the progress bar and display crawler progress, Save the obtained image path and title into a csv file.

2. Use technology

        In the process of crawling pictures, BeautifulSoup is used to parse and obtain html text, ThreadPoolExecutor is used for multi-thread acceleration, tqdm is used to display the download progress, and csv is used to write csv files.

3. Need to install a third-party library

import csv
import os
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

4. Source code

import csv
import os
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


# 获取图片地址并且写入csv中
def getUrlsAndWriteInCsv(list):
    # 设置下载的目录 不存在则创建
    csvDirectory = "csvfiles\\bing"
    if not os.path.exists(csvDirectory):
        os.makedirs(csvDirectory)
    # 设置csv文件 将信息写入文件
    csvFilePath = os.path.join(csvDirectory, str(i)) + ".csv"
    print(csvFilePath)
    with open(csvFilePath, 'w', encoding='utf-8', newline="") as f:
        # 2. 基于文件对象构建 csv写入对象
        csv_writer = csv.writer(f)
        # 3. 构建列表头
        csv_writer.writerow(["图片名称", "图片路径"])
        for k in tqdm(list, desc='图片下载'):
            url1 = "https://peapix.com" + k.attrs['href']
            response1 = requests.get(url1, headers=headers)
            if response1.status_code == 200:
                response1 = response1.text
                soup1 = BeautifulSoup(response1, 'lxml')
                pictureSrc = soup1.find('a', "btn btn-secondary w-100").attrs['href']
                pictureTitle = k.attrs['title']
                # 写入csv文件内容
                csv_writer.writerow([pictureTitle, pictureSrc])
                directory = 'images//bing//' + str(i)
                if not os.path.exists(directory):
                    os.makedirs(directory)
                    print(f"{directory}目录创建成功")
                # 定义下载一张图片的开始时间
                startTime = time.time()
                  # downloadPictures(directory, pictureTitle, pictureSrc)
                endTime = time.time()
                # 若五秒未下载完成图片则自动跳过
                if (endTime - startTime) > 5:
                    print(f"{pictureTitle} 图片下载超过五秒\n")
                    continue
                print(pictureTitle, '下载成功!!!\n')


# 下载图片
def downloadPictures(directory, pictureTitle, pictureSrc):
    # 设置图片下载存储的路径 添加图片后缀名
    img_path = directory + "//" + pictureTitle + '.jpg'
    # 此处为图片内容的二进制
    img_data = requests.get(url=pictureSrc, headers=headers).content
    with open(img_path, 'wb') as f:
        f.write(img_data)


def main(url, headers, params):
    params['page'] = i
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        response = response.text
        soup1 = BeautifulSoup(response, 'lxml')
        picturesList = soup.find_all('a', class_="image-list__link")
        getUrlsAndWriteInCsv(picturesList)


# 微软图片的获取 contents获取子节点里面的数据 string和text都可直接获取数据
# 先进行图片加载,再进行图片下载
if __name__ == '__main__':
    mainStartTime = time.time()
    url = "https://peapix.com/bing/cn"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58'}
    params = {
        'page': 1
    }
    picturesList = []  # 获取的图片标签
    pictureDict = {}  # 前者为图片名称,后者为相对地址
    # 创建一个有50个线程的线程池 max_workers = 12
    with ThreadPoolExecutor(max_workers=8) as t:
        for i in tqdm(range(1, 3), desc="所有页面加载"):
            t.submit(main(url, headers, params), name=f"线程{i}")  # f是format  格式化字符串
    print("所有图片下载成功")
    mainEndTime = time.time()
    print(f"爬取{url}页面图片总耗时{mainEndTime - mainStartTime}s")

5. Experiment summary

        Use BeautifulSoup to locate more clearly and simply, obtain the information we need, and write the information to the file. Of course, because each picture needs to send a request to a new page, the efficiency is a bit slow, so the thread pool is used. To speed up the picture download rate, in general, it is still good, it is worth learning, welcome friends to communicate and discuss, remember to pay attention and praise! ! !

Guess you like

Origin blog.csdn.net/m0_64238843/article/details/131491719
Recommended