网页自动采集图片

 需要下载一些方法

import {
    createWriteStream
} from "node:fs";
import {
    pipeline
} from "node:stream";
import {
    promisify
} from "node:util";
import fetch from "node-fetch";
import {
    Builder
} from "selenium-webdriver";
import * as chrome from "selenium-webdriver/chrome.js";

const service = new chrome.ServiceBuilder("C:\\WebDri\\bin\\chromedriver.exe");
const driver = new Builder()
    .forBrowser("chrome")
    .setChromeService(service)
    .build();
// 打开
// driver.get("https://www.bilibili.com");
// setTimeout(() => {
//   // 关闭
//   driver.quit();
// }, 5000);

// 下载函数
async function download(url, filename) {
    const streamPipeline = promisify(pipeline);
    // 复制200里面的fetch格式文件
    const response = await fetch(url);

    if (!response.ok)
        throw new Error(`unexpected response ${response.statusText}`);

    await streamPipeline(response.body, createWriteStream(filename));
}

// 网页路径
let urls = ["https://www.woyaogexing.com/touxiang/index.html"];
for (let i = 2; i <= 10; i++) {
    urls.push(`https://www.woyaogexing.com/touxiang/index_${i}.html`);
}

async function main() {
    // 采集单页面图片
    //   driver.get("https://www.bilibili.com/anime/?spm_id_from=333.1073.0.0");
    //   await driver.wait(async () => {
    //     let flag = await driver.executeScript(`
    //       return document.querySelectorAll("#app > div.block-area.block-timeline > div.timeline-wrapper.fix-width > div.timeline-box.clearfix > ul > li > a > div > img").length > 0
    //         `);
    //     return flag;
    //   });
    //   let list = await driver.executeScript(`
    //      let arr = [...document.querySelectorAll("#app > div.block-area.block-timeline > div.timeline-wrapper.fix-width > div.timeline-box.clearfix > ul > li > a > div > img")]
    //      return arr.map(item=>item.src)
    //         `);
    //   console.log(list);

    // 自动多页面采集图片
    for (let url in urls) {
        console.log("页面: " + urls[url]);
        await driver.get(urls[url]);
        // 判断是否有内容
        await driver.wait(async () => {
            let flag = await driver.executeScript(`
            return document.querySelectorAll("#main > div.list-main.mt10.cl > div.list-left.z > div.pMain > div > a.img").length > 0
            `);
            return flag;
        });
        // 采集内容
        let list = await driver.executeScript(`
    let arr2 = [...document.querySelectorAll("#main > div.list-main.mt10.cl > div.list-left.z > div.pMain > div > a.img")]
    return arr2.map((item) => {
        return {
            src: item.firstChild.src,
            title: item.title,
        };
    });
      `);

        console.log(list);
        // // 对数据的另存为到自己创建的文件夹
        list.forEach((item) => {
            download(item.src, "./imgs/" + item.title + ".jpg")
        });

        // 在此等待一秒,尽可能的模拟用户的行为
        await driver.wait(async () => {
            await new Promise((resolve) => {
                setTimeout(() => {
                    resolve();
                }, 1000);
            });
            return true;
        });
    }
}
main();

 获取想采集的标签的路径

 

检查路径 

猜你喜欢

转载自blog.csdn.net/weixin_70563937/article/details/127639285