我的网络爬虫,nodejs图片爬取

版权声明: https://blog.csdn.net/qq_32858649/article/details/81947841
话不多说,上代码,

其实有一个问题,就是我其实要爬差不多5万张图片,但是,循环次数超过20次那么必定会导致图片下载失败,应该说是图片下载下来之后,图片是脆了的,也就是图裂了。请教哪位da

大神指点一下。

const request = require('request');

const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const async = require('async');
const mysql=require('../mysql/mysql_pool');

const install = require('superagent-charset');
const request = require('superagent');

const requestn = require('request');

superagent = install(request);

    mysql.select('select img,spbh,spmc from sptp where spbh not in (select distinct spbh from yixiasptp) limit 10','',function (err,results,fields) {

        for(let i=0;i<10;i++){
            let imgul=results[i].img;
            let spbh=results[i].spbh;
            //商品名称去除空格以及特殊字符
            let spmc=results[i].spmc.replace(/\s+/g,"").replace(/[\-\_\,\!\|\~\`\(\)\#\$\%\^\&\*\{\}\:\;\"\L\<\>\?]/g,'');// 去掉特殊字符
            spmc=spmc.replace(/\\/g,"");
            spmc=spmc.replace(/\//g,"");
            spmc=String(spmc);

            let mul="D:\\yyw\\"+spmc+"\\";
            //创建目录
            let $ = cheerio.load(imgul);

            fs.exists(mul,function(exists){
                if(exists){
                    console.log('文件夹存在');
                    imgxiaz($,mul,spbh);
                }
                else
                {
                    fs.mkdir(mul,function(err){
                        if(err) {console.error(err)}
                        console.log('目录'+spmc);
                        imgxiaz($,mul,spbh);
                    });
                }
            });

        }

    });


function imgxiaz($,mul,spbh) {
    console.log(spbh);
    $('img').each(function () {
        let imgurl=$(this).attr("src");
        //console.log(imgurl);
        if(imgurl==''||imgurl==undefined){
            console.log('跳过');
        }else{
            setTimeout(function () {
                request(imgurl).pipe(fs.createWriteStream(mul+Date.now()+'.jpg'));
                mysql.modify("INSERT INTO yixiasptp (spbh) VALUES (?)",spbh,function (err,results,fields) {
                });
            },1000);

}
});
}

 最后吧,还是学艺不精,用async模块

 控制了下并发的问题,确实提高了成功率,没测试更多的,现在手动执行吧,200个连接一爬,手动点100次吧。

上修改之后的代码

//const request = require('request');
const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const async = require('async');
const mysql=require('../mysql/mysql_pool');

const install = require('superagent-charset');
const request = require('superagent');

const requestn = require('request');

superagent = install(request);
var options = [];
    mysql.select('select img,spbh,spmc from sptp where spbh not in (select distinct spbh from yixiasptp) limit 200','',function (err,results,fields) {

        for(let i=0;i<200;i++){
            let imgul=results[i].img;

            let spbh=results[i].spbh;
            //商品名称去除空格以及特殊字符
            let spmc=results[i].spmc.replace(/\s+/g,"").replace(/[\-\_\,\!\|\~\`\(\)\#\$\%\^\&\*\{\}\:\;\"\L\<\>\?]/g,'');// 去掉特殊字符
            spmc=spmc.replace(/\\/g,"");
            spmc=spmc.replace(/\//g,"");
            spmc=String(spmc);

            let mul="D:\\yyw\\"+spmc+"\\";
            //创建目录
            let $ = cheerio.load(imgul);

            let imgurls={
                $:$,
                mul:mul,
                spbh:spbh
            };

            options.push(imgurls);

            fs.exists(mul,function(exists){
                if(exists){
                    console.log('文件夹存在');
                    //imgxiaz($,mul,spbh);
                }
                else
                {
                    fs.mkdir(mul,function(err){
                        if(err) {console.error(err)}
                        console.log('创建目录'+spmc);
                        //imgxiaz($,mul,spbh);
                    });
                }
            });

        }

        async.eachLimit(options, 2, function (url,callback){
            imgxiaz(url.$,url.mul,url.spbh);
            //console.log(url.mul+","+url.spbh);
            callback(null, url);
        });
    });


function imgxiaz($,mul,spbh) {
    //console.log(spbh);
    $('img').each(function () {
        let imgurl=$(this).attr("src");
        //console.log(imgurl);
        if(imgurl==''||imgurl==undefined){
            console.log('跳过');
        }else{
            setTimeout(function () {
                /*
                request(imgurl).pipe(fs.createWriteStream(mul+Date.now()+'.jpg'));
                mysql.modify("INSERT INTO yixiasptp (spbh) VALUES (?)",spbh,function (err,results,fields) {
                });
                */

                requestn({
                    url: imgurl,
                    encoding:null
                }, (err, response, body) => {
                    fs.writeFileSync(
                        mul+Date.now()+'.jpeg',
                        body,
                        {
                            encoding:'binary'
                        }
                    );
                });
                mysql.modify("INSERT INTO yixiasptp (spbh) VALUES (?)",spbh,function (err,results,fields) {
                });

            },1000);

}
});
}

猜你喜欢

转载自blog.csdn.net/qq_32858649/article/details/81947841
今日推荐