版权声明: https://blog.csdn.net/qq_32858649/article/details/81947841
话不多说,上代码,
其实有一个问题,就是我其实要爬差不多5万张图片,但是,循环次数超过20次那么必定会导致图片下载失败,应该说是图片下载下来之后,图片是脆了的,也就是图裂了。请教哪位da
大神指点一下。
const request = require('request');
const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const async = require('async');
const mysql=require('../mysql/mysql_pool');
const install = require('superagent-charset');
const request = require('superagent');
const requestn = require('request');
superagent = install(request);
mysql.select('select img,spbh,spmc from sptp where spbh not in (select distinct spbh from yixiasptp) limit 10','',function (err,results,fields) {
for(let i=0;i<10;i++){
let imgul=results[i].img;
let spbh=results[i].spbh;
//商品名称去除空格以及特殊字符
let spmc=results[i].spmc.replace(/\s+/g,"").replace(/[\-\_\,\!\|\~\`\(\)\#\$\%\^\&\*\{\}\:\;\"\L\<\>\?]/g,'');// 去掉特殊字符
spmc=spmc.replace(/\\/g,"");
spmc=spmc.replace(/\//g,"");
spmc=String(spmc);
let mul="D:\\yyw\\"+spmc+"\\";
//创建目录
let $ = cheerio.load(imgul);
fs.exists(mul,function(exists){
if(exists){
console.log('文件夹存在');
imgxiaz($,mul,spbh);
}
else
{
fs.mkdir(mul,function(err){
if(err) {console.error(err)}
console.log('目录'+spmc);
imgxiaz($,mul,spbh);
});
}
});
}
});
function imgxiaz($,mul,spbh) {
console.log(spbh);
$('img').each(function () {
let imgurl=$(this).attr("src");
//console.log(imgurl);
if(imgurl==''||imgurl==undefined){
console.log('跳过');
}else{
setTimeout(function () {
request(imgurl).pipe(fs.createWriteStream(mul+Date.now()+'.jpg'));
mysql.modify("INSERT INTO yixiasptp (spbh) VALUES (?)",spbh,function (err,results,fields) {
});
},1000);
}
});
}
最后吧,还是学艺不精,用async模块
控制了下并发的问题,确实提高了成功率,没测试更多的,现在手动执行吧,200个连接一爬,手动点100次吧。
上修改之后的代码
//const request = require('request');
const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const async = require('async');
const mysql=require('../mysql/mysql_pool');
const install = require('superagent-charset');
const request = require('superagent');
const requestn = require('request');
superagent = install(request);
var options = [];
mysql.select('select img,spbh,spmc from sptp where spbh not in (select distinct spbh from yixiasptp) limit 200','',function (err,results,fields) {
for(let i=0;i<200;i++){
let imgul=results[i].img;
let spbh=results[i].spbh;
//商品名称去除空格以及特殊字符
let spmc=results[i].spmc.replace(/\s+/g,"").replace(/[\-\_\,\!\|\~\`\(\)\#\$\%\^\&\*\{\}\:\;\"\L\<\>\?]/g,'');// 去掉特殊字符
spmc=spmc.replace(/\\/g,"");
spmc=spmc.replace(/\//g,"");
spmc=String(spmc);
let mul="D:\\yyw\\"+spmc+"\\";
//创建目录
let $ = cheerio.load(imgul);
let imgurls={
$:$,
mul:mul,
spbh:spbh
};
options.push(imgurls);
fs.exists(mul,function(exists){
if(exists){
console.log('文件夹存在');
//imgxiaz($,mul,spbh);
}
else
{
fs.mkdir(mul,function(err){
if(err) {console.error(err)}
console.log('创建目录'+spmc);
//imgxiaz($,mul,spbh);
});
}
});
}
async.eachLimit(options, 2, function (url,callback){
imgxiaz(url.$,url.mul,url.spbh);
//console.log(url.mul+","+url.spbh);
callback(null, url);
});
});
function imgxiaz($,mul,spbh) {
//console.log(spbh);
$('img').each(function () {
let imgurl=$(this).attr("src");
//console.log(imgurl);
if(imgurl==''||imgurl==undefined){
console.log('跳过');
}else{
setTimeout(function () {
/*
request(imgurl).pipe(fs.createWriteStream(mul+Date.now()+'.jpg'));
mysql.modify("INSERT INTO yixiasptp (spbh) VALUES (?)",spbh,function (err,results,fields) {
});
*/
requestn({
url: imgurl,
encoding:null
}, (err, response, body) => {
fs.writeFileSync(
mul+Date.now()+'.jpeg',
body,
{
encoding:'binary'
}
);
});
mysql.modify("INSERT INTO yixiasptp (spbh) VALUES (?)",spbh,function (err,results,fields) {
});
},1000);
}
});
}