版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/funkstill/article/details/85332890
1.初始化项目
cd NodeJS/demo/CSDNSpider
npm init -y
npm install request cheerio -save
2.分析页面
3.代码实现
var request = require('request');
var cheerio = require('cheerio');
var articlelisturl = 'https://blog.csdn.net/funkstill/article/list/';
function getArticleLink(starturl,callback,flag){
//获取页面
request(curArticleListUrl,function(err,res){
if(err){
callback(err);
}
var $ = cheerio.load(res.body.toString());
var temparticleLinksArray = new Array();
if($('div').hasClass('article-list')){//如果列表非空
flag = true;
//console.log('I am in');
var articleList = $('.article-item-box');
articleList.each(function(item){
var artList = $(this);
var articleLink;
if(artList.attr('data-articleid')!="82762601"){
articleLink = artList.find('h4').find('a').attr('href');
//console.log(articleLink+"articleLink");
temparticleLinksArray.push(articleLink);
}
})
}else{//如果列表为空
//console.log('I am out');
flag = false;
}
callback(null,temparticleLinksArray,flag);
});
}
var articleLinksArray=new Array();//文章列表
//遍历所有文章列表页面
var pagenum =1;
var flag = true;
while((pagenum<1000)&&(flag==true)){
//拼接文章列表页面链接
var curArticleListUrl=articlelisturl+pagenum.toString();
getArticleLink(curArticleListUrl,function(err,resultArray,rflag){
if(err){
return console.log(err);
}
if(rflag){
console.log(resultArray);
}else{
flag=rflag;
}
});
pagenum++;
}
PS G:\NodeJS\demo\CSDNSpider> node CSDNSpider.js
[ 'https://blog.csdn.net/funkstill/article/details/84797594',
'https://blog.csdn.net/funkstill/article/details/84798115',
'https://blog.csdn.net/funkstill/article/details/84787172',
'https://blog.csdn.net/funkstill/article/details/84780232',
'https://blog.csdn.net/funkstill/article/details/84777151',
'https://blog.csdn.net/funkstill/article/details/84758390',
'https://blog.csdn.net/funkstill/article/details/84728094',
'https://blog.csdn.net/funkstill/article/details/84727337',
'https://blog.csdn.net/funkstill/article/details/84722906',
'https://blog.csdn.net/funkstill/article/details/84679054',
'https://blog.csdn.net/funkstill/article/details/84678381',
'https://blog.csdn.net/funkstill/article/details/84678134',
'https://blog.csdn.net/funkstill/article/details/84674435',
'https://blog.csdn.net/funkstill/article/details/79783358' ]
[ 'https://blog.csdn.net/funkstill/article/details/85035217',
'https://blog.csdn.net/funkstill/article/details/85031492',
'https://blog.csdn.net/funkstill/article/details/85018491',
'https://blog.csdn.net/funkstill/article/details/84996399',
'https://blog.csdn.net/funkstill/article/details/84996134',
'https://blog.csdn.net/funkstill/article/details/84993100',
'https://blog.csdn.net/funkstill/article/details/84988185',
'https://blog.csdn.net/funkstill/article/details/84985889',
'https://blog.csdn.net/funkstill/article/details/84983518',
'https://blog.csdn.net/funkstill/article/details/84977448',
'https://blog.csdn.net/funkstill/article/details/84972508',
'https://blog.csdn.net/funkstill/article/details/84971210',
'https://blog.csdn.net/funkstill/article/details/84965168',
'https://blog.csdn.net/funkstill/article/details/84842933',
'https://blog.csdn.net/funkstill/article/details/84958700',
'https://blog.csdn.net/funkstill/article/details/84945793',
'https://blog.csdn.net/funkstill/article/details/84885201',
'https://blog.csdn.net/funkstill/article/details/84833490',
'https://blog.csdn.net/funkstill/article/details/84817166',
'https://blog.csdn.net/funkstill/article/details/84801300' ]
[ 'https://blog.csdn.net/funkstill/article/details/85240473',
'https://blog.csdn.net/funkstill/article/details/85330187',
'https://blog.csdn.net/funkstill/article/details/85235941',
'https://blog.csdn.net/funkstill/article/details/85172942',
'https://blog.csdn.net/funkstill/article/details/85160539',
'https://blog.csdn.net/funkstill/article/details/85139078',
'https://blog.csdn.net/funkstill/article/details/85125248',
'https://blog.csdn.net/funkstill/article/details/85118114',
'https://blog.csdn.net/funkstill/article/details/85090172',
'https://blog.csdn.net/funkstill/article/details/85089320',
'https://blog.csdn.net/funkstill/article/details/85058228',
'https://blog.csdn.net/funkstill/article/details/85056194',
'https://blog.csdn.net/funkstill/article/details/85055823',
'https://blog.csdn.net/funkstill/article/details/85054055',
'https://blog.csdn.net/funkstill/article/details/85050247',
'https://blog.csdn.net/funkstill/article/details/85040860',
'https://blog.csdn.net/funkstill/article/details/85040849',
'https://blog.csdn.net/funkstill/article/details/85040321',
'https://blog.csdn.net/funkstill/article/details/85040001',
'https://blog.csdn.net/funkstill/article/details/85034990' ]
4.存在问题
回调函数的使用不是很熟练!!!
异步操作很难控制!!!
导致无法将每页遍历结果合并到一起,也没办法在无文章时停止循环。
5.彩蛋
if(artList.attr('data-articleid')!="82762601"){
articleLink = artList.find('h4').find('a').attr('href');
//console.log(articleLink+"articleLink");
temparticleLinksArray.push(articleLink);
}