NodeJS简单爬虫获取指定用户CSDN博客所有文章链接

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/funkstill/article/details/85332890

1.初始化项目

cd NodeJS/demo/CSDNSpider
npm init -y
npm install request cheerio -save

2.分析页面

文章列表页面
文章链接
列表页无文章情况

 3.代码实现

var request = require('request');
var cheerio = require('cheerio');

var articlelisturl = 'https://blog.csdn.net/funkstill/article/list/';

function getArticleLink(starturl,callback,flag){
    //获取页面
    request(curArticleListUrl,function(err,res){
            if(err){
                callback(err);
            }
            
            var $ = cheerio.load(res.body.toString());
            var temparticleLinksArray = new Array();
            if($('div').hasClass('article-list')){//如果列表非空
                flag = true;
                //console.log('I am in');
                var articleList = $('.article-item-box');
                articleList.each(function(item){
                    var artList = $(this);
                    var articleLink;
                    if(artList.attr('data-articleid')!="82762601"){
                        articleLink = artList.find('h4').find('a').attr('href');
                        //console.log(articleLink+"articleLink");
                        temparticleLinksArray.push(articleLink);
                    }
                })
            }else{//如果列表为空
                //console.log('I am out');
                flag = false;
            }
            callback(null,temparticleLinksArray,flag);
    });
}
var articleLinksArray=new Array();//文章列表
//遍历所有文章列表页面
var pagenum =1;
var flag = true;
while((pagenum<1000)&&(flag==true)){
        //拼接文章列表页面链接
        var curArticleListUrl=articlelisturl+pagenum.toString();
        getArticleLink(curArticleListUrl,function(err,resultArray,rflag){
            if(err){
                return console.log(err);
            }
            if(rflag){
                console.log(resultArray);
            }else{
                flag=rflag;
            }
        });
        pagenum++;
}
PS G:\NodeJS\demo\CSDNSpider> node CSDNSpider.js
[ 'https://blog.csdn.net/funkstill/article/details/84797594',
  'https://blog.csdn.net/funkstill/article/details/84798115',
  'https://blog.csdn.net/funkstill/article/details/84787172',
  'https://blog.csdn.net/funkstill/article/details/84780232',
  'https://blog.csdn.net/funkstill/article/details/84777151',
  'https://blog.csdn.net/funkstill/article/details/84758390',
  'https://blog.csdn.net/funkstill/article/details/84728094',
  'https://blog.csdn.net/funkstill/article/details/84727337',
  'https://blog.csdn.net/funkstill/article/details/84722906',
  'https://blog.csdn.net/funkstill/article/details/84679054',
  'https://blog.csdn.net/funkstill/article/details/84678381',
  'https://blog.csdn.net/funkstill/article/details/84678134',
  'https://blog.csdn.net/funkstill/article/details/84674435',
  'https://blog.csdn.net/funkstill/article/details/79783358' ]
[ 'https://blog.csdn.net/funkstill/article/details/85035217',
  'https://blog.csdn.net/funkstill/article/details/85031492',
  'https://blog.csdn.net/funkstill/article/details/85018491',
  'https://blog.csdn.net/funkstill/article/details/84996399',
  'https://blog.csdn.net/funkstill/article/details/84996134',
  'https://blog.csdn.net/funkstill/article/details/84993100',
  'https://blog.csdn.net/funkstill/article/details/84988185',
  'https://blog.csdn.net/funkstill/article/details/84985889',
  'https://blog.csdn.net/funkstill/article/details/84983518',
  'https://blog.csdn.net/funkstill/article/details/84977448',
  'https://blog.csdn.net/funkstill/article/details/84972508',
  'https://blog.csdn.net/funkstill/article/details/84971210',
  'https://blog.csdn.net/funkstill/article/details/84965168',
  'https://blog.csdn.net/funkstill/article/details/84842933',
  'https://blog.csdn.net/funkstill/article/details/84958700',
  'https://blog.csdn.net/funkstill/article/details/84945793',
  'https://blog.csdn.net/funkstill/article/details/84885201',
  'https://blog.csdn.net/funkstill/article/details/84833490',
  'https://blog.csdn.net/funkstill/article/details/84817166',
  'https://blog.csdn.net/funkstill/article/details/84801300' ]
[ 'https://blog.csdn.net/funkstill/article/details/85240473',
  'https://blog.csdn.net/funkstill/article/details/85330187',
  'https://blog.csdn.net/funkstill/article/details/85235941',
  'https://blog.csdn.net/funkstill/article/details/85172942',
  'https://blog.csdn.net/funkstill/article/details/85160539',
  'https://blog.csdn.net/funkstill/article/details/85139078',
  'https://blog.csdn.net/funkstill/article/details/85125248',
  'https://blog.csdn.net/funkstill/article/details/85118114',
  'https://blog.csdn.net/funkstill/article/details/85090172',
  'https://blog.csdn.net/funkstill/article/details/85089320',
  'https://blog.csdn.net/funkstill/article/details/85058228',
  'https://blog.csdn.net/funkstill/article/details/85056194',
  'https://blog.csdn.net/funkstill/article/details/85055823',
  'https://blog.csdn.net/funkstill/article/details/85054055',
  'https://blog.csdn.net/funkstill/article/details/85050247',
  'https://blog.csdn.net/funkstill/article/details/85040860',
  'https://blog.csdn.net/funkstill/article/details/85040849',
  'https://blog.csdn.net/funkstill/article/details/85040321',
  'https://blog.csdn.net/funkstill/article/details/85040001',
  'https://blog.csdn.net/funkstill/article/details/85034990' ]

4.存在问题

   回调函数的使用不是很熟练!!!

   异步操作很难控制!!!

    导致无法将每页遍历结果合并到一起,也没办法在无文章时停止循环。

5.彩蛋

人生前十几年浑浑噩噩、毫无目标的状态算是告了一个段落

if(artList.attr('data-articleid')!="82762601"){
  articleLink = artList.find('h4').find('a').attr('href');
  //console.log(articleLink+"articleLink");
  temparticleLinksArray.push(articleLink);
}

猜你喜欢

转载自blog.csdn.net/funkstill/article/details/85332890