Node.js:request&cheerio爬虫获取免费代理

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/funkstill/article/details/86107473

1.初始化项目

    在之前的基础上进行故不需要重新安装依赖,具体参考初始化项目

2.动态 userAgent

    每次爬取的时候从中随机选取一个

//./src/userAgent.js
const userAgents = [
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  ];

3.具体代码

    这里尝试使用标志量应对nodejs的单线程异步特性。

var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");

var proxys =[];
var useful =[];

function getProxys(pageNum){
    let userAgent = userAgents[parseInt(Math.random()*userAgents.length)];
    url = "https://www.xicidaili.com/nn/"+pageNum;

    request({
        url:url,
        method:"GET",
        headers:{
            'User-Agent':userAgent
        }
    },function(err,res,body){
        if(!err){
            var $ = cheerio.load(body);
            var trs = $("#ip_list tr");
            for(var i=1;i<trs.length;i++){
                var proxy = {};
                tr = trs.eq(i);
                tds = tr.children("td");
                proxy['ip'] = tds.eq(1).text();
                proxy['port'] = tds.eq(2).text();
                proxy['type'] = tds.eq(5).text();
                var speed = tds.eq(6).children("div").attr("title");
                speed = speed.substring(0,speed.length-1);
                var connectTime = tds.eq(7).children("div").attr("title");
                connectTime = connectTime.substring(0,connectTime.length-1);
                if(speed<=5&&connectTime<=1){
                    proxys.push(proxy);
                }
            }
        }
        check();
    });
}
/**
 * 检查代理是否有效
 */
function check(){
    var url = "http://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js";
    var flag = proxys.length;//检查异步函数是否执行完成的标志量
    for(var i=0;i<proxys.length;i++){
        var proxy = proxys[i];
        request({
            url:url,
            proxy: proxy['type'].toLowerCase()+"://"+proxy['ip']+":"+proxy['port'],
            method:'GET',
            timeout:20000
        },function(err,res,body){
            if(!err){
                if(res.statusCode==200){
                    useful.push(res.request['proxy']['href']);
                    console.log(res.request['proxy']['href'],"useful");
                }else{
                    console.log(res.request['proxy']['href'],"failed");
                }
            }else{

            }
            flag--;
            if(flag==0){
                saveProxys();
            }
        })
    }
}
/**
 * 保存有效代理
 */
function saveProxys(){
    fs.writeFileSync("proxys.json",JSON.stringify(useful));
    console.log("Save finished!");
}
getProxys(1);

4.实战效果

猜你喜欢

转载自blog.csdn.net/funkstill/article/details/86107473