The node crawler crawls different pages for image storage (the page obtains the URL of the page where the image is located, and enters the corresponding page to download the corresponding image)

Recently, the company is doing tourism projects (some of which are overseas tours) and needs to use the national flag. The design was lazy and didn't give me a picture, just sent me a link to let me climb down..., what else can I say to climb down (the front end is full of gas...)
https://data.countryflags.com/products/en/ category/2996352/tiles/population-desc.html
Get the URL of the page where the image is located from this page, enter the corresponding page to download the corresponding image, set the variable and then loop to find the URL, and enter the new page to download the image.

git repository https://gitee.com/shuah153/Node.git

A few points to explain

1. HTTPS needs to be introduced (because the URL of the downloaded image is https, and the URL of http is http)
2. Set User Agent

Some websites do not like to be accessed by crawler programs, so they will detect the connection object. If it is a crawler program, that is, non-human click access, it will not let you continue to visit, so in order for the program to run normally, you need to hide your own The identity of the bot. At this point, we can achieve the purpose of hiding identity by setting the User Agent. The Chinese name of User Agent is User Agent, or UA for short.
The User Agent is stored in the headers, and the server determines who is accessing by checking the User Agent in the headers. If the server checks the User Agent, programs that do not have the User Agent set up will not be able to access the website normally.

Don't talk much, just go to the code.

var http = require('https');
var fs = require('fs');
var cheerio = require('cheerio');
var request = require('request');
var q = 0;
var url = "https://data.countryflags.com/products/en/category/2996352/tiles/population-desc.html"; 
//初始url 

function fetchPage(url) {     //封装了一层函数
  startRequest(url); 
}
// 第一个页面  爬取第二个页面网址
function startRequest(url) {
  //采用http模块向服务器发起一次get请求      
  http.get(url, function (res) {     
      var html = '';        //用来存储请求网页的整个html内容
      var titles = [];        
      res.setEncoding('utf-8'); //防止中文乱码
      //监听data事件,每次取一块数据
      res.on('data', function (chunk) {   
          html += chunk;
          // console.log(html)
      });
      //监听end事件,如果整个网页内容的html都获取完毕,就执行回调函数
      res.on('end', function () {
        var $ = cheerio.load(html); //采用cheerio模块解析html
        var link = 'https:' + $("div.thumbnail a.clearfix").eq(q).attr('href')
        secondRequest(link)
      });

  }).on('error', function (err) {
      console.log(err);
  });

}
// 第二个页面  爬取图片 和 名称
function secondRequest(link){
  http.get(link, function (res) {     
    var html = '';        //用来存储请求网页的整个html内容
    var titles = [];        
    res.setEncoding('utf-8'); //防止中文乱码
    //监听data事件,每次取一块数据
    res.on('data', function (chunk) {   
        html += chunk;
        // console.log(html)
    });

    //监听end事件,如果整个网页内容的html都获取完毕,就执行回调函数
    res.on('end', function () {
      var $ = cheerio.load(html); //采用cheerio模块解析html
      var news_item = {
        //获取文章的标题
        title: $('div.panel-heading h3.panel-title').eq(0).text().trim(),   
        //获取图片url  爬取small png
        link:  'https:' + $(".download-example img").attr('src'),
        //i是用来判断获取了多少篇文章
        q: q = q + 1,     

      };
      savedImg($,news_item);    //存储每篇文章的图片及图片标题

      console.log(news_item)
      if (q <= 5) {                
        fetchPage(url);
      }
    });

  }).on('error', function (err) {
      console.log(err);
  });

}


//该函数的作用:在本地存储所爬取到的图片资源,把国家名称赋给图片
function savedImg($,news_item) {
  var img_filename = news_item.title + '.png';
  var img_src = news_item.link; //获取图片的url
  // console.log(img_src)
  //设置User Agent
  var options = {
    url: img_src,
    headers: {
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    }
  };
  //采用request模块,向服务器发起一次请求,获取图片资源
  request.head(options,function(err,res,body){
    if(err){
        console.log(err);
    }
  });
  request(options).pipe(fs.createWriteStream('./image/'+ img_filename));             //通过流的方式,把图片写到本地/image目录下,并用新闻的标题和图片的标题作为图片的名称。
  // request('https://cdn.countryflags.com/download/yemen/flag-png-small.png').pipe(fs.createWriteStream('./image/'+ img_filename)); 

}

fetchPage(url);      //主程序开始运行

It's time to say good night... Come on

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325509440&siteId=291194637