Use Nodejs to crawl certain data from the web page and write the crawled data into excel (server part)

Combined with my previous article (the address is as follows), after the request is sent, the server gets the data, operates and parses the data, and then requests it one by one.

https://blog.csdn.net/qq_45104282/article/details/127669095

Please see the code for details:

Some methods use encapsulation to increase code readability and reduce code duplication.

Starting the service requires node scaffolding, which can be installed by yourself. The startup command is named node. 

For example: node scrapingPage.js

//引入模块
const express=require('express'),  
urls  = require('url'),
querystring=require('querystring'),
morgan=require('morgan'),
request = require('superagent'),
fs = require('fs'),
cheerio = require('cheerio'),
excelJS = require('exceljs')

let list,data = []   //存储数据

const app=express()

let accessLogStream = fs.createWriteStream('./access.log', {flags: 'a'});  //日志请求时间

app.use(morgan('short',{stream:accessLogStream}));

app.all('/scrapingPage',(request,response)=>{

    let pathName=urls.parse(request.url).pathname;

    if(pathName=='/scrapingPage'){
        let names=''
        request.on('data',(chunk)=>{
            names+=chunk
        });
        request.on('end',()=>{
            let params=querystring.parse(names);
            var str=params.code;
            // 输入请求间隔时间(秒)
            var intervalTimeValue=params.time
            str=JSON.parse(str);
            let strLen = str.length
            console.log('---------------链接',str)
            resultData(str,intervalTimeValue)
            response.end()
        })
    }
})

app.listen(8888,()=>{
    console.log('服务器已启动,监听端口中')
})

function resultData(urlArr,intervalTimeValue) {

      for (let index = 0; index  < urlArr.length; index ++) {
          setTimeout(() => {
              capturePageResult(urlArr[index])
          }, i * intervalTimeValue * 1000)
      }

}

function capturePageResult(urlArr) {
    request.get(urlArr.toString())
        .end((err, res) => {
            if (!err) {    
                let
                    html = res.text,  
                    $ = cheerio.load(html, {decodeEntities: false}),    // 加载获取到的 html 数据
                    $itemMod = $('#productDetails_detailBullets_sections1'),
                    $itemMods = $('#detailBulletsWrapper_feature_div > ul > li'),
                    tableLen = $itemMod.length,
                    spanLen = $itemMods.length,
                    rankStr;
                console.log('tableLen长度', tableLen)
                console.log('spanLen长度', spanLen)
                if (tableLen > 0) {
                    // console.log(urlArr[i]+'join')
                    $itemMod.each((i, e) => {
                        $e = $(e);  // 缓存

                        rankStr = $e.find('span').text().split("\n")[0]
                        if($e.find('span').text().split("ratings").slice(-1).toString() === ""){
                            if(rankStr.split("#").length >= 6){
                                let rankStrArr = noRepeat(rankStr.split("#"))
                                let rankStrSave = ''
                                for (let j = 0; j < rankStrArr.length; j++) {
                                    rankStrSave =rankStrArr[0] + '\n#' + rankStrArr[1] +'\n#'+ rankStrArr[2] +'\n#'+ rankStrArr[3] +'\n#'+ rankStrArr[4]
                                }
                                readXlsxFile(urlArr, rankStrSave, formattingDate())
                            }
                            //  console.log(rankStr.split("#"))
                        }

                        if($e.find('span').text().split("ratings").slice(-1).toString() !== ""){

                            data.push($e.find('span').text().split("ratings").slice(-1).toString());

                            var rankNewStr = noRepeat(data.toString().split('#'))
                            console.log('rankNewStr',rankNewStr)
                            let rankData = ''
                            for (let j = 0; j < rankNewStr.length; j++) {
                                if (rankNewStr.length >= 3) rankData = '#' + rankNewStr[1] + '\n#' + rankNewStr[2] + (rankNewStr[3] === undefined ? '' : '\n#' + rankNewStr[3])
                                if (rankNewStr.length === 2) rankData = '#' + rankNewStr[1]
                            }
                            console.log('rankData',rankData)
                            // list = data.slice(-1).toString();
                            readXlsxFile(urlArr, rankData, formattingDate())

                            /**
                             * 去空格
                             */
                            // var str1 = str.replace(/#{1}\s*/g,"\n#");
                        }
                    });

                    //清空数组
                    data.length = []
                    // data.pop()
                }

                if (spanLen === 0 && tableLen === 0){
                    console.log(urlArr + '没有商品排名')
                    readXlsxFile(urlArr,"Not Data",formattingDate())
                }

                if(spanLen > 0){
                    $itemMods.each((i, e) => {
                        $e = $(e);  // 缓存

                        data.push($e.find('span').text().trim())

                    });
                    let removeBest = data.toString().split("Customer Reviews:")[0].toString()

                    console.log(removeBest.split("Best Sellers Rank:  ")[1])

                    if (removeBest.split("Best Sellers Rank:  ")[1] === undefined){
                        console.log(urlArr + '没有商品排名')
                        readXlsxFile(urlArr,"The commodity doesn't rank(该商品没有排名)",formattingDate())
                    }else {
                        readXlsxFile(urlArr,removeBest.split("Best Sellers Rank:  ")[1].replace(/#{1}\s*/g,"\n#"),formattingDate())
                    }
                    data.length = []
                }

            } else {
                console.log(urlArr + '请求延时稍后再试');
                readXlsxFile(urlArr,"The request timeout please try again later(请求延时)",formattingDate())
                return;
            }
        });
}

/**
 * 去重
 * @param arr
 * @returns {*[]}
 */
function noRepeat(arr) {
    let newArr=[];
    for(let i=0; i<arr.length; i++) {

        if(newArr.indexOf(arr[i].toString().replace(/^\s*|\s*$/g,"")) === -1) {  
          // 判断数组中有没有字符串值,如果没有则返回 -1
            newArr.push(arr[i].toString().replace(/^\s*|\s*$/g,""));
        }
    }
    return newArr
}

/**
 * 格式化时间
 * @returns {string}
 */
function formattingDate() {
    let formatDate = new Date()
    let fullYear = formatDate.getFullYear()
    let month = formatDate.getMonth() + 1
    let dates = formatDate.getDate()
    let hours = formatDate.getHours()
    let minutes = formatDate.getMinutes() < 10 ? '0'+formatDate.getMinutes() : formatDate.getMinutes()
    let seconds = formatDate.getSeconds() < 10 ? '0'+formatDate.getSeconds() : formatDate.getSeconds()
    let dateGroup = fullYear+"/"+month+"/"+dates+" "+hours+":"+minutes+":"+seconds
    return dateGroup
}

/**
 * 读写excel
 * @param code
 * @param name
 * @param date
 */
function readXlsxFile(code,name,date) {

    // let file = './'+code.toString().split("/")[4]+'.xlsx'
    // let files = './'+code.toString().split("/")[4]+'.xlsx'

    let readFileData = []
    readFileData.push([code,name,date])
    console.log(readFileData)

    let workbook = new excelJS.Workbook()

    let files = './readFile.xlsx'

    fs.readFile('./readFile.xlsx', 'utf-8', async function (err) {
        if (err) {

            // 表标签名
            let sheet = workbook.addWorksheet('商品排名统计表', {views: [{state: 'frozen', ySplit: 1}]})
            //初始化数据 跟绑定
            sheet.addRow(['编号', '商品排名详细信息', '获取时间']).height = 30
            sheet.columns = [
                {header: '编号', key: readFileData[0], width:40},
                {header: '商品排名详细信息', key: readFileData[1], width: 130},
                {header: '获取时间', key: readFileData[2], width: 20}
            ]
            // 筛选跟设置单元格格式
            sheet.autoFilter = 'A1:C1'
            sheet.getCell('A1').alignment = {vertical: 'middle', horizontal: 'center'};
            sheet.getCell('B1').alignment = {vertical: 'middle', horizontal: 'center'};
            sheet.getCell('C1').alignment = {vertical: 'middle', horizontal: 'center'}

            for (let i = 0; i < readFileData.length; i++) {
                sheet.addRow(readFileData[i]).height = 100
                sheet.getCell('A2').alignment = {vertical: 'middle', horizontal: 'center'};
                sheet.getCell('B2').alignment = {vertical: 'middle', horizontal: 'center'};
                sheet.getCell('C2').alignment = {vertical: 'middle', horizontal: 'center'}
            }
            //清空数组
             readFileData.length = []
            //写入
            workbook.xlsx.writeFile(files)
                    .then(function () {
                        console.log(`${code}的内容已保存到${files.substring(2,10)}文件`)
                    });
        } else {

            try{
                /**
                 * 读取信息
                 */
                let sheet = await workbook.xlsx.readFile(files)

                let sheetL = []
                sheet.eachSheet(function (sheet, sheetId) {
                    //打印当前表的名字(标签名)
                    sheetL.push(sheet.name)
                })
                let length = sheetL.length
                let s = sheet.getWorksheet(length)
                let sheetLine = s.lastRow.number
                console.log('sheetLine=====>', sheetLine)

                //  如果行数据大于20000 那就再新建一个工作表
                if (sheetLine > 20000) {
                    sheet.addWorksheet(`商品排名统计表${sheetL.length + 1}`, {views: [{state: 'frozen', ySplit: 1}]})
                    s = sheet.getWorksheet(`商品排名统计表${sheetL.length + 1}`)
                    s.addRow(['编号', '商品排名详细信息', '获取时间']).height = 30
                }

                for (let i = 0; i < readFileData.length; i++) {
                    //把后续进来的每一个单元格设置高度以及居中格式
                    s.addRow(readFileData[i]).height = 100
                    s.getCell(`A${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                    s.getCell(`B${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                    s.getCell(`C${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                }
                 //清空数组
                 readFileData.length = []

                //写入
                sheet.xlsx.writeFile(files)
                    .then(function () {
                        console.log(`${code}的内容已保存到${files.substring(2,10)}文件(2号入口)`)
                    });
            }catch (e) {
                // console.log('error==========>',e)
                console.log(`${code}失效  请重新获取`)
            }
        }
    })
}


After the server is started, combined with my last article, the front end submits four links with a five-second interval to see if the writing is successful.

 

The crawled data is written successfully, and the interval is requested every five seconds. overover~~~

If you have any questions, please leave a message in the comment area and I will answer them one by one, or you can also chat privately. See you next time! !

Guess you like

Origin blog.csdn.net/qq_45104282/article/details/127671520