Use Nodejs to crawl certain data from the web page and write the crawled data into excel (see the next article for the front-end part and the server-side part)

Because the links to be crawled are all in the long section on the left, which happens to be line by line. Since I was too lazy to find it, I wrote a similar parsing function and asked the code to find it for me. The idea is to put them at intervals with new lines. into the array, then traverse while matching the format I want, and then click Parse to automatically print it to the right frame

 Enter the request interval time (seconds): It tells the server how long after a link request is completed before requesting the next link.

Please see the code for details:

<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>获取排名信息</title>
    <style>
        * {
            margin: 0;
            padding: 0;
        }

        body {
            width: 100%;
            height: 100%;
            position: absolute;
        }

        #dis {
            width: 420px;
            height: 570px;
            position: absolute;
            left: 0;
            top: 0;
            bottom: 0;
            right: 0;
            margin: auto;
            padding: 25px;
            /* border: olive 3px solid; */
            box-shadow: 0px 0px 8px 4px #888888;
            border-radius: 5px;
        }
        #batch {
            width: 370px;
            height: 350px;
            position: absolute;
            left: -1000px;
            top: 0;
            bottom: 0;
            right: 0;
            margin: auto;
            padding: 25px;
            /* border: olive 3px solid; */
            /* box-shadow: 0px 0px 8px 4px #888888; */
            border-radius: 5px;
        }

        textarea {
            height: 200px;
            padding: 8px;
        }

        #code,
        #timer,
        #timerHour,
        #checkCount {
            width: 400px;
            margin-top: 6px;
            font-size: 18px;
            color: rgb(65, 62, 62);
            border: 3px solid #0073c6;
            border-radius: 10px;
        }
        #codetype {
            width: 400px;
            margin-top: 6px;
            font-size: 18px;
            color: rgb(65, 62, 62);
            border: 3px solid #0073c6;
            border-radius: 10px;
        }

        #timer,
        #timerHour,
        #checkCount {

            height: 30px;
            padding: 0 8px;
        }

        .input {
            margin-bottom: 10px;
        }

        textarea:focus {
            outline: none;
        }

        input:focus {
            outline: none;
        }

        #btn {
            height: 40px;
            width: 80px;
            position: absolute;
            left: 0;
            right: 0;
            bottom: 20px;
            margin: auto;
            background-color: aliceblue;
            border: 3px solid #0073c6;
            color: #fff;
            background-color: blue;
            text-align: center;
            font-size: 16px;
            border-radius: 5px;
        }
        #btnParse {
            height: 40px;
            width: 80px;
            position: absolute;
            left: 50px;
            right: 0;
            bottom: 20px;
            margin: auto;
            background-color: aliceblue;
            border: 3px solid #0073c6;
            color: #fff;
            background-color: blue;
            text-align: center;
            font-size: 16px;
            border-radius: 5px;
        }

        button:hover {
            color: rgb(65, 62, 62);
        }
        .select {
            margin-top: 20px;
            height: 30px;
            padding: 5px;
            background-color: #e84807;
            border: 3px solid #0073c6;
            border-radius: 10px;
            color: white;
            line-height: 15px;
        }

        #upload {
            padding: 5px;
            color: rgb(114, 109, 109);
        }

        #cancel {
            float: right;
        }

        span {
            color: rgb(114, 109, 109);
        }
    </style>
    <script src='https://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js'></script>
</head>

<body>
    <div id="batch">
        <div class="input">
            <span id="text">输入要解析的链接</span>
            <br />
            <textarea name="scrapingPage" id="codetype"></textarea>
        </div>
        <button id="btnParse">解析</button>
    </div>
    <div id="dis">
        <div class="input">
            <span id="text">输入链接 标准格式</span>
            <br />
            https://www.amazon.com/dp/B09ZTZZ315
            <textarea name="scrapingPage" id="code"></textarea>
        </div>
        <div class="input">
            <span>输入请求间隔时间(秒)</span><br />
            <input type="text" id="timer" value=""><br /><br />
        </div>
        <button id="btn">确认</button>
    </div>
    <script>
        var text = document.getElementById('code')
        var codeTypeValue = document.getElementById('codetype')


        var btn = document.getElementById('btn')
        var btnParse = document.getElementById('btnParse')
        var matchUrl = 'https://www.amazon.com/dp/'
        var urlArr = []

        var timer = document.getElementById('timer')
        btn.addEventListener('click', sendCode)
        btnParse.addEventListener('click', sendCodeTypeValue)

        
        function sendCodeTypeValue(){
            var textValue = codeTypeValue.value.split("\n")
            for (let index = 0; index < textValue.length; index++) {
                  if(textValue[index].match(matchUrl)){
                       urlArr.push(textValue[index])
                  }
            }
            let re = ''
            for (let index = 0; index < urlArr.length; index++) {
                re += urlArr[index] + '\n'
            }
            if(text.value === null || text.value !== null)
            document.getElementById('code').innerHTML = re
            // urlArr.length = []
        }

        function sendCode() {
            //获取输入内容
            var textValue = text.value
            var timerValue = timer.value
            var isnull = /^\s*$/i;
            var isNum = /^[0-9]*$/i;

            if (!isNum.test(timerValue) || isnull.test(timerValue)) {
                //这里就用简单的js自带提示了  没有用那些前端ui框架了
                return alert('输入请求间隔时间(秒)尽量大于5秒 只能为数字且不能为空')
            }

            console.log(textValue.length)
            if (isnull.test(textValue)) {
                return alert('Code输入不能为空')
            }
            //分割code
            textValue = textValue.split('\n')
            //判断格式
            for (let i = 0; i < textValue.length; i++) {
                if (textValue[i] == '' || isnull.test(textValue[i])) {
                    textValue.splice(i, 1)
                    i--;
                }
            }
            for (let i = 0; i < textValue.length; i++) {
                let count = textValue[i].length - 1
                if (textValue[i][count] == ' ') {//结尾去空格
                    let str = []
                    str = textValue[i].split('')
                    str.splice(count, 1)
                    textValue[i] = str.join('')
                    console.log(`${i}` + textValue[i])
                    i--
                }
            }
            console.log(textValue)
            if (textValue.length > 20) {
                return alert('Code最大个数不能超过20个')
            }

            textValue = JSON.stringify(textValue)
            //发送
            send(textValue,timerValue)
        }

        function send(textValue,timerValue,timerHourValue,checkCountValue) {
            var ajaxTimeout = $.ajax({
                type: 'post',
                url: 'http://127.0.0.1:8888/scrapingPage',
                data: { 'code': textValue,'time':timerValue},
                datatype: 'json',
                success: function (result) {

                }
                // error:function(err){
                //     alert(err)
                // }
            });

        }
    </script>
</body>

</html>

 If you have any questions, please leave a message in the comment area and I will answer them one by one.

Guess you like

Origin blog.csdn.net/qq_45104282/article/details/127669095