node.js入门(一)爬虫

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014744118/article/details/85565386
var https = require('https')

var cheerio = require('cheerio')

var baseUrl = 'https://www.imooc.com/learn/'

var url = 'https://www.imooc.com/learn/348'

var videoIds = [348, 259, 197, 134, 75]



function filterChapters(html) {

    var $ = cheerio.load(html)

    var chapters = $('.chapter')

    var title = $('hd h2').text()

    var number = parseInt($('.js-learn-num').text())

    var coursesData = {

        title: title,

        number: number,

        videos: []

    }

    chapters.each(function (item) {

        var chapter = $(this)

        var chapterTitle = chapter.find('.chapter-description').text()

        var videos = chapter.find('.video').children('li')

        var chapterData = {

            chapterTitle: chapterTitle,

            videos: []

        }

        videos.each(function (item) {

            var video = $(this).find('.J-media-item')

            var videoTitle = video.text()

            var id = video.attr('href').split('video/')[1]

            chapterData.videos.push({

                title: videoTitle,

                id: id

            })

        })

        coursesData.videos.push(chapterData)

    })

    return coursesData

}



function printCourseInfo(coursesData) {

    coursesData.forEach((courseData) => {

        console.log(courseData.number) + '人学过' + courseData.title + '\n'

    })



    coursesData.forEach(courseData => {

        console.log('###' + courseData.title + '\n')

        courseData.videos.forEach((item) => {

            var chapterTitle = item.chapterTitle

            console.log(chapterTitle + '\n')

            item.videos.forEach(video => {

                console.log('【' + video.id + '】' + video.title + '\n')

            })

        })

    });

}



function getPageAsync(url) {

    return new Promise(function (resolve, reject) {

        console.log("正在爬取" + url)

        https.get(url, function (res) {

            var html = ''

            res.on('data', function (data) {

                html += data

            })

            res.on('end', function () {

                resolve(html)

            })

        }).on('error', function () {

            console.log('获取出错')

            reject(e)

        })

    })

}

var fetchCourseArray = []

videoIds.forEach((id) => {

    fetchCourseArray.push(getPageAsync(baseUrl + id))

})



Promise.all(fetchCourseArray).then(function (pages) {

    //

    var courseData = []

    pages.forEach(function (html) {

        var courses = filterChapters(html)

        courseData.push(courses)

    })

    courseData.sort((a, b) => {

        return a.number < b.number

    })

    printCourseInfo(courseData)

})

猜你喜欢

转载自blog.csdn.net/u014744118/article/details/85565386