python爬取极客学院课程信息实例

用过正则表达式和requests库来爬取极客学院课程的名称,
课程简介,课程时长,课时数,上课人数。

以下为python3编写的代码

import requests
import string
import re

# 代理头信息
kv = {'user-agent':'Mozilla/5.0'}

# 爬虫类,用来实现各种功能
class spider(object):
    def __init__(self):
        print('开始爬取内容......')

    # 获取网页源代码
    def getsource(self,url):
        html = requests.get(url,headers=kv)
        return html.text

    # 换页,page_group用来存放那20条链接
    def changepage(self,url,total_page):
        now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
        page_group = []
        for i in range(now_page,total_page+1):
            link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
            page_group.append(link)
        return page_group

    # 获取每一个课程的源代码
    def geteveryclass(self,source):
        everyclass = re.findall('<li id=.*?</li>',source,re.S)
        return everyclass

    # 获取信息
    def getinfo(self,eachclass):
        info = {}        # 字典类型
        info['title'] = re.search('class="lessonimg" title=(.*?) alt=',eachclass,re.S).group(1)
        previous = re.search('<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',eachclass,re.S).group(1)
        result = str.strip(previous)
        info['content'] = result
        # 课程的时间和水平在一个标签内,所以这样写
        timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
        info['classtime'] = timeandlevel[0]
        info['classlevel'] = timeandlevel[1]
        info['learnnum'] = re.search('<em class="learn-number">(.*?)</em>',eachclass,re.S).group(1)
        return info

    # 将信息写入记事本
    def saveinfo(self,classinfo):
        f = open('info.txt','a',encoding='utf-8')        # a表示以追加的方式打开
        for each in classinfo:
            f.writelines('title:' + each['title'] + '\n')
            f.writelines('content:' + each['content'] + '\n')
            f.writelines('classtime:' + each['classtime'] + '\n')
            f.writelines('classleve:' + each['classlevel'] + '\n')
            f.writelines('learnnum:' + each['learnnum'] + '\n\n\n')
        f.close()


if __name__ == '__main__':
    classinfo = []              # 空列表,保存课程信息
    url = 'https://www.jikexueyuan.com/course/?pageNum=1'
    jikespider = spider()       # 类的实例化
    all_links = jikespider.changepage(url,20)
    for link in all_links:
        print('正在处理页面,' + link)
        html = jikespider.getsource(link)
        everyclass = jikespider.geteveryclass(html)
        for each in everyclass:
            info = jikespider.getinfo(each)
            classinfo.append(info)
        jikespider.saveinfo(classinfo)

猜你喜欢

转载自blog.csdn.net/qq_40258748/article/details/88021756