Python3 crawler urllib + uses bs4 (3) to crawl free courses and detailed chapter information of MOOC in batches

Crawling course information, you can customize the search range
First of all, it is not crawling course video content, but just crawling title and chapter information
Currently (as of 2019/10/15) the total number of free courses on MOOC is only 1189, in After that, it will return 404, so fill in 0 for the first input, and 1189 for the second input to
directly upload the complete code. It may look complicated, but after careful analysis, you can understand that the copied code may fail to run. The reason Mostly because I used tabs and 4 spaces for indentation when changing the content of the article, just follow the prompts to correct it

from urllib import request
from bs4 import BeautifulSoup
import time
import sys
import re
import os

def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 路径是否存在
    isExists=os.path.exists(path)
    if not isExists:
        # 不存在则创建目录
        os.makedirs(path) 
        print('创建成功:',path)
        return True
    else:
        print ('目录已存在:',path)
        return False
        
#将文件名非法字符转换为空格
#对每个文件名执行这段代码,因为获取到的文件名非法会报OSError
#如课程343
def validateTitle(title):
	#新增\t\n,1166号课程末尾带\t会中断程序
    rstr = r"[\/\\\:\*\?\"\<\>\|\t\n]"  # '/ \ : * ? " < > |'
    # 替换为空格
    new_title = re.sub(rstr, " ", title)  
    return new_title
#爬取出现未考虑到的异常将中断程序,用a方式写入保证处理完异常下次运行
#不覆盖原有数据
def get_not_found_list(nflist):
    f = open("不存在的课程.txt",'a')
    for l in nflist:
        f.write(l)
        f.write("\n")
    f.write("---------------------------------------------------------\n")
    f.close()

def get_not_available_list(nalist):
    f = open("已下架的课程.txt",'a')
    for l in nalist:
        f.write(l)
        f.write("\n")
    f.write("---------------------------------------------------------\n")
    f.close()

def get_available_list(alist):
    f = open("可以正常学习的课程.txt",'a')
    for l in alist:
        f.write(l)
        f.write("\n")
    f.write("---------------------------------------------------------\n")
    f.close()

#构造请求头,不构造的话默认的User-Agent是pyspider,等于自己告诉服务器我是爬虫在爬你的网站
#如果访问不成功是因为headers没有传完需要的所有参数
#审查元素/f12,或者利用抓包工具查看你爬取的页面需要哪些信息,这个例子中不需要所以不列举
def get_headers():
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'}
    return headers
    
def spider_active(begin, end,alist):
    headers = get_headers()
    for i in range(begin, end):
        i += 1
        lesson_id = str(i)
        url = basic_url + lesson_id + "/"
        print("正在查找:",url)
        req = request.Request(url=url, headers=headers)
        try:
            #获取到的内容
            content = request.urlopen(url)
        except Exception as e:
            print(e)
            #urlopen可能出现httperror,如404等
            not_found_list.append(lesson_id)
            time.sleep(1)
            continue
        encode_html = content.read()
        html = encode_html.decode("utf-8")
        soup = BeautifulSoup(html,"html.parser")

        tip = soup.find_all("div","tip")
        #如果课程已下架
        if len(tip) != 0:
            not_available_list.append(lesson_id)
            print("该课程已下架")
            time.sleep(1)
            continue
    
        title = soup.find_all('h2','l')
        t = title[0].get_text()
        #将当前存在的课程加入alist列表
        alist.append(lesson_id + t)
        #encoding="utf-8" 用于避免一个编码error(如不加课程30将报错)
        #zfill(5)用于给课程前面编号补0
        #让获取到的文件在文件夹查看时看上去更有序
        #validateTitle替换文件名非法字符为空格
        #在课程编号和课程名称之间加一个空格,避免课程名称开始是数字的情况造成混淆
        #如课程693, 707
        f = open("lessons\\" + lesson_id.zfill(5) + " "
                 + validateTitle(t) +".txt",'w',encoding="utf-8")
    
        f.write(url)
        f.write("\n\n")
        charpters = soup.find_all("a","J-media-item")

        for c in charpters:
            f.write(c.get_text().replace(' ','').replace("开始学习",'').replace('\n',''))
            f.write("\n")
        f.close()
        print("写入完成")   
        #休眠1秒
        time.sleep(1)


if __name__ == "__main__":

    #没有找到的课程的列表
    not_found_list = []
    #已下架课程的列表
    not_available_list = []
    #当前正常访问的课程的列表
    available_list = []
    basic_url = "https://www.imooc.com/learn/"
    #爬取课程编号从begin+1到end之间的课程信息
    #爬取开始点
    begin = input("请输入爬取开始点课程编号(要求是一个整数):")
    begin = int(begin)
    #爬取结束点
    end = input("请输入爬取结束点课程编号(要求是一个整数):")
    end = int(end)
    if begin < 0:
        begin = 0
    if begin > end:
        print("开始值必须小于等于结束值")
        sys.exit(0)
    # 定义要创建的目录
    mkpath="lessons"
    # 创建该目录
    try:
        mkdir(mkpath)
    except Exception as e:
        print(e)
        print("请检查路径是否出现以下问题:")
        print("1.路径含中括号内非法字符:[\/\\\:\*\?\"\<\>\|]")
        print("2.绝对路径长度超过了255个字符")
        print("3.此情况概率极低:超过了当前文件系统能拥有的文件夹数量上限")
    try:
        spider_active(begin, end, available_list)
    except Exception as e:
        print(e)
        #如果出现异常则向列表写入数据后退出程序,不包括用户主动退出
        #解决bug后可以手动调整begin的值从上次异常中断点继续爬取
        get_not_found_list(not_found_list)
        get_not_available_list(not_available_list)
        get_available_list(available_list)
        sys.exit(1)
    #正常爬取完也写入各个列表
    get_not_found_list(not_found_list)
    get_not_available_list(not_available_list)
    get_available_list(available_list)

The above code is the complete code. Next, the code is separated by function to explain its function for easy understanding

#创建文件夹的方法,不属于文章重点,请看代码注释
def mkdir(path):

for loop to crawl the code of the web page part

#基础路径
basic_url = "https://www.imooc.com/learn/"
i += 1
lesson_id = str(i)
#拼接路径
url = basic_url + lesson_id + "/"
#发送请求
req = request.Request(url=url, headers=headers)
#获取网页内容
content = request.urlopen(url)
encode_html = content.read()
html = encode_html.decode("utf-8")

Use bs4 to parse web content

soup = BeautifulSoup(html,"html.parser")
#找到所有div标签中class为tip的标签对,如果课程已下架则会出现该标签
tip = soup.find_all("div","tip")
#tip长度不为0表示该课程已下架
if len(tip) != 0:
	not_available_list.append(lesson_id)
	print("该课程已下架")
	time.sleep(1)
	#跳过这次循环
	continue

When the course does not exist, it will return httperror, and choose to skip this loop at this time

        try:
			content = request.urlopen(url)
        except Exception as e:
            print(e)
            continue

If the course exists and has not been taken off the shelf, write the obtained web page content to the file

title = soup.find_all('h2','l')
 t = title[0].get_text()
#将当前存在的课程加入alist列表
alist.append(t)
#encoding="utf-8" 用于避免一个编码error(如不加课程30将报错)
#zfill(5)用于给课程前面编号补0
#让获取到的文件在文件夹查看时看上去更有序
#validateTitle替换文件名非法字符为空格
#在课程编号和课程名称之间加一个空格,避免课程名称开始是数字的情况造成混淆
#如课程693, 707
f = open("lessons\\" + lesson_id.zfill(5) + " "
		+ validateTitle(t) +".txt",'w',encoding="utf-8")
f.write(url)
f.write("\n\n")
#类别为J-media-item的a标签包含了课程章节名称
charpters = soup.find_all("a","J-media-item")
for c in charpters:
	#去除空格和开始学习等字符
	f.write(c.get_text().replace(' ','').replace("开始学习",'').replace('\n',''))
	f.write("\n")
f.close()

Use validateTitle() to remove illegal characters in the course name, otherwise an error will be reported if you crawl to course 343

validateTitle(t)

Everything else is a matter of details, and you can understand it by reading the annotations slowly. If you don’t understand it after reading the annotations, please read the previous articles (1)
MOOC only crawls chapter information and does not have a crawler detection mechanism (as of 2019/10/15, the test is still There is no detection mechanism), so it’s okay not to forge headers and sleep, but please also consider the burden on the server. The purpose of this program is to facilitate you to quickly check the information of all free courses to quickly find the courses you want to learn but may not be displayed in the search. Instead of searching for the existence of courses one by one, whether it is what you want to see,
the operation effect is as follows:
insert image description here
insert image description here
open the txt file to view detailed chapter information
insert image description here

Guess you like

Origin blog.csdn.net/qq_36376711/article/details/102571508