Python爬虫之路-爬取在线课程并保存到Excel

网易云课堂 :study.163.com
输入 python 关键字 搜索
显示全部python课程
本次实现 :提取课程信息并保存到Excel

使用xlsxwriter模块实现该功能。
使用前一定要导入该模块:

import  xlsxwriter

如果要将数据写入到Excel,首先需要创建一个Excel,然后再Excel创建sheet,最后在sheet中写入数据。下面分别介绍实现方法:

1)创建Excel。代码如下:

workbook = xlsxwriter.Workbook("网易云课堂Python课程数据.xlsx") # 创建excel

第一个参数就是Excel名称。

2)使用workbook对象创建sheet。代码如下:

worksheet = workbook.add_worksheet("first_sheet") # 创建sheet

第一个参数就是sheet名称。

3)写入数据。代码如下:

worksheet.write(0, 0, '商品ID')
worksheet.write(0, 1, '课程ID')
worksheet.write(0, 2, '商品名称')
worksheet.write(0, 3, '商品类型')
worksheet.write(0, 4, '机构名称')
worksheet.write(0, 5, '评分')

上述代码中,worksheet.write(0, 0, ‘商品ID’)的第一个参数表示行(从0开始),第二个参数表示列(从0开始),第三个参数是该表格的内容。

4)关闭Excel。代码如下:

workbook.close()  # 关闭excel写入

运行程序,使用xlsxwriter模块生成的Excel表格

在这里插入图片描述
实例代码

import requests
import xlsxwriter

def get_json(index):
    # 爬虫功能
    # 爬取课程的json数据
    # :paramindex: 当前索引,从0开始
    # return: json数据
    url = "https://study.163.com/p/search/studycourse.json"
    # payload信息
    payload = {
    
    
        "activityId": 0,
        "keyword": "python",
        "orderType": 5,
        "pageIndex": index,
        "pageSize": 50,
        "priceType": -1,
        "qualityType": 0,
        "relativeOffset": 0,
        "searchTimeType": -1,
    }
    # headers信息
    headers = {
    
    
        "accept": "application/json",
        "content-type": "application/json",
        "origin": "https://study.163.com",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
    }

    try:
        response = requests.post(url,json=payload,headers=headers)
        content = response.json()
        if content and content["code"] == 0:
            return content
        return None

    except:
        print("出错了")

def get_course(content):
    # 获取课程信息列表
    # return 课程数据
    course_list = content["result"]["list"]
    return course_list


def save_excel(course_list):
    # 存到excel
    # 填充爬取的课程信息
    # page1  行数 1 50     50*(1-1) + 1
    # page2  行数 51 100   50*(2-1) + 1
    # page3  行数 101 150  50*(3-1) + 1
    for num,course in enumerate(course_list):
        row = 50*(index-1)+ num+1
        worksheet.write(row, 0, course["productId"])
        worksheet.write(row, 1, course["courseId"])
        worksheet.write(row, 2, course["productName"])
        worksheet.write(row, 3, course["provider"])
        worksheet.write(row, 4, course["score"])
        worksheet.write(row, 5, course["learnerCount"])
        worksheet.write(row, 6, course["lectorName"])
        worksheet.write(row, 7, course["originalPrice"])
        worksheet.write(row, 8, course["discountPrice"])
        worksheet.write(row, 9, course["bigImgUrl"])
        worksheet.write(row, 10, course["description"])

def main(index):
    content = get_json(index)         # 获取json数据
    course_list = get_course(content) # 获取第index页的50条件记录
    save_excel(course_list)           # 写入到excel

if __name__ == "__main__":

    # 存入excel
    workbook = xlsxwriter.Workbook("网易云课堂Python课程数据.xlsx")  # 创建excel
    worksheet = workbook.add_worksheet("first_sheet")
    worksheet.write(0, 0, "商品id")
    worksheet.write(0, 1, "课程id")
    worksheet.write(0, 2, "课程名称")
    worksheet.write(0, 3, "机构名称")
    worksheet.write(0, 4, "评分")
    worksheet.write(0, 5, "学习人数")
    worksheet.write(0, 6, "讲师名称")
    worksheet.write(0, 7, "原价")
    worksheet.write(0, 8, "折扣价")
    worksheet.write(0, 9, "图片")
    worksheet.write(0, 10, "课程描述")

    total_page_count = get_json(1)["result"]["query"]["totlePageCount"] # 总页数
    for index in range(1,total_page_count+1):
        main(index)
    workbook.close()

猜你喜欢

转载自blog.csdn.net/Yuyu920716/article/details/114175732