网易云课堂 :study.163.com
输入 python 关键字 搜索
显示全部python课程
本次实现 :提取课程信息并保存到Excel
使用xlsxwriter模块实现该功能。
使用前一定要导入该模块:
import xlsxwriter
如果要将数据写入到Excel,首先需要创建一个Excel,然后再Excel创建sheet,最后在sheet中写入数据。下面分别介绍实现方法:
1)创建Excel。代码如下:
workbook = xlsxwriter.Workbook("网易云课堂Python课程数据.xlsx") # 创建excel
第一个参数就是Excel名称。
2)使用workbook对象创建sheet。代码如下:
worksheet = workbook.add_worksheet("first_sheet") # 创建sheet
第一个参数就是sheet名称。
3)写入数据。代码如下:
worksheet.write(0, 0, '商品ID')
worksheet.write(0, 1, '课程ID')
worksheet.write(0, 2, '商品名称')
worksheet.write(0, 3, '商品类型')
worksheet.write(0, 4, '机构名称')
worksheet.write(0, 5, '评分')
上述代码中,worksheet.write(0, 0, ‘商品ID’)的第一个参数表示行(从0开始),第二个参数表示列(从0开始),第三个参数是该表格的内容。
4)关闭Excel。代码如下:
workbook.close() # 关闭excel写入
运行程序,使用xlsxwriter模块生成的Excel表格
实例代码
import requests
import xlsxwriter
def get_json(index):
# 爬虫功能
# 爬取课程的json数据
# :paramindex: 当前索引,从0开始
# return: json数据
url = "https://study.163.com/p/search/studycourse.json"
# payload信息
payload = {
"activityId": 0,
"keyword": "python",
"orderType": 5,
"pageIndex": index,
"pageSize": 50,
"priceType": -1,
"qualityType": 0,
"relativeOffset": 0,
"searchTimeType": -1,
}
# headers信息
headers = {
"accept": "application/json",
"content-type": "application/json",
"origin": "https://study.163.com",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
}
try:
response = requests.post(url,json=payload,headers=headers)
content = response.json()
if content and content["code"] == 0:
return content
return None
except:
print("出错了")
def get_course(content):
# 获取课程信息列表
# return 课程数据
course_list = content["result"]["list"]
return course_list
def save_excel(course_list):
# 存到excel
# 填充爬取的课程信息
# page1 行数 1 50 50*(1-1) + 1
# page2 行数 51 100 50*(2-1) + 1
# page3 行数 101 150 50*(3-1) + 1
for num,course in enumerate(course_list):
row = 50*(index-1)+ num+1
worksheet.write(row, 0, course["productId"])
worksheet.write(row, 1, course["courseId"])
worksheet.write(row, 2, course["productName"])
worksheet.write(row, 3, course["provider"])
worksheet.write(row, 4, course["score"])
worksheet.write(row, 5, course["learnerCount"])
worksheet.write(row, 6, course["lectorName"])
worksheet.write(row, 7, course["originalPrice"])
worksheet.write(row, 8, course["discountPrice"])
worksheet.write(row, 9, course["bigImgUrl"])
worksheet.write(row, 10, course["description"])
def main(index):
content = get_json(index) # 获取json数据
course_list = get_course(content) # 获取第index页的50条件记录
save_excel(course_list) # 写入到excel
if __name__ == "__main__":
# 存入excel
workbook = xlsxwriter.Workbook("网易云课堂Python课程数据.xlsx") # 创建excel
worksheet = workbook.add_worksheet("first_sheet")
worksheet.write(0, 0, "商品id")
worksheet.write(0, 1, "课程id")
worksheet.write(0, 2, "课程名称")
worksheet.write(0, 3, "机构名称")
worksheet.write(0, 4, "评分")
worksheet.write(0, 5, "学习人数")
worksheet.write(0, 6, "讲师名称")
worksheet.write(0, 7, "原价")
worksheet.write(0, 8, "折扣价")
worksheet.write(0, 9, "图片")
worksheet.write(0, 10, "课程描述")
total_page_count = get_json(1)["result"]["query"]["totlePageCount"] # 总页数
for index in range(1,total_page_count+1):
main(index)
workbook.close()