用过正则表达式和requests库来爬取极客学院课程的名称,
课程简介,课程时长,课时数,上课人数。
以下为python3编写的代码
import requests
import string
import re
# 代理头信息
kv = {'user-agent':'Mozilla/5.0'}
# 爬虫类,用来实现各种功能
class spider(object):
def __init__(self):
print('开始爬取内容......')
# 获取网页源代码
def getsource(self,url):
html = requests.get(url,headers=kv)
return html.text
# 换页,page_group用来存放那20条链接
def changepage(self,url,total_page):
now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
page_group = []
for i in range(now_page,total_page+1):
link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
page_group.append(link)
return page_group
# 获取每一个课程的源代码
def geteveryclass(self,source):
everyclass = re.findall('<li id=.*?</li>',source,re.S)
return everyclass
# 获取信息
def getinfo(self,eachclass):
info = {} # 字典类型
info['title'] = re.search('class="lessonimg" title=(.*?) alt=',eachclass,re.S).group(1)
previous = re.search('<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',eachclass,re.S).group(1)
result = str.strip(previous)
info['content'] = result
# 课程的时间和水平在一个标签内,所以这样写
timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
info['classtime'] = timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('<em class="learn-number">(.*?)</em>',eachclass,re.S).group(1)
return info
# 将信息写入记事本
def saveinfo(self,classinfo):
f = open('info.txt','a',encoding='utf-8') # a表示以追加的方式打开
for each in classinfo:
f.writelines('title:' + each['title'] + '\n')
f.writelines('content:' + each['content'] + '\n')
f.writelines('classtime:' + each['classtime'] + '\n')
f.writelines('classleve:' + each['classlevel'] + '\n')
f.writelines('learnnum:' + each['learnnum'] + '\n\n\n')
f.close()
if __name__ == '__main__':
classinfo = [] # 空列表,保存课程信息
url = 'https://www.jikexueyuan.com/course/?pageNum=1'
jikespider = spider() # 类的实例化
all_links = jikespider.changepage(url,20)
for link in all_links:
print('正在处理页面,' + link)
html = jikespider.getsource(link)
everyclass = jikespider.geteveryclass(html)
for each in everyclass:
info = jikespider.getinfo(each)
classinfo.append(info)
jikespider.saveinfo(classinfo)