从零开始学习--Pyhon 6月30日

Python

                                                                                                                     ---小白121的记录笔记

class main_(): 定义一个类
    def get_source(self,url):
        return requests.get(url)

    def page_num(self,url,total_page): #创造多个 URL 元组
        now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
        page_group = []
        for i in range(now_page,total_page+1):
            link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
            page_group.append(link)
        return page_group


    def get_info(self,html): #爬取并过滤出有用信息

        info = {}
        info['title'] = re.search('class="lessonimg" title="(.*?)" alt="',html.text,re.S).group(1).replace('\n','').replace('\t','')
        info['content'] = re.search('<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',html.text,re.S).group(1).replace('\n','').replace('\t','')
        info['h_m_person'] = re.search('<em class="learn-number">(.*?)</em>',html.text).group(1).replace('\n','').replace('\t','')
        info['h_long'] = re.search('<dd class="mar-b8"><i class="time-icon"></i><em>(.*?)</em>',html.text,re.S).group(1).replace('\n','').replace('\t','')

        return info

    def save_info(self,all_info): #存储信息
        os.chdir('F:\\python测试\\')
        f = open('极客学院课程info.txt','a',encoding='utf-8')

        for each in all_info:
            f.writelines('题目为:' + each['title'] + '\n')
            f.writelines('内容为:' + each['content'] + '\n')
            f.writelines('学习人数:' + each['h_m_person'] + '\n')
            f.writelines('课程时间:' + each['h_long'] + '\n\n\n\n\n')
        f.close()


a = main_()#调用main类


page_num = input('please input crawl web the page : ') #输入爬取最大页数
page_num = int(page_num) 
classinfo = []
change_page = a.page_num(url,page_num) #调用change_page

for i in change_page:
    html = a.get_source(i)
    info = a.get_info(html)
    classinfo.append(info)
    print('正在提取:%s '%i)

a.save_info(classinfo)




猜你喜欢

转载自blog.csdn.net/qq_42184699/article/details/80870138