Daily exercise reptiles - climb PPT

Foreword

PPT network is really a treasure, ideal for novices to operate reptiles practice, not only at work learning to use ppt templates, you can also practice reptiles.

The site clearly directory structure, not do too much anti-climb, anti-hotlinking case there is only a part of the link, if the whole point was to climb there is a certain amount of work.
Based on this station, I would like to achieve the following objectives:
call link in Thunder Batch Download Station with Python, and while the file is renamed
today to grasp the idea that, behind have to take the time to realize my needs. Come to Oliver!

'''
爬ppt,第一版
version:01
author:金鞍少年
date:2020-02-25


'''
import requests
from bs4 import BeautifulSoup

class first_ppt():

    def __init__(self):
        self.pages = 1  # 分页计数
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
            "Referer": "http://www.1ppt.com/tags/127/"
        }

    # 获取首页
    def get_Url(self):
        while self.pages <= 2:
            url = 'http://www.1ppt.com/tags/127/'+str(self.pages)+'/'
            self.pages += 1
            re = requests.get(url, headers=self.headers)
            if re.status_code == 200:
                page_html = BeautifulSoup(re.text, 'html.parser')
                yield page_html
            else:
                print('链接失败!')

    # 获取page_url
    def get_page(self, pages):
        ppt_list = pages.findAll('a', class_="preview")
        for i in ppt_list:
            page_url = 'http://www.1ppt.com/'+i['href']
            yield page_url

    # 获取内容页链接
    def get_Content(self,page_url):
        res = requests.get(page_url, headers=self.headers)
        if res.status_code == 200:
            res.encoding = res.apparent_encoding
            Content_page = BeautifulSoup(res.text, 'html.parser')
            yield Content_page


    # 下载保存
    def downloadYour(self,Content_page):
        title = Content_page.title.string.replace(' - 第一PPT','')
        res = Content_page.find('ul', class_="downurllist").findAll('a')
        for i in res:
            request = requests.get(i['href'])
            if request.status_code == 200:
                 file_path = r'./ppt/'+title
                 with open(file_path+'.zip', 'wb')as f:
                     f.write(request.content)
                     print('下载 %s %s 成功'%(i['href'],title))

    # 核心业务
    def fun(self):
        for page_html in self.get_Url():
            for page_url in self.get_page(page_html):
                for url in self.get_Content(page_url):
                    self.downloadYour(url)

if __name__ =='__main__':
    ppt = first_ppt()
    ppt.fun()
Published 46 original articles · won praise 37 · views 4517

Guess you like

Origin blog.csdn.net/weixin_42444693/article/details/104507405