批量下载Americanlife 语音材料

批量下载

下载后会在给定目录生成多pdf文件,文件名为每一节的名称

#!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
# @Time    : 2019/11/18 下午10:48
# @Author  : yon
# @Email   : [email protected]
# @File    : day1.py 

import os
import re
import time
import logging
import pdfkit
from bs4 import BeautifulSoup
import requests


def gethtml(url):
    targeturl = url
    filepath = '/home/yon/Desktop/pdf/'
    headers = {
        # 'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
        'Cache-Control': 'no-cache',
        'accept-encoding': 'gzip, deflate, br',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
        'Referer': 'https://www.google.com/'
    }
    resp = requests.get(targeturl, headers=headers)
    soup = BeautifulSoup(resp.content, "html.parser")
    txt = soup.find("article")
    title = filepath + txt.h1.text.replace(" ", "") + ".pdf"
    # print(title)
    pdfkit.from_string(str(txt), title)



if __name__  == '__main__':
    # gethtml("https://www.thisamericanlife.org/664/transcript")
    for number in range(665, 687):
        urltoget = "https://www.thisamericanlife.org/" + str(number) + "/transcript"
        gethtml(urltoget)
        time.sleep(10)

合并

将多个pdf 合并,并根据每节各自生成书签

猜你喜欢

转载自www.cnblogs.com/g2thend/p/12003179.html
今日推荐