想通过听实时新闻来提高英语听力,学了那么多年的英语,不能落下啊,不然白费背了那么多年的单词。
npr news是美国国家公共电台,发音纯正,音频每日更新,以美国为主,世界新闻为辅,比如最近我国武汉发生的新型冠状病毒肺炎,每天都有涉及China,Wuhan,Coronavirus等词。
废话不多说,直接上代码:
__author__ = "jayson" import requests import re from multiprocessing import Pool import time import traceback import sys import os def download_mp3(audios, music_index, headers, already_downloads): try: date = audios[2] # 文件前添加日期,便于排序 music_name = date + '_' + audios[0] + '.mp3' music_url = audios[1] if music_name in already_downloads: # 避免被重复下载 print('已存在,跳过下载') pass else: print(f'{music_index}下载中...') music_rt = requests.get(music_url, headers=headers) music_rt = music_rt.content # 以bytes形式接收(接收后,以二进制写入文件即下载文件。也可以通过decode来转码变换为str),.text是以str形式接收 with open(f'.//downloads//{music_name}', 'wb') as f: f.write(music_rt) print(f'{music_index}下载完成!') except: # 捕获异常,写入文件 error_info = sys.exc_info() with open('npr_news_error.txt', 'a') as f: f.write(music_name + ',' + time.strftime("%Y-%m-%d %H:%M:%S") + ':\n') print(error_info[0], ':', error_info[1], '\n', file=f) traceback.print_tb(error_info[2], file=f) f.write('\n' + '=' * 50 + '\n') if __name__ == '__main__': url = 'https://www.npr.org/proxy/listening/v2/recommendations?channel=cleplayer' # 每日更新音频的链接 headers = { 'Referer': 'https://item.jd.com/5239477.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} rt = requests.get(url, headers=headers) rt =rt.text audio_orurls = re.findall('title":"(.*?)".*?"date":"(.*?)T.*?audio\\\\\/mp3","href":(.*?mp3)\?', rt, re.S) # 获取音频下载链接、标题、日期 audio_urls = [] for mus_ora in audio_orurls: title = mus_ora[0].replace('\/', ' ').encode('utf-8').decode('unicode_escape').replace('\\', '').replace(':', ':').replace('?', '?').replace('<', '《').replace('>', '》').replace('*', ' ').replace('\"', '\'').replace('|', ' ') # 处理不规范的字符,windows文件名不支持部分特殊字符 title = title[:-3] + title[-3:].replace(' ', '').replace('.', '') date = mus_ora[1].replace('-', '') url = mus_ora[2][1:].replace('\\', '') audio_urls.append([title, url, date]) already_downloads = os.listdir('.//downloads') pool = Pool(6) # 采用多进程进行下载(注意windows系统 jupyter里面不能用多进程),若不想用,可以直接调用download_mp3函数即可 for music_index, audios in enumerate(audio_urls): pool.apply_async(download_mp3, (audios, music_index, headers, already_downloads)) pool.close() pool.join()