运用爬虫爬取古诗句,并播放
代码为:
from urllib import request
from urllib import parse
from bs4 import BeautifulSoup
from pygame import mixer
import sys
import time
from mutagen.mp3 import MP3
def Time_1(): # 进度条函数
for i in range(1,51):
sys.stdout.write('\r')
sys.stdout.write('{0}% |{1}'.format(int(i%51)*2,int(i%51)*'■'))
sys.stdout.flush()
time.sleep(0.125)
sys.stdout.write('\n')
def Chinese(word):
word=word.replace('<br/>','\n')
str_1=''
for ch in range(len(word)):
if '\u4e00' <= word[ch] <= '\u9fff' or word[ch] in ['、','\n','·',',','。',';',':','!','?','【','】']:
str_1+=word[ch]
return str_1
def Chineses(word):
len_1=0
for ch in range(len(word)):
if '\u4e00' <= word[ch] <= '\u9fff':
len_1+=1
return len_1 # 统计一篇诗的字数
keyword=input('请输入一个关键字:')
key_word=parse.urlencode({'keyword':keyword})
key_word=key_word[key_word.find('=')+1:]
cishu=int(input('是否想看全部相关诗句:(1是,2否)'))
url='https://so.gushiwen.org/search.aspx?type=title&page=1&value='+key_word
content=request.urlopen(url=url)
content_1=BeautifulSoup(content.read().decode('utf-8'),'lxml')
titles_list_1=[]; contents_list_1=[]
pages=str(content_1.select('div.pagesright>span'))
pages=pages[pages.find('">/')+4:pages.find('页')]
# 古诗的总页数
if cishu==2:
pages='1'
elif cishu!=1 and cishu!=2:
print('你的输入有问题,程序已经自动退出了!')
sys.exit()
Downlad_href=[];musics_list=[]
for pages_1 in range(1,int(pages)+1):
url='https://so.gushiwen.org/search.aspx?type=title&page='+str(pages_1)+'&value='+key_word
content=request.urlopen(url=url)
content_1=BeautifulSoup(content.read().decode('utf-8'),'lxml')
titles_list=content_1.select('div.cont >p>a>b')
titles_list=[Chinese(str(title)) for title in titles_list] # 古诗的标题 类型为列表
titles_list_1.extend(titles_list)
contents_list=content_1.select('div.cont>div.contson')
contents_list=[Chinese(str(content)) for content in contents_list]
contents_list_1.extend(contents_list)
hrefs_list=content_1.select('div.sons>div>div>img')
for i in range(len(hrefs_list)):
str_1=str(hrefs_list[i])
if '背诵' in str_1:
str_1=str_1[str_1.find('OnBeisong')+11:str_1.find('OnBeisong')+23]
Downlad_href.append(str_1)
music_url=content_1.select('div.sons')
for i in music_url:
i=str(i)
str_2=i[i.find('toolPlay'):]
str_2=str_2[8:str_2.find('"')]
musics_list.append(str_2) # musics_list为诗句播放的id列表
for i in range(len(titles_list_1)):
print('-'*25)
print('{}|-------{}-------|'.format(i+1,titles_list_1[i][1:-1]))
print('{}'.format(contents_list_1[i]))
print('*'*25)
key_name=int(input('请输入你想播放的诗的序号:'))
if key_name>=1 and key_name<=len(Downlad_href):
music_url_1='https://so.gushiwen.org/viewplay.aspx?id='+musics_list[key_name-1]
href_content=request.urlopen(url=music_url_1).read().decode('utf-8')
soup=BeautifulSoup(href_content,'lxml')
str_3=str(soup.select('body>div>audio')[0])
Downlad_url=str_3[str_3.find('src="')+5:str_3.find('style=')-2] # Downlad_url 为诗句朗诵的下载链接
try:
request.urlretrieve(url=Downlad_url,filename='.\{}.mp3'.format(titles_list_1[key_name-1][1:-1]))
print('正在加载进程!')
len_1 = Chineses(contents_list_1[key_name - 1]) # 诗的字数
Time_1()
audio=MP3('.\{}.mp3'.format(titles_list_1[key_name-1][1:-1]))
time_1=int(audio.info.length)
mixer.init()
mixer.music.load('.\{}.mp3'.format(titles_list_1[key_name-1][1:-1]))
mixer.music.play()
mixer.stop()
print('该段文字总共有{}字数,需要{}秒'.format(len_1,time_1))
for text in contents_list_1[key_name-1]:
print(text,end='')
time.sleep(float(time_1/len_1)+0.1)
print('播放完毕!')
except Exception as e:
print('对不起,该诗到目前为止,还没有找到播放路径{}',e)
else:
print('你的输入有问题,程序将退出!')
运行结果:
注意:部分诗句没有朗诵。
如果大家觉得这个还可以,觉得点赞和关注,谢谢!