调用百度语音AI实现语音的识别和合成

#coding:utf-8

## 先去ffmpeg官网下载(https://ffmpeg.zeranoe.com/builds/),好了之后解压缩,配一下环境变量

## 打开cmd,运行命令,安装如下的包
## pip install baidu-aip
## pip install pydub
## pip install PyAudio
## pip install Wave

""" 调用百度语音api """
from aip import AipSpeech
APP_ID = " "
API_KEY = " "
SECRET_KEY = " "
client = AipSpeech(APP_ID,API_KEY,SECRET_KEY)
 
 
 
def speech_synthesis(text, filepath):
    """ 语音合成:文字转语音 """
    result = client.synthesis(text, 'zh', 1, {
        'vol': 5,
        'spd': 5,
        'pit': 5,
        'per': 0,
    })
    if not isinstance(result, dict): 
        with open (filepath , 'wb') as file: file.write(result)



def play_speech(filepath):
    import os
    os.system("ffplay %s"%(filepath))

# def play_speech(filepath):
    # """ 播放语音 """
    # import pyaudio
    # import wave 
    # wf = wave.open(filepath, 'rb') #二进制只读方式打开wav文件
    # p = pyaudio.PyAudio()
    # stream=p.open(format=p.get_format_from_width(wf.getsampwidth()),channels=wf.getnchannels(),rate=wf.getframerate(),output=True)
    # stream = p.open(format=pyaudio.paInt16,
                    # channels=1,
                    # rate=16000,
                    # output=True) #打开数据流
    # data = wf.readframes(1024) #读取数据
    # while data != '': #播放  
        # stream.write(data)
        # data = wf.readframes(1024)
    # stream.stop_stream()
    # stream.close()
    # p.terminate()
 
 


# def Conversion_sampling_rate(filepath, newfilepath):
    # """ 转换采样率 """
    # from pydub import AudioSegment
    # setframefp = AudioSegment.from_file(filepath)
    # setframefp.set_frame_rate(16000)
    # setframefp.export(newfilepath, format='wav')



def wav_to_pcm(wav_file):
    """ wav文件转为16k pcm文件 """
    import os
    pcm_file = "%s.pcm" %(wav_file.split(".")[0])
    os.system("ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s"%(wav_file,pcm_file))
    return pcm_file




def sound_record(file_name):
    """ 录音 """
    import pyaudio
    import wave
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 3
    
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    print("开始录音,请说话......")
    frames = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("录音结束!")
    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(file_name, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()


def speech_recognition(filepath):
    """ 语音识别:语音转文字 """ 
    with open(filepath, 'rb') as fp:speechfile = fp.read()
    result = client.asr(speechfile, 'pcm', 16000, {
        'dev_pid': 1536,
    })
    try:
        res_str = result.get("result")[0]
        print(res_str)
    except:
        res_str = "error"
        print("识别没有成功")
    return res_str

 
# 测试
# text = "世界很复杂百度更懂你"
# synthesisfilepath = "synthesisspeech.pcm"
# synthesisfilepath = "16k.pcm"
# speech_synthesis(text, synthesisfilepath)
# wav_file = pcm_to_wav(synthesisfilepath)
# play_speech(wav_file)

# recordfilepath = "recordspeech.wav"
# sound_record(recordfilepath)
# pcm_file = wav_to_pcm(recordfilepath)
# speech_recognition(pcm_file)




""" 控制面板 """
from tkinter import *
from tkinter import ttk
from tkinter import messagebox
import os
class App:
    def __init__(self, master):
        self.master = master
        self.master.title("调用百度AI识别语音")
        self.master.geometry("500x400")
        self.buttonimg = PhotoImage(file= os.path.join(os.path.dirname(os.path.abspath(__file__)), 'luyin - small.gif')) 
        self.initWidgets()
        
    def initWidgets(self):
        self.button = Button(self.master, text='开始录音', image=self.buttonimg, command=self.open_sound_record, height=100,width=100)
        self.button.pack(ipadx=5, ipady=5,  pady = 20)
        
        self.label = Label(self.master, text="语音识别结果:")
        self.label.place(x=100,y=400,anchor='nw')
        self.label.pack()
        
        self.text = Text(self.master, height=3, width=200)
        self.text.place(x=150,y=400,anchor='nw')
        self.text.pack()
    
    def open_sound_record(self):
        recordfilepath = "recordspeech.wav"
        sound_record(recordfilepath)
        pcm_file = wav_to_pcm(recordfilepath)
        res_str = speech_recognition(pcm_file)
        if res_str == "error":
            print(messagebox.showinfo("出错","没有成功识别语音!"))
        else:
            self.text.insert("insert", res_str)
            # text = "语音识别的结果是"+res_str
            # synthesisfilepath = "synthesisspeech.pcm"
            # speech_synthesis(text, synthesisfilepath)
            # play_speech(synthesisfilepath)
        
root = Tk()
App(root)
root.mainloop()

猜你喜欢

转载自www.cnblogs.com/yejifeng/p/11428936.html