Asistente de voz personal llm, desde palabras de activación hasta transmisión de voz tts

Referencia:
uso de la API chatglm2:
https://blog.csdn.net/weixin_42357472/article/details/130342799?spm=1001.2014.3001.5501

Reconocimiento de voz a texto sherpa:
https://blog.csdn.net/weixin_42357472/article/details/131269539?spm=1001.2014.3001.5502

transmisión tts;
https://blog.csdn.net/weixin_42357472/article/details/132256328?spm=1001.2014.3001.5501

Proceso marco

Personaliza una palabra de activación (aquí: Xiaole Xiaole)》Convierta a texto mediante el reconocimiento de voz sherpa (si se reconoce el procesamiento de palabras activas)=》El texto es procesado por el modelo grande LLM=>Los resultados del procesamiento se transmiten a tts sound

Si desea informar los resultados en tiempo real devueltos por el modelo grande LLM en tiempo real a través de tts, puede consultar (lo principal es que la API acepte el contenido del protocolo sse en tiempo real del servidor): https:/ /blog.csdn.net/weixin_42357472/article/details/132336046

código

#!/usr/bin/env python3

# Real-time speech recognition from a microphone with sherpa-ncnn Python API
#
# Please refer to
# https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
# to download pre-trained models

import sys

try:
    import sounddevice as sd
except ImportError as e:
    print("Please install sounddevice first. You can use")
    print()
    print("  pip install sounddevice")
    print()
    print("to install it")
    sys.exit(-1)

import sherpa_ncnn
import pyttsx3

def create_recognizer():
    # Please replace the model files if needed.
    # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
    # for download links.
    # base_file = "sherpa-ncnn-conv-emformer-transducer-2022-12-06"
    # base_file = "sherpa-ncnn-lstm-transducer-small-2023-02-13"
    base_file = "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13"
    # base_file = "sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16"
    # base_file = "sherpa-ncnn-streaming-zipformer-20M-2023-02-17"
    recognizer = sherpa_ncnn.Recognizer(
        tokens="./{}/tokens.txt".format(base_file),
        encoder_param="./{}/encoder_jit_trace-pnnx.ncnn.param".format(base_file),
        encoder_bin="./{}/encoder_jit_trace-pnnx.ncnn.bin".format(base_file),
        decoder_param="./{}/decoder_jit_trace-pnnx.ncnn.param".format(base_file),
        decoder_bin="./{}/decoder_jit_trace-pnnx.ncnn.bin".format(base_file),
        joiner_param="./{}/joiner_jit_trace-pnnx.ncnn.param".format(base_file),
        joiner_bin="./{}/joiner_jit_trace-pnnx.ncnn.bin".format(base_file),
        num_threads=4,
    )
    return recognizer

import requests
import json
def chatglm(payload):
	""" post请求chatglm2  api服务 """
    url="http://192*****4:8000"
    json_payload = json.dumps(payload)

    # Set the headers to indicate that the request contains JSON data
    headers = {'Content-Type': 'application/json'}

    # Send the POST request with the JSON payload
    response = requests.post(url, data=json_payload, headers=headers).json()

    return response

def main():

	###llm初始化模型角色定义
    history =[["你名字叫*****;每次回答请都简要回答不超过30个字","好的,小乐很乐意为你服务"]]

    print("Started! Please speak")
    recognizer = create_recognizer()
    sample_rate = recognizer.sample_rate
    # samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
    samples_per_read = int(3 * sample_rate)  # 0.1 second = 100 ms
    print(samples_per_read,sample_rate)
    last_result = ""
    j=0
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
        while True:
            samples, _ = s.read(samples_per_read)  # a blocking read
            print("##"*18)
            
            samples = samples.reshape(-1)
            print(samples.shape,samples)
            recognizer.accept_waveform(sample_rate, samples)
            result = recognizer.text
            # print("result:",result,"last_result:",last_result)
        
			###实时语音识别,唤醒词处理等逻辑
            if last_result != result:
                last_result_len=len(last_result)   
                words = result[last_result_len:]
                print("words:",words)
                
                
				
                if "小乐小乐小乐" in words:
                    pyttsx3.speak("在的呢")
                    index = words.index("小乐小乐小乐")
                    new_word = words[index+6:]
                    print("new_word:",new_word)
                    if new_word:
                        pyttsx3.speak(new_word)
                    last_result = result
                    j=0

                elif "小乐小乐" in words:
                    pyttsx3.speak("在的呢")
                    index = words.index("小乐小乐")
                    new_word = words[index+4:]
                    print("new_word:",new_word)
                    if new_word:
                        pyttsx3.speak(new_word)
                    last_result = result
                    j=0
                
                else:
                    if j==0:
                        print("speak:",words)
                        pyttsx3.speak(words)
                        last_result = result
                        j=1

                        ##llm 处理与结果播报;或者见文章最下面实时api接口播报代码
                        results = chatglm({"prompt": words, "history": history})
                        print(results)
                        pyttsx3.speak(results["response"])
                        history = results["history"]


                 

            
          


                  





if __name__ == "__main__":
    devices = sd.query_devices()
    sd.default.device[0] = 0
    print(len(devices),devices,sd.default.device,)

    input_device_info = sd.query_devices(kind='input')
    channels = input_device_info['max_input_channels']

    # 打印通道数
    print(f"输入设备的通道数: {channels}")
    # default_input_device_idx = sd.default.device[0]
    
    # print(f'Use default device: {devices[default_input_device_idx]["name"]}')
    print(sd.default.channels)

    try:
        main()
    except KeyboardInterrupt:
        print("\nCaught Ctrl + C. Exiting")

Informe API de transmisión en tiempo real, referencia: https://blog.csdn.net/weixin_42357472/article/details/132336046

import httpx
import asyncio

async def chatglm_chat(word):
    url = "http://192*****4:8000"
    data = {
        "input": word,
        "max_length": 2048,
        "top_p": 0.7,
        "temperature": 0.95,
        "history": [["你名字叫******,让世界更安全;每次回答请都简要回答不超过30个字","好的,小***乐意为你服务"]],


        "html_entities": True,
    }
    text_len = 0
    async with httpx.AsyncClient() as client:
        async with client.stream("POST", url, json=data) as response:
            async for line in response.aiter_lines():
                print(line)
                line = line[6:]
                if text_len == 0:
                    if "," in line or ":" in line or "。" in line or  "、" in line or "!" in line or "," in line:
                        pyttsx3.speak(line)
                        text_len += len(line)

                else:
                    new_line = line[text_len:]
                    if "," in new_line or ":" in new_line or "。" in new_line or  "、" in new_line or "!" in new_line or "," in new_line:
                        
                        pyttsx3.speak(new_line)
                        text_len += len(new_line)

# 调用异步函数
asyncio.run(chatglm_chat(words))

Código completo:

#!/usr/bin/env python3

# Real-time speech recognition from a microphone with sherpa-ncnn Python API
#
# Please refer to
# https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
# to download pre-trained models

import sys

try:
    import sounddevice as sd
except ImportError as e:
    print("Please install sounddevice first. You can use")
    print()
    print("  pip install sounddevice")
    print()
    print("to install it")
    sys.exit(-1)

import sherpa_ncnn
import pyttsx3

def create_recognizer():
    # Please replace the model files if needed.
    # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
    # for download links.
    # base_file = "sherpa-ncnn-conv-emformer-transducer-2022-12-06"
    # base_file = "sherpa-ncnn-lstm-transducer-small-2023-02-13"
    base_file = "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13"
    # base_file = "sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16"
    # base_file = "sherpa-ncnn-streaming-zipformer-20M-2023-02-17"
    recognizer = sherpa_ncnn.Recognizer(
        tokens="./{}/tokens.txt".format(base_file),
        encoder_param="./{}/encoder_jit_trace-pnnx.ncnn.param".format(base_file),
        encoder_bin="./{}/encoder_jit_trace-pnnx.ncnn.bin".format(base_file),
        decoder_param="./{}/decoder_jit_trace-pnnx.ncnn.param".format(base_file),
        decoder_bin="./{}/decoder_jit_trace-pnnx.ncnn.bin".format(base_file),
        joiner_param="./{}/joiner_jit_trace-pnnx.ncnn.param".format(base_file),
        joiner_bin="./{}/joiner_jit_trace-pnnx.ncnn.bin".format(base_file),
        num_threads=4,
    )
    return recognizer

import requests
import json
def chatglm(payload):

    url="http://192.168.19.14:8000"
    json_payload = json.dumps(payload)

    # Set the headers to indicate that the request contains JSON data
    headers = {'Content-Type': 'application/json'}

    # Send the POST request with the JSON payload
    response = requests.post(url, data=json_payload, headers=headers).json()

    return response



import httpx
import asyncio

async def chatglm_chat(word):
    url = "http://192.168.19.14:8000"
    data = {
        "input": word,
        "max_length": 2048,
        "top_p": 0.7,
        "temperature": 0.95,
        "history": [["你名字****过30个字","好的,小杰很乐意为你服务"]],


        "html_entities": True,
    }
    text_len = 0
    async with httpx.AsyncClient() as client:
        async with client.stream("POST", url, json=data) as response:
            async for line in response.aiter_lines():
                print(line)
                line = line[6:]
                if text_len == 0:
                    if "," in line or ":" in line or "。" in line or  "、" in line or "!" in line or "," in line:
                        pyttsx3.speak(line)
                        text_len += len(line)

                else:
                    new_line = line[text_len:]
                    if "," in new_line or ":" in new_line or "。" in new_line or  "、" in new_line or "!" in new_line or "," in new_line:
                        
                        pyttsx3.speak(new_line)
                        text_len += len(new_line)


def main():
    history =[["你名字叫*******过30个字","好的,小杰很乐意为你服务"]]

    print("Started! Please speak")
    recognizer = create_recognizer()
    sample_rate = recognizer.sample_rate
    # samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
    samples_per_read = int(5 * sample_rate)  # 0.1 second = 100 ms
    print(samples_per_read,sample_rate)
    last_result = ""
    j=0
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
        while True:
            samples, _ = s.read(samples_per_read)  # a blocking read
            print("##"*18)
            
            samples = samples.reshape(-1)
            print(samples.shape,samples)
            recognizer.accept_waveform(sample_rate, samples)
            result = recognizer.text
            # print("result:",result,"last_result:",last_result)
        

            if last_result != result:
                last_result_len=len(last_result)   
                words = result[last_result_len:]
                print("words:",words)
                
                

                if "小杰小杰小杰" in words:
                    pyttsx3.speak("在的呢")
                    index = words.index("小杰小杰小杰")
                    new_word = words[index+6:]
                    print("new_word:",new_word)
                    if new_word:
                        pyttsx3.speak(new_word)
                    last_result = result
                    j=0

                elif "小杰小杰" in words:
                    pyttsx3.speak("在的呢")
                    index = words.index("小杰小杰")
                    new_word = words[index+4:]
                    print("new_word:",new_word)
                    if new_word:
                        pyttsx3.speak(new_word)
                    last_result = result
                    j=0
                
                else:
                    if j==0:
                        print("speak:",words)
                        # pyttsx3.speak(words)
                        last_result = result
                        j=1

                        # 调用异步函数
                        asyncio.run(chatglm_chat(words))

                 

            
          


            




if __name__ == "__main__":
    devices = sd.query_devices()
    sd.default.device[0] = 0
    print(len(devices),devices,sd.default.device,)

    input_device_info = sd.query_devices(kind='input')
    channels = input_device_info['max_input_channels']

    # 打印通道数
    print(f"输入设备的通道数: {channels}")
    # default_input_device_idx = sd.default.device[0]
    
    # print(f'Use default device: {devices[default_input_device_idx]["name"]}')
    print(sd.default.channels)

    try:
        main()
    except KeyboardInterrupt:
        print("\nCaught Ctrl + C. Exiting")

Supongo que te gusta

Origin blog.csdn.net/weixin_42357472/article/details/132324360
Recomendado
Clasificación