NeMo Chinese ASR speaker separation (speaker log) practice

        Speaker separation, or speaker log, mainly solves the problem of when and what the speaker said. Typical application scenarios: multi-person meetings, agent sales/customer service scenarios.

        

 The typical implementation is based on pipelines.

First, the sound clips are segmented based on MarbleNet of VAD (Voice Activity Detection), then speaker features are extracted based on TitaNet-L, then speakers are distinguished through clustering, and finally speaker labels are separated through neural networks.

1. Nemo environment installation

Refer to NeMo Chinese/English ASR model fine-tuning training practice_wxl781227's blog-CSDN blog

2. Reference dependency

import nemo.collections.asr as nemo_asr
import numpy as np
from IPython.display import Audio, display
import librosa
import os
import wget
import matplotlib.pyplot as plt

import nemo
import glob

import pprint
pp = pprint.PrettyPrinter(indent=4)

3. Locally isolated samples

ROOT = os.getcwd()
data_dir = os.path.join(ROOT,'data')
print(data_dir)
os.makedirs(data_dir, exist_ok=True)

AUDIO_FILENAME = os.path.join(data_dir,'test.wav')

audio_file_list = glob.glob(f"{data_dir}/test.wav")
print("Input audio file list: \n", audio_file_list)

signal, sample_rate = librosa.load(AUDIO_FILENAME, sr=16000)
print(sample_rate)
display(Audio(signal,rate=sample_rate))

3. Fine-tuning training practice using NeMo Chinese/English ASR model_wxl781227’s blog-CSDN blog trained model

my_asr_model = nemo_asr.models.EncDecCTCModel.restore_from("my_stt_zh_quartznet15x5.nemo")

4. Define the waveform output function and waveform coloring function

def display_waveform(signal,text='Audio',overlay_color=[]):
    fig,ax = plt.subplots(1,1)
    fig.set_figwidth(20)
    fig.set_figheight(2)
    plt.scatter(np.arange(len(signal)),signal,s=1,marker='o',c='k')
    if len(overlay_color):
        plt.scatter(np.arange(len(signal)),signal,s=1,marker='o',c=overlay_color)
    fig.suptitle(text, fontsize=16)
    plt.xlabel('time (secs)', fontsize=18)
    plt.ylabel('signal strength', fontsize=14);
    plt.axis([0,len(signal),-0.5,+0.5])
    time_axis,_ = plt.xticks();
    plt.xticks(time_axis[:-1],time_axis[:-1]/sample_rate);
    
COLORS="b g c m y".split()
import soundfile
def get_color(signal,speech_labels,sample_rate=sample_rate):
    c=np.array(['k']*len(signal))
    for time_stamp in speech_labels:
        start,end,label=time_stamp.split()
        start,end = int(float(start)*sample_rate),int(float(end)*sample_rate),
        corp_wav = signal[start:end]
        soundfile.write(f"/NeMo/tutorials/speaker_tasks/data/test_{start}_{end}_{label}.wav", corp_wav, sample_rate)
        print(label,my_asr_model.transcribe([f"/NeMo/tutorials/speaker_tasks/data/test_{start}_{end}_{label}.wav"]))
        if label == "speech":
            code = 'red'
        else:
            code = COLORS[int(label.split('_')[-1])]
        c[start:end]=code
    
    return c 

5. Output the waveform before separation

display_waveform(signal)

6. Introduce the original yaml configuration file

from omegaconf import OmegaConf
import shutil
DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"

CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"

if not os.path.exists(os.path.join(data_dir,CONFIG_FILE_NAME)):
    CONFIG = wget.download(CONFIG_URL, data_dir)
else:
    CONFIG = os.path.join(data_dir,CONFIG_FILE_NAME)

cfg = OmegaConf.load(CONFIG)
print(OmegaConf.to_yaml(cfg))

7. Create mainfest file

# Create a manifest file for input with below format. 
# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", 
# "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
import json
meta = {
    'audio_filepath': AUDIO_FILENAME, 
    'offset': 0, 
    'duration':None, 
    'label': 'infer', 
    'text': '-', 
    'num_speakers': None, 
    'rttm_filepath': None, 
    'uem_filepath' : None
}
with open(os.path.join(data_dir,'input_manifest.json'),'w') as fp:
    json.dump(meta,fp)
    fp.write('\n')

cfg.diarizer.manifest_filepath = os.path.join(data_dir,'input_manifest.json')
!cat {cfg.diarizer.manifest_filepath}

8. Set up the pipeline model used

pretrained_speaker_model='titanet-l.nemo'
cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
cfg.diarizer.out_dir = data_dir #Directory to store intermediate files and prediction outputs
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
cfg.diarizer.clustering.parameters.oracle_num_speakers=None

# Using Neural VAD and Conformer ASR 
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
cfg.diarizer.asr.model_path = 'my_stt_zh_quartznet15x5.nemo'
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD 
cfg.diarizer.asr.parameters.asr_based_vad = False
cfg.batch_size = 2

9. Run ASR to get the timestamp mark

from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
asr_model = asr_decoder_ts.set_asr_model()
word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)

print("Decoded word output dictionary: \n", word_hyp['test'])
print("Word-level timestamps dictionary: \n", word_ts_hyp['test'])

10. Create ASR offline separation object

from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset

11. Perform offline separation

diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
print("Diarization hypothesis output: \n", diar_hyp['test'])

12. Display the results of offline separation

def read_file(path_to_file):
    with open(path_to_file) as f:
        contents = f.read().splitlines()
    return contents

predicted_speaker_label_rttm_path = f"{data_dir}/pred_rttms/test.rttm"
pred_rttm = read_file(predicted_speaker_label_rttm_path)

pp.pprint(pred_rttm)

from nemo.collections.asr.parts.utils.speaker_utils import rttm_to_labels
pred_labels = rttm_to_labels(predicted_speaker_label_rttm_path)

color = get_color(signal, pred_labels)
display_waveform(signal,'Audio with Speaker Labels', color)
display(Audio(signal,rate=sample_rate))

[   'SPEAKER test2 1   0.000   4.125 <NA> <NA> speaker_1 <NA> <NA>',
    'SPEAKER test2 1   4.125   4.565 <NA> <NA> speaker_0 <NA> <NA>']
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]
speaker_1 ['Hey, you told me the day before yesterday what the 12-period interest rate is']
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]
speaker_0 ['If the job number is 908262, then it will be 0.810,000, then it will be divided into 120,000, and the interest will be 80']

 

Guess you like

Origin blog.csdn.net/wxl781227/article/details/132292782