Speaker separation, or speaker log, mainly solves the problem of when and what the speaker said. Typical application scenarios: multi-person meetings, agent sales/customer service scenarios.
The typical implementation is based on pipelines.
First, the sound clips are segmented based on MarbleNet of VAD (Voice Activity Detection), then speaker features are extracted based on TitaNet-L, then speakers are distinguished through clustering, and finally speaker labels are separated through neural networks.
1. Nemo environment installation
Refer to NeMo Chinese/English ASR model fine-tuning training practice_wxl781227's blog-CSDN blog
2. Reference dependency
import nemo.collections.asr as nemo_asr
import numpy as np
from IPython.display import Audio, display
import librosa
import os
import wget
import matplotlib.pyplot as plt
import nemo
import glob
import pprint
pp = pprint.PrettyPrinter(indent=4)
3. Locally isolated samples
ROOT = os.getcwd()
data_dir = os.path.join(ROOT,'data')
print(data_dir)
os.makedirs(data_dir, exist_ok=True)
AUDIO_FILENAME = os.path.join(data_dir,'test.wav')
audio_file_list = glob.glob(f"{data_dir}/test.wav")
print("Input audio file list: \n", audio_file_list)
signal, sample_rate = librosa.load(AUDIO_FILENAME, sr=16000)
print(sample_rate)
display(Audio(signal,rate=sample_rate))
3. Fine-tuning training practice using NeMo Chinese/English ASR model_wxl781227’s blog-CSDN blog trained model
my_asr_model = nemo_asr.models.EncDecCTCModel.restore_from("my_stt_zh_quartznet15x5.nemo")
4. Define the waveform output function and waveform coloring function
def display_waveform(signal,text='Audio',overlay_color=[]):
fig,ax = plt.subplots(1,1)
fig.set_figwidth(20)
fig.set_figheight(2)
plt.scatter(np.arange(len(signal)),signal,s=1,marker='o',c='k')
if len(overlay_color):
plt.scatter(np.arange(len(signal)),signal,s=1,marker='o',c=overlay_color)
fig.suptitle(text, fontsize=16)
plt.xlabel('time (secs)', fontsize=18)
plt.ylabel('signal strength', fontsize=14);
plt.axis([0,len(signal),-0.5,+0.5])
time_axis,_ = plt.xticks();
plt.xticks(time_axis[:-1],time_axis[:-1]/sample_rate);
COLORS="b g c m y".split()
import soundfile
def get_color(signal,speech_labels,sample_rate=sample_rate):
c=np.array(['k']*len(signal))
for time_stamp in speech_labels:
start,end,label=time_stamp.split()
start,end = int(float(start)*sample_rate),int(float(end)*sample_rate),
corp_wav = signal[start:end]
soundfile.write(f"/NeMo/tutorials/speaker_tasks/data/test_{start}_{end}_{label}.wav", corp_wav, sample_rate)
print(label,my_asr_model.transcribe([f"/NeMo/tutorials/speaker_tasks/data/test_{start}_{end}_{label}.wav"]))
if label == "speech":
code = 'red'
else:
code = COLORS[int(label.split('_')[-1])]
c[start:end]=code
return c
5. Output the waveform before separation
display_waveform(signal)
6. Introduce the original yaml configuration file
from omegaconf import OmegaConf
import shutil
DOMAIN_TYPE = "meeting" # Can be meeting or telephonic based on domain type of the audio file
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
if not os.path.exists(os.path.join(data_dir,CONFIG_FILE_NAME)):
CONFIG = wget.download(CONFIG_URL, data_dir)
else:
CONFIG = os.path.join(data_dir,CONFIG_FILE_NAME)
cfg = OmegaConf.load(CONFIG)
print(OmegaConf.to_yaml(cfg))
7. Create mainfest file
# Create a manifest file for input with below format.
# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-",
# "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
import json
meta = {
'audio_filepath': AUDIO_FILENAME,
'offset': 0,
'duration':None,
'label': 'infer',
'text': '-',
'num_speakers': None,
'rttm_filepath': None,
'uem_filepath' : None
}
with open(os.path.join(data_dir,'input_manifest.json'),'w') as fp:
json.dump(meta,fp)
fp.write('\n')
cfg.diarizer.manifest_filepath = os.path.join(data_dir,'input_manifest.json')
!cat {cfg.diarizer.manifest_filepath}
8. Set up the pipeline model used
pretrained_speaker_model='titanet-l.nemo'
cfg.diarizer.manifest_filepath = cfg.diarizer.manifest_filepath
cfg.diarizer.out_dir = data_dir #Directory to store intermediate files and prediction outputs
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
cfg.diarizer.clustering.parameters.oracle_num_speakers=None
# Using Neural VAD and Conformer ASR
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
cfg.diarizer.asr.model_path = 'my_stt_zh_quartznet15x5.nemo'
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD
cfg.diarizer.asr.parameters.asr_based_vad = False
cfg.batch_size = 2
9. Run ASR to get the timestamp mark
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
asr_model = asr_decoder_ts.set_asr_model()
word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)
print("Decoded word output dictionary: \n", word_hyp['test'])
print("Word-level timestamps dictionary: \n", word_ts_hyp['test'])
10. Create ASR offline separation object
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
11. Perform offline separation
diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
print("Diarization hypothesis output: \n", diar_hyp['test'])
12. Display the results of offline separation
def read_file(path_to_file):
with open(path_to_file) as f:
contents = f.read().splitlines()
return contents
predicted_speaker_label_rttm_path = f"{data_dir}/pred_rttms/test.rttm"
pred_rttm = read_file(predicted_speaker_label_rttm_path)
pp.pprint(pred_rttm)
from nemo.collections.asr.parts.utils.speaker_utils import rttm_to_labels
pred_labels = rttm_to_labels(predicted_speaker_label_rttm_path)
color = get_color(signal, pred_labels)
display_waveform(signal,'Audio with Speaker Labels', color)
display(Audio(signal,rate=sample_rate))
[ 'SPEAKER test2 1 0.000 4.125 <NA> <NA> speaker_1 <NA> <NA>', 'SPEAKER test2 1 4.125 4.565 <NA> <NA> speaker_0 <NA> <NA>']
Transcribing: 0%| | 0/1 [00:00<?, ?it/s]
speaker_1 ['Hey, you told me the day before yesterday what the 12-period interest rate is']
Transcribing: 0%| | 0/1 [00:00<?, ?it/s]
speaker_0 ['If the job number is 908262, then it will be 0.810,000, then it will be divided into 120,000, and the interest will be 80']