import os
source_file = 'D:\PythonTest\ASR_work\data_thchs30\data'
def source_get(source_file):
train_file = source_file
label_lst = []
wav_lst = []
for root, dirs, files in os.walk(train_file):
for file in files:
if file.endswith('.wav') or file.endswith('.WAV'):
wav_file = os.sep.join([root, file])
label_file = wav_file + '.trn'
wav_lst.append(wav_file)
label_lst.append(label_file)
return label_lst, wav_lst
label_lst, wav_lst = source_get(source_file)
# with open('s.txt','w+',encoding="utf-8") as f:
# for i in wav_lst:
# i = i.split("\\")[-1]
# i = i.split('.')[0]
# f.write(i+'\n')
def read_label(label_file):
with open(label_file, 'r', encoding='utf8') as f:
data = f.readlines()
return data[0]
# print(read_label(label_lst[0]))
def gen_label_data(label_lst):
label_data = []
for label_file in label_lst:
pny = read_label(label_file)
label_data.append(pny.strip('\n'))
return label_data
label_data = gen_label_data(label_lst)
print(label_data[0:2])
print(wav_lst[0:2])
with open('ss.txt','w+',encoding="utf-8") as f:
for i in range(len(wav_lst)):
ii = wav_lst[i].split("\\")[-1]
ii = ii.split('.')[0]
line = label_data[i].split()
l = ''.join(line)
l = ' '.join(l)
l = ii+' '+l
f.write(l+'\n')
with open('./aishell_transcript_v0.8.txt','r',encoding='utf-8') as f:
lines = f.readlines()
vocab = []
for line in lines:
line = line.split()
liness = line[1:]
l = ''.join(liness)
l = ' '.join(l)
l = l.split(' ')
for pny in l:
if pny not in vocab:
vocab.append(pny)
vocab.append('_')
with open('./sss.txt','w',encoding='utf-8') as fr:
for i in vocab:
fr.write(i)
fr.write('\n')
语音识别数据列表和字典处理脚本
Guess you like
Origin blog.csdn.net/weixin_44885180/article/details/117326272
Recommended
Ranking