thch30 local/thch-30_data_prep.sh详细注释

这个脚本的输入有两个参数:$H $thchs
H代表当前工作路径,thchs代表数据文件路径

#!/bin/bash
# Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
#           2016  LeSpeech (Author: Xingyu Na)

#This script pepares the data directory for thchs30 recipe. 
#It reads the corpus and get wav.scp and transcriptions.

dir=$1		# 读取输入的第一个参数,这里是工作路径
corpus_dir=$2		# 读取输入的第二个参数,这里是语料库文件路径,这个路径下就是train dev test文件夹


cd $dir		# 切换到工作路径

echo "creating data/{train,dev,test}"
mkdir -p data/{train,dev,test}	# 创建两级目录 data/train data/dev data/test

#create wav.scp, utt2spk.scp, spk2utt.scp, text
(
for x in train dev test; do
  echo "cleaning data/$x"
  cd $dir/data/$x
  rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text # 删除文件夹下所有文件
  echo "preparing scps and text in data/$x"
  #updated new "for loop" figured out the compatibility issue with Mac     created by Xi Chen, in 03/06/2018
  #for nn in `find  $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do
  # 这一句有些复杂,find命令先将train文件夹下所有wav文件列出,然后sort命令去除重复内容,然后
  # 将每一个wav文件的名字去除,也就是将..../train/A2_0.wav变成A2_0
  for nn in `find  $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do
      spkid=`echo $nn | awk -F"_" '{print "" $1}'`	# 用'_'来分割$nn并打印第一个分割部分。A2_0输出A2
      spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'`	# A2 输出 A (括号内的内容放到第一个位置)
      spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'`	# A2 输出 2 (括号内的内容放到第一个位置)
      spkid=$(printf '%s%.2d' "$spk_char" "$spk_num")	# 输出A02
      utt_num=`echo $nn | awk -F"_" '{print $2}'`	# A2_0 输出 0 
      uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num")	#输出 A02_000
      echo $uttid $corpus_dir/$x/$nn.wav >> wav.scp	# 将文件名输出到wav.scp中
      echo $uttid $spkid >> utt2spk	# 将spkid输出到utt2spk中
      echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt	# sed -n 1p只处理打印第一行(第一行时是中文)
      echo $uttid `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt	# # sed -n 3p只处理打印第三行(第三行是音节)
  done 
  cp word.txt text
  sort wav.scp -o wav.scp	# 排序后输出
  sort utt2spk -o utt2spk	# 排序后输出
  sort text -o text	# 排序后输出
  sort phone.txt -o phone.txt	# 排序后输出
done
) || exit 1

utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
# 这里使用了工具对utt2spk进行操作转换
echo "creating test_phone for phone decoding"
(
  rm -rf data/test_phone && cp -R data/test data/test_phone  || exit 1
  cd data/test_phone && rm text &&  cp phone.txt text || exit 1
)
# 对test_phone文件夹里的文件做了一点简单操作

猜你喜欢

转载自blog.csdn.net/x603560617/article/details/83114424
30