语音识别、语音合成、歌声合成自动标注工具

语音合成歌声合成

  • 一直都是做语音方面的工作
  • 目前行业存在的难点:人工标注比较繁琐,标注困难。成本大,不准确。
  • 目前我标注工具是已经试验可用的,希望对大家有用。

标注歌声的样本实例

  • 歌词音素发声时段与音频对应的时段
0	2400000	pau
2400000	2750000	u
2750000	2850000	sh
2850000	2920000	iii
2920000	2970000	l
2970000	3220000	iu
3220000	3260000	g
3260000	3420000	e
3420000	3500000	m
3500000	3720000	in
3720000	3820000	z
3820000	4170000	u
4170000	4570000	pau
4570000	4760000	u
4760000	4900000	sh
4900000	5080000	iii
5080000	5140000	l
5140000	5310000	iu
5310000	5410000	zh
5410000	5590000	iii
5590000	5700000	h
5700000	6240000	ua
6680000	6980000	u
6980000	7060000	sh
7060000	7120000	iii
7120000	7160000	l
7160000	7350000	iu
7350000	7430000	g
7430000	7560000	e
7560000	7680000	x
7680000	7880000	iong
7880000	7910000	d
7910000	8090000	i
8090000	8180000	j
8180000	8340000	ie
8340000	8410000	m
8410000	8580000	ei
8580000	8710000	sh
8710000	9070000	iii
9070000	9070000	xx
9070000	9070000	xx
9070000	9620000	i
9620000	9620000	xx
9620000	9730000	j
9730000	10320000 ia
10320000 10740000 pau
10740000 11050000 u
11050000 11150000 sh
11150000 11190000 iii
11190000 11240000 l
11240000 11450000 iu
11450000 11530000 z
11530000 11680000 u
11680000 11980000 iu
11980000 12400000 ian
12400000 12760000 pau
12760000 12860000 h
12860000 13010000 ui
13010000 13120000 ch
13120000 13250000 eng
13250000 13560000 i
13560000 13670000 j
13670000 13850000 v
13850000 13950000 h
13950000 14540000 ua
14930000 15170000 ai
15170000 15400000 uo
15400000 15470000 zh
15470000 15640000 ong
15640000 15730000 h
15730000 15920000 ua
15920000 16180000 ai
16180000 16420000 uo
16420000 16520000 zh
16520000 16660000 ong
16660000 16760000 h
16760000 16990000 ua
16990000 17230000 ai
17230000 17740000 uo
17740000 17740000 pau
17740000 17860000 zh
17860000 18040000 ong
18040000 18150000 h
18150000 18580000 ua
18580000 18870000 pau
18870000 18960000 h
18960000 19000000 ei
19000000 19060000 l
19060000 19140000 uo
19140000 19210000 h
19210000 19430000 ei
19430000 19500000 l
19500000 19650000 uo
19650000 19730000 h
19730000 19930000 ei
19930000 20000000 l
20000000 20170000 uo
20170000 20270000 h
20270000 20700000 ei
20700000 21010000 pau
21010000 21070000 h
21070000 21140000 ei
21140000 21180000 l
21180000 21260000 uo
21260000 21340000 h
21340000 21560000 ei
21560000 21610000 l
21610000 21730000 uo
21730000 21820000 h
21820000 22000000 ei
22000000 22120000 l
22120000 22270000 uo
22270000 22370000 h
22370000 22820000 ei
22820000 23130000 pau
23130000 23180000 h
23180000 23230000 ei
23230000 23280000 l
23280000 23340000 uo
23340000 23420000 h
23420000 23620000 ei
23620000 23700000 l
23700000 23880000 uo
23880000 23960000 h
23960000 24140000 ei
24140000 24190000 l
24190000 24360000 uo
24360000 24460000 h
24460000 24660000 ei
24660000 24730000 l
24730000 24890000 uo
24890000 24980000 h
24980000 25150000 ei
25150000 25240000 l
25240000 25430000 uo
25430000 25680000 ai
25680000 25680000 xx
25680000 26180000 uo
26180000 26280000 zh
26280000 26470000 ong
26470000 26570000 h
26570000 27010000 ua
27010000 27660000 pau
27660000 28430000 ai
28430000 28700000 uo
28700000 28780000 zh
28780000 28970000 ong
28970000 29060000 h
29060000 29410000 ua
29410000 29820000 pau
29820000 29870000 j
29870000 30080000 ian
30080000 30290000 er
30290000 30380000 f
30380000 30550000 en
30550000 30640000 q
30640000 30820000 i
30820000 30820000 xx
30820000 30890000 b
30890000 31050000 u
31050000 31170000 f
31170000 31540000 a
32040000 32800000 ai
32800000 33080000 uo
33080000 33150000 zh
33150000 33330000 ong
33330000 33430000 h
33430000 33790000 ua
33790000 34110000 pau
34110000 34230000 j
34230000 34370000 ian
34370000 34490000 sh
34490000 34640000 e
34640000 34920000 uo
34920000 34980000 m
34980000 35040000 en
35040000 35100000 d
35100000 35170000 i
35170000 35230000 g
35230000 35410000 uo
35410000 35510000 j
35510000 35950000 ia
36360000 37150000 ai
37150000 37400000 uo
37400000 37480000 zh
37480000 37660000 ong
37660000 37750000 h
37750000 38090000 ua
38090000 38440000 pau
38440000 38510000 zh
38510000 38670000 ong
38670000 38730000 h
38730000 38900000 ua
38900000 38990000 x
38990000 39180000 iong
39180000 39250000 z
39250000 39430000 ii
39430000 39430000 xx
39430000 39700000 ing
39700000 39800000 f
39800000 40200000 a
40630000 41360000 ai
41360000 41650000 uo
41650000 41730000 zh
41730000 41900000 ong
41900000 42010000 h
42010000 42320000 ua
42320000 42810000 pau
42810000 42980000 u
42980000 43090000 sh
43090000 43310000 iii
43310000 43370000 l
43370000 43530000 iu
43530000 43610000 z
43610000 43770000 u
43770000 43860000 x
43860000 44050000 iong
44050000 44090000 d
44090000 44280000 i
44280000 44370000 j
44370000 44540000 ie
44540000 44610000 m
44610000 44770000 ei
44770000 44990000 u
44990000 45100000 sh
45100000 45290000 iii
45290000 45340000 l
45340000 45520000 iu
45520000 45590000 z
45590000 45770000 u
45770000 46000000 iu
46000000 46280000 ian
46280000 46360000 h
46360000 46530000 ui
46530000 46620000 ch
46620000 46780000 eng
46780000 47370000 i
47370000 47370000 xx
47370000 47680000 pau
47680000 47750000 j
47750000 47920000 v
47920000 48010000 h
48010000 48370000 ua
48370000 48940000 pau
48940000 49370000 ai
49370000 49880000 uo
49880000 50000000 zh
50000000 50180000 ong
50180000 50280000 h
50280000 50770000 ua
50770000 51050000 pau

语音合成标注

  • 文本的拼音的音素与说话声音wav的对应时段
  • 适合merlin合成方法,参数化语音训练方法HTS
0 2500000 xx^xx-sil+k=a3@xx@/A:xx-xx^xx@/B:xx+xx@xx^xx^xx+xx#xx-xx-/C:xx_xx^xx#xx+xx+xx&/D:xx=xx!xx@xx-xx&/E:xx|xx-xx@xx#xx&xx!xx-xx#/F:xx^xx=xx_xx-xx!
2500000 4000000 xx^sil-k+a3=er3@a@/A:xx-3^3@/B:0+8@1^3^1+3#1-9-/C:xx_n^v#xx+3+1&/D:xx=3!6@1-2&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
4000000 4500000 sil^k-a3+er3=p@a@/A:xx-3^3@/B:0+8@1^3^1+3#1-9-/C:xx_n^v#xx+3+1&/D:xx=3!6@1-2&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
4500000 6000000 k^a3-er3+p=u3@er@/A:3-3^3@/B:1+7@2^2^2+2#2-8-/C:xx_n^v#xx+3+1&/D:xx=3!6@1-2&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
6000000 7300000 a3^er3-p+u3=p@u@/A:3-3^2@/B:2+6@3^1^3+1#3-7-/C:xx_n^v#xx+3+1&/D:xx=3!6@1-2&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
7300000 8600000 er3^p-u3+p=ei2@u@/A:3-3^2@/B:2+6@3^1^3+1#3-7-/C:xx_n^v#xx+3+1&/D:xx=3!6@1-2&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
8600000 9800000 p^u3-p+ei2=w@ei@/A:3-2^4@/B:3+5@1^1^1+6#4-6-/C:n_v^n#3+1+2&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
9800000 10600000 u3^p-ei2+w=uai4@ei@/A:3-2^4@/B:3+5@1^1^1+6#4-6-/C:n_v^n#3+1+2&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
10600000 11500000 p^ei2-w+uai4=s@uai@/A:2-4^1@/B:4+4@1^2^2+5#5-5-/C:v_n^v#1+2+1&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
11500000 12800000 ei2^w-uai4+s=un1@uai@/A:2-4^1@/B:4+4@1^2^2+5#5-5-/C:v_n^v#1+2+1&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
12800000 14200000 w^uai4-s+un1=w@un@/A:4-1^2@/B:5+3@2^1^3+4#6-4-/C:v_n^v#1+2+1&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
14200000 15700000 uai4^s-un1+w=uan2@un@/A:4-1^2@/B:5+3@2^1^3+4#6-4-/C:v_n^v#1+2+1&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
15700000 16399999 s^un1-w+uan2=h@uan@/A:1-2^2@/B:6+2@1^1^4+3#7-3-/C:n_v^n#2+1+2&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
16399999 17500000 un1^w-uan2+h=ua2@uan@/A:1-2^2@/B:6+2@1^1^4+3#7-3-/C:n_v^n#2+1+2&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
17500000 18800000 w^uan2-h+ua2=t@ua@/A:2-2^1@/B:7+1@1^2^5+2#8-2-/C:v_n^xx#1+2+xx&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
18800000 20099999 uan2^h-ua2+t=i1@ua@/A:2-2^1@/B:7+1@1^2^5+2#8-2-/C:v_n^xx#1+2+xx&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
20099999 21900000 h^ua2-t+i1=sil@i@/A:2-1^xx@/B:8+0@2^1^6+1#9-1-/C:v_n^xx#1+2+xx&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
21900000 24200000 ua2^t-i1+sil=xx@i@/A:2-1^xx@/B:8+0@2^1^6+1#9-1-/C:v_n^xx#1+2+xx&/D:3=6!xx@2-1&/E:xx|9-xx@xx#2&xx!1-1#/F:xx^9=5_2-1!
24200000 26400000 t^i1-sil+xx=xx@xx@/A:xx-xx^xx@/B:xx+xx@xx^xx^xx+xx#xx-xx-/C:xx_xx^xx#xx+xx+xx&/D:xx=xx!xx@xx-xx&/E:xx|xx-xx@xx#xx&xx!xx-xx#/F:xx^xx=xx_xx-xx!

核心源码

  • textgrid to mono label
#!/usr/bin/python
#!/usr/bin/python
# -*- coding: utf-8 -*-
# __author__ = "errrolyan"
# Date: 18-10-16
# Describe = "praat 的输出结果textgrid to mono label”
import sys
import re

usage = 'Usage: ./textgrid_to_lab.py input.TextGrid output.lab'

if len(sys.argv) != 3:
    print (usage)
    exit()

ifname = sys.argv[1]
ofname = sys.argv[2]

# boilerplate
outf = open(ofname, 'w')
outf.write('separator ;\n')
outf.write('nfields 1\n')
outf.write('#\n')

# intervals
inf = open(ifname)
start_of_intervals = False
start = ''
end = ''
text = ''
for line in inf:
    l = line.strip()
    if not start_of_intervals:
        if re.search('^intervals', l):
            start_of_intervals = True
        else:
            continue
    if re.search('^xmin', l):
        start = float(l.strip('xmin = '))
    elif re.search('^xmax', l):
        end = l.strip('xmax = ')
    elif re.search('^text', l):  # end of interval; write info
        text = l.strip('text = ').strip('"') 
        outf.write('\t %.5f' % float(end) + ' 26 \t' + text + '\n')
    else:
        continue

outf.close()
inf.close()

  • lab to textgrid
#!/usr/bin/python
#!/usr/bin/python
# -*- coding: utf-8 -*-
# __author__ = "errrolyan"
# Date: 18-10-16
# Describe = "lab_to_lab的输出结果.lab 输入到 praat 修正lab的元音和辅音的分界值”
import sys,os
import re

# usage = 'Usage: ./lab_to_textgrid.py input.lab output.TextGrid'
#
# if len(sys.argv) != 3:
#     print usage
#     exit()
#
# ifname = sys.argv[1]
# ofname = sys.argv[2]

def labtotextgrid(ifname,ofname):

    inf = open(ifname, 'r')
    outf = open(ofname, 'w')

    # get info from .lab
    labs = []
    for line in inf:
        if not re.search('^\s*\d+\s*\d+\s*\S+', line): #regular expresion for "space number space number space word"
            continue
        tokens = line.split()
        time = tokens[1].strip()
        label = tokens[2].strip()
        labs.append((str(int(time)/10000000.0), label))

    maxtime = str(labs[-1][0])

    # boilerplate
    outf.write('File type = "ooTextFile"\n')
    outf.write('Object class = "TextGrid"\n')
    outf.write('\n')
    outf.write('xmin = 0\n')
    outf.write('xmax = ' + maxtime + '\n')
    outf.write('tiers? <exists>\n')
    outf.write('size = 1\n')
    outf.write('item []:\n')
    outf.write('    item [1]:\n')
    outf.write('        class = "IntervalTier"\n')
    outf.write('        name = "labels"\n')
    outf.write('        xmin = 0\n')
    outf.write('        xmax = ' + maxtime + '\n')
    outf.write('        intervals: size = ' + str(len(labs)) + '\n')

    # intervals
    count = 0
    prevtime = '0'
    for elt in labs:
        count += 1
        outf.write('        intervals [' + str(count) + ']:\n')
        outf.write('            xmin = ' + prevtime + '\n')
        outf.write('            xmax = ' + elt[0] + '\n')
        outf.write('            text = "' + elt[1] + '"\n')
        prevtime = elt[0]

    inf.close()
    outf.close()


def fileread_mono(filepath,filechange):
    pathDir = os.listdir(filepath)
    for s in pathDir:
        newDir = os.path.join(filepath, s)
        if os.path.isfile(newDir):
            if os.path.splitext(newDir)[1] == ".lab":
                print(newDir)
                name = newDir[-30:-4]
                lab_in = filepath + name + ".lab"
                lab_out = filechange + name +".TextGrid"
                labtotextgrid(lab_in, lab_out)

fileread_mono("./labels_new/mono/","textgridin/")
发布了299 篇原创文章 · 获赞 129 · 访问量 8万+

猜你喜欢

转载自blog.csdn.net/weixin_32393347/article/details/102829011