【API解析】微软文本转语音(text-to-speech)官方Demo调用步骤

1. 来源

github: MsEdgeTTS
吾爱破解：微软语音助手免费版，支持多种功能，全网首发
微软Demo: 文本转语音, speechSDK.js, text-to-speech,js

2. 准备工作

功能来源：edge浏览器
抓包工具：fiddler
模拟请求：postman

3. 主要分析步骤

第一步：点击文本转语音播放按钮，从开发者工具网络上直接找到了wss连接wss://eastus.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization=bearer%20{token}，fiddler上也同样捕捉到了对应的wss请求

// 来源：https://azure.microsoft.com/en-us/services/cognitive-services/text-to-speech
// 初始化token和region
var localizedResources = {
    
    
	token: "eyJhbGciOiJodHRwOi8vd3d3LnczLm9yZy8yMDAxLzA0L3htbGRzaWctbW9yZSNobWFjLXNoYTI1NiIsInR5cCI6IkpXVCJ9.eyJyZWdpb24iOiJlYXN0dXMiLCJzdWJzY3JpcHRpb24taWQiOiI2MWIxODBlMmJkOGU0YWI2OGNiNmQxN2UxOWE5NjAwMiIsInByb2R1Y3QtaWQiOiJTcGVlY2hTZXJ2aWNlcy5TMCIsImNvZ25pdGl2ZS1zZXJ2aWNlcy1lbmRwb2ludCI6Imh0dHBzOi8vYXBpLmNvZ25pdGl2ZS5taWNyb3NvZnQuY29tL2ludGVybmFsL3YxLjAvIiwiYXp1cmUtcmVzb3VyY2UtaWQiOiIvc3Vic2NyaXB0aW9ucy9jMjU1ZGYzNi05NzRjLTQ2MGEtODMwYi0yNTE2NTEzYWNlYjIvcmVzb3VyY2VHcm91cHMvY3MtY29nbml0aXZlc2VydmljZXMtcHJvZC13dXMyL3Byb3ZpZGVycy9NaWNyb3NvZnQuQ29nbml0aXZlU2VydmljZXMvYWNjb3VudHMvYWNvbS1zcGVlY2gtcHJvZC1lYXN0dXMiLCJzY29wZSI6InNwZWVjaHNlcnZpY2VzIiwiYXVkIjoidXJuOm1zLnNwZWVjaHNlcnZpY2VzLmVhc3R1cyIsImV4cCI6MTY1NzU0MjgyMywiaXNzIjoidXJuOm1zLmNvZ25pdGl2ZXNlcnZpY2VzIn0.vI3ferw2AowktlDmmrMLvr-XVJicjm8gagPie59UZbc",
	region: "eastus",
	srComplete: "Done Recognizing Speech",
	srStartFailure: "Cannot Recognize Speech",
	srCanceledError: "Recognition was canceled due to error ",
	srStartSpeaking: "Start Speaking",
	srTryAgain: "An error occurred while loading this demo, please reload and try again",
	srTooManyFiles: "This demo supports a maximum of 5 files.",
	ttsPitch: "Pitch",
	ttsSpeed: "Speaking speed",
	ttsPreview: "Preview",
	ttsDefaultText: {
    
    ...}
}

// 来源：https://azurecomcdn.azureedge.net/cvt-f187b0e8321af2f3c7299619208c62b4c1e44f0eb595e2abd9bc3207f2c90b3e/scripts/Acom/Components/cognitiveServicesDemos/speechJsSdk/textToSpeech.js
// 获取语音包列表的代码
$.ajax({
    
    
    url: 'https://' + localizedResources.region + '.tts.speech.microsoft.com/cognitiveservices/voices/list',
    type: 'GET',
    beforeSend: function textToSpeechVoiceListBeforeAjaxSend(xhr) {
    
     xhr.setRequestHeader('Authorization', 'Bearer ' + localizedResources.token); },
    success: function textToSpeechVoiceListAjaxSuccess(data) {
    
    
        // put neural voices in front.
        var sorted = data.sort(function (a, b) {
    
    
            return a.VoiceType.localeCompare(b.VoiceType);
        });
        $.each(sorted, function (_index, element) {
    
    
            var displayName = element.DisplayName;
            if (element.Status === 'Deprecated') {
    
    
                // Don't show deprecated voices.
                return;
            }
            if (!voiceList[element.Locale]) {
    
    
                voiceList[element.Locale] = '';
            }
            if (element.VoiceType === 'Neural') {
    
    
                displayName += ' (Neural)';
            }
            if (element.LocalName !== element.DisplayName) {
    
    
                displayName += ' - ' + element.LocalName;
            }
            if (element.Status === 'Preview') {
    
    
                displayName += ' - ' + localizedResources.ttsPreview;
            }
            voiceList[element.Locale] += '<option value="' + element.ShortName + '">' + displayName + '</option>';
            styleList[element.ShortName] = element.StyleList;
            rolePlayList[element.ShortName] = element.RolePlayList;
            secondaryLocaleList[element.ShortName] = element.SecondaryLocaleList;
        });
        language.onchange();
    },
    error: function textToSpeechVoiceListAjaxError(_jqXHR, _textStatus, error) {
    
    
        status.innerText = localizedResources.srTryAgain;
        global.Core.Util.TrackException('A Text To Speech voice list API Ajax error occurred: ' + error);
    }
});

// 来源：https://azurecomcdn.azureedge.net/cvt-f187b0e8321af2f3c7299619208c62b4c1e44f0eb595e2abd9bc3207f2c90b3e/scripts/Acom/Components/cognitiveServicesDemos/speechJsSdk/textToSpeech.js
// 播放按钮触发的函数
function SpeakOnce() {
    
    
    var config = SpeechSDK.SpeechTranslationConfig.fromAuthorizationToken(localizedResources.token, localizedResources.region),
        synthesizer,
        audioConfig;

    // due to a bug in Chromium (https://bugs.chromium.org/p/chromium/issues/detail?id=1028206)
    // mp3 playback has some beeps, using a higher bitrate here as a workaround.
    config.speechSynthesisOutputFormat = SpeechSDK.SpeechSynthesisOutputFormat.Audio24Khz160KBitRateMonoMp3;

    player = new SpeechSDK.SpeakerAudioDestination();
    player.onAudioEnd = function () {
    
    
        stopli.hidden = true;
        playli.hidden = false;
    };

    audioConfig = SpeechSDK.AudioConfig.fromSpeakerOutput(player);

    synthesizer = new SpeechSDK.SpeechSynthesizer(config, audioConfig);

    synthesizer.synthesisCompleted = function () {
    
    
        synthesizer.close();
        synthesizer = null;
    };

    synthesizer.SynthesisCanceled = function (s, e) {
    
    
        var details;
        stopli.hidden = true;
        playli.hidden = false;
        details = SpeechSDK.CancellationDetails.fromResult(e);
        if (details.reason === SpeechSDK.CancellationReason.Error) {
    
    
            status.innerText = localizedResources.srTryAgain;
        }
    };

    synthesizer.speakSsmlAsync(ssml.value, function () {
    
     }, function (error) {
    
    
        status.innerText = localizedResources.srTryAgain + ' ' + error;
    });
}

第二步：分析和对照之前edge浏览器大声朗读功能api调用过程，步骤还是比较相似的，不过比之前要先从微软文本转语音官网上取得token，才能做后面的操作

/*
 * postman中模拟成功
 * 从官网取得的token和region
 * http url: https://azure.microsoft.com/en-us/services/cognitive-services/text-to-speech
 * method: GET
 */
var region, token
{
    
    
	uri: `https://${
      
      region}.tts.speech.microsoft.com/cognitiveservices/voices/list`,
	headers: {
    
    
		Authorization: `bearer ${
      
      token}`
	},
	method: "GET"
}

/*
 * postman中模拟成功
 * 获取可用语音包，需要用到官网取得的token和region
 * http url: https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list
 * headers: { Authorization: `bearer ${token}` }
 * method: GET
 */
{
    
    
	uri: `https://${
      
      region}.tts.speech.microsoft.com/cognitiveservices/voices/list`,
	headers: {
    
    
		Authorization: `bearer ${
      
      token}`
	},
	method: "GET"
}

/*
 * postman中模拟成功
 * 发送wss连接，传输文本和语音数据，模拟SpeakOnce()函数中speechSDK发送请求
 * wss url: wss://${region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization=bearer%20{token}
 * send: 需要先随机生成一个requestid（替换掉guid的分隔符“-”），共发送三次数据（第一次是speechSDK环境，第二次是音频格式，第三次是ssml标记文本）
 * receive: 接收到的音频字节包含在相同requestid的正文部分，用Path=audio\r\n定位正文索引
 * 注：和edge大声朗读接口不同，音频格式可以参照官方文档自己设置
 */
{
    
    
	uri: `wss://${
      
      region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1`,
	query: {
    
    
		Authorization: `bearer%20{token}`
	},
	sendmessage: {
    
    
		speechconfig: `
Path: speech.config
X-RequestId: 095E1E12004641208D62F656AC26CED6
X-Timestamp: 2022-07-11T10:45:52.938Z
Content-Type: application/json

{"context":{"system":{"name":"SpeechSDK","version":"1.19.0","build":"JavaScript","lang":"JavaScript"},"os":{"platform":"Browser/Win32","name":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49","version":"5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49"}}}`,
speechconfig: `
Path: synthesis.context
X-RequestId: 095E1E12004641208D62F656AC26CED6
Content-Type: application/json

{"synthesis":{"audio":{"metadataOptions":{"bookmarkEnabled":false,"sentenceBoundaryEnabled":false,"visemeEnabled":false,"wordBoundaryEnabled":false},"outputFormat":"audio-24khz-160kbitrate-mono-mp3"},"language":{"autoDetection":false}}}`,
		ssml: `
Path: ssml
X-RequestId: 095E1E12004641208D62F656AC26CED6
Content-Type: application/ssml+xml

<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="en-US-JennyNeural"><prosody rate="0%" pitch="0%">You can replace this text with any text you wish. You can either write in this text box or paste your own text here.
Try different languages and voices. Change the speed and the pitch of the voice. You can even tweak the SSML (Speech Synthesis Markup Language) to control how the different sections of the text sound. Click on SSML above to give it a try!
Enjoy using Text to Speech!</prosody></voice></speak>`
	}
}

4. 编写代码

websocket库：WebSocketSharp。最新版安装失败的可以降版本安装，此文发布的时候最新预览版是1.0.3-rc11

using System;
using System.Collections.Generic;
using System.Linq;
using System.IO;
using System.Text;
using System.Security.Authentication;
using System.Web;
using System.Net;
using System.Text.RegularExpressions;
using WebSocketSharp;

namespace ConsoleTest
{
    
    
    internal class Program
    {
    
    
        static string UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49";

        static Dictionary<string, string> GetToken()
        {
    
    
            var url = "https://azure.microsoft.com/en-us/services/cognitive-services/text-to-speech";
            var request = WebRequest.CreateHttp(url);
            request.Method = "GET";
            request.UserAgent = UserAgent;
            var response = request.GetResponse();
            using (var stream = response.GetResponseStream())
                using (var rd = new StreamReader(stream))
                {
    
    
                    var content = rd.ReadToEnd();
                    var match = Regex.Match(content, @"localizedResources\s?=\s?{\r?\n\s+token:\s?""(?<token>.*?)"",\r?\n\s+region:\s?""(?<region>.*?)""");
                    return new Dictionary<string, string>
                    {
    
    
                        {
    
     "token", match.Groups["token"].Value },
                        {
    
     "region", match.Groups["region"].Value }
                    };
                }
        }

        static void Main(string[] args)
        {
    
    
            var localres = GetToken();
            var AudioDelimeter = "Path:audio\r\n";
            var url = $"wss://{
      
      localres["region"]}.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization={
      
      HttpUtility.UrlPathEncode("bearer " + localres["token"])}";
            var dataBuffers = new Dictionary<string, List<byte>>();
            
            // 音频格式
            var audioOutputFormat = "audio-24khz-160kbitrate-mono-mp3";
            // ssml参数
            var Language = "en-US";
            var Voice = "zh-CN-XiaoxiaoNeural";
            var Rate = 0;
            var Pitch = 0;
            var msg = "Hello world";

            // 生成requestId
            var sendRequestId = Guid.NewGuid().ToString().Replace("-", "").ToUpper();
            // 待发送信息
            var speechconfig = $"Path: speech.config\r\nX-RequestId: {
      
      sendRequestId}\r\nContent-Type: application/json\r\n\r\n"
                + ("{'context':{'system':{'name':'SpeechSDK','version':'1.19.0','build':'JavaScript','lang':'JavaScript'},'os':{'platform':'Browser/Win32','name':'"
                   + UserAgent + "','version':'" + UserAgent.Split("/".ToCharArray(), 2)[1] +"'}}}").Replace("'", "\"");
            var speechcontext = $"Path: synthesis.context\r\nX-RequestId: {
      
      sendRequestId}\r\nContent-Type: application/json\r\n\r\n"
                + ("{'synthesis':{'audio':{'metadataOptions':{'bookmarkEnabled':false,'sentenceBoundaryEnabled':false,'visemeEnabled':false,'wordBoundaryEnabled':false},'outputFormat':'" + audioOutputFormat + "'},'language':{'autoDetection':false}}}").Replace("'", "\"");
            var ssmltext = $"Path: ssml\r\nX-RequestId: {
      
      sendRequestId}\r\nContent-Type: application/ssml+xml\r\n\r\n"
                + $"<speak xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xmlns:emo='http://www.w3.org/2009/10/emotionml' version='1.0' xml:lang='{
      
      Language}'><voice name='{
      
      Voice}'><prosody rate='{
      
      Rate}%' pitch='{
      
      Pitch}%'>{
      
      msg}</prosody></voice></speak>";

            Console.WriteLine(url);
            var webSocket = new WebSocket(url);
            webSocket.SslConfiguration.ServerCertificateValidationCallback = (sender, certificate, chain, sslPolicyErrors) => true;
            // 增加对Tls12的支持，否则连接时会报错：System.Security.Authentication.AuthenticationException: 调用 SSPI 失败，请参见内部异常。
            webSocket.SslConfiguration.EnabledSslProtocols = SslProtocols.Tls | SslProtocols.Tls11 | SslProtocols.Tls12 | SslProtocols.Ssl2;
            webSocket.OnOpen += (sender, e) => Console.WriteLine("[Log] WebSocket Open");
            webSocket.OnClose += (sender, e) => Console.WriteLine("[Log] WebSocket Close");
            webSocket.OnError += (sender, e) => Console.WriteLine("[Error] error message: " + e.Message);
            webSocket.OnMessage += (sender, e) =>
            {
    
    
                if (e.IsText)
                {
    
    
                    var data = e.Data;
                    var requestId = Regex.Match(data, @"X-RequestId:(?<requestId>.*?)\r?\n").Groups["requestId"].Value;
                    Console.WriteLine("- [" + requestId + "]:\n" + e.Data);
                    if (data.Contains("Path:turn.start"))
                    {
    
    
                        // start of turn, ignore. 开始信号，不用处理
                    }
                    else if (data.Contains("Path:turn.end"))
                    {
    
    
                        // end of turn, close stream. 结束信号，可主动关闭socket
                        // dataBuffers[requestId] = null;
                        // 不要跟着MsEdgeTTS中用上面那句，音频发送完毕后，最后还会收到一个表示音频结束的文本信息
                        webSocket.Close();
                    }
                    else if (data.Contains("Path:response"))
                    {
    
    
                        // context response, ignore. 响应信号，无需处理
                    }
                    else
                    {
    
    
                        Console.WriteLine("unknow message: " + data); // 未知错误，通常不会发生
                    }
                }
                else if (e.IsBinary)
                {
    
    
                    var data = e.RawData;
                    var message = Encoding.UTF8.GetString(e.RawData);
                    var requestId = Regex.Match(message, @"X-RequestId:(?<requestId>.*?)\r?\n").Groups["requestId"].Value;
                    Console.WriteLine("- [" + requestId + "]:\nbyte array size: " + e.Data.Length);
                    if (!dataBuffers.ContainsKey(requestId))
                        dataBuffers[requestId] = new List<byte>();
                    if (data[0] == 0x00 && data[1] == 0x67 && data[2] == 0x58)
                    {
    
    
                        // Last (empty) audio fragment. 空音频片段，代表音频发送结束
                    }
                    else
                    {
    
    
                        var index = message.IndexOf(AudioDelimeter) + AudioDelimeter.Length;
                        dataBuffers[requestId].AddRange(data.Skip(index));
                        Console.WriteLine("buffer size: " + dataBuffers[requestId].Count);
                    }
                }
            };

            webSocket.Connect();
            Console.WriteLine("--- speech.config ---\n" + speechconfig);
            webSocket.Send(speechconfig);
            Console.WriteLine("--- speech.context ---\n" + speechcontext);
            webSocket.Send(speechcontext);
            Console.WriteLine("--- ssml ---\n" + ssmltext);
            webSocket.Send(ssmltext);

            while (webSocket.IsAlive) {
    
     }
            Console.WriteLine("接收到的音频字节长度：" + dataBuffers[sendRequestId].Count);
            Console.ReadKey(true);
        }
    }
}

5. 结语

可以自定义输出格式，但需要在连接失败时重新在官网获取token

【API解析】微软文本转语音(text-to-speech)官方Demo调用步骤

【API解析】微软文本转语音(text-to-speech)官方Demo调用步骤

1. 来源

2. 准备工作

3. 主要分析步骤

4. 编写代码

5. 结语

猜你喜欢