Advancement of UNITY actual combat - HKUST Xunfei TTS (offline speech synthesis) unity plays wav in PCM format (does not rely on third-party libraries)-7

Preparation

1. Register HKUST Xunfei developer account
2. Create an application and get the APPID

3. Download the offline sdk (you can download whichever platform you are on)

4. Open the compressed package 5. The file bin\msc.dll

we need bin\msc_x64.dll the jet file in the bin\msc\res\tts directory 6. Create a Unity project, put the required plugin and tts files into the project in turn

Understand the functions of msc.dll

/// <summary>
/// 引入TTSDll函数的类
/// </summary>
private class TTSDll
{
    //登录，用户名，密码，appid配置
    [DllImport("msc_x64", CallingConvention = CallingConvention.StdCall)]
    public static extern int MSPLogin(string usr, string pwd, string parameters);

    //退出
    [DllImport("msc_x64", CallingConvention = CallingConvention.Winapi)]
    public static extern int MSPLogout();

    //初始化TTS会话，获取SessionID
    [DllImport("msc_x64", CallingConvention = CallingConvention.Winapi)]
    public static extern IntPtr QTTSSessionBegin(string _params, ref int errorCode);

    //传入需要转化的文字
    [DllImport("msc_x64", CallingConvention = CallingConvention.Winapi)]
    public static extern int QTTSTextPut(string sessionID, string textString, uint textLen, string _params);

    //开始获取， 注意SynthStatus
    [DllImport("msc_x64", CallingConvention = CallingConvention.Winapi)]
    public static extern IntPtr QTTSAudioGet(string sessionID, ref int audioLen, ref SynthStatus synthStatus, ref int errorCode);

    //结束
    [DllImport("msc_x64", CallingConvention = CallingConvention.Winapi)]
    public static extern int QTTSSessionEnd(string sessionID, string hints);

    //没用到
    [DllImport("msc_x64", CallingConvention = CallingConvention.Winapi)]
    public static extern int QTTSGetParam(string sessionID, string paramName, string paramValue, ref uint valueLen);
}

The difficulty

How do you know how many speakers there are
from the tts folder: xiaofeng & xiaoyan

SynthStatus get status

  public enum SynthStatus
  {
      //开始获取
      TTS_FLAG_STILL_HAVE_DATA = 1,
      //获取结束
      TTS_FLAG_DATA_END,
      //取消
      TTS_FLAG_CMD_CANCELED
  }

The first parameter setting of QTTSSessionBegin

string szParams = $"engine_type = local, voice_name = { m_Speeker }, text_encoding = UTF8, tts_res_path = { TTSPath(m_Speeker) }, sample_rate = 16000, speed = { m_Speed }, volume = { m_Vol }, pitch = 50, rdn = 2";

Offline Speech Synthesis Windows SDK Documentation | Xunfei Open Platform Documentation Center https://www.xfyun.cn/doc/tts/offline_tts/Windows-SDK.html#_2-3-%E5%8F%82%E6%95% B0%E4%B8%8E%E8%AF%B4%E6%98%8E
The sdk generates audio in wav format by default, but what should I do if the wav file saved in the streaming file cannot be played?
How to use AudioSource to play the obtained stream file in unity?

Generate correct WAV files

Let's look at the sdk example first

/*
* 语音合成（Text To Speech，TTS）技术能够自动将任意文字实时转换为连续的
* 自然语音，是一种能够在任何时间、任何地点，向任何人提供语音信息服务的
* 高效便捷手段，非常符合信息时代海量数据、动态更新和个性化查询的需求。
*/

#include <stdlib.h>
#include <stdio.h>
#include <windows.h>
#include <conio.h>
#include <errno.h>

#include "qtts.h"
#include "msp_cmn.h"
#include "msp_errors.h"

#ifdef _WIN64
#pragma comment(lib,"../../libs/msc_x64.lib")//x64
#else
#pragma comment(lib,"../../libs/msc.lib")//x86
#endif

/* wav音频头部格式 */
typedef struct _wave_pcm_hdr
{
	char            riff[4];                // = "RIFF"
	int				size_8;                 // = FileSize - 8
	char            wave[4];                // = "WAVE"
	char            fmt[4];                 // = "fmt "
	int				fmt_size;				// = 下一个结构体的大小 : 16

	short int       format_tag;             // = PCM : 1
	short int       channels;               // = 通道数 : 1
	int				samples_per_sec;        // = 采样率 : 8000 | 6000 | 11025 | 16000
	int				avg_bytes_per_sec;      // = 每秒字节数 : samples_per_sec * bits_per_sample / 8
	short int       block_align;            // = 每采样点字节数 : wBitsPerSample / 8
	short int       bits_per_sample;        // = 量化比特数: 8 | 16

	char            data[4];                // = "data";
	int				data_size;              // = 纯数据长度 : FileSize - 44 
} wave_pcm_hdr;

/* 默认wav音频头部数据 */
wave_pcm_hdr default_wav_hdr = 
{
	{ 'R', 'I', 'F', 'F' },
	0,
	{'W', 'A', 'V', 'E'},
	{'f', 'm', 't', ' '},
	16,
	1,
	1,
	16000,
	32000,
	2,
	16,
	{'d', 'a', 't', 'a'},
	0  
};
/* 文本合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
	int          ret          = -1;
	FILE*        fp           = NULL;
	const char*  sessionID    = NULL;
	unsigned int audio_len    = 0;
	wave_pcm_hdr wav_hdr      = default_wav_hdr;
	int          synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;

	if (NULL == src_text || NULL == des_path)
	{
		printf("params is error!\n");
		return ret;
	}
	fp = fopen(des_path, "wb");
	if (NULL == fp)
	{
		printf("open %s error.\n", des_path);
		return ret;
	}
	/* 开始合成 */
	sessionID = QTTSSessionBegin(params, &ret);
	if (MSP_SUCCESS != ret)
	{
		printf("QTTSSessionBegin failed, error code: %d.\n", ret);
		fclose(fp);
		return ret;
	}
	ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
	if (MSP_SUCCESS != ret)
	{
		printf("QTTSTextPut failed, error code: %d.\n",ret);
		QTTSSessionEnd(sessionID, "TextPutError");
		fclose(fp);
		return ret;
	}
	printf("正在合成 ...\n");
	fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //添加wav音频头，使用采样率为16000
	while (1) 
	{
		/* 获取合成音频 */
		const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
		if (MSP_SUCCESS != ret)
			break;
		if (NULL != data)
		{
			fwrite(data, audio_len, 1, fp);
		    wav_hdr.data_size += audio_len; //计算data_size大小
		}
		if (MSP_TTS_FLAG_DATA_END == synth_status)
			break;
	}
	printf("\n");
	if (MSP_SUCCESS != ret)
	{
		printf("QTTSAudioGet failed, error code: %d.\n",ret);
		QTTSSessionEnd(sessionID, "AudioGetError");
		fclose(fp);
		return ret;
	}
	/* 修正wav文件头数据的大小 */
	wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
	
	/* 将修正过的数据写回文件头部,音频文件为wav格式 */
	fseek(fp, 4, 0);
	fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //写入size_8的值
	fseek(fp, 40, 0); //将文件指针偏移到存储data_size值的位置
	fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //写入data_size的值
	fclose(fp);
	fp = NULL;
	/* 合成完毕 */
	ret = QTTSSessionEnd(sessionID, "Normal");
	if (MSP_SUCCESS != ret)
	{
		printf("QTTSSessionEnd failed, error code: %d.\n",ret);
	}

	return ret;
}

int main(int argc, char* argv[])
{
	int         ret                  = MSP_SUCCESS;
	const char* login_params         = "appid = XXXXXXXX, work_dir = .";//登录参数,appid与msc库绑定,请勿随意改动
	/*
	* rdn:           合成音频数字发音方式
	* volume:        合成音频的音量
	* pitch:         合成音频的音调
	* speed:         合成音频对应的语速
	* voice_name:    合成发音人
	* sample_rate:   合成音频采样率
	* text_encoding: 合成文本编码格式
	*
	*/
	const char* session_begin_params = "engine_type = local, voice_name = xiaoyan, text_encoding = GB2312, tts_res_path = fo|res\\tts\\xiaoyan.jet;fo|res\\tts\\common.jet, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
	const char* filename             = "tts_sample.wav"; //合成的语音文件名称
	const char* text                 = "科大讯飞作为智能语音技术提供商，在智能语音技术领域有着长期的研究积累，并在中文语音合成、语音识别、口语评测等多项技术上拥有技术成果。科大讯飞是我国以语音技术为产业化方向的国家863计划产业化基地"; //合成文本
	/* 用户登录 */
	ret = MSPLogin(NULL, NULL, login_params); //第一个参数是用户名，第二个参数是密码，第三个参数是登录参数，用户名和密码可在http://www.xfyun.cn注册获取
	if (MSP_SUCCESS != ret)
	{
		printf("MSPLogin failed, error code: %d.\n", ret);
		goto exit ;//登录失败，退出登录
	}

	printf("\n###########################################################################\n");
	printf("## 语音合成（Text To Speech，TTS）技术能够自动将任意文字实时转换为连续的 ##\n");
	printf("## 自然语音，是一种能够在任何时间、任何地点，向任何人提供语音信息服务的  ##\n");
	printf("## 高效便捷手段，非常符合信息时代海量数据、动态更新和个性化查询的需求。  ##\n");
	printf("###########################################################################\n\n");

	/* 文本合成 */
	printf("开始合成 ...\n");
	ret = text_to_speech(text, filename, session_begin_params);
	if (MSP_SUCCESS != ret)
	{
		printf("text_to_speech failed, error code: %d.\n", ret);
	}
	printf("合成完毕\n");

exit:
	printf("按任意键退出 ...\n");
	_getch();
	MSPLogout(); //退出登录

	return 0;
}

After reading it, I am not angry, so I need to manually add the wav file header, so that the generated wav file can be played

Then we write the C# wav file header for this

using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
using UnityEngine;

/// <summary>
/// wave文件头
/// </summary>
public struct WAVE_Header
{
    public int RIFF_ID;           //4 byte , 'RIFF'
    public int File_Size;         //4 byte , 文件长度
    public int RIFF_Type;         //4 byte , 'WAVE'

    public int FMT_ID;            //4 byte , 'fmt'
    public int FMT_Size;          //4 byte , 数值为16或18，18则最后又附加信息
    public short FMT_Tag;         //2 byte , 编码方式，一般为0x0001
    public ushort FMT_Channel;    //2 byte , 声道数目，1--单声道；2--双声道
    public int FMT_SamplesPerSec; //4 byte , 采样频率
    public int AvgBytesPerSec;    //4 byte , 每秒所需字节数,记录每秒的数据量
    public ushort BlockAlign;     //2 byte , 数据块对齐单位(每个采样需要的字节数)
    public ushort BitsPerSample;  //2 byte , 每个采样需要的bit数

    public int DATA_ID;           //4 byte , 'data'
    public int DATA_Size;         //4 byte , 
}

public class ToWave
{
    /// <summary>
    /// 根据数据段的长度，生产文件头
    /// </summary>
    /// <param name="data_len">音频数据长度</param>
    /// <returns>返回wav文件头结构体</returns>
    public static WAVE_Header getWave_Header(int data_len)
    {
        WAVE_Header wav_Header = new WAVE_Header();
        wav_Header.RIFF_ID = 0x46464952;        //字符RIFF
        wav_Header.File_Size = data_len + 36;
        wav_Header.RIFF_Type = 0x45564157;      //字符WAVE

        wav_Header.FMT_ID = 0x20746D66;         //字符fmt
        wav_Header.FMT_Size = 16;
        wav_Header.FMT_Tag = 0x0001;
        wav_Header.FMT_Channel = 1;             //单声道
        wav_Header.FMT_SamplesPerSec = 16000;   //采样频率
        wav_Header.AvgBytesPerSec = 32000;      //每秒所需字节数
        wav_Header.BlockAlign = 2;              //每个采样1个字节
        wav_Header.BitsPerSample = 16;           //每个采样8bit

        wav_Header.DATA_ID = 0x61746164;        //字符data
        wav_Header.DATA_Size = data_len;

        return wav_Header;
    }

    /// <summary>
    /// 把结构体转化为字节序列
    /// </summary>
    /// <param name="structure">被转化的结构体</param>
    /// <returns>返回字节序列</returns>
    public static byte[] StructToBytes(object structure)
    {
        int size = Marshal.SizeOf(structure);
        IntPtr buffer = Marshal.AllocHGlobal(size);
        try
        {
            Marshal.StructureToPtr(structure, buffer, false);
            Byte[] bytes = new Byte[size];
            Marshal.Copy(buffer, bytes, 0, size);
            return bytes;
        }
        finally
        {
            Marshal.FreeHGlobal(buffer);
        }
    }

}

Combining examples to write C# offline speech synthesis

    public class XunFeiTemplate
    {
        private string m_SessionID;

        private string m_Speed;

        private string m_Vol;

        private string m_Speeker;

        private Dictionary<Speeker, string> DSpeeker = new Dictionary<Speeker, string>();

        /// <summary>
        /// 构造函数，初始化引擎
        /// </summary>
        /// <param name="configs">初始化引擎参数</param>
        /// <param name="szParams">开始会话用参数</param>
        public XunFeiTemplate(string name, string password, string configs)
        {
            DSpeeker.Add(Speeker.XiaoYan_Woman_Mandarin, "xiaoyan");
            DSpeeker.Add(Speeker.XiaoFeng_Man_Mandarin, "xiaofeng");

            int ret = TTSDll.MSPLogin(name, password, configs);
            if (ret != 0) throw new Exception("初始化TTS引擎错误，错误代码：" + ret);

            m_Speed = "50";
            m_Vol = "100";
            SetSpeaker(Speeker.XiaoYan_Woman_Mandarin);

            UnityEngine.Debug.Log(TTSPath(m_Speeker));
        }

        public void SetSpeaker(Speeker speeker)
        {
            if (DSpeeker.ContainsKey(speeker))
                m_Speeker = DSpeeker[speeker];
        }

        public void CloseXunFei()
        {
            int ret = TTSDll.MSPLogout();
            if (ret != 0) throw new Exception("逆初始化TTS引擎错误，错误代码：" + ret);
        }

        public string TTSPath(string speeker)
        {
            //fo|C:\\Users\\Administrator\\Desktop\\tts\\xiaofeng.jet;fo|C:\\Users\\Administrator\\Desktop\\tts\\common.jet
            string path = UnityEngine.Application.streamingAssetsPath;
            path = path.Replace("/", "\\");
            string combine = "fo|{0}\\tts\\{1}.jet;fo|{2}\\tts\\common.jet";

            return string.Format(combine, path, speeker, path);
        }

        /// <summary>
        /// 把文本转换成声音，写入指定的内存流
        /// </summary>
        /// <param name="speekText">要转化成语音的文字</param>
        /// <param name="mStream">合成结果输出的音频流</param>
        private void Speek(string speekText, ref MemoryStream mStream)
        {
            if (speekText == "" || m_Speed == "" || m_Vol == "" || m_Speeker == "") return;

            string szParams = $"engine_type = local, voice_name = { m_Speeker }, text_encoding = UTF8, tts_res_path = { TTSPath(m_Speeker) }, sample_rate = 16000, speed = { m_Speed }, volume = { m_Vol }, pitch = 50, rdn = 2";

            int ret = 0;
            try
            {
                m_SessionID = Marshal.PtrToStringAnsi(TTSDll.QTTSSessionBegin(szParams, ref ret));
                if (ret != 0)
                {
                    UnityEngine.Debug.Log(ret);
                    throw new Exception("初始化TTS会话错误，错误代码：" + ret);
                }
                ret = TTSDll.QTTSTextPut(m_SessionID, speekText, (uint)Encoding.Default.GetByteCount(speekText), null);
                if (ret != 0)
                {
                    UnityEngine.Debug.Log(ret);
                    throw new Exception("发送数据，错误代码：" + ret);
                }

                SynthStatus synth_status = SynthStatus.TTS_FLAG_STILL_HAVE_DATA;
                while (true)
                {
                    int audio_len = 0;
                    IntPtr source = TTSDll.QTTSAudioGet(m_SessionID, ref audio_len, ref synth_status, ref ret);
                    byte[] array = new byte[audio_len];
                    if (audio_len > 0)
                    {
                        Marshal.Copy(source, array, 0, audio_len);
                    }
                    mStream.Write(array, 0, audio_len);
                    //将合成的音频字节数据存放到内存流中
                    //Thread.Sleep(15);//防止CPU频繁占用
                    if (synth_status == SynthStatus.TTS_FLAG_DATA_END || ret != 0)
                        break;
                }
            }
            catch (Exception ex)
            {
                UnityEngine.Debug.Log(ex.Message);
            }
            finally
            {
                ret = TTSDll.QTTSSessionEnd(m_SessionID, "");
                if (ret != 0) throw new Exception("结束TTS会话错误，错误代码：" + ret);
            }
        }

        public MemoryStream SpeechSynthesis(string SpeekText)
        {
            MemoryStream mStream = new MemoryStream(1024 * 8);

            mStream.Write(new byte[44], 0, 44);

            Speek(SpeekText, ref mStream);

            //创建wav文件头
            WAVE_Header header = ToWave.getWave_Header((int)mStream.Length - 44);
            //把文件头结构转化为字节数组                      
            byte[] headerByte = ToWave.StructToBytes(header);
            //定位到文件头
            mStream.Position = 0;
            //写入文件头
            mStream.Write(headerByte, 0, headerByte.Length);
            return mStream;
        }


    }

The finally obtained MemoryStream can be saved as a normal wav file

MemoryStream mStream = new MemoryStream(1024 * 8);
Speek(SpeekText, ref mStream);

using (FileStream wavFile = new FileStream("music/test.wav", FileMode.Create))
{
    mStream.WriteTo(wavFile);
}

PS：TTSDll.MSPLogin(name, password, configs);

The configuration of configs is in the sample program

Username and password can pass null

TTSPath obtains an absolute path, and the sdk obtains a relative path. Unity cannot use it, so use an absolute path

Play streaming media in Unity

Of course, you can also save it as a wav file and let AudioSource play it through dynamic loading.

However, since we have streaming media, why not play it directly in the AudioSource?

There is a SetData method at the bottom of AudioClip, so it proves that this idea is feasible. We only need to convert the binary stream into the data in AudioClip.

The wav format analysis can be understood by Baidu.

public bool SetData (float[] data , int offsetSamples );
Set the sample data of the clip.
The range of floating-point numbers for sampling is -1.0 ~ 1.0 (beyond this limit will cause bugs or unknown behavior), and the number of samples is determined by the length of the floating-point number array. Use offsetSamples to start reading from a random position in the clip. If the length read from the offset value is longer than the clip, the read wraps around reading the remaining samples from the start of the clip.

PS: Only uncompressed PCM wav files are supported.

public static AudioClip Clip(byte[] fileBytes, int offsetSamples = 0, string name = "ifly")
{
    //string riff = Encoding.ASCII.GetString (fileBytes, 0, 4);
    //string wave = Encoding.ASCII.GetString (fileBytes, 8, 4);
    int subchunk1 = BitConverter.ToInt32(fileBytes, 16);
    ushort audioFormat = BitConverter.ToUInt16(fileBytes, 20);

    // NB: Only uncompressed PCM wav files are supported.
    string formatCode = FormatCode(audioFormat);
    //Debug.AssertFormat(audioFormat == 1 || audioFormat == 65534, "Detected format code '{0}' {1}, but only PCM and WaveFormatExtensable uncompressed formats are currently supported.", audioFormat, formatCode);

    ushort channels = BitConverter.ToUInt16(fileBytes, 22);
    int sampleRate = BitConverter.ToInt32(fileBytes, 24);
    //int byteRate = BitConverter.ToInt32 (fileBytes, 28);
    //UInt16 blockAlign = BitConverter.ToUInt16 (fileBytes, 32);
    ushort bitDepth = BitConverter.ToUInt16(fileBytes, 34);

    int headerOffset = 16 + 4 + subchunk1 + 4;
    int subchunk2 = BitConverter.ToInt32(fileBytes, headerOffset);
    //Debug.LogFormat ("riff={0} wave={1} subchunk1={2} format={3} channels={4} sampleRate={5} byteRate={6} blockAlign={7} bitDepth={8} headerOffset={9} subchunk2={10} filesize={11}", riff, wave, subchunk1, formatCode, channels, sampleRate, byteRate, blockAlign, bitDepth, headerOffset, subchunk2, fileBytes.Length);

    //Log.Info(bitDepth);

    float[] data;
    switch (bitDepth)
    {
        case 8:
            data = Convert8BitByteArrayToAudioClipData(fileBytes, headerOffset, subchunk2);
            break;
        case 16:
            data = Convert16BitByteArrayToAudioClipData(fileBytes, headerOffset, subchunk2);
            break;
        case 24:
            data = Convert24BitByteArrayToAudioClipData(fileBytes, headerOffset, subchunk2);
            break;
        case 32:
            data = Convert32BitByteArrayToAudioClipData(fileBytes, headerOffset, subchunk2);
            break;
        default:
            throw new Exception(bitDepth + " bit depth is not supported.");
    }

    AudioClip audioClip = AudioClip.Create(name, data.Length, channels, sampleRate, false);
    audioClip.SetData(data, 0);
    return audioClip;
}

private static string FormatCode(UInt16 code)
{
    switch (code)
    {
        case 1:
            return "PCM";
        case 2:
            return "ADPCM";
        case 3:
            return "IEEE";
        case 7:
            return "μ-law";
        case 65534:
            return "WaveFormatExtensable";
        default:
            Debug.LogWarning("Unknown wav code format:" + code);
            return "";
    }
}

#region wav file bytes to Unity AudioClip conversion methods

private static float[] Convert8BitByteArrayToAudioClipData(byte[] source, int headerOffset, int dataSize)
{
    int wavSize = BitConverter.ToInt32(source, headerOffset);
    headerOffset += sizeof(int);
    Debug.AssertFormat(wavSize > 0 && wavSize == dataSize, "Failed to get valid 8-bit wav size: {0} from data bytes: {1} at offset: {2}", wavSize, dataSize, headerOffset);

    float[] data = new float[wavSize];

    sbyte maxValue = sbyte.MaxValue;

    int i = 0;
    while (i < wavSize)
    {
        data[i] = (float)source[i] / maxValue;
        ++i;
    }

    return data;
}

private static float[] Convert16BitByteArrayToAudioClipData(byte[] source, int headerOffset, int dataSize)
{
    int wavSize = BitConverter.ToInt32(source, headerOffset);
    headerOffset += sizeof(int);
    Debug.AssertFormat(wavSize > 0 && wavSize == dataSize, "Failed to get valid 16-bit wav size: {0} from data bytes: {1} at offset: {2}", wavSize, dataSize, headerOffset);

    int x = sizeof(Int16); // block size = 2
    int convertedSize = wavSize / x;

    float[] data = new float[convertedSize];

    Int16 maxValue = Int16.MaxValue;

    int offset = 0;
    int i = 0;
    while (i < convertedSize)
    {
        offset = i * x + headerOffset;
        data[i] = (float)BitConverter.ToInt16(source, offset) / maxValue;
        ++i;
    }

    Debug.AssertFormat(data.Length == convertedSize, "AudioClip .wav data is wrong size: {0} == {1}", data.Length, convertedSize);

    return data;
}

private static float[] Convert24BitByteArrayToAudioClipData(byte[] source, int headerOffset, int dataSize)
{
    int wavSize = BitConverter.ToInt32(source, headerOffset);
    headerOffset += sizeof(int);
    Debug.AssertFormat(wavSize > 0 && wavSize == dataSize, "Failed to get valid 24-bit wav size: {0} from data bytes: {1} at offset: {2}", wavSize, dataSize, headerOffset);

    int x = 3; // block size = 3
    int convertedSize = wavSize / x;

    int maxValue = Int32.MaxValue;

    float[] data = new float[convertedSize];

    byte[] block = new byte[sizeof(int)]; // using a 4 byte block for copying 3 bytes, then copy bytes with 1 offset

    int offset = 0;
    int i = 0;
    while (i < convertedSize)
    {
        offset = i * x + headerOffset;
        Buffer.BlockCopy(source, offset, block, 1, x);
        data[i] = (float)BitConverter.ToInt32(block, 0) / maxValue;
        ++i;
    }

    Debug.AssertFormat(data.Length == convertedSize, "AudioClip .wav data is wrong size: {0} == {1}", data.Length, convertedSize);

    return data;
}

private static float[] Convert32BitByteArrayToAudioClipData(byte[] source, int headerOffset, int dataSize)
{
    int wavSize = BitConverter.ToInt32(source, headerOffset);
    headerOffset += sizeof(int);
    Debug.AssertFormat(wavSize > 0 && wavSize == dataSize, "Failed to get valid 32-bit wav size: {0} from data bytes: {1} at offset: {2}", wavSize, dataSize, headerOffset);

    int x = sizeof(float); //  block size = 4
    int convertedSize = wavSize / x;

    Int32 maxValue = Int32.MaxValue;

    float[] data = new float[convertedSize];

    int offset = 0;
    int i = 0;
    while (i < convertedSize)
    {
        offset = i * x + headerOffset;
        data[i] = (float)BitConverter.ToInt32(source, offset) / maxValue;
        ++i;
    }

    Debug.AssertFormat(data.Length == convertedSize, "AudioClip .wav data is wrong size: {0} == {1}", data.Length, convertedSize);

    return data;
}

#endregion

Final playback code:

 m_XunFeiTemplate.SetSpeaker(Speeker.XiaoYan_Woman_Mandarin);
 MemoryStream memoryStream = m_XunFeiTemplate.SpeechSynthesis(text.text);

 byte[] datas = memoryStream.ToArray();
 m_AudioSource.clip = ToAudio.Clip(datas);
 m_AudioSource.loop = false;
 m_AudioSource.volume = 1;
 m_AudioSource.Play();

Interested friends can pay attention to a wave

o(*￣▽￣*)bu

Advancement of UNITY actual combat - HKUST Xunfei TTS (offline speech synthesis) unity plays wav in PCM format (does not rely on third-party libraries)-7

Preparation

Understand the functions of msc.dll

The difficulty

Generate correct WAV files

Combining examples to write C# offline speech synthesis

Play streaming media in Unity

Guess you like