微软TTS 使用

Win7环境下测试。

首先安装 Windows Speech SDK，下载地址为：http://www.microsoft.com/download/en/details.aspx?id=10121，SpeechSDK51.exe是简体中文语音引擎，SpeechSDK51LangPach.exe是中文男生语音库。

微软本身的语音库中文语音比较生硬，可以安装NeoSpeech的中文语音库。百度下直接搜索NeoSpeech的中文语音即可找到。

从网上找的代码封装成类，qt5.5，在简单环境下可以满足需求，直接上代码

头文件

<pre name="code" class="cpp">#include <QObject>
#include <QList>
#include <sapi.h>

class ISpVoice;
class ISpObjectToken;
class ISpAudio;
class SpFormat;
class SimpleTTS : public QObject
{
	Q_OBJECT

public:
	SimpleTTS(QObject *parent);
	~SimpleTTS();


	void speak(QString text);
	
	//当前的音频格式
	SPSTREAMFORMAT currFormat();
private:
	void initVoices();				//获取可用的语音库
	void initSpFormat();				//设置可用的音频格式

private:

	bool m_bTTSEnable;				//语音库是否可用
	ISpVoice* m_pVoice;			    //ms com tts 组件
	ISpAudio* m_pAudio;				//voice 所使用的 音频相关
	QList<ISpObjectToken*> m_voices;	//可用的语音库数组

	QList<SpFormat> m_spFmts;       //
};



class SpFormat						//封装ms描述的音频格式
{
public:
	SpFormat(SPSTREAMFORMAT vl, QString sz);
	~SpFormat(){};

	QString discription()const { return m_discription; }
	SPSTREAMFORMAT getFormat()const{ return m_val; }
	DWORD rate()const{ return m_bytePS; }
private:
	SPSTREAMFORMAT m_val;
	QString m_discription;
	DWORD m_bytePS;
};

cpp

#include "simpletts.h"
#include <conio.h>
#include <sphelper.h>
#include <vector>
#include <queue>
#include <string>
#include <QDebug>
#include "jiontctrllmgr.h"
#include "simplelog.h"
#include "jiontctrllmgr.h"

#include <sphelper.h>
#include <spuihelp.h>

#pragma comment(lib,"sapi.lib")    //sapi.lib在SDK的lib目录,必需正确配置

SimpleTTS::SimpleTTS(QObject *parent)
	: QObject(parent), m_pVoice(NULL), m_pAudio(NULL)
{
	m_bTTSEnable = true;
	TCHAR szBuf[80] = { 0 };
	LPVOID lpMsgBuf = NULL;
	HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);

	if (FAILED(hr))
	{
		hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
		if (FAILED(hr))
		{
			FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |FORMAT_MESSAGE_FROM_SYSTEM,NULL,hr,MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),(LPTSTR)&lpMsgBuf,0, NULL);
			wsprintf(szBuf, _T("error %d: %s"), hr, lpMsgBuf);
			LocalFree(lpMsgBuf);
#ifdef QT_DEBUG
			qDebug().noquote() << "Error to intiliaze COM reason:" + QString::fromStdWString(szBuf);
#else
			LOGERROR("Error to intiliaze COM reason:"+ QString::fromStdWString(szBuf));
#endif // QT_DEBUG
			return;
		}
	}

	hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&m_pVoice);

	if (FAILED(hr))
	{
		FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, hr, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpMsgBuf, 0, NULL);
		wsprintf(szBuf, _T("error %d: %s"), hr, lpMsgBuf);
		LocalFree(lpMsgBuf);
#ifdef QT_DEBUG
		qDebug().noquote() << "Error to intiliaze ISPVoice component,reason: " + QString::fromStdWString(szBuf);
#else
		LOGERROR("Error to intiliaze ISPVoice component,reason: " + QString::fromStdWString(szBuf));
#endif // QT_DEBUG
		m_bTTSEnable = false;
		return;
	}

	hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOOUT, &m_pAudio);
	if (FAILED(hr))
	{
		FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, hr, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpMsgBuf, 0, NULL);
		wsprintf(szBuf, _T("error %d: %s"), hr, lpMsgBuf);
		LocalFree(lpMsgBuf);
#ifdef QT_DEBUG
		qDebug().noquote() << "SpCreateDefaultObjectFromCategoryId failed,reason:" + QString::fromStdWString(szBuf);
#else
		LOGERROR("SpCreateDefaultObjectFromCategoryId failed,reason:" + QString::fromStdWString(szBuf));
#endif // QT_DEBUG
		m_bTTSEnable = false;
		return;
	}

	initVoices();

	WCHAR* szDesc;

	QStringList list;
	for (size_t i = 0; i < m_voices.size(); i++)
	{
		SpGetDescription(m_voices[i], &szDesc); //获取语音库描述
	
		list << QString::fromStdWString(std::wstring(szDesc));
		qDebug() << szDesc;
	}
	if (list.empty())LOGERROR(QString::fromLocal8Bit("TTS 没有可用的语音库"));
	else LOGINFO(QString::fromLocal8Bit("TTS 支持语音库：%1").arg(list.join(",")));

	int index = JiontCtrllMgr::getInstance()->config()->voiceIndex();
	if (index > 0 && index < m_voices.size())
	{
		hr = m_pVoice->SetVoice(m_voices[index]);
	}
	else
	{
#ifdef QT_DEBUG
		qDebug().noquote() <<QString::fromLocal8Bit("SetVoice index 无效，语音为正确设置！");
#else
		LOGERROR(QString::fromLocal8Bit("SetVoice index 无效，语音为正确设置！"));
#endif // QT_DEBUG
	}

	initSpFormat();
	//输出音频设置
	CSpStreamFormat Fmt;
	index = JiontCtrllMgr::getInstance()->config()->audioIndex();
	if (index >= 0 && index < m_spFmts.size())
	{
		Fmt.AssignFormat(m_spFmts[index].getFormat());
		hr = m_pAudio->SetFormat(Fmt.FormatId(), Fmt.WaveFormatExPtr());
		if (FAILED(hr))
		{
#ifdef QT_DEBUG
			qDebug().noquote() << QString::fromLocal8Bit("SetFormat TTS 初始化失败,TTS 不可用！");
#else
			LOGERROR(QString::fromLocal8Bit("SetFormat TTS 初始化失败,TTS 不可用！"));
#endif // QT_DEBUG
			m_bTTSEnable = false;
			return;

		}

		hr = m_pVoice->SetOutput(m_pAudio, FALSE);
		if (FAILED(hr))
		{
#ifdef QT_DEBUG
			qDebug().noquote() << QString::fromLocal8Bit("SetOutput TTS 初始化失败，TTS 不可用！");
#else
			LOGERROR(QString::fromLocal8Bit("SetOutput TTS 初始化失败，TTS 不可用！"));
#endif // QT_DEBUG
			m_bTTSEnable = false;
			return;
		}
	}
}

SimpleTTS::~SimpleTTS()
{
	m_pVoice->Release();
	m_pVoice = NULL;
	m_pAudio->Release();
	CoUninitialize();
}



void SimpleTTS::speak(QString text)
{
	if (!m_bTTSEnable)
	{
		printf("tts library cant use!");
		return;
	}
	for (int i = 0; i <= 9;i++)
	{
		QString temp = QString("%1").arg(i);
		text.replace(temp, "[" + temp + "]");
	}
	const QList<QPair<QString, QString>>& list = JiontCtrllMgr::getInstance()->config()->replaceList();
	for (int i = 0; i < list.size();i++)
	{
		text.replace(list[i].first, list[i].second);
	}
	
	m_pVoice->Speak((LPCTSTR)text.toStdWString().c_str(), SPF_ASYNC, NULL);
	//m_pVoice->WaitUntilDone(INFINITE);
}



SPSTREAMFORMAT SimpleTTS::currFormat()
{
	SPSTREAMFORMAT fmt = SPSF_Default;
	CComPtr<ISpStreamFormat> cpStream;
	HRESULT hr = m_pVoice->GetOutputStream(&cpStream);

	CSpStreamFormat Fmt;
	if (hr == S_OK)
	{
		hr = Fmt.AssignFormat(cpStream);
		if (SUCCEEDED(hr))
		{
			fmt = Fmt.ComputeFormatEnum();
		}
	}
	return fmt;
}

void SimpleTTS::initVoices()
{
	IEnumSpObjectTokens* cpEnum;
	HRESULT hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &cpEnum);
	ULONG i = 0, ulCount = 0;
	hr = cpEnum->GetCount(&ulCount);

	ISpObjectToken* tok;
	while (SUCCEEDED(hr) && i < ulCount)
	{
		hr = cpEnum->Next(1, &tok, NULL);
		m_voices.push_back(tok);
		i++;
	}
	cpEnum->Release();
}

void SimpleTTS::initSpFormat()
{
	m_spFmts.push_back(SpFormat(SPSF_12kHz16BitStereo, "SPSF_12kHz16BitStereo"));
	m_spFmts.push_back(SpFormat(SPSF_16kHz16BitMono, "SPSF_16kHz16BitMono"));
	m_spFmts.push_back(SpFormat(SPSF_16kHz16BitStereo, "SPSF_16kHz16BitStereo"));
	m_spFmts.push_back(SpFormat(SPSF_22kHz16BitMono, "SPSF_22kHz16BitMono"));
	m_spFmts.push_back(SpFormat(SPSF_22kHz16BitStereo, "SPSF_22kHz16BitStereo"));
	m_spFmts.push_back(SpFormat(SPSF_24kHz16BitStereo, "SPSF_24kHz16BitStereo"));
	m_spFmts.push_back(SpFormat(SPSF_32kHz16BitStereo, "SPSF_32kHz16BitStereo"));
	m_spFmts.push_back(SpFormat(SPSF_44kHz16BitMono, "SPSF_44kHz16BitMono"));
	m_spFmts.push_back(SpFormat(SPSF_44kHz16BitStereo, "SPSF_44kHz16BitStereo"));
	m_spFmts.push_back(SpFormat(SPSF_48kHz16BitMono, "SPSF_48kHz16BitMono"));
	m_spFmts.push_back(SpFormat(SPSF_48kHz16BitStereo, "SPSF_48kHz16BitStereo"));
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SpFormat::SpFormat(SPSTREAMFORMAT vl, QString  sz)
{
	m_val = vl;
	m_discription = sz;
	QString sChannel = m_discription.left(15);
	DWORD sampleRate, bitRate;
	sscanf(sz.toStdString().c_str(), "SPSF_%ukHz%uBit", &sampleRate, &bitRate);
	if (sampleRate == 22)
		sampleRate = 22050;
	else if (sampleRate == 44)
		sampleRate = 44100;
	else
		sampleRate *= 1000;
	m_bytePS = sampleRate*bitRate / 8;
	if (sChannel == "Stereo")
		m_bytePS *= 2;
}

猜你喜欢