在 2016 年的 WWDC 上,Apple 介绍了一个十分有用的语音识别 API,那就是 Speech 框架。Speech 框架能够帮助你快速的集成语音输入的功能到APP。
2016 年的 WWDC Speech 框架视频
下面介绍一下简单的集成过程。
1 获取APP 授权信息
- 麦克风的使用 NSMicrophoneUsageDescription
- 语音的识别 NSSpeechRecognitionUsageDescription
可以通过Info 文件为以上两个Key 值设置请求授权提示信息,也可在info.plist 的原文件添加。
<key>NSMicrophoneUsageDescription</key> <string>Your microphone will be used to record your speech when you press the "Start Recording" button.</string>
<key>NSSpeechRecognitionUsageDescription</key> <string>Speech recognition will be used to determine which words you speak into this device's microphone.</string>
2 实现Speech
首先需要在swift文件中导入
import Foundation
import UIKit
import Speech
import AudioToolbox
import AVFoundation
然后创建用于语音识别的实例对象
// MARK: Properties
private let speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "zh_CN"))!
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var audioEngine = AVAudioEngine()
private var result = ""
public var delegate: BDPSpeechDelegate?
private var timer : Timer?
启动语语音识别
public func startRecording() throws {
self.checkSpeech()
// Cancel the previous task if it's running.
if let recognitionTask = recognitionTask {
recognitionTask.cancel()
self.recognitionTask = nil
}
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(AVAudioSessionCategoryRecord)
try audioSession.setMode(AVAudioSessionModeMeasurement)
try audioSession.setActive(true, with: .notifyOthersOnDeactivation)
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let inputNode = audioEngine.inputNode else {
print("Audio engine has no input node")
return
}
guard let recognitionRequest = recognitionRequest else {
print("Unable to created a SFSpeechAudioBufferRecognitionRequest object")
return
}
if inputNode.numberOfInputs > 0 {
// Configure request so that results are returned before audio recording is finished
recognitionRequest.shouldReportPartialResults = true
// A recognition task represents a speech recognition session.
// We keep a reference to the task so that it can be cancelled.
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
var isFinal = false
if let result = result {
if !result.isFinal && result.bestTranscription.formattedString != "" {
// if did't get any voice input after 1 second, auto end audioEngine
self.timer = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: false, block: { (timer) in
self.audioEngine.stop()
self.recognitionRequest?.endAudio()
self.audioEngine.inputNode?.removeTap(onBus: 0)
})
} else {
self.timer?.invalidate()
self.timer = nil
}
isFinal = result.isFinal
self.delegate?.voiceChanged(result: result.bestTranscription.formattedString)
self.result = result.bestTranscription.formattedString
print("---isFinal", isFinal, result.bestTranscription.formattedString, self.result == result.bestTranscription.formattedString)
if isFinal {
self.delegate?.didStopRecording(result: result.bestTranscription.formattedString)
}
}
if error != nil || isFinal {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
self.timer?.invalidate()
self.timer = nil
print("---audioEngine stoped", isFinal)
self.delegate?.speechTaskError()
}
}
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
self.result = ""
}
}
始录音和停止录音的方法
// MARK: Start record
public func record() {
try! startRecording()
if audioEngine.isRunning {
print("---- Speech start recording")
}
}
// MARK: Stop record
public func stop() {
if audioEngine.isRunning {
audioEngine.stop()
recognitionRequest?.endAudio()
audioEngine.inputNode?.removeTap(onBus: 0)
audioEngine.reset()
self.timer?.invalidate()
self.timer = nil
print("---- Speech end recording")
}
}
public protocol SpeechDelegate {
func didStopRecording(result: String)
func voiceChanged(result: String)
func authorizeDenied()
func speechTaskError()
}
最后值得注意的是
Apple 对每台设备的识别有限制。详情未知,不过你可以尝试联系 Apple 获得更多信息。
Apple 对每个应用的识别也有限制。
如果你总是遭遇限制,务必联系 Apple,他们或许可以解决这个问题。
语音识别会消耗不少电量和流量。
语音识别每次只能持续大概一分钟。
Siri Speech对中文识别的准确率很差一些没有专业的词汇基本上听不出来
Siri Speech稳定性也很低, 在不同机型上面识别效率不一样,相同的条件下有时候能识别有时候不能识别。
Siri Speech对网络要求很高,由于服务器的原因,经常获取不到返回结果