Distal achieve recording audio blob to the server, then the server Baidu AI speech recognition results are returned to the front end
The article is Baidu AI voice recognition Nodejs SDK
version of the presentation were again identified, the results returned to the front display, here is a complete front-end recording and then compressed audio objects Blob
to the server, using Baidu AI voice server identification, the final recognition result returned for display to the front end.
Benpian call a third-party library Recorder.js
, call the library how to capture HTML5
the WAV
audio and upload it to a local server or download, you can view this blog , but to explain it is uploaded to PHP
the server, where I changed based on Node
building the Websocket
server.
This is the speech recognition results of this blog:
View document information I want to know, if you want to achieve real-time speech recognition, voice long period of time, the wake-up word feature, semantic analysis function, use Android
, IOS SDK
or Linux C++ SDK
version, and I'm using Nodejs SDK
is not supported.
-
When the voice on the line as long 60s, beyond the talk returns an error
-
The original recording files
pcm
,wav
oramr
format, case-insensitive, it is recommended to usepcm
-
Recording sampling rate is 16000 channel single-channel
-
Support in Mandarin, English and Cantonese, Sichuan dialect
-
Project structure
AI calls Baidu platform for speech recognition
Nodejs SDK
, see the documentation Quick Start, you can see how the call.First nodejs-sdk download, the download directory will
speech
copy the folder to your project folder, whichassets
is a place to store audio recordings, and then enter thenode
location of the folder to install dependencies:npm install
My project folder directory as follows:
audio_asr_baidu ├─ package-lock.json └─ speech ├─ .gitignore ├─ assets │ ├─ 16k_test.pcm │ └─ recorder.wav ├─ cpp │ ├─ .gitignore │ ├─ README.md │ ├─ build.sh │ └─ main.cpp └─ node ├─ .gitignore ├─ README.md ├─ index.html ├─ main.js ├─ node_modules ├─ package-lock.json ├─ package.json └─ style.css
Then in the
node
folder ofindex.html
my client file,main.js
my file server.
In the main.js
build file websocket
server, first install its dependencies modules:
Elevation and S ws
Then set up:
let Server = require('ws').Server; const wss = new Server({ port: 9001 }) // 连接服务器 wss.on('connection', ws => { console.log('server connected'); }) ws.on('error', error => { console.log('Error:' + error); }) ws.on('close', () => { console.log('Websocket is closed'); }) }) // 断开连接 wss.on('disconnection', ws => { ws.on('message', msg => { console.log('server recived msg:' + msg); }) })
Then in the index.html
middle:
let ws = new WebSocket('ws://localhost:9001'); ws.onopen = e => { console.log('Connection to server opened'); }
Start the service:
node main.js
You can see this print in the console:
// print information of the client: Connection Server to the Opened // print information service side: Server Connected
After recording client implementation, the compressed audio object Blob
passed to the server:
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>Simple Recorder.js demo with record, stop and pause</title> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <link rel="stylesheet" type="text/css" href="style.css"> </head> <body> <div id="controls"> <button id="recordButton">Record</button> <button id="stopButton" disabled>Stop</button> </div> <p id="out-txt">You said:</p> <h3>Recordings</h3> <ol id="recordingsList"></ol> <script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script> //连接服务器 let ws = new WebSocket('ws://localhost:9001'); ws.onopen = e => { console.log('Connection to server opened'); } URL = window.URL || window.webkitURL; var gumStream; //stream from getUserMedia() var rec; //Recorder.js object var input; //MediaStreamAudioSourceNode var AudioContext = window.AudioContext || window.webkitAudioContext; var audioContext var recordButton = document.getElementById("recordButton"); var stopButton = document.getElementById("stopButton"); recordButton.addEventListener("click", startRecording); stopButton.addEventListener("click", stopRecording); // 录音 function startRecording() { console.log("recordButton clicked"); var constraints = { audio: true, video: false } recordButton.disabled = true; stopButton.disabled = false; // 获取录音权限 然后开始录音 navigator.mediaDevices.getUserMedia(constraints).then(function(stream) { console.log("getUserMedia() success, stream created, initializing Recorder.js ..."); audioContext = new AudioContext(); gumStream = stream; input = audioContext.createMediaStreamSource(stream); rec = new Recorder(input, { numChannels: 1 // 单声道 }) //开始录音 rec.record() console.log("Recording started"); }).catch(function(err) { recordButton.disabled = false; stopButton.disabled = true; }); } // 停止录音 function stopRecording() { console.log("stopButton clicked"); stopButton.disabled = true; recordButton.disabled = false; rec.stop(); gumStream.getAudioTracks()[0].stop(); //Create a wav format to make the blob download rec.exportWAV (createDownloadLink); } // message receiving side service delivered ws.onmessage = E => { the console.log (e.data); the setTimeout (() => { document.getElementById ( "OUT-TXT") the innerHTML + =. e.data }, 3000 ); } // Create a download link function createDownloadLink (BLOB) { the console.log (BLOB); ws.send (BLOB); var URL = URL.createObjectURL (BLOB); var Au = document.createElement ( 'Audio' ); var li = document.createElement('li'); var link = document.createElement('a'); var filename = new Date().toISOString(); au.controls = true; au.src = url; link.href = url; link.download = filename + ".wav"; link.innerHTML = "Save to disk"; li.appendChild(au); li.appendChild(document.createTextNode(filename + ".wav ")) li.appendChild(link); }
In this way, the page will create a download link, and to record the date for the file name, you can choose to download, but also the audio objects to the server.
Because the front to the back through the audio stream uploaded files no longer saved as wav
an audio format, but the process into a stream of binary array, directly call Baidu voice recognition SDK
method to return recognition results, after after having to send coded end, a rear end and then decoded.
let AipSpeech = require("baidu-aip-sdk").speech; let fs = require('fs'); let Server = require('ws').Server; const wss = new Server({ port: 9001 }) let resTxt; wss.on('connection', ws => { console.log('server connected'); const transTxt = (resTxt) => { ws.send(resTxt); } ws.on('message', data => { console.log('server recived audio blob'); // 务必替换百度云控制台中新建百度语音应用的 Api Key 和 Secret Key let client = new AipSpeech(0, 'Api Key', 'Secret Key'); let voiceBase64 = new Buffer(data); client.recognize(voiceBase64, 'wav', 16000).then(function(result) { console.log('语音识别本地音频文件结果: ' + JSON.stringify(result)); resTxt = JSON.parse(JSON.stringify(result)); // 将结果传给前端 transTxt(resTxt); }, function(err) { console.log(err); }); }) ws.on('error', error => { console.log('Error:' + error); }) ws.on('close', () => { console.log('Websocket is closed'); }) }) wss.on('disconnection', ws => { ws.on('message', msg => { console.log('server recived msg:' + msg); }) })
This is the result of front-end recording talking backstage pass speech recognition, will use the results websocket
passed to the front, it can display in the label: