js front-end implements language recognition and recording

Preface

During the internship, the leader asked to verify the implementation of recording and speech recognition on the web front-end. After checking, I found that there are almost no tutorials on speech recognition, that is, speech-to-text.

In fact, there is a solution. The front-end records first and then transmits the recording to the back-end. The back-end requests an API such as Baidu speech-to-text for recognition, but this requires writing a second back-end. If you directly request Baidu api from the front end, you will encounter cross-domain problems, not to mention that it is not very safe to write the apikey on the front end. Furthermore, Baidu’s recognition accuracy is not very high. .

This is the origin of this article, implementing speech recognition based on the web's native API.

environment

name	Version
node	v17.1.0
npm	8.1.4
@vue/cli	4.5.15
vue	2
vant	2

Adaptation rate

Insert image description here

As can be seen from the picture, commonly used browsers are basically supported, but after actual testing, Google Chrome is sometimes inoperable due to network reasons. Edge recognition performs best on PCs, and is almost unavailable on Android devices. It works perfectly with Safari on Apple iOS.

Recording and speech recognition

HZRecorder.js package

Here is a packaged js found online

HZRecorder.js

function HZRecorder (stream, config) {
    
    
  config = config || {
    
    }
  config.sampleBits = config.sampleBits || 16 // 采样数位 8, 16
  config.sampleRate = config.sampleRate || 16000 // 采样率16khz

  let context = new (window.webkitAudioContext || window.AudioContext)()
  let audioInput = context.createMediaStreamSource(stream)
  let createScript = context.createScriptProcessor || context.createJavaScriptNode
  let recorder = createScript.apply(context, [4096, 1, 1])

  let audioData = {
    
    
    size: 0 // 录音文件长度
    , buffer: [] // 录音缓存
    , inputSampleRate: context.sampleRate // 输入采样率
    , inputSampleBits: 16 // 输入采样数位 8, 16
    , outputSampleRate: config.sampleRate // 输出采样率
    , oututSampleBits: config.sampleBits // 输出采样数位 8, 16
    , input: function (data) {
    
    
      this.buffer.push(new Float32Array(data))
      this.size += data.length
    }
    , compress: function () {
    
     // 合并压缩
      // 合并
      let data = new Float32Array(this.size)
      let offset = 0
      for (let i = 0; i < this.buffer.length; i++) {
    
    
        data.set(this.buffer[i], offset)
        offset += this.buffer[i].length
      }
      // 压缩
      let compression = parseInt(this.inputSampleRate / this.outputSampleRate)
      let length = data.length / compression
      let result = new Float32Array(length)
      // eslint-disable-next-line one-var
      let index = 0, j = 0
      while (index < length) {
    
    
        result[index] = data[j]
        j += compression
        index++
      }
      return result
    }
    , encodeWAV: function () {
    
    
      let sampleRate = Math.min(this.inputSampleRate, this.outputSampleRate)
      let sampleBits = Math.min(this.inputSampleBits, this.oututSampleBits)
      let bytes = this.compress()
      let dataLength = bytes.length * (sampleBits / 8)
      let buffer = new ArrayBuffer(44 + dataLength)
      let data = new DataView(buffer)

      let channelCount = 1// 单声道
      let offset = 0

      let writeString = function (str) {
    
    
        for (let i = 0; i < str.length; i++) {
    
    
          data.setUint8(offset + i, str.charCodeAt(i))
        }
      }

      // 资源交换文件标识符
      writeString('RIFF')
      offset += 4
      // 下个地址开始到文件尾总字节数,即文件大小-8
      data.setUint32(offset, 36 + dataLength, true)
      offset += 4
      // WAV文件标志
      writeString('WAVE')
      offset += 4
      // 波形格式标志
      writeString('fmt ')
      offset += 4
      // 过滤字节,一般为 0x10 = 16
      data.setUint32(offset, 16, true)
      offset += 4
      // 格式类别 (PCM形式采样数据)
      data.setUint16(offset, 1, true)
      offset += 2
      // 通道数
      data.setUint16(offset, channelCount, true)
      offset += 2
      // 采样率,每秒样本数,表示每个通道的播放速度
      data.setUint32(offset, sampleRate, true)
      offset += 4
      // 波形数据传输率 (每秒平均字节数) 单声道×每秒数据位数×每样本数据位/8
      data.setUint32(offset, channelCount * sampleRate * (sampleBits / 8), true)
      offset += 4
      // 快数据调整数 采样一次占用字节数 单声道×每样本的数据位数/8
      data.setUint16(offset, channelCount * (sampleBits / 8), true)
      offset += 2
      // 每样本数据位数
      data.setUint16(offset, sampleBits, true)
      offset += 2
      // 数据标识符
      writeString('data')
      offset += 4
      // 采样数据总数,即数据总大小-44
      data.setUint32(offset, dataLength, true)
      offset += 4
      // 写入采样数据
      if (sampleBits === 8) {
    
    
        for (let i = 0; i < bytes.length; i++, offset++) {
    
    
          let s = Math.max(-1, Math.min(1, bytes[i]))
          let val = s < 0 ? s * 0x8000 : s * 0x7FFF
          val = parseInt(255 / (65535 / (val + 32768)))
          data.setInt8(offset, val, true)
        }
      } else {
    
    
        for (let i = 0; i < bytes.length; i++, offset += 2) {
    
    
          let s = Math.max(-1, Math.min(1, bytes[i]))
          data.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true)
        }
      }

      return new Blob([data], {
    
    type: 'audio/wav'})
    }
  }
  // 开始录音
  this.start = function () {
    
    
    audioInput.connect(recorder)
    recorder.connect(context.destination)
  }

  // 停止
  this.stop = function () {
    
    
    recorder.disconnect()
  }

  // 获取音频文件
  this.getBlob = function () {
    
    
    this.stop()
    console.log(audioData.encodeWAV())
    return audioData.encodeWAV()
  }

  // 回放
  this.play = function (audio) {
    
    
    let blob = this.getBlob()
    // saveAs(blob, "F:/3.wav");
    // window.open(window.URL.createObjectURL(this.getBlob()))
    audio.src = window.URL.createObjectURL(this.getBlob())
  }

  // 上传
  this.upload = function () {
    
    
    return this.getBlob()
  }

  // 音频采集
  recorder.onaudioprocess = function (e) {
    
    
    audioData.input(e.inputBuffer.getChannelData(0))
    // record(e.inputBuffer.getChannelData(0));
  }

  return this
}

export {
    
    
  HZRecorder
}

VueJs

<template>
  <div id="page">
    <div class="content">
      <div>
        <div style="display: block;align-items: center;text-align: center;">
          <label>识别结果: {
   
   { result }}</label>
        </div>
        <div style="display: block;align-items: center;text-align: center;margin: 20px 0 20px 0">
          <label>识别结果2: {
   
   { result2 }}</label>
        </div>
        <audio ref="audiodiv" type="audio/wav" controls />
      </div>
      <div style="display: inline-flex;margin: 20px 0 20px 0">
        <van-button
          type="warning"
          @click="speakClick"
          square
        >识别点击说话
        </van-button>
        <van-button
          type="warning"
          @click="speakEndClick"
          square
        >识别结束说话
        </van-button>
      </div>
      <div>
        <van-button
          type="warning"
          @click="speakClick2"
          square
        >录音点击说话
        </van-button>
        <van-button
          type="warning"
          @click="speakEndClick2"
          square
        >录音关闭说话
        </van-button>
      </div>
    </div>
  </div>
</template>

<script>
import {
      
      HZRecorder} from '../js/HZRecorder'
import {
      
      Toast} from 'vant'

export default {
      
      
  name: 'home',
  data () {
      
      
    return {
      
      
      recorder: '',
      recognition: '',
      audioSrc: '',
      result: '',
      result2: ''
    }
  },
  created () {
      
      
    const vue = this

    if (navigator.mediaDevices.getUserMedia || navigator.getUserMedia || navigator.webkitGetUserMedia || navigator.mozGetUserMedia) {
      
      
      this.getUserMedia({
      
       video: false, audio: true }) // 调用用户媒体设备，访问摄像头、录音
    } else {
      
      
      console.log('你的浏览器不支持访问用户媒体设备')
    }

  },
  methods: {
      
      
    speakClick () {
      
      
      const vue = this
      vue.result2 = ''
      vue.result = ''
      console.log('start识别')

      let SpeechRecognition = window.SpeechRecognition || window.mozSpeechRecognition || window.webkitSpeechRecognition || window.msSpeechRecognition || window.oSpeechRecognition
      if (SpeechRecognition) {
      
      
        vue.recognition = new SpeechRecognition()
        vue.recognition.continuous = true
        vue.recognition.interimResults = true
        vue.recognition.lang = 'cmn-Hans-CN' // 普通话 (中国大陆)
      }

      vue.recognition.start()
      vue.recognition.onstart = function () {
      
      
        console.log('识别开始...')
      }
      // eslint-disable-next-line one-var
      let final_transcript = '', interim_transcript = ''
      vue.recognition.onerror = function (event) {
      
      
        console.log('识别出错')
        console.log(event)
        if (event.error == 'no-speech') {
      
      
          console.log('no-speech')
        }
        if (event.error == 'audio-capture') {
      
      
          console.log('audio-capture')
        }
        if (event.error == 'not-allowed') {
      
      
          console.log('not-allowed')
        }
      }
      vue.recognition.onresult = function (event) {
      
      
        console.log('识别成功')
        if (typeof (event.results) == 'undefined') {
      
      
          console.log('识别结果undefined')
          vue.recognition.onend = null
          vue.recognition.stop()
        } else {
      
      
          console.log(event.results)
          for (let i = event.resultIndex; i < event.results.length; ++i) {
      
      
            if (event.results[i].isFinal) {
      
      
              final_transcript += event.results[i][0].transcript
            } else {
      
      
              interim_transcript += event.results[i][0].transcript
            }
          }
          final_transcript = capitalize(final_transcript)
          console.log('final_transcript: ' + final_transcript)
          console.log('interim_transcript: ' + interim_transcript)
          if (final_transcript != 'undefined') {
      
      
            vue.result = final_transcript
          }
          if (interim_transcript != 'undefined') {
      
      
            vue.result2 = interim_transcript
          }
        }
      }
      var two_line = /\n\n/g
      var one_line = /\n/g

      function linebreak (s) {
      
      
        return s.replace(two_line, '<p></p>').replace(one_line, '<br>')
      }

      let first_char = /\S/

      function capitalize (s) {
      
      
        return s.replace(first_char,
          function (m) {
      
      
            return m.toUpperCase()
          })
      }
    },
    speakEndClick () {
      
      
      const vue = this
      console.log('end识别')
      vue.recognition.stop() // 识别停止
      vue.recognition.onend = function () {
      
      
		console.log('识别结束')
      }
    },
    speakClick2 () {
      
      
      const vue = this
      console.log('start')
      vue.recorder.start() // 录音
    },
    speakEndClick2 () {
      
      
      const vue = this
      console.log('end')
      let audioData = new FormData()
      audioData.append('speechFile', vue.recorder.getBlob())
      vue.recorder.play(this.$refs.audiodiv)
    },
    getUserMedia (constrains) {
      
      
      let that = this
      if (navigator.mediaDevices.getUserMedia) {
      
      
        // 最新标准API
        navigator.mediaDevices.getUserMedia(constrains).then(stream => {
      
      
          that.success(stream)
          that.recorder = new HZRecorder(stream)
          console.log('录音初始化准备完成')
        }).catch(err => {
      
       that.error(err) })
      } else if (navigator.webkitGetUserMedia) {
      
      
        // webkit内核浏览器
        navigator.webkitGetUserMedia(constrains).then(stream => {
      
      
          that.success(stream)
          that.recorder = new HZRecorder(stream)
          console.log('录音初始化准备完成')
        }).catch(err => {
      
       that.error(err) })
      } else if (navigator.mozGetUserMedia) {
      
      
        // Firefox浏览器
        navigator.mozGetUserMedia(constrains).then(stream => {
      
      
          that.success(stream)
          that.recorder = new HZRecorder(stream)
          console.log('录音初始化准备完成')
        }).catch(err => {
      
       that.error(err) })
      } else if (navigator.getUserMedia) {
      
      
        // 旧版API
        navigator.getUserMedia(constrains).then(stream => {
      
      
          that.success(stream)
          that.recorder = new HZRecorder(stream)
          console.log('录音初始化准备完成')
        }).catch(err => {
      
       that.error(err) })
      }
    },
    // 成功的回调函数
    success (stream) {
      
      
      console.log('已点击允许,开启成功')
    },
    // 异常的回调函数
    error (error) {
      
      
      console.log('访问用户媒体设备失败：', error.name, error.message)
    }
  }
}

</script>

<style scoped>
#page{
      
      
  position: absolute;
  display: flex;
  width: 100%;
  height: 100%;
  align-items: center;
  text-align: center;
  vertical-align: middle;
}
.content{
      
      
  width: 30%;
  height: 30%;
  margin: 0 auto;
}
</style>

Recognition effect

Chrome
Insert image description here

Safari

ios Safari

important

You can apply for and enable microphone permissions in localhost. In other environments, you need to configure https to apply for and enable microphone permissions.

Reference documentation

Web APIs | MDN

The original text is posted on my blog whitemoon.top

js front-end implements language recognition (asr) and recording