webRTC AEC 滤波器的自适应

权值更新就是使用SSE指令集加速完成NLMS算法的频域计算,具体实现细节见代码注释。

static void FilterAdaptationSSE2(AecCore* aec,
                                 float* fft,
                                 float ef[2][PART_LEN1]) {
  int i, j;
  const int num_partitions = aec->num_partitions;  // 分块数
  for (i = 0; i < num_partitions; i++) {
	  // xpos指向分块的第一个元素,每块65个元素
    int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
    int pos = i * PART_LEN1;
    // 边界检查,如果分块位置超过块数,回到缓存的第一块的起始地址
    if (i + aec->xfBufBlockPos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // Process the whole array...
    for (j = 0; j < PART_LEN; j += 4) {
      // Load xfBuf(远场数据的频域表达) and ef(消回声估计误差的频域表达).

		/*_mm_load_ps它的输入是一个指向float的指针,返回的就是一个__m128类型的数据,从函数的角度理解,
		就是把一个float数组的四个元素依次读取,返回一个组合的__m128类型的SSE数据类型,
		从而可以使用这个返回的结果传递给其它的SSE指令进行运算,比如加法等;从汇编的角度理解,
		它对应的就是读取内存中连续四个地址的float数据,将其放入SSE新的暂存器(XMM0~8)中,
		从而给其他的指令准备好数据进行计算,_ps表示并行。*/

		// 加载数据
      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
      const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
      const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);

      // Calculate the product of conjugate(xfBuf) by ef.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm 实部
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe 虚部
	  /*__m128 _mm_mul_ps(__m128 a , __m128 b )将a和b的四个单精度浮点值相乘。
	  返回值r0 := a0 * b0;r1 := a1 * b1;r2 := a2 * b2;r3 := a3 * b3*/
      const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);	//	远场数据实部与误差实部相乘
      const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);	//  远场数据虚部与误差虚部相乘
      const __m128 c = _mm_mul_ps(xfBuf_re, ef_im); //  远场数据实部与误差虚部相乘
      const __m128 d = _mm_mul_ps(xfBuf_im, ef_re); //  远场数据虚部与误差实部相乘
      const __m128 e = _mm_add_ps(a, b);			//  a+b
      const __m128 f = _mm_sub_ps(c, d);			//  c-d

      // Interleave real and imaginary parts.
		// extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B);
	    // 返回一个__m128的寄存器,Selects and interleaves the lower two single-precision,  
		// floating-point values from a and b  
		// r0=_A0, r1=_B0, r2=_A1, r3=_B1  
      const __m128 g = _mm_unpacklo_ps(e, f);

	  //	extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B);
	  //	返回一个__m128的寄存器,Selects and interleaves the upper two single-precision,  
	  //	floating-point values from a and b  
	  //	r0=_A2, r1=_B2, r2=_A3, r3=_B3 
      const __m128 h = _mm_unpackhi_ps(e, f);

      // Store,extern void _mm_storeu_ps(float *_V, __m128 _A); 
	  //返回为空,Stores four single-precision, floating-point values,  
	  //The address does not need to be 16-byte aligned  
	  //_V[0]=_A0, _V[1]=_A1, _V[2]=_A2, _V[3]=_A3  
      _mm_storeu_ps(&fft[2 * j + 0], g);
      _mm_storeu_ps(&fft[2 * j + 4], h);
    }

    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
                   -aec->xfBuf[1][xPos + PART_LEN],
                   ef[0][PART_LEN],
                   ef[1][PART_LEN]);

	// 对fft进行傅里叶逆变换
    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
    {
      float scale = 2.0f / PART_LEN2;
      const __m128 scale_ps = _mm_load_ps1(&scale);
      for (j = 0; j < PART_LEN; j += 4) {
        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
        _mm_storeu_ps(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

	// 权值更新,权值为频域表达形式
    {
      float wt1 = aec->wfBuf[1][pos];
      aec->wfBuf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j += 4) {
        __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
        __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
        const __m128 fft_re =
            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
        const __m128 fft_im =
            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
        _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
        _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
      }
      aec->wfBuf[1][pos] = wt1;
    }
  }
}

猜你喜欢

转载自blog.csdn.net/ljl86400/article/details/81080000