如何快速计算图像梯度、幅值以及梯度方向角 -- 使用SSE指令集

给定一副图像I，如何有效地计算图像上每个位置的梯度Ix,Iy，梯度幅值M，方向角Theta：

Ix(x,y) =I(x+1,y) - I(x-1,y)

Iy(x,y) =I(x,y+1) -I(x,y-1)

M(x,y) = sqrt(Ix(x,y)*Ix(x,y) +Iy(x,y)*Iy(x,y) )

Theta(x,y) = atan2(Iy(x,y),Ix(x,y) )

从上面的公示来看，计算起来并不困难，oepnCV里有Sobel函数，可以直接计算出Ix,Iy。这里我们主要讲的是高效率编程，自己编程来解决这个问题。

Intel生产的CPU基本上都支持SSE指令集，这一指令集允许同时对四个单精度浮点数进行运算，比如在一个CPU的clock里完成计算4个float与另外4个float的分别对应相乘。我在之前的博文里公开了我写的DPM目标检测代码，其中用到了SSE，那是我第一次使用SSE进行加速计算，效果颇好。之后有幸看到了Piotr Dollar大神的关于行人检测的公开代码（文章是：The Fastest Pedestrian Detector in the West），里面的mex函数文件大量使用了SSE指令集，让我对SSE编程有了进一步了解。

我以前用OpenCV的Sobel函数来计算Ix,Iy，然后再算M和Theta，一方面效率不够理想，另一方面程序的可控性较差——自己想在计算过程中增加一两个简单运算不太方便。由此我自己写了相关代码，几经修改，形成了几个不同的版本，下面将这些代码一一贴出来并作简单讲解。

------------------------------------

代码一：oriented_gradient.cpp

说明：该cpp中定义了两个版本的yuOrientedGradient函数，该函数的功能是，输入一幅灰度或彩色图像，计算图像上每个像素位置的梯度幅值和方向角。若输入是多通道图像，梯度幅值是取各个颜色通道梯度幅值的最大值。输出的方向角不是实数值，而是离散的整数值，比如指定orientation_bins=9，sensitive=true，则将一个取值在[0,2*pi)的方向角划分到[0,20)(度),[20,40),[40,60),...,[340,360)共18个bin中的一个。这种做法其实是为了后续进一步计算HOG特征服务的。如果将orientation的数据类型改为int型，将orientation_bins设置为360，则可以计算出每个像素位置方向角的角度，精度为1°. 继续提高orientation_bins的值，可以增加方向角的估计精度。

#include "cv.h"
using namespace cv;

// The two versions of function proposed here share the same functionality whereas the V2 is relatively faster.
void  yuOrientedGradient( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive );
void  yuOrientedGradient_V2( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive );
/* test:
	Mat im = imread("0001.jpg");
	Mat imF; im.convertTo(imF,CV_32F);
	Mat O1, O2, G1, G2;
	yuOrientedGradient( imF, O1, G1, 9, true );
	yuOrientedGradient( imF, O2, G2, 9, true );
	absdiff( O1, O2, O1 );
	absdiff( G1, G2, G1 );
	double a, b;
	minMaxLoc( G1, 0, &a );
	minMaxLoc( O1, 0, &b );
	cout<<a<<endl<<b<<endl; // a==0, b==0
*/


/*
 Calculate the orientation at each pixel.
 
 The calculated orientations are snapped to one of the N bins which are equally spaced in:
 [0,180), if sensitive==true, then values of orientation are between [0,num_orientation_bins-1);
 [0,360), if sensitive==false, then values of orientation are between [0,2*num_orientation_bins-1).

 The output orientation (CV_8UC1) & gradient (CV_32FC1) are 2 pixels smaller both in rows
 and in cols than input img (multi-channel,float type).
 
 theta = angle( OP ), where O = (0,0), P = (dx,dy)
 theta is then snapped to one of nine orientations [0,20) [20,40), ... , [160 180)
 How to snap:
 e.g. we set the bins as [0,20), [20,40), ..., [160,180),
      then for any theta in [0,180), cos(theta-i*20) achieves max when theta is in [i*20,i*20+20)
      as: cos(a-b) = cos(a)*cos(b) + sin(a)*sin(b)
      so: cos(theta-i*20) = cos(theta)*cos(i*20) + sin(theta)*sin(i*20)
      now that: cos(theta) = x/sqrt(x^2+y^2), sin(theta) = y/sqrt(x^2+y^2)
      so: x*cos(i*20)+y*sin(i*20) will achieve max when the orientation of (x,y) is in [i*20,i*20+20)
      make: uu = [cos(0) cos(20) ... cos(160)]; vv = [sin(0) sin(20) ... sin(180)];
      then: x*uu(i)+y*vv(i) achieves max when theta is in [i*20,i*20+20), namely the i-th orientation bin.

 by: YU Xianguo, 2015/06/24
*/
void  yuOrientedGradient( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive )
{
	assert( img.depth()==CV_32F );
	assert( img.rows>2 && img.cols>2 );
	assert( orientation_bins>1 && orientation_bins<256 );

	// create output: only calc for img(Rect(1,1,cols-2,rows-2)).
	int rows = img.rows, cols = img.cols, chans = img.channels();
	orientation.create( rows-2, cols-2, CV_8UC1 );
	gradient.create( rows-2, cols-2, CV_32FC1 );
	
	// calculate gradient for img(Rect(1,1,cols-2,rows-2))
	// multi-channel operation
	Mat Left = img( Rect(0,1,cols-2,rows-2) );
	Mat Right = img( Rect(2,1,cols-2,rows-2) );
	Mat Up = img( Rect(1,0,cols-2,rows-2) );
	Mat Down = img( Rect(1,2,cols-2,rows-2) );
	Mat _Dx = Right - Left;
	Mat _Dy = Down - Up;

	Mat Dx, Dy;
	if( chans==1 ){
		Dx = _Dx;
		Dy = _Dy;
		gradient = 0;
		accumulateSquare( Dx, gradient );
		accumulateSquare( Dy, gradient );
	}
	else{
		rows = _Dx.rows, cols = _Dx.cols;
		//	for each element in Dx & Dy: <dx0,dx1,dx2> & <dy0,dy1,dy2>
		//	calculate the square sum: <d0,d1,d2>, where d = dx^2 + dy^2
		//	select d(i) = max(d0,d1,d2)
		//	then set the corresponding value of DDx by dx(i), set DDy by dy(i)
		Dx.create( gradient.size(), gradient.type() );
		Dy.create( gradient.size(), gradient.type() );
		float *a = (float*)_Dx.data;
		float *b = (float*)_Dy.data;
		float *c = (float*)Dx.data;
		float *d = (float*)Dy.data;
		float *g = (float*)gradient.data;
		int pg = gradient.step1()-cols;
		int x, y, chn;
		float dv, mdx, mdy, mdv;
		for( y=0; y++<rows; g+=pg ){
			for( x=0; x++<cols; ){
				for( mdv=-1, chn=0; chn++<chans; ){
					float &dx = *(a++);
					float &dy = *(b++);
					dv = dx*dx + dy*dy;
					if( mdv<dv ){
						mdv = dv;
						mdx = dx;
						mdy = dy;
					}
				}
				*(c++) = mdx;
				*(d++) = mdy;
				*(g++) = mdv; // gradient = Dx.^2 + Dy.^2
			}
		}
	}	

	// construct orientation snaps
	vector<double> uu(orientation_bins);
	vector<double> vv(orientation_bins);
	double bin_span = CV_PI / orientation_bins;
	for( int k=0; k<orientation_bins; k++ ){
		double theta = k * bin_span;
		uu[k] = cos( theta );
		vv[k] = sin( theta );
	}

	// val = DDx * uu[0] + DDy * vv[0] = DDx
	Mat val, maxval, bw;	
	Dx.copyTo( val ); // Dx.convertTo( val, CV_32FC1 );
	maxval = abs( val );
	orientation = 0;
	if( !sensitive ){
		for( int i=1; i<orientation_bins; i++ ){
			val = Dx*uu[i] + Dy*vv[i]; //addWeighted( Dx, uu[i], Dy, vv[i], 0, val, CV_32FC1 );
			val = abs(val);
			bw = maxval < val;
			if( i<orientation_bins-1 )
				val.copyTo( maxval, bw );
			orientation.setTo(i,bw);
		}
	}
	else{
		bw = val < 0;
		orientation.setTo( orientation_bins, bw );
		for( int i=1; i<orientation_bins; i++ ){
			val = Dx*uu[i] + Dy*vv[i]; //addWeighted( Dx, uu[i], Dy, vv[i], 0, val, CV_32FC1 );
			bw = maxval < val;
			if( i<orientation_bins-1 )
				val.copyTo( maxval, bw );
			orientation.setTo( i, bw );
			val = -val;
			bw = maxval < val;
			if( i<orientation_bins-1 )
				val.copyTo( maxval, bw );
			orientation.setTo( i+orientation_bins, bw );
		}
	}

	cv::sqrt( gradient, gradient );

	return;
}


void  yuOrientedGradient_V2( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive )
{
	typedef uchar T; // data type of orientation
	const int orientation_type = CV_8UC1; // must accord with T

	assert( img.depth()==CV_32F );
	assert( img.rows>2 && img.cols>2 );
	assert( orientation_bins>1 && orientation_bins<256 ); // cause we use uchar type orientation

	int rows = img.rows, cols = img.cols, channels = img.channels();
	int result_rows = rows - 2, result_cols = cols - 2;
	orientation.create( result_rows, result_cols, orientation_type );
	gradient.create( result_rows, result_cols, CV_32FC1 );

	// construct orientation snaps
	vector<float> uu(orientation_bins);
	vector<float> vv(orientation_bins);
	float bin_span = float(CV_PI) / orientation_bins;
	for( int k=0; k<orientation_bins; k++ ){
		float theta = k * bin_span;
		uu[k] = cosf( theta );
		vv[k] = sinf( theta );
	}

	T *orient = (T*)orientation.data; int gap1=orientation.step1()-result_cols;
	float *grad = (float*)gradient.data; int gap2=gradient.step1()-result_cols;

	// orientation(y,x) is from: img(y,x),img(y+2,x),img(y,x+2),img(y+2,x+2)	
	const float *Up = (float*)img.data + channels;
	const float *Down = Up + img.step1()*2;
	const float *Left = (float*)img.data + img.step1();
	const float *Right = Left + 2*channels;
	int gap = img.step1() - result_cols*channels;

	float dx, dy, dv, mdx, mdy, mdv;
	for( int y=0; y<result_rows; y++ ){
		for( int x=0; x<result_cols; x++ ){
			mdx = *(Right++) - *(Left++);
			mdy = *(Down++) - *(Up++);
			mdv = mdx*mdx + mdy*mdy;
			for( int c=1; c<channels; c++ ){
				dx = *(Right++) - *(Left++);
				dy = *(Down++) - *(Up++);
				dv = dx*dx + dy*dy;
				if( mdv<dv ){
					mdv = dv;
					mdx = dx;
					mdy = dy;
				}
			}
			// snap to one orientation bin
			float maxVal = mdx < 0 ? -mdx : mdx; // uu[0]*mdx + vv[0]*mdy == mdx
			int maxOrient = 0;
			if( sensitive ){
				if( mdx<0 ) maxOrient += orientation_bins;
				for( int k=1; k<orientation_bins; k++ ){
					float val = uu[k]*mdx + vv[k]*mdy;
					if( maxVal<val ){
						maxVal = val;
						maxOrient = k;
					}
					else if( maxVal<-val ){
						maxVal = -val;
						maxOrient = k + orientation_bins;
					}
				}				
			}
			else{
				for( int k=1; k<orientation_bins; k++ ){
					float val = uu[k]*mdx + vv[k]*mdy;
					if( val<0 ) val = -val;
					if( maxVal<val ){
						maxVal = val;
						maxOrient = k;
					}
				}
			}

			*(orient++) = maxOrient;
			*(grad++) = mdv;
		}
		Up+=gap, Down+=gap, Left+=gap, Right+=gap;
		orient+=gap1, grad+=gap2;
	}

	cv::sqrt( gradient, gradient );

	return;
}

代码二：sse.h

说明：将常用sse指令打包放入一个头文件中，方便使用。这个头文件是从P'Dollar的工具箱中拿出来的，其中注释部分是我写的，一些函数的形式被我修改了，另外我还加入了若干个函数。这个头文件极大地方便了sse编程，对我用处颇大。

/*******************************************************************************
* Piotr's Image&Video Toolbox      Version 3.23
* Copyright 2013 Piotr Dollar & Ron Appel.  [pdollar-at-caltech.edu]
* Please email me if you find bugs, or have suggestions or questions!
* Licensed under the Simplified BSD License [see external/bsd.txt]
*******************************************************************************/

/* The interpretations are written by YU Xianguo, 2015/06/26.
 * Notification:
 * The defined functions accords with the form: __m128(i) fun( dst, src ); OR: __m128(i) fun( src1, src2 );
 * In my interpretation, x[4] means x is a float* (or int*), or it is a __m128 (or __m128i), and the 4 values 
 * of x are treated independently. x_4 means x is a __m128 (or __m128i), and it is treated as a 128 byte variable.
 *
 * Besides the comments, some functions' parameters form are changed:
 * if the input parameter is a float[4], in original form, it is formed as float&, here I changed it to float*.
 *
 * I also add some new functions by checking the SSE instructions presented in online MSDN:
 * https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
*/

#pragma once

#include <xmmintrin.h>
#include <emmintrin.h> // SSE2:<e*.h>, SSE3:<p*.h>, SSE4:<s*.h>

#define RETf inline __m128
#define RETi inline __m128i

/* set, load and store values */
// return all zeros
RETf SSE_ZERO() { return _mm_setzero_ps(); }
// Set 4 float in __m128 to the same value.
RETf SSE_SET( const float &x ) { return _mm_set1_ps(x); } // Same as _mm_set_ps1, Sets the 4 floats to x. Return: r0 := r1 := r2 := r3 := x 
// Set 4 floats in __m128 to 4 different values.
RETf SSE_SET( float x, float y, float z, float w ) { return _mm_set_ps(x,y,z,w); } // Set the 4 floats to the four inputs. Return: r0 := x r1 := y r2 := z r3 := w 
// Set 4 int in __m128i to the same value.
RETi SSE_SET( const int &x ) { return _mm_set1_epi32(x); } // Sets the 4 signed 32-bit integer values to x. Return: r0 := r1 := r2 := r3 := x
// Loads 4 float (address 16-byte aligned) into __m128
RETf SSE_LD( const float *x ) { return _mm_load_ps(x); } // Loads 4 floats. The address must be 16-byte aligned. Return: r0 := x[0] r1 := x[1] r2 := x[2] r3 := x[3] 
// Loads 4 float (address need not be aligned) into __m128
RETf SSE_LDu( const float *x ) { return _mm_loadu_ps(x); } // Load 4 floats, The address does not need to be 16-byte aligned.
// x[4] = y[4] (address aligned)
RETf SSE_STR( float *x, const __m128 y ) { _mm_store_ps(x,y); return y; } // Stores 4 floats. The address must be 16-byte aligned. Return: x[0] = y0 x[1] = y1 x[2] = y2 x[3] = y3
// x = y[0]
RETf SSE_STR1( float *x, const __m128 y ) { _mm_store_ss(x,y); return y; } // Stores the lower float value. Return: x[0] := y0
// x[4] = y[4] (no address aligned)
RETf SSE_STRu( float *x, const __m128 y ) { _mm_storeu_ps(x,y); return y; } // Stores 4 floats. The address does not need to be 16-byte aligned. Return: x[0] = y0 x[1] = y[1] x[2] = y[2] x[3] = y3
// x[4] = y (address aligned)
RETf SSE_STR( float *x, const float y ) { return SSE_STR(x,SSE_SET(y)); }

/* arithmetic operators */
// return x[4] + y[4] (int)
RETi SSE_ADD( const __m128i x, const __m128i y ) { return _mm_add_epi32(x,y); } // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. Return: r0 := a0 + b0 ~ r3 := a3 + b3
// return x[4] + y[4]
RETf SSE_ADD( const __m128 x, const __m128 y ) { return _mm_add_ps(x,y); } // Adds the 4 float values of a and b. Return: r0 := a0 + b0 ~ r3 := a3 + b3
// return x[4] + y[4] + z[4]
RETf SSE_ADD( const __m128 x, const __m128 y, const __m128 z ) { return SSE_ADD(SSE_ADD(x,y),z); }
// return x[4] + y[4] + z[4]
RETf SSE_ADD( const __m128 a, const __m128 b, const __m128 c, const __m128 &d ) { return SSE_ADD(SSE_ADD(SSE_ADD(a,b),c),d); }
// return x[4] - y[4]
RETf SSE_SUB( const __m128 x, const __m128 y ) { return _mm_sub_ps(x,y); } // Subtract the 4 floats of a and b. Return: r0 := a0 - b0 ~ r3 := a3 - b3
// return x[4] * y[4]
RETf SSE_MUL( const __m128 x, const __m128 y ) { return _mm_mul_ps(x,y); } // Multiplies the 4 floats of a and b. Return: r0 := a0 * b0 ~ r3 := a3 * b3
// return x[4] * y
RETf SSE_MUL( const __m128 x, const float y ) { return SSE_MUL(x,SSE_SET(y)); }
// return x * y[4]
RETf SSE_MUL( const float x, const __m128 y ) { return SSE_MUL(SSE_SET(x),y); }
// x[4] = x[4] + y[4]
RETf SSE_INC( __m128 &x, const __m128 y ) { return x = SSE_ADD(x,y); }
// x[4] = x[4] + y[4]
RETf SSE_INC( float *x, const __m128 y ) { __m128 t=SSE_ADD(SSE_LD(x),y); return SSE_STR(x,t); }
// x[4] = x[4] - y[4]
RETf SSE_DEC( __m128 &x, const __m128 y ) { return x = SSE_SUB(x,y); }
// x[4] = x[4] - y[4]
RETf SSE_DEC( float *x, const __m128 y ) { __m128 t=SSE_SUB(SSE_LD(x),y); return SSE_STR(x,t); }
// return min( x[4], y[4] )
RETf SSE_MIN( const __m128 x, const __m128 y ) { return _mm_min_ps(x,y); } // Computes the minima of the 4 floats of a and b. Return: r0 := min(a0, b0), r3 := min(a3, b3)
// return max( x[4], y[4] )
RETf SSE_MAX( const __m128 x, const __m128 y ) { return _mm_max_ps(x,y); } // Computes the maximums of the four single-precision, floating-point values of a and b.
// return 1.f / x[4]
RETf SSE_RCP( const __m128 x ) { return _mm_rcp_ps(x); } // Computes the approximations of reciprocals (inverse value) of the 4 values of a. Return: r0 := recip(a0), r3 := recip(a3)
// return sqrtf( x[4] )
RETf SSE_SQRT( const __m128 x ) { return _mm_sqrt_ps(x); } // Computes the square roots of the four single-precision, floating-point values of a.
// return 1.f / sqrtf(x[4])
RETf SSE_RCPSQRT( const __m128 x ) { return _mm_rsqrt_ps(x); } // Computes the approximations of the reciprocals of the square roots of the 4 floats of a. Return: r0 := recip(sqrt(a0)), r3 := recip(sqrt(a3))

/* logical operators */
// return x[4] & y[4]
RETf SSE_AND( const __m128 x, const __m128 y ) { return _mm_and_ps(x,y); } // Bitwise AND of the 4 values of a and b. Return: r0 := a0 & b0, r3 := a3 & b3
// return x_4 & y_4
RETi SSE_AND( const __m128i x, const __m128i y ) { return _mm_and_si128(x,y); } // Bitwise AND of the 128-bit value in a and the 128-bit value in b. Return: r := a & b
// return ~x[4] & y[4]
RETf SSE_ANDNOT( const __m128 x, const __m128 y ) { return _mm_andnot_ps(x,y); } // Bitwise AND-NOT of the 4 values of a and b. Return: r0 := ~a0 & b0, r3 := ~a3 & b3
// return x[4] | y[4]
RETf SSE_OR( const __m128 x, const __m128 y ) { return _mm_or_ps(x,y); } // Bitwise OR of the 4 values of a and b. Return: r0 := a0 | b0, r3 := a3 | b3
// return x[4] xor y[4]
RETf SSE_XOR( const __m128 x, const __m128 y ) { return _mm_xor_ps(x,y); } // Bitwise EXOR (exclusive-or) of the 4 values of a and b. Return: r0 := a0 ^ b0, r3 := a3 ^ b3

/* comparison operators */
// return x[4] > y[4]
RETf SSE_CMPGT( const __m128 x, const __m128 y ) { return _mm_cmpgt_ps(x,y); } // Greater than. Return: r0 := (a0 > b0) ? 0xffffffff : 0x0, r3 := (a3 > b3) ? 0xffffffff : 0x0
// return x[4] < y[4]
RETf SSE_CMPLT( const __m128 x, const __m128 y ) { return _mm_cmplt_ps(x,y); } // Less than. Return: r0 := (a0 < b0) ? 0xffffffff : 0x0, r3 := (a3 < b3) ? 0xffffffff : 0x0
// return x[4] > y[4] (int)
RETi SSE_CMPGT( const __m128i x, const __m128i y ) { return _mm_cmpgt_epi32(x,y); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. Return: r0 := (a0 > b0) ? 0xffffffff : 0x0, r3 := (a3 > b3) ? 0xffffffff : 0x0
// return x[4] < y[4] (int)
RETi SSE_CMPLT( const __m128i x, const __m128i y ) { return _mm_cmplt_epi32(x,y); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than.

/* conversion operators */
// return float( x[4] )
RETf SSE_CVT( const __m128i x ) { return _mm_cvtepi32_ps(x); } // Converts the four signed 32-bit integer values of a to single-precision, floating-point values. Return: r0 := (float) a0, r3 := (float) a3
// return int( x[4] )
RETi SSE_CVT( const __m128 x ) { return _mm_cvttps_epi32(x); } // r0 := (int) a0, r1 := (int) a1, r2 := (int) a2, r3 := (int) a3

#undef RETf
#undef RETi

代码三：gradient.cpp

说明：计算梯度图像。即给定I，如何计算Ix，Iy。当输入图像满足16字节对齐时，则调用SSE版本，否则使用openCV的Sobel函数来算。所谓的16字节对齐是指矩阵数据的首地址，将它转化为整型，则要求这个整数能被16整除。一般来说，我们new出来的内存一般不满足16字节对齐的条件，但是openCV分配内存时，通过特殊的处理让矩阵的首地址16字节对齐了。在使用SSE命令对矩阵每一行处理时，要求每行数据的首地址都是对齐的，从而要求（1）矩阵首地址是对齐的（2）矩阵的step是16字节的整数倍（对于float类型的矩阵来说，仅要求每行的元素个数是4的整数倍即可）。

#include "sse.h"
#include "cv.h"
using namespace cv;


/* void  yuGradientSSE( const Mat &I, Mat &Gx, Mat &Gy );
 *
 * Calculate the Sobel gradient [-1 0 1] for each pixel.
 * The results are two gradient images (along x axis and
 * y axis respectively) of CV_32FC1 type and the same
 * size as input image.
 *
 * (1) Require input image to be CV_32FC1 type, and the 
 * initial address of all the rows are 16-byte aligned. 
 * Generally, this is easily achieved when we use openCV's
 * Mat::create function to allocate the matrix data and
 * the cols of matrix is integer times of 4.
 * (2) Require Gx & Gy are preallocated and the initial
 * address of all the rows are 16-byte aligned.
 * (3) DO NOTICE that the data alignment will NEVER be
 * a problem if we use openCV to allocate memory storage
 * and make sure the step-length (in bytes) of matrix 
 * is integer times of 16.
 * (4) If we want to "new" a memory storage by ourself
 * and we hope the initial address is 16-byte aligned, 
 * we can use:
    int sz = 100, k;
	char *buf = new char [sz*sizeof(float)+15];
	for( k=0; k<16; k++ )
		if( !(size_t(buf+k)&15) )
			break; // will certainly break when 0<=k<=15
	float *data = (float*)(buf+k);
	assert( !(size_t(data)&15) );
 * This is a dump method but is easy to understand.
 * More professional way can be found from internet.
 * But I prefer to take advantage of openCV:
    int sz = 100; // sz should be times of 4
	Mat data_mat( 1, sz, CV_32FC1 );
	float *data = (float*)data_mat.data;
	assert( !(size_t(data)&15) );
 * The openCV will also deallocate the memory if needed.
 * This is very helpful when we forget to free data sometimes.
 *
 * by: YU Xianguo, 2015/06/27.
*/
void  yuGradientSSE( const Mat &I, Mat &Gx, Mat &Gy )
{
	assert( I.type()==CV_32FC1 && Gx.type()==CV_32FC1 && Gy.type()==CV_32FC1 );
	assert( I.size()==Gx.size() && I.size()==Gy.size() );
	// To ensure the initial address of every row is 16 byte aligned,
	// we require that the initial address of the mat is aligned,
	// meanwhile the step-size of the mat is integer times of 16.
	assert( !(size_t(I.data)&15) && !(I.step%16) );
	assert( !(size_t(Gx.data)&15)  && !(I.step%16) );
	assert( !(size_t(Gy.data)&15) && !(I.step%16) );
	int rows = I.rows, cols = I.cols;

	// The special trick used in P.Dollar's toolbox.	
	//int n1 = ~( size_t(I.data) ) + 1;
	//int n = ( n1 & 15 ) / 4;
	//if( n==0 ) n = 4; // no smaller than 1
	//else if( n>rows-1 ) n = rows - 1;
	// I think n can also be got from:
	//float *p = (float*)I.data;
	//for( n=1; n<rows-2; n++, p++ )
	//	if( !(size_t(p)&15) )
	//		break;
	// With the info of n1 & n, we know that data address will be 16-byte
	// aligned for all column (n), and we won't need I.data to be 16-byte aligned.
	// But employing such info will make the code complex -- difficult to
	// read. So I won't use it here.

	const float *Up, *Down, *Left, *Right; int gap;
	float *Dx, *Dy; int gap1, gap2;
	__m128 *_Up, *_Down, *_Dx, *_Dy;

	// Compute Gx
	Left = (float*)I.data; gap = I.step1()-cols;
	Dx = (float*)Gx.data; gap1 = Gx.step1()-cols;
	int num_sse = ( cols - 4 - 1 ) / 4;
	int res = cols - ( 4 + 4*num_sse );
	for( int r=0; r<rows; r++ ){
		// the first n columns
		*(Dx++) = ( Left[1] - Left[0] ) * 2.f; // first column
		Right = Left + 2;
		for( int k=1; k<4; k++ )
			*(Dx++) = *(Right++) - *(Left++);
		//assert( !(size_t(Dx)&15) ); // check if address is 16-byte aligned
		_Dx = (__m128*)Dx;
		for( int c=0; c<num_sse; c++, Right+=4, Left+=4 )
			*(_Dx++) = SSE_SUB( SSE_LDu(Right), SSE_LDu(Left) ); // address of Left & Right could be non-aligned
		// the last res columns (1<=res<=4)
		Dx = (float*)_Dx;
		for( int k=1; k<res; k++ )
			*(Dx++) = *(Right++) - *(Left++);
		*(Dx++) = ( Left[1] - Left[0] ) * 2.f; // last column
		Left += gap+2, Dx += gap1; // Left = Right + gap
	}

	// Compute Gy
	Up = (float*)I.data;
	Down = Up + 2*I.step1();
	Dy = (float*)Gy.data+Gy.step1(); gap2 = Gy.step1()-cols;
	num_sse = cols / 4;
	res = cols - 4*num_sse;	
	for( int r=2; r<rows; r++ ){
		_Up = (__m128*)Up;
		_Down = (__m128*)Down;
		_Dy = (__m128*)Dy;
		for( int c=0; c<num_sse; c++ )
			*(_Dy++) = SSE_SUB( *(_Down++), *(_Up++) );
		// the last res rows (res<4)
		Up = (float*)_Up;
		Down = (float*)_Down;
		Dy = (float*)_Dy;
		for( int k=0; k<res; k++ )
			*(Dy++) = *(Down++) - *(Up++);
		Up+=gap, Down+=gap, Dy+=gap2;
	}
	// the first row and the last row
	Up = (float*)I.data;
	Down = Up + I.step1();
	Dy = (float*)Gy.data;
	__m128 factor = SSE_SET( 2.f );
	for( int r=0; r<rows; r+=rows-1 ){		
		_Up = (__m128*)Up;
		_Down = (__m128*)Down;
		_Dy = (__m128*)Dy;
		for( int c=0; c<num_sse; c++ )
			*(_Dy++) = SSE_MUL( SSE_SUB( *(_Down++), *(_Up++) ), factor );
		Up = (float*)_Up;
		Down = (float*)Down;
		Dy = (float*)_Dy;
		for( int k=0; k<res; k++ )
			*(Dy++) = ( *(Down++) - *(Up++) ) * 2.f;
		Up = I.ptr<float>(rows-2);
		Down = I.ptr<float>(rows-1);
		Dy = Gy.ptr<float>(rows-1);
	}
	
	return;
}


void  yuGradient( const Mat &I, Mat &Gx, Mat &Gy )
{
	assert( I.type()==CV_32FC1 );
	Gx.create( I.size(), CV_32FC1 );
	Gy.create( I.size(), CV_32FC1 );
	if( !(size_t(I.data)&15) && !(I.step%16) 
		&& !(size_t(Gx.data)&15) && !(Gx.step%16) 
		&& !(size_t(Gy.data)&15) && !(Gy.step%16) )
	{
		yuGradientSSE( I, Gx, Gy );
		return;
	}
	// NOTICE: the Sobel method doesn't produces exactly
	// the same result as SSE method. It is indeed the
	// gradient_sse result combined with a linear blurring method.
	Sobel( I, Gx, CV_32FC1, 1, 0 );
	Sobel( I, Gy, CV_32FC1, 0, 1 );
	return;
}

代码四：magnitude_orientation.cpp

说明：给定图像I的两幅梯度图像Ix,Iy,如何利用SSE快速计算梯度幅值M和梯度方向Theta。这里的代码计算出的Theta与前面不同，不是离散的整数值，而是连续实值，取值在[0,pi)或[0,2*pi)。

#include "sse.h"
#include "cv.h"
using namespace cv;

//-------------------------------------------------------------------------------------------------
void  yuMagOrientSSE( const Mat &Gx, const Mat &Gy, Mat &Magnitude, Mat &Orientation, bool full );
//-------------------------------------------------------------------------------------------------


// build lookup table a[] s.t. a[x*n]~=acos(x) for x in [-1,1]
float * acosTable()
{
	const int n=10000, b=10; 
	int i;
	static float a[n*2+b*2]; 
	static bool init = false;
	float *a1 = a + n + b; 
	if( init ) 
		return a1;
	for( i=-n-b; i<-n; i++ )   
		a1[i] = (float)CV_PI;
	for( i=-n; i<n; i++ )      
		a1[i] = float( acos(i/float(n)) );
	for( i=n; i<n+b; i++ )     
		a1[i] = 0;
	for( i=-n-b; i<n/10; i++ ) 
		if( a1[i] > float(CV_PI)-1e-6f ) 
			a1[i] = (float)CV_PI-1e-6f;
	init = true; 
	return a1;
}


/* Calculate the gradient magnitude of each pixel.
 * The result is a gradient image of CV_32FC1 type
 * and the same size as input.
 * Require preallocated output matrix. if Orientation
 * is empty, then it will not be calculated. Else,
 * orientation is calculated for each pixel using
 * an approximation method (see P.Dollar's toolbox).
 * If full==false, then orientation will be [0,pi).
 * Else, orientation will be [0,2pi).
*/
void  yuMagOrientSSE( const Mat &Gx, const Mat &Gy, Mat &Magnitude, Mat &Orientation, bool full )
{
	assert( Gx.type()==CV_32FC1 && Gx.type()==Gy.type() && Gx.type()==Magnitude.type() );
	assert( Gx.size()==Gy.size() && Gx.size()==Magnitude.size() );
	assert( !(size_t(Gx.data)&15) && !(Gx.step%16) ); // check if address is 16-byte aligned
	assert( !(size_t(Gy.data)&15) && !(Gy.step%16) );
	assert( !(size_t(Magnitude.data)&15) && !(Magnitude.step%16) );
	bool Ont = !Orientation.empty();
	if( Ont ){
		assert( Orientation.type()==CV_32FC1 && Orientation.size()==Gx.size() );
		assert( !(size_t(Orientation.data)&15) && !(Orientation.step%16) );
	}
	int rows = Gx.rows, cols = Gx.cols;
	float *gx = (float*)Gx.data; int gapx = Gx.step1()-cols;
	float *gy = (float*)Gy.data; int gapy = Gy.step1()-cols;
	float *mg = (float*)Magnitude.data; int gapm = Magnitude.step1()-cols;
	float *ot = (float*)Orientation.data; int gapo = Orientation.step1()-cols;
	__m128 *_gx, *_gy, *_mg, *_ot;	

	float *acost = acosTable(), acMult=10000.0f;
	__m128 mult128 = SSE_SET( acMult );
	__m128 eps128 = SSE_SET(0.00001f);
	__m128 zero128 = SSE_SET(0.f);
	__m128 pi128 = SSE_SET((float)CV_PI);

	Mat tmpM( 1, 4, CV_32FC1 );
	assert( !(size_t(tmpM.data)&15) );
	float *tmpD = (float*)tmpM.data;
	__m128 *tmp = (__m128*)(tmpD);

	int cols4 = cols/4;
	int res = cols%4;
	for( int r=0; r<rows; r++ ){
		//assert( gx==Gx.ptr<float>(r) );
		//assert( mg==Magnitude.ptr<float>(r) );
		//if( Ont) assert( ot==Orientation.ptr<float>(r) );
		_gx = (__m128*)gx;
		_gy = (__m128*)gy;
		_mg = (__m128*)mg;		
		for( int c=0; c<cols4; c++ ){
			// mag = sqrt( gx*gx + gy*gy )
			*_mg = SSE_SQRT( SSE_ADD( SSE_MUL(*_gx,*_gx), SSE_MUL(*_gy,*_gy) ) );
			if( Ont ){
				_ot = (__m128*)ot;
				// tmp = acMult * gx / max( mag, eps )
				*tmp = SSE_MUL( mult128, SSE_MUL(*_gx, SSE_RCP(SSE_MAX(*_mg,eps128))) );
				// acos( gx/mag ) == acostable[ gx/mag * acMult ]
				for( int k=0; k<4; k++ )
					*(ot++) = acost[ int(tmpD[k]) ]; // theta in [0,pi]
				if( full ) // orient += (gy<0) & pi
					*_ot = SSE_ADD( *_ot, SSE_AND(SSE_CMPLT(*_gy,zero128),pi128) ); // theta in [0,2*pi]
			}
			_gx++, _gy++, _mg++;
		}
		gx = (float*)_gx;
		gy = (float*)_gy;
		mg = (float*)_mg;
		// the last res cols
		for( int k=0; k<res; k++ ){
			float &gradx = *(gx++);
			float &grady = *(gy++);
			*mg = sqrtf( gradx*gradx + grady*grady );
			if( Ont ){
				float ux = acMult * gradx / max( *mg, 0.00001f );
				*ot = acost[ int(ux) ];
				if( full && grady<0 )
					*ot += (float)CV_PI;
				ot++;
			}
			mg++;
		}
		gx+=gapx, gy+=gapy, mg+=gapm;
		if( Ont) ot+=gapo;		
	}
}


/* Quantize orientations in [0,pi) or [0,2*pi) (float) into equally space orientation bins.
 * The quantization method is from Felzenswalb's (DPM) HOG feature.
 * contrast sensitive (full==true):
 * B1(x,y) = round( p*theta(x,y) / 2pi ) mode p
 * contrast insensitive (full==false):
 * B2(x,y) = round( p*theta(x,y) / pi ) mode p
*/
void  yuQuantizeOrientSSE( const Mat &Orientation, Mat &Quantized, int nOrients, bool full )
{
	assert( nOrients>0 );

	// Need preallocated memory storage
	assert( Orientation.type()==CV_32FC1 );
	assert( !(size_t(Orientation.data)&15) && !(Orientation.step%16) ); // check if address is 16-byte aligned
	
	int rows = Orientation.rows, cols = Orientation.cols;
	Quantized.create( rows, cols, CV_32SC1 );
	assert( !(size_t(Quantized.data)&15) && !(Quantized.step%16) ); // check if address is 16-byte aligned

	float *pOrient = (float*)Orientation.data; int gap1 = Orientation.step1()-cols;
	int *pQuant = (int*)Quantized.data; int gap2 = Quantized.step1()-cols;

	const float mult = (float)( nOrients / (full?2*CV_PI:CV_PI) );
	__m128 mult128 = SSE_SET( mult );
	__m128i maxOrients128 = SSE_SET( nOrients );
	__m128i tmp;

	int cols4 = cols/4;
	int res = cols%4;
	for( int r=0; r<rows; r++ ){
		__m128 *_pOrient = (__m128*)pOrient;
		__m128i *_pQuant = (__m128i*)pQuant;
		for( int c=0; c<cols4; c++ ){
			tmp = SSE_CVT( SSE_MUL( *(_pOrient++), mult128 ) );
			*(_pQuant++) = SSE_AND( tmp, SSE_CMPLT(tmp,maxOrients128) );
		}
		pOrient = (float*)_pOrient;
		pQuant = (int*)_pQuant;
		for( int k=0; k<res; k++ ){
			int a = int( *(pOrient++) * mult );
			*(pQuant++) = a < nOrients ? a : 0;
		}
		pOrient+=gap1, pQuant+=gap2;
	}
}

------------------------------------

以上代码我都编译调试通过，并有过使用，但是不保证里面没有bug，代码贴在这里而不是以上传文件的方式发布到csdn资源，就是希望集思广益，大家有看到里面的错误和不足的地方给我指正，我直接在网页里改，惠及后来人。

如何快速计算图像梯度、幅值以及梯度方向角 -- 使用SSE指令集

猜你喜欢