给定一副图像I,如何有效地计算图像上每个位置的梯度Ix,Iy,梯度幅值M,方向角Theta:
Ix(x,y) =I(x+1,y) - I(x-1,y)
Iy(x,y) =I(x,y+1) -I(x,y-1)
M(x,y) = sqrt(Ix(x,y)*Ix(x,y) +Iy(x,y)*Iy(x,y) )
Theta(x,y) = atan2(Iy(x,y),Ix(x,y) )
从上面的公示来看,计算起来并不困难,oepnCV里有Sobel函数,可以直接计算出Ix,Iy。这里我们主要讲的是高效率编程,自己编程来解决这个问题。
Intel生产的CPU基本上都支持SSE指令集,这一指令集允许同时对四个单精度浮点数进行运算,比如在一个CPU的clock里完成计算4个float与另外4个float的分别对应相乘。我在之前的博文里公开了我写的DPM目标检测代码,其中用到了SSE,那是我第一次使用SSE进行加速计算,效果颇好。之后有幸看到了Piotr Dollar大神的关于行人检测的公开代码(文章是:The Fastest Pedestrian Detector in the West),里面的mex函数文件大量使用了SSE指令集,让我对SSE编程有了进一步了解。
我以前用OpenCV的Sobel函数来计算Ix,Iy,然后再算M和Theta,一方面效率不够理想,另一方面程序的可控性较差——自己想在计算过程中增加一两个简单运算不太方便。由此我自己写了相关代码,几经修改,形成了几个不同的版本,下面将这些代码一一贴出来并作简单讲解。
------------------------------------
代码一:oriented_gradient.cpp
说明:该cpp中定义了两个版本的yuOrientedGradient函数,该函数的功能是,输入一幅灰度或彩色图像,计算图像上每个像素位置的梯度幅值和方向角。若输入是多通道图像,梯度幅值是取各个颜色通道梯度幅值的最大值。输出的方向角不是实数值,而是离散的整数值,比如指定orientation_bins=9,sensitive=true,则将一个取值在[0,2*pi)的方向角划分到[0,20)(度),[20,40),[40,60),...,[340,360)共18个bin中的一个。这种做法其实是为了后续进一步计算HOG特征服务的。如果将orientation的数据类型改为int型,将orientation_bins设置为360,则可以计算出每个像素位置方向角的角度,精度为1°. 继续提高orientation_bins的值,可以增加方向角的估计精度。
#include "cv.h"
using namespace cv;
// The two versions of function proposed here share the same functionality whereas the V2 is relatively faster.
void yuOrientedGradient( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive );
void yuOrientedGradient_V2( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive );
/* test:
Mat im = imread("0001.jpg");
Mat imF; im.convertTo(imF,CV_32F);
Mat O1, O2, G1, G2;
yuOrientedGradient( imF, O1, G1, 9, true );
yuOrientedGradient( imF, O2, G2, 9, true );
absdiff( O1, O2, O1 );
absdiff( G1, G2, G1 );
double a, b;
minMaxLoc( G1, 0, &a );
minMaxLoc( O1, 0, &b );
cout<<a<<endl<<b<<endl; // a==0, b==0
*/
/*
Calculate the orientation at each pixel.
The calculated orientations are snapped to one of the N bins which are equally spaced in:
[0,180), if sensitive==true, then values of orientation are between [0,num_orientation_bins-1);
[0,360), if sensitive==false, then values of orientation are between [0,2*num_orientation_bins-1).
The output orientation (CV_8UC1) & gradient (CV_32FC1) are 2 pixels smaller both in rows
and in cols than input img (multi-channel,float type).
theta = angle( OP ), where O = (0,0), P = (dx,dy)
theta is then snapped to one of nine orientations [0,20) [20,40), ... , [160 180)
How to snap:
e.g. we set the bins as [0,20), [20,40), ..., [160,180),
then for any theta in [0,180), cos(theta-i*20) achieves max when theta is in [i*20,i*20+20)
as: cos(a-b) = cos(a)*cos(b) + sin(a)*sin(b)
so: cos(theta-i*20) = cos(theta)*cos(i*20) + sin(theta)*sin(i*20)
now that: cos(theta) = x/sqrt(x^2+y^2), sin(theta) = y/sqrt(x^2+y^2)
so: x*cos(i*20)+y*sin(i*20) will achieve max when the orientation of (x,y) is in [i*20,i*20+20)
make: uu = [cos(0) cos(20) ... cos(160)]; vv = [sin(0) sin(20) ... sin(180)];
then: x*uu(i)+y*vv(i) achieves max when theta is in [i*20,i*20+20), namely the i-th orientation bin.
by: YU Xianguo, 2015/06/24
*/
void yuOrientedGradient( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive )
{
assert( img.depth()==CV_32F );
assert( img.rows>2 && img.cols>2 );
assert( orientation_bins>1 && orientation_bins<256 );
// create output: only calc for img(Rect(1,1,cols-2,rows-2)).
int rows = img.rows, cols = img.cols, chans = img.channels();
orientation.create( rows-2, cols-2, CV_8UC1 );
gradient.create( rows-2, cols-2, CV_32FC1 );
// calculate gradient for img(Rect(1,1,cols-2,rows-2))
// multi-channel operation
Mat Left = img( Rect(0,1,cols-2,rows-2) );
Mat Right = img( Rect(2,1,cols-2,rows-2) );
Mat Up = img( Rect(1,0,cols-2,rows-2) );
Mat Down = img( Rect(1,2,cols-2,rows-2) );
Mat _Dx = Right - Left;
Mat _Dy = Down - Up;
Mat Dx, Dy;
if( chans==1 ){
Dx = _Dx;
Dy = _Dy;
gradient = 0;
accumulateSquare( Dx, gradient );
accumulateSquare( Dy, gradient );
}
else{
rows = _Dx.rows, cols = _Dx.cols;
// for each element in Dx & Dy: <dx0,dx1,dx2> & <dy0,dy1,dy2>
// calculate the square sum: <d0,d1,d2>, where d = dx^2 + dy^2
// select d(i) = max(d0,d1,d2)
// then set the corresponding value of DDx by dx(i), set DDy by dy(i)
Dx.create( gradient.size(), gradient.type() );
Dy.create( gradient.size(), gradient.type() );
float *a = (float*)_Dx.data;
float *b = (float*)_Dy.data;
float *c = (float*)Dx.data;
float *d = (float*)Dy.data;
float *g = (float*)gradient.data;
int pg = gradient.step1()-cols;
int x, y, chn;
float dv, mdx, mdy, mdv;
for( y=0; y++<rows; g+=pg ){
for( x=0; x++<cols; ){
for( mdv=-1, chn=0; chn++<chans; ){
float &dx = *(a++);
float &dy = *(b++);
dv = dx*dx + dy*dy;
if( mdv<dv ){
mdv = dv;
mdx = dx;
mdy = dy;
}
}
*(c++) = mdx;
*(d++) = mdy;
*(g++) = mdv; // gradient = Dx.^2 + Dy.^2
}
}
}
// construct orientation snaps
vector<double> uu(orientation_bins);
vector<double> vv(orientation_bins);
double bin_span = CV_PI / orientation_bins;
for( int k=0; k<orientation_bins; k++ ){
double theta = k * bin_span;
uu[k] = cos( theta );
vv[k] = sin( theta );
}
// val = DDx * uu[0] + DDy * vv[0] = DDx
Mat val, maxval, bw;
Dx.copyTo( val ); // Dx.convertTo( val, CV_32FC1 );
maxval = abs( val );
orientation = 0;
if( !sensitive ){
for( int i=1; i<orientation_bins; i++ ){
val = Dx*uu[i] + Dy*vv[i]; //addWeighted( Dx, uu[i], Dy, vv[i], 0, val, CV_32FC1 );
val = abs(val);
bw = maxval < val;
if( i<orientation_bins-1 )
val.copyTo( maxval, bw );
orientation.setTo(i,bw);
}
}
else{
bw = val < 0;
orientation.setTo( orientation_bins, bw );
for( int i=1; i<orientation_bins; i++ ){
val = Dx*uu[i] + Dy*vv[i]; //addWeighted( Dx, uu[i], Dy, vv[i], 0, val, CV_32FC1 );
bw = maxval < val;
if( i<orientation_bins-1 )
val.copyTo( maxval, bw );
orientation.setTo( i, bw );
val = -val;
bw = maxval < val;
if( i<orientation_bins-1 )
val.copyTo( maxval, bw );
orientation.setTo( i+orientation_bins, bw );
}
}
cv::sqrt( gradient, gradient );
return;
}
void yuOrientedGradient_V2( const Mat &img, Mat &orientation, Mat &gradient, int orientation_bins, bool sensitive )
{
typedef uchar T; // data type of orientation
const int orientation_type = CV_8UC1; // must accord with T
assert( img.depth()==CV_32F );
assert( img.rows>2 && img.cols>2 );
assert( orientation_bins>1 && orientation_bins<256 ); // cause we use uchar type orientation
int rows = img.rows, cols = img.cols, channels = img.channels();
int result_rows = rows - 2, result_cols = cols - 2;
orientation.create( result_rows, result_cols, orientation_type );
gradient.create( result_rows, result_cols, CV_32FC1 );
// construct orientation snaps
vector<float> uu(orientation_bins);
vector<float> vv(orientation_bins);
float bin_span = float(CV_PI) / orientation_bins;
for( int k=0; k<orientation_bins; k++ ){
float theta = k * bin_span;
uu[k] = cosf( theta );
vv[k] = sinf( theta );
}
T *orient = (T*)orientation.data; int gap1=orientation.step1()-result_cols;
float *grad = (float*)gradient.data; int gap2=gradient.step1()-result_cols;
// orientation(y,x) is from: img(y,x),img(y+2,x),img(y,x+2),img(y+2,x+2)
const float *Up = (float*)img.data + channels;
const float *Down = Up + img.step1()*2;
const float *Left = (float*)img.data + img.step1();
const float *Right = Left + 2*channels;
int gap = img.step1() - result_cols*channels;
float dx, dy, dv, mdx, mdy, mdv;
for( int y=0; y<result_rows; y++ ){
for( int x=0; x<result_cols; x++ ){
mdx = *(Right++) - *(Left++);
mdy = *(Down++) - *(Up++);
mdv = mdx*mdx + mdy*mdy;
for( int c=1; c<channels; c++ ){
dx = *(Right++) - *(Left++);
dy = *(Down++) - *(Up++);
dv = dx*dx + dy*dy;
if( mdv<dv ){
mdv = dv;
mdx = dx;
mdy = dy;
}
}
// snap to one orientation bin
float maxVal = mdx < 0 ? -mdx : mdx; // uu[0]*mdx + vv[0]*mdy == mdx
int maxOrient = 0;
if( sensitive ){
if( mdx<0 ) maxOrient += orientation_bins;
for( int k=1; k<orientation_bins; k++ ){
float val = uu[k]*mdx + vv[k]*mdy;
if( maxVal<val ){
maxVal = val;
maxOrient = k;
}
else if( maxVal<-val ){
maxVal = -val;
maxOrient = k + orientation_bins;
}
}
}
else{
for( int k=1; k<orientation_bins; k++ ){
float val = uu[k]*mdx + vv[k]*mdy;
if( val<0 ) val = -val;
if( maxVal<val ){
maxVal = val;
maxOrient = k;
}
}
}
*(orient++) = maxOrient;
*(grad++) = mdv;
}
Up+=gap, Down+=gap, Left+=gap, Right+=gap;
orient+=gap1, grad+=gap2;
}
cv::sqrt( gradient, gradient );
return;
}
代码二:sse.h
说明:将常用sse指令打包放入一个头文件中,方便使用。这个头文件是从P'Dollar的工具箱中拿出来的,其中注释部分是我写的,一些函数的形式被我修改了,另外我还加入了若干个函数。这个头文件极大地方便了sse编程,对我用处颇大。
/*******************************************************************************
* Piotr's Image&Video Toolbox Version 3.23
* Copyright 2013 Piotr Dollar & Ron Appel. [pdollar-at-caltech.edu]
* Please email me if you find bugs, or have suggestions or questions!
* Licensed under the Simplified BSD License [see external/bsd.txt]
*******************************************************************************/
/* The interpretations are written by YU Xianguo, 2015/06/26.
* Notification:
* The defined functions accords with the form: __m128(i) fun( dst, src ); OR: __m128(i) fun( src1, src2 );
* In my interpretation, x[4] means x is a float* (or int*), or it is a __m128 (or __m128i), and the 4 values
* of x are treated independently. x_4 means x is a __m128 (or __m128i), and it is treated as a 128 byte variable.
*
* Besides the comments, some functions' parameters form are changed:
* if the input parameter is a float[4], in original form, it is formed as float&, here I changed it to float*.
*
* I also add some new functions by checking the SSE instructions presented in online MSDN:
* https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
*/
#pragma once
#include <xmmintrin.h>
#include <emmintrin.h> // SSE2:<e*.h>, SSE3:<p*.h>, SSE4:<s*.h>
#define RETf inline __m128
#define RETi inline __m128i
/* set, load and store values */
// return all zeros
RETf SSE_ZERO() { return _mm_setzero_ps(); }
// Set 4 float in __m128 to the same value.
RETf SSE_SET( const float &x ) { return _mm_set1_ps(x); } // Same as _mm_set_ps1, Sets the 4 floats to x. Return: r0 := r1 := r2 := r3 := x
// Set 4 floats in __m128 to 4 different values.
RETf SSE_SET( float x, float y, float z, float w ) { return _mm_set_ps(x,y,z,w); } // Set the 4 floats to the four inputs. Return: r0 := x r1 := y r2 := z r3 := w
// Set 4 int in __m128i to the same value.
RETi SSE_SET( const int &x ) { return _mm_set1_epi32(x); } // Sets the 4 signed 32-bit integer values to x. Return: r0 := r1 := r2 := r3 := x
// Loads 4 float (address 16-byte aligned) into __m128
RETf SSE_LD( const float *x ) { return _mm_load_ps(x); } // Loads 4 floats. The address must be 16-byte aligned. Return: r0 := x[0] r1 := x[1] r2 := x[2] r3 := x[3]
// Loads 4 float (address need not be aligned) into __m128
RETf SSE_LDu( const float *x ) { return _mm_loadu_ps(x); } // Load 4 floats, The address does not need to be 16-byte aligned.
// x[4] = y[4] (address aligned)
RETf SSE_STR( float *x, const __m128 y ) { _mm_store_ps(x,y); return y; } // Stores 4 floats. The address must be 16-byte aligned. Return: x[0] = y0 x[1] = y1 x[2] = y2 x[3] = y3
// x = y[0]
RETf SSE_STR1( float *x, const __m128 y ) { _mm_store_ss(x,y); return y; } // Stores the lower float value. Return: x[0] := y0
// x[4] = y[4] (no address aligned)
RETf SSE_STRu( float *x, const __m128 y ) { _mm_storeu_ps(x,y); return y; } // Stores 4 floats. The address does not need to be 16-byte aligned. Return: x[0] = y0 x[1] = y[1] x[2] = y[2] x[3] = y3
// x[4] = y (address aligned)
RETf SSE_STR( float *x, const float y ) { return SSE_STR(x,SSE_SET(y)); }
/* arithmetic operators */
// return x[4] + y[4] (int)
RETi SSE_ADD( const __m128i x, const __m128i y ) { return _mm_add_epi32(x,y); } // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. Return: r0 := a0 + b0 ~ r3 := a3 + b3
// return x[4] + y[4]
RETf SSE_ADD( const __m128 x, const __m128 y ) { return _mm_add_ps(x,y); } // Adds the 4 float values of a and b. Return: r0 := a0 + b0 ~ r3 := a3 + b3
// return x[4] + y[4] + z[4]
RETf SSE_ADD( const __m128 x, const __m128 y, const __m128 z ) { return SSE_ADD(SSE_ADD(x,y),z); }
// return x[4] + y[4] + z[4]
RETf SSE_ADD( const __m128 a, const __m128 b, const __m128 c, const __m128 &d ) { return SSE_ADD(SSE_ADD(SSE_ADD(a,b),c),d); }
// return x[4] - y[4]
RETf SSE_SUB( const __m128 x, const __m128 y ) { return _mm_sub_ps(x,y); } // Subtract the 4 floats of a and b. Return: r0 := a0 - b0 ~ r3 := a3 - b3
// return x[4] * y[4]
RETf SSE_MUL( const __m128 x, const __m128 y ) { return _mm_mul_ps(x,y); } // Multiplies the 4 floats of a and b. Return: r0 := a0 * b0 ~ r3 := a3 * b3
// return x[4] * y
RETf SSE_MUL( const __m128 x, const float y ) { return SSE_MUL(x,SSE_SET(y)); }
// return x * y[4]
RETf SSE_MUL( const float x, const __m128 y ) { return SSE_MUL(SSE_SET(x),y); }
// x[4] = x[4] + y[4]
RETf SSE_INC( __m128 &x, const __m128 y ) { return x = SSE_ADD(x,y); }
// x[4] = x[4] + y[4]
RETf SSE_INC( float *x, const __m128 y ) { __m128 t=SSE_ADD(SSE_LD(x),y); return SSE_STR(x,t); }
// x[4] = x[4] - y[4]
RETf SSE_DEC( __m128 &x, const __m128 y ) { return x = SSE_SUB(x,y); }
// x[4] = x[4] - y[4]
RETf SSE_DEC( float *x, const __m128 y ) { __m128 t=SSE_SUB(SSE_LD(x),y); return SSE_STR(x,t); }
// return min( x[4], y[4] )
RETf SSE_MIN( const __m128 x, const __m128 y ) { return _mm_min_ps(x,y); } // Computes the minima of the 4 floats of a and b. Return: r0 := min(a0, b0), r3 := min(a3, b3)
// return max( x[4], y[4] )
RETf SSE_MAX( const __m128 x, const __m128 y ) { return _mm_max_ps(x,y); } // Computes the maximums of the four single-precision, floating-point values of a and b.
// return 1.f / x[4]
RETf SSE_RCP( const __m128 x ) { return _mm_rcp_ps(x); } // Computes the approximations of reciprocals (inverse value) of the 4 values of a. Return: r0 := recip(a0), r3 := recip(a3)
// return sqrtf( x[4] )
RETf SSE_SQRT( const __m128 x ) { return _mm_sqrt_ps(x); } // Computes the square roots of the four single-precision, floating-point values of a.
// return 1.f / sqrtf(x[4])
RETf SSE_RCPSQRT( const __m128 x ) { return _mm_rsqrt_ps(x); } // Computes the approximations of the reciprocals of the square roots of the 4 floats of a. Return: r0 := recip(sqrt(a0)), r3 := recip(sqrt(a3))
/* logical operators */
// return x[4] & y[4]
RETf SSE_AND( const __m128 x, const __m128 y ) { return _mm_and_ps(x,y); } // Bitwise AND of the 4 values of a and b. Return: r0 := a0 & b0, r3 := a3 & b3
// return x_4 & y_4
RETi SSE_AND( const __m128i x, const __m128i y ) { return _mm_and_si128(x,y); } // Bitwise AND of the 128-bit value in a and the 128-bit value in b. Return: r := a & b
// return ~x[4] & y[4]
RETf SSE_ANDNOT( const __m128 x, const __m128 y ) { return _mm_andnot_ps(x,y); } // Bitwise AND-NOT of the 4 values of a and b. Return: r0 := ~a0 & b0, r3 := ~a3 & b3
// return x[4] | y[4]
RETf SSE_OR( const __m128 x, const __m128 y ) { return _mm_or_ps(x,y); } // Bitwise OR of the 4 values of a and b. Return: r0 := a0 | b0, r3 := a3 | b3
// return x[4] xor y[4]
RETf SSE_XOR( const __m128 x, const __m128 y ) { return _mm_xor_ps(x,y); } // Bitwise EXOR (exclusive-or) of the 4 values of a and b. Return: r0 := a0 ^ b0, r3 := a3 ^ b3
/* comparison operators */
// return x[4] > y[4]
RETf SSE_CMPGT( const __m128 x, const __m128 y ) { return _mm_cmpgt_ps(x,y); } // Greater than. Return: r0 := (a0 > b0) ? 0xffffffff : 0x0, r3 := (a3 > b3) ? 0xffffffff : 0x0
// return x[4] < y[4]
RETf SSE_CMPLT( const __m128 x, const __m128 y ) { return _mm_cmplt_ps(x,y); } // Less than. Return: r0 := (a0 < b0) ? 0xffffffff : 0x0, r3 := (a3 < b3) ? 0xffffffff : 0x0
// return x[4] > y[4] (int)
RETi SSE_CMPGT( const __m128i x, const __m128i y ) { return _mm_cmpgt_epi32(x,y); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. Return: r0 := (a0 > b0) ? 0xffffffff : 0x0, r3 := (a3 > b3) ? 0xffffffff : 0x0
// return x[4] < y[4] (int)
RETi SSE_CMPLT( const __m128i x, const __m128i y ) { return _mm_cmplt_epi32(x,y); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than.
/* conversion operators */
// return float( x[4] )
RETf SSE_CVT( const __m128i x ) { return _mm_cvtepi32_ps(x); } // Converts the four signed 32-bit integer values of a to single-precision, floating-point values. Return: r0 := (float) a0, r3 := (float) a3
// return int( x[4] )
RETi SSE_CVT( const __m128 x ) { return _mm_cvttps_epi32(x); } // r0 := (int) a0, r1 := (int) a1, r2 := (int) a2, r3 := (int) a3
#undef RETf
#undef RETi
代码三:gradient.cpp
说明:计算梯度图像。即给定I,如何计算Ix,Iy。当输入图像满足16字节对齐时,则调用SSE版本,否则使用openCV的Sobel函数来算。所谓的16字节对齐是指矩阵数据的首地址,将它转化为整型,则要求这个整数能被16整除。一般来说,我们new出来的内存一般不满足16字节对齐的条件,但是openCV分配内存时,通过特殊的处理让矩阵的首地址16字节对齐了。在使用SSE命令对矩阵每一行处理时,要求每行数据的首地址都是对齐的,从而要求(1)矩阵首地址是对齐的(2)矩阵的step是16字节的整数倍(对于float类型的矩阵来说,仅要求每行的元素个数是4的整数倍即可)。
#include "sse.h"
#include "cv.h"
using namespace cv;
/* void yuGradientSSE( const Mat &I, Mat &Gx, Mat &Gy );
*
* Calculate the Sobel gradient [-1 0 1] for each pixel.
* The results are two gradient images (along x axis and
* y axis respectively) of CV_32FC1 type and the same
* size as input image.
*
* (1) Require input image to be CV_32FC1 type, and the
* initial address of all the rows are 16-byte aligned.
* Generally, this is easily achieved when we use openCV's
* Mat::create function to allocate the matrix data and
* the cols of matrix is integer times of 4.
* (2) Require Gx & Gy are preallocated and the initial
* address of all the rows are 16-byte aligned.
* (3) DO NOTICE that the data alignment will NEVER be
* a problem if we use openCV to allocate memory storage
* and make sure the step-length (in bytes) of matrix
* is integer times of 16.
* (4) If we want to "new" a memory storage by ourself
* and we hope the initial address is 16-byte aligned,
* we can use:
int sz = 100, k;
char *buf = new char [sz*sizeof(float)+15];
for( k=0; k<16; k++ )
if( !(size_t(buf+k)&15) )
break; // will certainly break when 0<=k<=15
float *data = (float*)(buf+k);
assert( !(size_t(data)&15) );
* This is a dump method but is easy to understand.
* More professional way can be found from internet.
* But I prefer to take advantage of openCV:
int sz = 100; // sz should be times of 4
Mat data_mat( 1, sz, CV_32FC1 );
float *data = (float*)data_mat.data;
assert( !(size_t(data)&15) );
* The openCV will also deallocate the memory if needed.
* This is very helpful when we forget to free data sometimes.
*
* by: YU Xianguo, 2015/06/27.
*/
void yuGradientSSE( const Mat &I, Mat &Gx, Mat &Gy )
{
assert( I.type()==CV_32FC1 && Gx.type()==CV_32FC1 && Gy.type()==CV_32FC1 );
assert( I.size()==Gx.size() && I.size()==Gy.size() );
// To ensure the initial address of every row is 16 byte aligned,
// we require that the initial address of the mat is aligned,
// meanwhile the step-size of the mat is integer times of 16.
assert( !(size_t(I.data)&15) && !(I.step%16) );
assert( !(size_t(Gx.data)&15) && !(I.step%16) );
assert( !(size_t(Gy.data)&15) && !(I.step%16) );
int rows = I.rows, cols = I.cols;
// The special trick used in P.Dollar's toolbox.
//int n1 = ~( size_t(I.data) ) + 1;
//int n = ( n1 & 15 ) / 4;
//if( n==0 ) n = 4; // no smaller than 1
//else if( n>rows-1 ) n = rows - 1;
// I think n can also be got from:
//float *p = (float*)I.data;
//for( n=1; n<rows-2; n++, p++ )
// if( !(size_t(p)&15) )
// break;
// With the info of n1 & n, we know that data address will be 16-byte
// aligned for all column (n), and we won't need I.data to be 16-byte aligned.
// But employing such info will make the code complex -- difficult to
// read. So I won't use it here.
const float *Up, *Down, *Left, *Right; int gap;
float *Dx, *Dy; int gap1, gap2;
__m128 *_Up, *_Down, *_Dx, *_Dy;
// Compute Gx
Left = (float*)I.data; gap = I.step1()-cols;
Dx = (float*)Gx.data; gap1 = Gx.step1()-cols;
int num_sse = ( cols - 4 - 1 ) / 4;
int res = cols - ( 4 + 4*num_sse );
for( int r=0; r<rows; r++ ){
// the first n columns
*(Dx++) = ( Left[1] - Left[0] ) * 2.f; // first column
Right = Left + 2;
for( int k=1; k<4; k++ )
*(Dx++) = *(Right++) - *(Left++);
//assert( !(size_t(Dx)&15) ); // check if address is 16-byte aligned
_Dx = (__m128*)Dx;
for( int c=0; c<num_sse; c++, Right+=4, Left+=4 )
*(_Dx++) = SSE_SUB( SSE_LDu(Right), SSE_LDu(Left) ); // address of Left & Right could be non-aligned
// the last res columns (1<=res<=4)
Dx = (float*)_Dx;
for( int k=1; k<res; k++ )
*(Dx++) = *(Right++) - *(Left++);
*(Dx++) = ( Left[1] - Left[0] ) * 2.f; // last column
Left += gap+2, Dx += gap1; // Left = Right + gap
}
// Compute Gy
Up = (float*)I.data;
Down = Up + 2*I.step1();
Dy = (float*)Gy.data+Gy.step1(); gap2 = Gy.step1()-cols;
num_sse = cols / 4;
res = cols - 4*num_sse;
for( int r=2; r<rows; r++ ){
_Up = (__m128*)Up;
_Down = (__m128*)Down;
_Dy = (__m128*)Dy;
for( int c=0; c<num_sse; c++ )
*(_Dy++) = SSE_SUB( *(_Down++), *(_Up++) );
// the last res rows (res<4)
Up = (float*)_Up;
Down = (float*)_Down;
Dy = (float*)_Dy;
for( int k=0; k<res; k++ )
*(Dy++) = *(Down++) - *(Up++);
Up+=gap, Down+=gap, Dy+=gap2;
}
// the first row and the last row
Up = (float*)I.data;
Down = Up + I.step1();
Dy = (float*)Gy.data;
__m128 factor = SSE_SET( 2.f );
for( int r=0; r<rows; r+=rows-1 ){
_Up = (__m128*)Up;
_Down = (__m128*)Down;
_Dy = (__m128*)Dy;
for( int c=0; c<num_sse; c++ )
*(_Dy++) = SSE_MUL( SSE_SUB( *(_Down++), *(_Up++) ), factor );
Up = (float*)_Up;
Down = (float*)Down;
Dy = (float*)_Dy;
for( int k=0; k<res; k++ )
*(Dy++) = ( *(Down++) - *(Up++) ) * 2.f;
Up = I.ptr<float>(rows-2);
Down = I.ptr<float>(rows-1);
Dy = Gy.ptr<float>(rows-1);
}
return;
}
void yuGradient( const Mat &I, Mat &Gx, Mat &Gy )
{
assert( I.type()==CV_32FC1 );
Gx.create( I.size(), CV_32FC1 );
Gy.create( I.size(), CV_32FC1 );
if( !(size_t(I.data)&15) && !(I.step%16)
&& !(size_t(Gx.data)&15) && !(Gx.step%16)
&& !(size_t(Gy.data)&15) && !(Gy.step%16) )
{
yuGradientSSE( I, Gx, Gy );
return;
}
// NOTICE: the Sobel method doesn't produces exactly
// the same result as SSE method. It is indeed the
// gradient_sse result combined with a linear blurring method.
Sobel( I, Gx, CV_32FC1, 1, 0 );
Sobel( I, Gy, CV_32FC1, 0, 1 );
return;
}
代码四:magnitude_orientation.cpp
说明:给定图像I的两幅梯度图像Ix,Iy,如何利用SSE快速计算梯度幅值M和梯度方向Theta。这里的代码计算出的Theta与前面不同,不是离散的整数值,而是连续实值,取值在[0,pi)或[0,2*pi)。
#include "sse.h"
#include "cv.h"
using namespace cv;
//-------------------------------------------------------------------------------------------------
void yuMagOrientSSE( const Mat &Gx, const Mat &Gy, Mat &Magnitude, Mat &Orientation, bool full );
//-------------------------------------------------------------------------------------------------
// build lookup table a[] s.t. a[x*n]~=acos(x) for x in [-1,1]
float * acosTable()
{
const int n=10000, b=10;
int i;
static float a[n*2+b*2];
static bool init = false;
float *a1 = a + n + b;
if( init )
return a1;
for( i=-n-b; i<-n; i++ )
a1[i] = (float)CV_PI;
for( i=-n; i<n; i++ )
a1[i] = float( acos(i/float(n)) );
for( i=n; i<n+b; i++ )
a1[i] = 0;
for( i=-n-b; i<n/10; i++ )
if( a1[i] > float(CV_PI)-1e-6f )
a1[i] = (float)CV_PI-1e-6f;
init = true;
return a1;
}
/* Calculate the gradient magnitude of each pixel.
* The result is a gradient image of CV_32FC1 type
* and the same size as input.
* Require preallocated output matrix. if Orientation
* is empty, then it will not be calculated. Else,
* orientation is calculated for each pixel using
* an approximation method (see P.Dollar's toolbox).
* If full==false, then orientation will be [0,pi).
* Else, orientation will be [0,2pi).
*/
void yuMagOrientSSE( const Mat &Gx, const Mat &Gy, Mat &Magnitude, Mat &Orientation, bool full )
{
assert( Gx.type()==CV_32FC1 && Gx.type()==Gy.type() && Gx.type()==Magnitude.type() );
assert( Gx.size()==Gy.size() && Gx.size()==Magnitude.size() );
assert( !(size_t(Gx.data)&15) && !(Gx.step%16) ); // check if address is 16-byte aligned
assert( !(size_t(Gy.data)&15) && !(Gy.step%16) );
assert( !(size_t(Magnitude.data)&15) && !(Magnitude.step%16) );
bool Ont = !Orientation.empty();
if( Ont ){
assert( Orientation.type()==CV_32FC1 && Orientation.size()==Gx.size() );
assert( !(size_t(Orientation.data)&15) && !(Orientation.step%16) );
}
int rows = Gx.rows, cols = Gx.cols;
float *gx = (float*)Gx.data; int gapx = Gx.step1()-cols;
float *gy = (float*)Gy.data; int gapy = Gy.step1()-cols;
float *mg = (float*)Magnitude.data; int gapm = Magnitude.step1()-cols;
float *ot = (float*)Orientation.data; int gapo = Orientation.step1()-cols;
__m128 *_gx, *_gy, *_mg, *_ot;
float *acost = acosTable(), acMult=10000.0f;
__m128 mult128 = SSE_SET( acMult );
__m128 eps128 = SSE_SET(0.00001f);
__m128 zero128 = SSE_SET(0.f);
__m128 pi128 = SSE_SET((float)CV_PI);
Mat tmpM( 1, 4, CV_32FC1 );
assert( !(size_t(tmpM.data)&15) );
float *tmpD = (float*)tmpM.data;
__m128 *tmp = (__m128*)(tmpD);
int cols4 = cols/4;
int res = cols%4;
for( int r=0; r<rows; r++ ){
//assert( gx==Gx.ptr<float>(r) );
//assert( mg==Magnitude.ptr<float>(r) );
//if( Ont) assert( ot==Orientation.ptr<float>(r) );
_gx = (__m128*)gx;
_gy = (__m128*)gy;
_mg = (__m128*)mg;
for( int c=0; c<cols4; c++ ){
// mag = sqrt( gx*gx + gy*gy )
*_mg = SSE_SQRT( SSE_ADD( SSE_MUL(*_gx,*_gx), SSE_MUL(*_gy,*_gy) ) );
if( Ont ){
_ot = (__m128*)ot;
// tmp = acMult * gx / max( mag, eps )
*tmp = SSE_MUL( mult128, SSE_MUL(*_gx, SSE_RCP(SSE_MAX(*_mg,eps128))) );
// acos( gx/mag ) == acostable[ gx/mag * acMult ]
for( int k=0; k<4; k++ )
*(ot++) = acost[ int(tmpD[k]) ]; // theta in [0,pi]
if( full ) // orient += (gy<0) & pi
*_ot = SSE_ADD( *_ot, SSE_AND(SSE_CMPLT(*_gy,zero128),pi128) ); // theta in [0,2*pi]
}
_gx++, _gy++, _mg++;
}
gx = (float*)_gx;
gy = (float*)_gy;
mg = (float*)_mg;
// the last res cols
for( int k=0; k<res; k++ ){
float &gradx = *(gx++);
float &grady = *(gy++);
*mg = sqrtf( gradx*gradx + grady*grady );
if( Ont ){
float ux = acMult * gradx / max( *mg, 0.00001f );
*ot = acost[ int(ux) ];
if( full && grady<0 )
*ot += (float)CV_PI;
ot++;
}
mg++;
}
gx+=gapx, gy+=gapy, mg+=gapm;
if( Ont) ot+=gapo;
}
}
/* Quantize orientations in [0,pi) or [0,2*pi) (float) into equally space orientation bins.
* The quantization method is from Felzenswalb's (DPM) HOG feature.
* contrast sensitive (full==true):
* B1(x,y) = round( p*theta(x,y) / 2pi ) mode p
* contrast insensitive (full==false):
* B2(x,y) = round( p*theta(x,y) / pi ) mode p
*/
void yuQuantizeOrientSSE( const Mat &Orientation, Mat &Quantized, int nOrients, bool full )
{
assert( nOrients>0 );
// Need preallocated memory storage
assert( Orientation.type()==CV_32FC1 );
assert( !(size_t(Orientation.data)&15) && !(Orientation.step%16) ); // check if address is 16-byte aligned
int rows = Orientation.rows, cols = Orientation.cols;
Quantized.create( rows, cols, CV_32SC1 );
assert( !(size_t(Quantized.data)&15) && !(Quantized.step%16) ); // check if address is 16-byte aligned
float *pOrient = (float*)Orientation.data; int gap1 = Orientation.step1()-cols;
int *pQuant = (int*)Quantized.data; int gap2 = Quantized.step1()-cols;
const float mult = (float)( nOrients / (full?2*CV_PI:CV_PI) );
__m128 mult128 = SSE_SET( mult );
__m128i maxOrients128 = SSE_SET( nOrients );
__m128i tmp;
int cols4 = cols/4;
int res = cols%4;
for( int r=0; r<rows; r++ ){
__m128 *_pOrient = (__m128*)pOrient;
__m128i *_pQuant = (__m128i*)pQuant;
for( int c=0; c<cols4; c++ ){
tmp = SSE_CVT( SSE_MUL( *(_pOrient++), mult128 ) );
*(_pQuant++) = SSE_AND( tmp, SSE_CMPLT(tmp,maxOrients128) );
}
pOrient = (float*)_pOrient;
pQuant = (int*)_pQuant;
for( int k=0; k<res; k++ ){
int a = int( *(pOrient++) * mult );
*(pQuant++) = a < nOrients ? a : 0;
}
pOrient+=gap1, pQuant+=gap2;
}
}
------------------------------------
以上代码我都编译调试通过,并有过使用,但是不保证里面没有bug,代码贴在这里而不是以上传文件的方式发布到csdn资源,就是希望集思广益,大家有看到里面的错误和不足的地方给我指正,我直接在网页里改,惠及后来人。