SSE2 指令

数据类型

__m128 : 四位 float
__m128d : 两位 double

双精度浮点指令

数学运算

__m128d _mm_add_sd(__m128d a, __m128d b);
__m128d _mm_sub_sd(__m128d a, __m128d b);
__m128d _mm_mul_sd(__m128d a, __m128d b);
__m128d _mm_div_sd(__m128d a, __m128d b);
两位双精度浮点数加法 减法 乘法 除法
result = [ a0 * b0 , a1 ]

__m128d _mm_add_pd(__m128d a, __m128d b);
__m128d _mm_sub_pd(__m128d a, __m128d b);
__m128d _mm_mul_pd(__m128d a, __m128d b);
__m128d _mm_div_pd(__m128d a, __m128d b);
两位双精度浮点数加法 减法 乘法 除法
result = [ a0 * b0 , a1 * b1 ]

__m128d _mm_sqrt_sd(__m128d a, __m128d b);
两位双精度浮点数开平方
result = [ sqrt(b0) , a1 ]

__m128d _mm_sqrt_pd(__m128d a);
两位双精度浮点数开平方
result = [ sqrt(a0) , sqrt(a1) ]

__m128d _mm_min_sd(__m128d a, __m128d b);
两位双精度浮点数最小值
result = [ min(a0,b0) , a1 ]

__m128d _mm_min_pd(__m128d a, __m128d b);
两位双精度浮点数最小值
result = [ min(a0,b0) , min(a1,b1) ]

__m128d _mm_max_sd(__m128d a, __m128d b);
两位双精度浮点数最大值
result = [ max(a0,b0) , a1 ]

__m128d _mm_max_pd(__m128d a, __m128d b);
两位双精度浮点数最大值
result = [ max(a0,b0) , max(a0,a1) ]

逻辑指令

__m128d _mm_and_pd(__m128d a, __m128d b);
两位双精度浮点数按位与
result = [ a0&b0 , a1&b1 ]

__m128d _mm_andnot_pd(__m128d a, __m128d b);
两位双精度浮点数按位与 b & (~a)
result = [ (~a0)&b0 , (~a1)&b1 ]

__m128d _mm_or_pd(__m128d a, __m128d b);
两位双精度浮点数按位或
result = [ a0|b0 , a1|b1 ]

__m128d _mm_xor_pd(__m128d a, __m128d b);
两位双精度浮点数按位异或
result = [ a0^b0 , a1^b1 ]

比较运算

__m128d _mm_cmpeq_pd(__m128d a, __m128d b);
比较两位双精度浮点数是否一样
result = [ (a0 == b0)? 0xffffffffffffffff : 0x0 ,(a1 == b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmplt_pd(__m128d a, __m128d b);
比较两位双精度浮点数 a < b
result = [ (a0 < b0)? 0xffffffffffffffff : 0x0 ,(a1 < b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmple_pd(__m128d a, __m128d b);
比较两位双精度浮点数 a<=b
result = [ (a0 <= b0)? 0xffffffffffffffff : 0x0 ,(a1 <= b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmpgt_pd(__m128d a, __m128d b);
比较两位双精度浮点数 a>b
result = [ (a0 > b0)? 0xffffffffffffffff : 0x0 ,(a1 > b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmpge_pd(__m128d a, __m128d b);
比较两位双精度浮点数 a>=b
result = [ (a0 >= b0)? 0xffffffffffffffff : 0x0 ,(a1 >= b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmpneq_pd( __m128d a, __m128d b);
比较两位双精度浮点数 a!=b
result = [ (a0 != b0)? 0xffffffffffffffff : 0x0 ,(a1!= b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmpnlt_pd(__m128d a, __m128d b);
比较两位双精度浮点数 !(a < b)
result = [ !(a0 < b0)? 0xffffffffffffffff : 0x0 , !(a1 < b1)? 0xffffffffffffffff : 0x0 ]

__m128d _mm_cmpnle_pd(__m128d a, __m128d b);
__m128d _mm_cmpngt_pd(__m128d a, __m128d b);
__m128d _mm_cmpnge_pd(__m128d a, __m128d b);
比较两位双精度浮点数 !(a <= b) !(a>b) !(a>=b)

__m128d _mm_cmpeq_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位
result = [ (a0 == b0? 0xffffffffffffffff : 0x0 , a1 ]

__m128d _mm_cmple_sd(__m128d a, __m128d b);
__m128d _mm_cmpgt_sd(__m128d a, __m128d b);
__m128d _mm_cmpge_sd(__m128d a, __m128d b);
__m128d _mm_cmpneq_sd(__m128d a, __m128d b);
__m128d _mm_cmpnlt_sd(__m128d a, __m128d b);
__m128d _mm_cmpnle_sd(__m128d a, __m128d b);
__m128d _mm_cmpngt_sd(__m128d a, __m128d b);
__m128d _mm_cmpnge_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位
result = [ a0<=b0 a0>b0 a0>=b0 a0!=b0 !(a0 < b0) !(a0 <= b0) !(a0 > b0) !(a0 >= b0) , a1 ]

int _mm_comieq_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位
result = (a0 == b0) ? 0x1 : 0x0

int _mm_comilt_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位
result = (a0 < b0) ? 0x1 : 0x0

int _mm_comile_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位
result = (a0 <= b0) ? 0x1 : 0x0

int _mm_comigt_sd(__m128d a, __m128d b);
int _mm_comige_sd(__m128d a, __m128d b);
int _mm_comineq_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位 (a0 > b0) (a0 >= b0) (a0 != b0)

int _mm_ucomieq_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位
result = (a0 == b0) ? 0x1 : 0x0

int _mm_ucomilt_sd(__m128d a, __m128d b);
int _mm_ucomile_sd(__m128d a, __m128d b);
int _mm_ucomigt_sd(__m128d a, __m128d b);
int _mm_ucomige_sd(__m128d a, __m128d b);
int _mm_ucomineq_sd(__m128d a, __m128d b);
比较两位双精度浮点数 低位 (a0 < b0) (a0 <= b0) (a0 > b0) (a0 >= b0) (a0 != b0)

转换指令

__m128 _mm_cvtpd_ps(__m128d a);
双精度浮点类型转化为单精度浮点类型
result = [ (float) a0 , (float) a1 , 0 , 0 ]

__m128d _mm_cvtps_pd(__m128 a);
单精度浮点类型转化为双精度浮点类型
result = [ (double) a0 , (double) a1 ]

__m128d _mm_cvtepi32_pd(__m128i a);
有符号32位整数转化为双精度浮点类型
result = [ (double) a0 , (double) a1 ]

__m128i _mm_cvtpd_epi32(__m128d a);
双精度浮点类型转化为有符号32位整数
result = [ (int) a0 , (int) a1 , 0x0 , 0x0 ]

int _mm_cvtsd_si32(__m128d a);
双精度浮点类型转化为有符号32位整数
result = (int)a0

__m128 _mm_cvtsd_ss(__m128 a, __m128d b);
低位双精度浮点类型转化为单精度浮点类型
result = [ (float)b0 , a1 , a2 , a3 ]

__m128d _mm_cvtsi32_sd(__m128d a, int b);
有符号32位整数转化为双精度浮点类型
result = [ (double) b0 , a1 ]

__m128d _mm_cvtss_sd(__m128d a, __m128 b);
单精度浮点类型转化为双精度浮点类型
result = [ (double) b0 , a1 ]

__m128i _mm_cvttpd_epi32(__m128d a);
双精度浮点类型转化为有符号32位整数,使用截断
result = [ (int) a0 , (int) a1 , 0x0 , 0x0 ]

int _mm_cvttsd_si32(__m128d a);
双精度浮点类型转化为有符号32位整数,使用截断
result = (int)a0

__m64 _mm_cvtpd_pi32(__m128d a);
双精度浮点类型转化为有符号32位整数
result = [ (int) a0 , (int)a1 ]

__m64 _mm_cvttpd_pi32(__m128d a);
双精度浮点类型转化为有符号32位整数,使用截断
result = [ (int) a0 , (int)a1 ]

__m128d _mm_cvtpi32_pd(__m64 a);
有符号32位整数转化为双精度浮点类型
result = [ (double)a0 , (double)a1 ]

double _mm_cvtsd_f64(__m128d a);
返回第一个值

加载指令

__m128d _mm_load_pd(double const*dp);
加载两个双精度浮点类型,必须16字节对齐
result = [ p[0] , p[1] ]

__m128d _mm_load1_pd(double const*dp);
加载一个双精度浮点类型,复制给两个元素,必须16字节对齐
result = [ p[0] , p[0] ]

__m128d _mm_loadr_pd(double const*dp);
加载两个双精度浮点类型反转,必须16字节对齐
result = [ p[1] , p[0] ]

__m128d _mm_loadu_pd(double const*dp);
加载两个双精度浮点类型,不需要16字节对齐
result = [ p[0] , p[1] ]

__m128d _mm_load_sd(double const*dp);
加载一个双精度浮点类型,必须16字节对齐
result = [ p[0] , 0.0 ]

__m128d _mm_loadh_pd(__m128d a, double const*dp);
加载一个双精度浮点作为高位,不需要16字节对齐
result = [ a[0] , p[0] ]

__m128d _mm_loadl_pd(__m128d a, double const*dp);
加载一个双精度浮点作为低位,不需要16字节对齐
result = [ p[0] , a[1] ]

设置指令

__m128d _mm_set_sd(double w);
设置低位双精度浮点
result = [ w , 0.0 ]

__m128d _mm_set1_pd(double w);
设置两位双精度浮点
result = [ w , w ]

__m128d _mm_set_pd(double w, double x);
设置两位双精度浮点
result = [ w , x ]

__m128d _mm_setr_pd(double w, double x);
设置两位双精度浮点
result = [ x , w ]

__m128d _mm_setzero_pd(void);
设置两位双精度浮点为0
result = [ 0.0 , 0.0 ]

__m128d _mm_move_sd( __m128d a, __m128d b);
设置两位双精度浮点
result = [ b[0] , a[1] ]

存储指令

void _mm_store_sd(double *dp, __m128d a);
存储一位双精度浮点,不需要16字节对齐
result = [ a0 ]

void _mm_store1_pd(double *dp, __m128d a);
存储两次双精度浮点,地址必须16字节对齐
result = [ a0 , a0 ]

void _mm_store_pd(double *dp, __m128d a);
存储两个双精度浮点,地址必须16字节对齐
result = [ a0 , a1 ]

void _mm_storeu_pd(double *dp, __m128d a);
存储两个双精度浮点,地址不需要16字节对齐
result = [ a0 , a1 ]

void _mm_storer_pd(double *dp, __m128d a);
存储两个双精度浮点翻转,地址必须16字节对齐
result = [ a1 , a0 ]

void _mm_storeh_pd(double *dp, __m128d a);
存储高位双精度浮点
result = [ a1 ]

void _mm_storel_pd(double *dp, __m128d a);
存储低位双精度浮点
result = [ a0 ]

整型指令

数学计算

__m128i _mm_add_epi8(__m128i a, __m128i b);
16位 有符号或无符号8 bit加法
result = [ a0+b0 , a1+b1 , … , a15+b15 ]

__m128i _mm_add_epi16(__m128i a, __m128i b);
8位 有符号或无符号16 bit加法
result = [ a0+b0 , a1+b1 , … , a7+b7 ]

__m128i _mm_add_epi32(__m128i a, __m128i b);
4位 有符号或无符号32 bit加法
result = [ a0+b0 , a1+b1 , … , a3+b3 ]

__m64 _mm_add_si64(__m64 a, __m64 b);
2位 有符号或无符号64 bit加法
result = [ a+b ]

__m128i _mm_add_epi64(__m128i a, __m128i b);
64bit整数加法
result = [ a0+b0 , a1+b1 ]

__m128i _mm_adds_epi8(__m128i a, __m128i b);
16位 有符号8 bit加法,饱和算法
result = [ SignedSaturate(a0+b0) , … ,SignedSaturate(a15+b15) ]

__m128i _mm_adds_epi16(__m128i a, __m128i b);
8位 有符号16 bit加法,饱和算法
result = [ SignedSaturate(a0+b0) , … ,SignedSaturate(a7+b7) ]

__m128i _mm_adds_epu8(__m128i a, __m128i b);
16位 无符号8 bit加法,饱和算法
result = [ SignedSaturate(a0+b0) , … ,SignedSaturate(a15+b15) ]

__m128i _mm_adds_epu16(__m128i a, __m128i b);
8位 无符号16 bit加法,饱和算法
result = [ SignedSaturate(a0+b0) , … ,SignedSaturate(a7+b7) ]

__m128i _mm_avg_epu8(__m128i a, __m128i b);
16位 无符号8 bit整数均值近似
result = [ (a0+b0)/2 , (a1+b1)/2 , … ,(a15+b15)/2 ]

__m128i _mm_avg_epu16(__m128i a, __m128i b);
8位 无符号16 bit整数均值近似
result = [ (a0+b0)/2 , (a1+b1)/2 , … ,(a7+b7)/2 ]

__m128i _mm_madd_epi16(__m128i a, __m128i b);
8位 有符号16 bit整数相乘,32 bit结果再配对相加,得到4个32bit整型
result = [ (a0*b0)+(a1*b1) , (a2*b2)+(a3*b3) , … ,(a6*b6)+(a7*b7)]

__m128i _mm_max_epi16(__m128i a, __m128i b);
8位 有符号16 bit整数最大值
result = [ max(a0,b0) , max(a1,b1) , … ,max(a7,b7)]

__m128i _mm_max_epu8(__m128i a, __m128i b);
16位 无符号8 bit整形最大值
result = [ max(a0,b0) , max(a1,b1) , … ,max(a15,b15)]

__m128i _mm_min_epi16(__m128i a, __m128i b);
8位 有符号16 bit整数最小值
result = [ min(a0,b0) , min(a1,b1) , … ,min(a7,b7)]

__m128i _mm_min_epu8(__m128i a, __m128i b);
16位 无符号8 bit整形最小值
result = [ min(a0,b0) , min(a1,b1) , … , min(a15,b15)]

__m128i _mm_mulhi_epi16(__m128i a, __m128i b);
8位 有符号16bit整型相乘,取高16位
result = [ (a0*b0) [31:16], (a1*b1) [31:16], … , (a7*b7) [31:16] ]

__m128i _mm_mulhi_epu16(__m128i a, __m128i b);
8位 无符号16bit整型相乘,取高16位
result = [ (a0*b0) [31:16], (a1*b1) [31:16], … , (a7*b7) [31:16] ]

__m128i _mm_mullo_epi16(__m128i a, __m128i b);
8位 有符号或无符号16bit整型相乘,取低16位
result = [ (a0*b0) [15:0], (a1*b1) [15:0], … , (a7*b7) [15:0] ]

__m64 _mm_mul_su32(__m64 a, __m64 b);
低32 bit相乘
result = [ a0*b0 ]

__m128i _mm_mul_epu32(__m128i a, __m128i b);
无符号32 bit 整型相乘,得到64 bit整型
result = [ a0 * b0 , a2*b2 ]

__m128i _mm_sad_epu8(__m128i a, __m128i b);
16位 无符号 8 bit整型绝对差,前8个相加,后八个相加,得到无符号16bit整型
result = [ abs(a0-b0)+…+abs(a7-b7) , 0x0 , 0x0 , 0x0 , 0x0 , abs(a8-b8)+…+abs(a15-b15) , 0x0 , 0x0 , 0x0 ]

__m128i _mm_sub_epi8(__m128i a, __m128i b);
16位 有符号或无符号 8 bit 整型减法
result = [ a0-b0 , a1-b1 , … , a15-b15 ]

__m128i _mm_sub_epi16(__m128i a, __m128i b);
8位 有符号或无符号 16 bit 整型减法
result = [ a0-b0 , a1-b1 , … , a7-b7 ]

__m128i _mm_sub_epi32(__m128i a, __m128i b);
4位 有符号或无符号 32 bit 整型减法
result = [ a0-b0 , a1-b1 , … , a3-b3 ]

__m64 _mm_sub_si64 (__m64 a, __m64 b);
有符号或无符号 64 bit 整型减法
result = [ a-b ]

__m128i _mm_sub_epi64(__m128i a, __m128i b);
2位 有符号或无符号 64 bit 整型减法
result = [ a0-b0 , a1-b1 ]

__m128i _mm_subs_epi8(__m128i a, __m128i b);
16位 有符号 8 bit 整型减法,使用饱和算法
result = [ SignedSaturate(a0-b0) , SignedSaturate(a1-b1) , … , SignedSaturate(a15-b15) ]

__m128i _mm_subs_epi16(__m128i a, __m128i b);
8位 有符号 16 bit 整型减法,使用饱和算法
result = [ SignedSaturate(a0-b0) , SignedSaturate(a1-b1) , … , SignedSaturate(a7-b7) ]

__m128i _mm_subs_epu8(__m128i a, __m128i b);
6位 无符号 8 bit 整型减法,使用饱和算法
result = [ SignedSaturate(a0-b0) , SignedSaturate(a1-b1) , … , SignedSaturate(a15-b15) ]

__m128i _mm_subs_epu16(__m128i a, __m128i b);
8位 无符号 16 bit 整型减法,使用饱和算法
result = [ SignedSaturate(a0-b0) , SignedSaturate(a1-b1) , … , SignedSaturate(a7-b7) ]

逻辑指令

__m128i _mm_and_si128(__m128i a, __m128i b);
位与
result = [ a&b ]

__m128i _mm_andnot_si128(__m128i a, __m128i b);
位a非 与B
result = [ (~a)&b ]

__m128i _mm_or_si128(__m128i a, __m128i b);
位或
result = [ a|b ]

__m128i _mm_xor_si128(__m128i a, __m128i b);
异或
result = [ a^b ]

移位指令

__m128i _mm_slli_si128(__m128i a, int imm);
128-bit 左移 imm字节,低位填0
result = [ a << (imm*8) ]

__m128i _mm_slli_epi16(__m128i a, int count);
8位 有符号或无符号16bit 左移 count bit,低位填0
result = [ a0 << count , a1 << count , … ,a7 << count ]

__m128i _mm_sll_epi16(__m128i a, __m128i count);
8位 有符号或无符号16bit 左移 count bit,低位填0
result = [ a0 << count , a1 << count , … , a7 << count ]

__m128i _mm_slli_epi32(__m128i a, int count);
4位 有符号或无符号32bit 左移 count bit,低位填0
result = [ a0 << count , a1 << count , … , a3 << count ]

__m128i _mm_sll_epi32(__m128i a, __m128i count);
4位 有符号或无符号32bit 左移 count bit,低位填0
result = [ a0 << count , a1 << count , … , a3 << count ]

__m128i _mm_slli_epi64(__m128i a, int count);
2位 有符号或无符号64bit 左移 count bit,低位填0
result = [ a0 << count , a1 << count ]

__m128i _mm_sll_epi64(__m128i a, int count);
2位 有符号或无符号64bit 左移 count bit,低位填0
result = [ a0 << count , a1 << count ]

__m128i _mm_srai_epi16(__m128i a, int count);
8位 有符号或无符号16bit 右移 count bit,高位填符号位
result = [ a0 >> count , a1 >> count , … , a7 >> count ]

__m128i _mm_sra_epi16(__m128i a, __m128i count);
8位 有符号或无符号16bit 右移 count bit,高位填符号位
result = [ a0 >> count , a1 >> count , … , a7 >> count ]

__m128i _mm_srai_epi32(__m128i a, int count);
4位 有符号或无符号32bit 右移 count bit,高位填符号位
result = [ a0 >> count , a1 >> count , … , a3 >> count ]

__m128i _mm_sra_epi32(__m128i a, __m128i count);
4位 有符号或无符号32bit 右移 count bit,高位填符号位
result = [ a0 >> count , a1 >> count , … , a3 >> count ]

__m128i _mm_srli_si128(__m128i a, int imm);
128 bit 右移 imm 字节,高位填0
result = [ a >> (imm*8) ]

__m128i _mm_srli_epi16(__m128i a, int count);
8位 有符号或无符号16bit 右移 count bit,高位填0
result = [ a0 >> count , a1 >> count , … , a7 >> count ]

__m128i _mm_srl_epi16(__m128i a, __m128i count);
8位 有符号或无符号16bit 右移 count bit,高位填0
result = [ a0 >> count , a1 >> count , … , a7 >> count ]

__m128i _mm_srli_epi32(__m128i a, int count);
4位 有符号或无符号32bit 右移 count bit,高位填0
result = [ a0 >> count , a1 >> count , … , a3 >> count ]

__m128i _mm_srl_epi32(__m128i a, __m128i count);
4位 有符号或无符号32bit 右移 count bit,高位填0
result = [ a0 >> count , a1 >> count , … , a3 >> count ]

__m128i _mm_srli_epi64(__m128i a, int count)
2位 有符号或无符号64bit 右移 count bit,高位填0
result = [ a0 >> count , a1 >> count ]

__m128i _mm_srl_epi64(__m128i a, __m128i count)
2位 有符号或无符号64bit 右移 count bit,高位填0
result = [ a0 >> count , a1 >> count ]

比较指令

__m128i _mm_cmpeq_epi8(__m128i a, __m128i b);
__m128i _mm_cmpeq_epi16(__m128i a, __m128i b);
___m128i _mm_cmpeq_epi32(__m128i a, __m128i b);
__m128i _mm_cmpgt_epi8(__m128i a, __m128i b);
__m128i _mm_cmpgt_epi16(__m128i a, __m128i b);
___m128i _mm_cmpgt_epi32(__m128i a, __m128i b);
__m128i _mm_cmplt_epi8( __m128i a, __m128i b);
__m128i _mm_cmplt_epi16( __m128i a, __m128i b);
__m128i _mm_cmplt_epi32( __m128i a, __m128i b);
比较 a , b ,成功返回 0xff…ff ,失败返回0x0

转换指令

__m128d _mm_cvtsi64_sd(__m128d a, __int64 b);
64 bit整型转化为 双精度浮点
result = [ (double)b ]

__int64 _mm_cvtsd_si64(__m128d a);
双精度浮点化为 64 bit整型转,近似值
result = [ (__int64)a0 ]

__int64 _mm_cvttsd_si64(__m128d a);
双精度浮点化为 64 bit整型转,使用截断
result = [ (__int64)a0 ]

__m128 _mm_cvtepi32_ps(__m128i a);
4位 有符号32 bit整型转单精度浮点
result = [ (float)a0 , (float)a1 , … , (float)a3 ]

__m128i _mm_cvtps_epi32(__m128 a);
4位 单精度浮点转有符号32 bit整型
result = [ (int)a0 , (int)a1 , … , (int)a3 ]

__m128i _mm_cvttps_epi32(__m128 a);
4位 单精度浮点转有符号32 bit整型,使用截断
result = [ (int)a0 , (int)a1 , … , (int)a3 ]

移动指令

__m128i _mm_cvtsi32_si128(int a);
移动32 bit整型 到低位
result = [ a , 0x0 , 0x0 , 0x0 ]

__m128i _mm_cvtsi64_si128(__int64 a);
移动64 bit整型 到低位
result = [ a , 0x0 ]

int _mm_cvtsi128_si32(__m128i a);
移动低位到32 bit整型
result = [ a0 ]

__int64 _mm_cvtsi128_si64(__m128i a);
移动低位到64 bit整型
result = [ a0 ]

加载指令

__m128i _mm_load_si128(__m128i const*p);
加载128 bit值,必须16字节对齐
result = [ p0 ]

__m128i _mm_loadu_si128(__m128i const*p);
加载128 bit值,不需要16字节对齐
result = [ p0 ]

__m128i _mm_loadl_epi64(__m128i const*p);
加载低 64 bit值,不需要16字节对齐
result = [ p0[ 63:0 ] , 0x0 ]

设置指令

__m128i _mm_set_epi64(__m64 q1, __m64 q0);
设置两个64 bit整型值
result = [ q0 , q1 ]

__m128i _mm_set_epi64x(__int64 b, __int64 a);
设置两个64 bit整型值
result = [ q0 , q1 ]

__m128i _mm_set_epi32(int i3, int i2, int i1, int i0);
设置4个有符号32bit整型
result = [ i0 , i1 , i2 , i3 ]

__m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short
w1, short w0);
设置8个有符号16bit整型
result = [ w0 , w1 , … , w7 ]

__m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char
b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
设置16个有符号8bit整型
result = [ b0 , b1 , … , b15 ]

__m128i _mm_set1_epi64(__m64 q);
设置2个64bit整型
result = [ q , q ]

__m128i _mm_set1_epi64x(__int64 a);
设置2个64bit整型
result = [ a , a ]

__m128i _mm_set1_epi32(int i);
设置4个有符号32bit整型
result = [ i , i , i , i ]

__m128i _mm_set1_epi16(short w);
设置8个有符号16bit整型
result = [ i , i , … , i ]

__m128i _mm_set1_epi8(char b);
设置16个有符号8bit整型
result = [ b , b , … , b ]

__m128i _mm_setr_epi64(__m64 q0, __m64 q1);
设置2个64bit整型翻转
result = [ q0 , q1 ]

__m128i _mm_setr_epi32(int i0, int i1, int i2, int i3);
设置4个有符号32bit整型翻转
result = [ i0 , i1 , i2 , i3 ]

__m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5,
short w6, short w7);
设置8个有符号16bit整型翻转
result = [ w0 , w1 , … , w7 ]

__m128i _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6,
char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15);
设置16个有符号8bit整型翻转
result = [ b0 , b1 , … , b15 ]

__m128i _mm_setzero_si128();
设置128 bit 0值

存储指令

void _mm_stream_si128(__m128i *p, __m128i a);
将a存入p,必须16字节对齐
result = [ a ]

void _mm_stream_si32(int *p, int a);
将a存入p
result = [ a ]

void _mm_store_si128(__m128i *p, __m128i b);
存储128 bit值,必须16字节对齐
result = [ b ]

void _mm_storeu_si128(__m128i *p, __m128i b);
存储128 bit值,不需要16字节对齐
result = [ b ]

void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p);
有条件存储d的字节到p,n的高位字节决定是否存储某字节,必须16字节对齐
result = [ if(n0[7]) p[0] = d0 , if(n1[7]) p[1] = d1 , … , if(n15[7]) p[15] = d15 ,

void _mm_storel_epi64(__m128i *p, __m128i a);
存储低64bit 到p
result = [ a0 ]

其他指令

void _mm_stream_pd(double *p, __m128d a);
存储 a 到 p,必须16字节对齐
result = [ a0 , a1 ]

void _mm256_stream_pd(double *p, __m256d a);
存储 a 到 p,必须32字节对齐
result = [ a0 , a1 ]

void _mm_stream_si128(__m128i *p, __m128i a);
存储 a 到 p,必须16字节对齐
result = [ a ]

void _mm256_stream_si256(__m256i *p, __m256i a);
存储 a 到 p,必须32字节对齐
result = [ a ]

void _mm_stream_si32(int *p, int a);
a 存入 p
result = [ a ]

void _mm_stream_si64(__int64 *p, __int64 a);
a 存入 p
result = [ a ]

void _mm_clflush(void const*p);
Cache line containing p is flushed and invalidated from all caches in the coherency domain.
result = [ a ]

void _mm_clflushopt(void const *p);
Cache line containing p is flushed and invalidated from all caches in the coherency domain. This optimized version of the _mm_clflush is available if indicated by the CPUID feature flag CLFLUSHOPT .
result = [ a ]

void _mm_lfence(void);
Guarantees that every load instruction that precedes, in program order, the load fence instruction is globally visible before any load instruction which follows the fence in program order.

void _mm_mfence(void);
Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.

杂项指令

__m128i _mm_packs_epi16(__m128i a, __m128i b);
将16位有符号16bit整型打包成 8 bit整型,饱和处理
result = [ SignedSaturate(a0) , SignedSaturate(a1) , … ,SignedSaturate(a7) , SignedSaturate(b0) , … , SignedSaturate(b7) ]

__m128i _mm_packs_epi32(__m128i a, __m128i b);
将8位有符号32bit整型打包成 有符号16 bit整型,饱和处理
result = [ SignedSaturate(a0) , SignedSaturate(a1) , … ,SignedSaturate(a3) , SignedSaturate(b0) , … , SignedSaturate(b3) ]

__m128i _mm_packus_epi16(__m128i a, __m128i b);
将16位有符号16bit整型打包成 无符号8 bit整型,饱和处理
result = [ UnsignedSaturate(a0) , UnsignedSaturate(a1) , … ,UnsignedSaturate(a7) , UnsignedSaturate(b0) , … , UnsignedSaturate(b7) ]

int _mm_extract_epi16(__m128i a, int imm);
提取a中有符号或无符号16 bit整型
result = [ (imm == 0) ? a0: ( (imm == 1) ? a1: … (imm==7) ? a7) ]

__m128i _mm_insert_epi16(__m128i a, int b, int imm);
将b插入a
result = [ (imm == 0) ? b :a0 ,(imm == 1) ? b :a1 , … , (imm == 7) ? b :a7 ]

int _mm_movemask_epi8(__m128i a);
创建 16bit 掩码
result = [ a15[7] << 15 | a14[7] << 14 | … a1[7] << 1 | a0[7] ]

__m128i _mm_shuffle_epi32(__m128i a, int imm);
___m128i _mm_shufflehi_epi16(__m128i a, int imm);
__m128i _mm_shufflelo_epi16(__m128i a, int imm);
拖拽

__m128i _mm_unpackhi_epi8(__m128i a, __m128i b);
交替高八位有符号或无符号8bit整数
result = [ a8 , b8 , a9 , b9 , … , a15 , b15 ]

__m128i _mm_unpackhi_epi16(__m128i a, __m128i b);
交替高4位有符号或无符号16bit整数
result = [ a4 , b4 , a5 , b5 , … , a7 , b7 ]

__m128i _mm_unpackhi_epi32(__m128i a, __m128i b);
交替高2位有符号或无符号32bit整数
result = [ a2 , b2 , a3, b3 ]

__m128i _mm_unpackhi_epi64(__m128i a, __m128i b);
交替高位有符号或无符号64bit整数
result = [ a1 , b1]

__m128i _mm_unpacklo_epi8(__m128i a, __m128i b);
交替低八位有符号或无符号8bit整数
result = [ a0 , b0 , a1 , b1 , … , a7 , b7 ]

__m128i _mm_unpacklo_epi16(__m128i a, __m128i b);
交替低4位有符号或无符号16bit整数
result = [ a0 , b0 , a1 , b1 , … , a3 , b3 ]

__m128i _mm_unpacklo_epi32(__m128i a, __m128i b);
交替低2位有符号或无符号32bit整数
result = [ a0 , b0 , a1 , b1 ]

__m128i _mm_unpacklo_epi64(__m128i a, __m128i b);
交替低位有符号或无符号64bit整数
result = [ a0 , b0 ]

__m64 _mm_movepi64_pi64(__m128i a);
移动低64位
result = [ a0 ]

__m128i _mm_movpi64_pi64(__m64 a);
移动到低64位
result = [ a , 0x0 ]

__m128i _mm_move_epi64(__m128i a);
移动到低64位
result = [ a0 , 0x0 ]

__m128d _mm_unpackhi_pd(__m128d a, __m128d b);
交替高位双精度浮点
result = [ a1 , b1 ]

__m128d _mm_unpacklo_pd(__m128d a, __m128d b);
交替低位双精度浮点
result = [ a0 , b0 ]

int _mm_movemask_pd(__m128d a);
创建两位掩码 从符号位
result = [ sign(a1) << 1 | sign(a0) ]

__m128d _mm_shuffle_pd(__m128d a, __m128d b, int i)
从a b 选择两位浮点值,由i指定

强制转化操作(不改变内存内容)

__m128 _mm_castpd_ps(__m128d in);
__m128i _mm_castpd_si128(__m128d in);
__m128d _mm_castps_pd(__m128 in);
__m128i _mm_castps_si128(__m128 in);
__m128 _mm_castsi128_ps(__m128i in);
__m128d _mm_castsi128_pd(__m128i in);

猜你喜欢

转载自blog.csdn.net/huajun998/article/details/78425179