SSE指令集
include库
#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h> //所有版本(include immintrin.h)
基本操作
- 使用SSE专门的LOAD指令将数据从内存加载一个向量到寄存器;
- 使用SSE专门的OP指令对两个向量进行某种计算;
- 使用SSE专门的STORE指令把计算结果从寄存器写回到内存;
数据类型
- __m128表示128bit的单精度浮点数
typedef union __declspec(intrin_type) __declspec(align(16)) __m128 {float m128_f32[4];unsigned __int64 m128_u64[2];__int8 m128_i8[16];__int16 m128_i16[8];__int32 m128_i32[4];__int64 m128_i64[2];unsigned __int8 m128_u8[16];unsigned __int16 m128_u16[8];unsigned __int32 m128_u32[4];} __m128;
- __m128i表示128bit的整数型
typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {__int8 m128i_i8[16];__int16 m128i_i16[8];__int32 m128i_i32[4];__int64 m128i_i64[2];unsigned __int8 m128i_u8[16];unsigned __int16 m128i_u16[8];unsigned __int32 m128i_u32[4];unsigned __int64 m128i_u64[2];
} __m128i;
- __128d表示128bit的双精度浮点数
typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {double m128d_f64[2];
} __m128d;
指令函数命名
SSE指令的函数从命名上,主要分成三部分,以_mm_loadu_pd为例:
- 第一部分均以_mm开头,表示属于SSE指令集,_mm256或_mm512是AVX或AVX-512指令集的Intrinsic函数前缀;
- 第二部分表明操作类型,比如load,add,store等。但部分指令后面跟有[l|h|u|r]等字母,比如u表示mem_addr不需要内存对齐,r表示反向读取等;
- 第三部分为操作的对象名及数据类型:
_ps:packed操作所有的单精度浮点数;
_pd:packed操作所有的双精度浮点数;
_pixx:(xx为长度,可以是8,16,32,64)packed操作所有的xx位有符号整数,使用的寄存器长度为64位;
_epixx:(xx为长度)packed操作所有的xx位的有符号整数,使用的寄存器长度为128位;
_epuxx: packed操作所有的xx位的无符号整数;
_ss:scalar操作第一个单精度浮点数;
p表示packed即对128bits的数据全部执行相同的操作,s表示scalar,只对128bits中的第一组数据执行操作,如下图所示。
1、load加载
__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
__m128d _mm_load_pd (double const* mem_addr)
__m128d _mm_load_pd1 (double const* mem_addr)
__m128 _mm_load_ps (float const* mem_addr)
__m128 _mm_load_ps1 (float const* mem_addr)
__m128d _mm_load_sd (double const* mem_addr)
__m128i _mm_load_si128 (__m128i const* mem_addr)
__m128 _mm_load_ss (float const* mem_addr)
__m128d _mm_load1_pd (double const* mem_addr)
__m128 _mm_load1_ps (float const* mem_addr)
__m128d _mm_loaddup_pd (double const* mem_addr)
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
__m128d _mm_loadr_pd (double const* mem_addr)
__m128 _mm_loadr_ps (float const* mem_addr)
__m128d _mm_loadu_pd (double const* mem_addr)
__m128 _mm_loadu_ps (float const* mem_addr)
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
__m128i _mm_loadu_si16 (void const* mem_addr)
__m128i _mm_loadu_si32 (void const* mem_addr)
__m128i _mm_loadu_si64 (void const* mem_addr)
2、OP操作
Arithmetic算术
__m128i _mm_add_epi16 (__m128i a, __m128i b)
__m128i _mm_add_epi32 (__m128i a, __m128i b)
__m128i _mm_add_epi64 (__m128i a, __m128i b)
__m128i _mm_add_epi8 (__m128i a, __m128i b)
__m128d _mm_add_pd (__m128d a, __m128d b)
__m128 _mm_add_ps (__m128 a, __m128 b)
__m128d _mm_add_sd (__m128d a, __m128d b)
__m64 _mm_add_si64 (__m64 a, __m64 b)
__m128 _mm_add_ss (__m128 a, __m128 b)
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
__m128d _mm_addsub_pd (__m128d a, __m128d b)
__m128 _mm_addsub_ps (__m128 a, __m128 b)
__m128d _mm_div_pd (__m128d a, __m128d b)
__m128 _mm_div_ps (__m128 a, __m128 b)
__m128d _mm_div_sd (__m128d a, __m128d b)
__m128 _mm_div_ss (__m128 a, __m128 b)
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
__m128d _mm_hadd_pd (__m128d a, __m128d b)
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
__m128 _mm_hadd_ps (__m128 a, __m128 b)
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
__m128d _mm_hsub_pd (__m128d a, __m128d b)
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
__m128 _mm_hsub_ps (__m128 a, __m128 b)
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
__m128d _mm_mul_pd (__m128d a, __m128d b)
__m128 _mm_mul_ps (__m128 a, __m128 b)
__m128d _mm_mul_sd (__m128d a, __m128d b)
__m128 _mm_mul_ss (__m128 a, __m128 b)
__m64 _mm_mul_su32 (__m64 a, __m64 b)
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
__m64 _m_pmulhuw (__m64 a, __m64 b)
__m64 _m_psadbw (__m64 a, __m64 b)
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
__m128d _mm_sub_pd (__m128d a, __m128d b)
__m128 _mm_sub_ps (__m128 a, __m128 b)
__m128d _mm_sub_sd (__m128d a, __m128d b)
__m64 _mm_sub_si64 (__m64 a, __m64 b)
__m128 _mm_sub_ss (__m128 a, __m128 b)
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
__m128i _mm_subs_epu8 (__m128i a, __m128i b)
Compare比较
__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
__m128d _mm_cmple_pd (__m128d a, __m128d b)
__m128 _mm_cmple_ps (__m128 a, __m128 b)
__m128d _mm_cmple_sd (__m128d a, __m128d b)
__m128 _mm_cmple_ss (__m128 a, __m128 b)
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
int _mm_comieq_sd (__m128d a, __m128d b)
int _mm_comieq_ss (__m128 a, __m128 b)
int _mm_comige_sd (__m128d a, __m128d b)
int _mm_comige_ss (__m128 a, __m128 b)
int _mm_comigt_sd (__m128d a, __m128d b)
int _mm_comigt_ss (__m128 a, __m128 b)
int _mm_comile_sd (__m128d a, __m128d b)
int _mm_comile_ss (__m128 a, __m128 b)
int _mm_comilt_sd (__m128d a, __m128d b)
int _mm_comilt_ss (__m128 a, __m128 b)
int _mm_comineq_sd (__m128d a, __m128d b)
int _mm_comineq_ss (__m128 a, __m128 b)
int _mm_ucomieq_sd (__m128d a, __m128d b)
int _mm_ucomieq_ss (__m128 a, __m128 b)
int _mm_ucomige_sd (__m128d a, __m128d b)
int _mm_ucomige_ss (__m128 a, __m128 b)
int _mm_ucomigt_sd (__m128d a, __m128d b)
int _mm_ucomigt_ss (__m128 a, __m128 b)
int _mm_ucomile_sd (__m128d a, __m128d b)
int _mm_ucomile_ss (__m128 a, __m128 b)
int _mm_ucomilt_sd (__m128d a, __m128d b)
int _mm_ucomilt_ss (__m128 a, __m128 b)
int _mm_ucomineq_sd (__m128d a, __m128d b)
int _mm_ucomineq_ss (__m128 a, __m128 b)
Convert转换
__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
__m64 _mm_cvt_ps2pi (__m128 a)
__m128 _mm_cvt_si2ss (__m128 a, int b)
int _mm_cvt_ss2si (__m128 a)
__m128i _mm_cvtepi16_epi32 (__m128i a)
__m128i _mm_cvtepi16_epi64 (__m128i a)
__m128i _mm_cvtepi32_epi64 (__m128i a)
__m128d _mm_cvtepi32_pd (__m128i a)
__m128 _mm_cvtepi32_ps (__m128i a)
__m128i _mm_cvtepi8_epi16 (__m128i a)
__m128i _mm_cvtepi8_epi32 (__m128i a)
__m128i _mm_cvtepi8_epi64 (__m128i a)
__m128i _mm_cvtepu16_epi32 (__m128i a)
__m128i _mm_cvtepu16_epi64 (__m128i a)
__m128i _mm_cvtepu32_epi64 (__m128i a)
__m128i _mm_cvtepu8_epi16 (__m128i a)
__m128i _mm_cvtepu8_epi32 (__m128i a)
__m128i _mm_cvtepu8_epi64 (__m128i a)
__m128i _mm_cvtpd_epi32 (__m128d a)
__m64 _mm_cvtpd_pi32 (__m128d a)
__m128 _mm_cvtpd_ps (__m128d a)
__m128 _mm_cvtpi16_ps (__m64 a)
__m128d _mm_cvtpi32_pd (__m64 a)
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
__m128 _mm_cvtpi8_ps (__m64 a)
__m128i _mm_cvtps_epi32 (__m128 a)
__m128d _mm_cvtps_pd (__m128 a)
__m64 _mm_cvtps_pi16 (__m128 a)
__m64 _mm_cvtps_pi32 (__m128 a)
__m64 _mm_cvtps_pi8 (__m128 a)
__m128 _mm_cvtpu16_ps (__m64 a)
__m128 _mm_cvtpu8_ps (__m64 a)
double _mm_cvtsd_f64 (__m128d a)
int _mm_cvtsd_si32 (__m128d a)
__int64 _mm_cvtsd_si64 (__m128d a)
__int64 _mm_cvtsd_si64x (__m128d a)
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
int _mm_cvtsi128_si32 (__m128i a)
__int64 _mm_cvtsi128_si64 (__m128i a)
__int64 _mm_cvtsi128_si64x (__m128i a)
__m128d _mm_cvtsi32_sd (__m128d a, int b)
__m128i _mm_cvtsi32_si128 (int a)
__m128 _mm_cvtsi32_ss (__m128 a, int b)
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64_si128 (__int64 a)
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64x_si128 (__int64 a)
float _mm_cvtss_f32 (__m128 a)
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
int _mm_cvtss_si32 (__m128 a)
__int64 _mm_cvtss_si64 (__m128 a)
__m64 _mm_cvtt_ps2pi (__m128 a)
int _mm_cvtt_ss2si (__m128 a)
__m128i _mm_cvttpd_epi32 (__m128d a)
__m64 _mm_cvttpd_pi32 (__m128d a)
__m128i _mm_cvttps_epi32 (__m128 a)
__m64 _mm_cvttps_pi32 (__m128 a)
int _mm_cvttsd_si32 (__m128d a)
__int64 _mm_cvttsd_si64 (__m128d a)
__int64 _mm_cvttsd_si64x (__m128d a)
int _mm_cvttss_si32 (__m128 a)
__int64 _mm_cvttss_si64 (__m128 a)
__m128i _mm_packus_epi32 (__m128i a, __m128i b)
Logical逻辑
__m128d _mm_and_pd (__m128d a, __m128d b)
__m128 _mm_and_ps (__m128 a, __m128 b)
__m128i _mm_and_si128 (__m128i a, __m128i b)
__m128d _mm_andnot_pd (__m128d a, __m128d b)
__m128 _mm_andnot_ps (__m128 a, __m128 b)
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
__m128d _mm_or_pd (__m128d a, __m128d b)
__m128 _mm_or_ps (__m128 a, __m128 b)
__m128i _mm_or_si128 (__m128i a, __m128i b)
int _mm_test_all_ones (__m128i a)
int _mm_test_all_zeros (__m128i mask, __m128i a)
int _mm_test_mix_ones_zeros (__m128i mask, __m128i a)
int _mm_testc_si128 (__m128i a, __m128i b)
int _mm_testnzc_si128 (__m128i a, __m128i b)
int _mm_testz_si128 (__m128i a, __m128i b)
__m128d _mm_xor_pd (__m128d a, __m128d b)
__m128 _mm_xor_ps (__m128 a, __m128 b)
__m128i _mm_xor_si128 (__m128i a, __m128i b)
Set设置
__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_set_pd (double e1, double e0)
__m128d _mm_set_pd1 (double a)
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
__m128 _mm_set_ps1 (float a)
__m128d _mm_set_sd (double a)
__m128 _mm_set_ss (float a)
__m128i _mm_set1_epi16 (short a)
__m128i _mm_set1_epi32 (int a)
__m128i _mm_set1_epi64 (__m64 a)
__m128i _mm_set1_epi64x (__int64 a)
__m128i _mm_set1_epi8 (char a)
__m128d _mm_set1_pd (double a)
__m128 _mm_set1_ps (float a)
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_setr_pd (double e1, double e0)
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
__m128d _mm_setzero_pd (void)
__m128 _mm_setzero_ps (void)
__m128i _mm_setzero_si128 ()
3、Store存储
void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
void _mm_store_pd (double* mem_addr, __m128d a)
void _mm_store_pd1 (double* mem_addr, __m128d a)
void _mm_store_ps (float* mem_addr, __m128 a)
void _mm_store_ps1 (float* mem_addr, __m128 a)
void _mm_store_sd (double* mem_addr, __m128d a)
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
void _mm_store_ss (float* mem_addr, __m128 a)
void _mm_store1_pd (double* mem_addr, __m128d a)
void _mm_store1_ps (float* mem_addr, __m128 a)
void _mm_storeh_pd (double* mem_addr, __m128d a)
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
void _mm_storel_pd (double* mem_addr, __m128d a)
void _mm_storel_pi (__m64* mem_addr, __m128 a)
void _mm_storer_pd (double* mem_addr, __m128d a)
void _mm_storer_ps (float* mem_addr, __m128 a)
void _mm_storeu_pd (double* mem_addr, __m128d a)
void _mm_storeu_ps (float* mem_addr, __m128 a)
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
void _mm_storeu_si16 (void* mem_addr, __m128i a)
void _mm_storeu_si32 (void* mem_addr, __m128i a)
void _mm_storeu_si64 (void* mem_addr, __m128i a)
void _mm_stream_pd (double* mem_addr, __m128d a)
void _mm_stream_pi (__m64* mem_addr, __m64 a)
void _mm_stream_ps (float* mem_addr, __m128 a)
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
void _mm_stream_si32 (int* mem_addr, int a)
void _mm_stream_si64 (__int64* mem_addr, __int64 a)
参考
1、https://www.zhihu.com/column/c_1550937293912748032
2、https://zhuanlan.zhihu.com/p/409973153
3、https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=4880,3865,6557&techs=SSE_ALL