18#if !defined(__SSE4_1__)
19# if defined(__i686__) || defined(__x86_64__)
20# error Implementation only for SSE 4.1 capable architectures
24# include <emmintrin.h>
25# include <smmintrin.h>
33#if defined(FAST_FLOAT)
35 int max_offset = n - 4;
39 __m128 sum = _mm_setzero_ps();
40 if (offset <= max_offset) {
43 if ((
reinterpret_cast<uintptr_t
>(u) & 15) == 0 &&
44 (
reinterpret_cast<uintptr_t
>(v) & 15) == 0) {
47 __m128 floats2 = _mm_load_ps(v);
49 sum = _mm_mul_ps(sum, floats2);
50 while (offset <= max_offset) {
51 __m128 floats1 = _mm_load_ps(u + offset);
52 floats2 = _mm_load_ps(v + offset);
53 floats1 = _mm_mul_ps(floats1, floats2);
54 sum = _mm_add_ps(sum, floats1);
59 sum = _mm_loadu_ps(u);
60 __m128 floats2 = _mm_loadu_ps(v);
62 sum = _mm_mul_ps(sum, floats2);
63 while (offset <= max_offset) {
64 __m128 floats1 = _mm_loadu_ps(u + offset);
65 floats2 = _mm_loadu_ps(v + offset);
66 floats1 = _mm_mul_ps(floats1, floats2);
67 sum = _mm_add_ps(sum, floats1);
74 alignas(32)
float tmp[4];
75 _mm_store_ps(tmp, sum);
76 float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
78 __m128 zero = _mm_setzero_ps();
80 sum = _mm_hadd_ps(sum, zero);
81 sum = _mm_hadd_ps(sum, zero);
83 float result = _mm_cvtss_f32(sum);
87 result += u[offset] * v[offset];
93double DotProductSSE(
const double *u,
const double *v,
int n) {
94 int max_offset = n - 2;
98 __m128d sum = _mm_setzero_pd();
99 if (offset <= max_offset) {
102 if ((
reinterpret_cast<uintptr_t
>(u) & 15) == 0 &&
103 (
reinterpret_cast<uintptr_t
>(v) & 15) == 0) {
105 sum = _mm_load_pd(u);
106 __m128d floats2 = _mm_load_pd(v);
108 sum = _mm_mul_pd(sum, floats2);
109 while (offset <= max_offset) {
110 __m128d floats1 = _mm_load_pd(u + offset);
111 floats2 = _mm_load_pd(v + offset);
113 floats1 = _mm_mul_pd(floats1, floats2);
114 sum = _mm_add_pd(sum, floats1);
118 sum = _mm_loadu_pd(u);
119 __m128d floats2 = _mm_loadu_pd(v);
121 sum = _mm_mul_pd(sum, floats2);
122 while (offset <= max_offset) {
123 __m128d floats1 = _mm_loadu_pd(u + offset);
124 floats2 = _mm_loadu_pd(v + offset);
126 floats1 = _mm_mul_pd(floats1, floats2);
127 sum = _mm_add_pd(sum, floats1);
132 sum = _mm_hadd_pd(sum, sum);
134 double result = _mm_cvtsd_f64(sum);
137 result += u[offset] * v[offset];
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)