19# if defined(__i686__) || defined(__x86_64__)
20# error Implementation only for AVX capable architectures
24# include <immintrin.h>
32# if defined(FAST_FLOAT)
34 const unsigned quot = n / 16;
35 const unsigned rem = n % 16;
36 __m512 t0 = _mm512_setzero_ps();
37 for (
unsigned k = 0; k < quot; k++) {
38 __m512 f0 = _mm512_loadu_ps(u);
39 __m512 f1 = _mm512_loadu_ps(v);
40 t0 = _mm512_fmadd_ps(f0, f1, t0);
44 float result = _mm512_reduce_add_ps(t0);
45 for (
unsigned k = 0; k < rem; k++) {
46 result += *u++ * *v++;
52 const unsigned quot = n / 8;
53 const unsigned rem = n % 8;
54 __m512d t0 = _mm512_setzero_pd();
55 for (
unsigned k = 0; k < quot; k++) {
56 t0 = _mm512_fmadd_pd(_mm512_loadu_pd(u), _mm512_loadu_pd(v), t0);
60 double result = _mm512_reduce_add_pd(t0);
61 for (
unsigned k = 0; k < rem; k++) {
62 result += *u++ * *v++;
TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n)