18#if !defined(__SSE4_1__)
19# if defined(__i686__) || defined(__x86_64__)
20# error Implementation only for SSE 4.1 capable architectures
26# include <emmintrin.h>
27# include <smmintrin.h>
34static int32_t IntDotProductSSE(
const int8_t *u,
const int8_t *v,
int n) {
35 int max_offset = n - 8;
40 if (offset <= max_offset) {
42 __m128i packed1 = _mm_loadl_epi64(
reinterpret_cast<const __m128i *
>(u));
43 __m128i packed2 = _mm_loadl_epi64(
reinterpret_cast<const __m128i *
>(v));
44 __m128i sum = _mm_cvtepi8_epi16(packed1);
45 packed2 = _mm_cvtepi8_epi16(packed2);
49 sum = _mm_madd_epi16(sum, packed2);
50 while (offset <= max_offset) {
51 packed1 = _mm_loadl_epi64(
reinterpret_cast<const __m128i *
>(u + offset));
52 packed2 = _mm_loadl_epi64(
reinterpret_cast<const __m128i *
>(v + offset));
54 packed1 = _mm_cvtepi8_epi16(packed1);
55 packed2 = _mm_cvtepi8_epi16(packed2);
56 packed1 = _mm_madd_epi16(packed1, packed2);
57 sum = _mm_add_epi32(sum, packed1);
60 sum = _mm_hadd_epi32(sum, sum);
61 sum = _mm_hadd_epi32(sum, sum);
62 result = _mm_cvtsi128_si32(sum);
65 result += u[offset] * v[offset];
72static void PartialMatrixDotVector1(
const int8_t *wi,
const TFloat *scales,
const int8_t *u,
74 TFloat total = IntDotProductSSE(u, wi, num_in);
76 *v = (total + wi[num_in] * INT8_MAX) * *scales;
79static void matrixDotVector(
int dim1,
int dim2,
const int8_t *wi,
const TFloat *scales,
80 const int8_t *u,
TFloat *v) {
81 const int num_out = dim1;
82 const int num_in = dim2 - 1;
86 PartialMatrixDotVector1(wi, scales, u, num_in, v);
static const IntSimdMatrix intSimdMatrixSSE