tesseract v5.3.3.20231005
dotproductsse.cpp
Go to the documentation of this file.
1
2// File: dotproductsse.cpp
3// Description: Architecture-specific dot-product function.
4// Author: Ray Smith
5//
6// (C) Copyright 2015, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#if !defined(__SSE4_1__)
19# if defined(__i686__) || defined(__x86_64__)
20# error Implementation only for SSE 4.1 capable architectures
21# endif
22#else
23
24# include <emmintrin.h>
25# include <smmintrin.h>
26# include <cstdint>
27# include "dotproduct.h"
28
29namespace tesseract {
30
31// Computes and returns the dot product of the n-vectors u and v.
32// Uses Intel SSE intrinsics to access the SIMD instruction set.
33#if defined(FAST_FLOAT)
34float DotProductSSE(const float *u, const float *v, int n) {
35 int max_offset = n - 4;
36 int offset = 0;
37 // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
38 // v, and multiplying them together in parallel.
39 __m128 sum = _mm_setzero_ps();
40 if (offset <= max_offset) {
41 offset = 4;
42 // Aligned load is reputedly faster but requires 16 byte aligned input.
43 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
44 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
45 // Use aligned load.
46 sum = _mm_load_ps(u);
47 __m128 floats2 = _mm_load_ps(v);
48 // Multiply.
49 sum = _mm_mul_ps(sum, floats2);
50 while (offset <= max_offset) {
51 __m128 floats1 = _mm_load_ps(u + offset);
52 floats2 = _mm_load_ps(v + offset);
53 floats1 = _mm_mul_ps(floats1, floats2);
54 sum = _mm_add_ps(sum, floats1);
55 offset += 4;
56 }
57 } else {
58 // Use unaligned load.
59 sum = _mm_loadu_ps(u);
60 __m128 floats2 = _mm_loadu_ps(v);
61 // Multiply.
62 sum = _mm_mul_ps(sum, floats2);
63 while (offset <= max_offset) {
64 __m128 floats1 = _mm_loadu_ps(u + offset);
65 floats2 = _mm_loadu_ps(v + offset);
66 floats1 = _mm_mul_ps(floats1, floats2);
67 sum = _mm_add_ps(sum, floats1);
68 offset += 4;
69 }
70 }
71 }
72 // Add the 4 sums in sum horizontally.
73#if 0
74 alignas(32) float tmp[4];
75 _mm_store_ps(tmp, sum);
76 float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
77#else
78 __m128 zero = _mm_setzero_ps();
79 // https://www.felixcloutier.com/x86/haddps
80 sum = _mm_hadd_ps(sum, zero);
81 sum = _mm_hadd_ps(sum, zero);
82 // Extract the low result.
83 float result = _mm_cvtss_f32(sum);
84#endif
85 // Add on any left-over products.
86 while (offset < n) {
87 result += u[offset] * v[offset];
88 ++offset;
89 }
90 return result;
91}
92#else
93double DotProductSSE(const double *u, const double *v, int n) {
94 int max_offset = n - 2;
95 int offset = 0;
96 // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
97 // v, and multiplying them together in parallel.
98 __m128d sum = _mm_setzero_pd();
99 if (offset <= max_offset) {
100 offset = 2;
101 // Aligned load is reputedly faster but requires 16 byte aligned input.
102 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
103 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
104 // Use aligned load.
105 sum = _mm_load_pd(u);
106 __m128d floats2 = _mm_load_pd(v);
107 // Multiply.
108 sum = _mm_mul_pd(sum, floats2);
109 while (offset <= max_offset) {
110 __m128d floats1 = _mm_load_pd(u + offset);
111 floats2 = _mm_load_pd(v + offset);
112 offset += 2;
113 floats1 = _mm_mul_pd(floats1, floats2);
114 sum = _mm_add_pd(sum, floats1);
115 }
116 } else {
117 // Use unaligned load.
118 sum = _mm_loadu_pd(u);
119 __m128d floats2 = _mm_loadu_pd(v);
120 // Multiply.
121 sum = _mm_mul_pd(sum, floats2);
122 while (offset <= max_offset) {
123 __m128d floats1 = _mm_loadu_pd(u + offset);
124 floats2 = _mm_loadu_pd(v + offset);
125 offset += 2;
126 floats1 = _mm_mul_pd(floats1, floats2);
127 sum = _mm_add_pd(sum, floats1);
128 }
129 }
130 }
131 // Add the 2 sums in sum horizontally.
132 sum = _mm_hadd_pd(sum, sum);
133 // Extract the low result.
134 double result = _mm_cvtsd_f64(sum);
135 // Add on any left-over products.
136 while (offset < n) {
137 result += u[offset] * v[offset];
138 ++offset;
139 }
140 return result;
141}
142#endif
143
144} // namespace tesseract.
145
146#endif
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)