tesseract v5.3.3.20231005
simddetect.cpp
Go to the documentation of this file.
1
2// File: simddetect.cpp
3// Description: Architecture detector.
4// Author: Stefan Weil (based on code from Ray Smith)
5//
6// (C) Copyright 2014, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#ifdef HAVE_CONFIG_H
19# include "config_auto.h" // for HAVE_AVX, ...
20#endif
21#include <numeric> // for std::inner_product
22#include "dotproduct.h"
23#include "intsimdmatrix.h" // for IntSimdMatrix
24#include "params.h" // for STRING_VAR
25#include "simddetect.h"
26#include "tprintf.h" // for tprintf
27
28#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29// The GNU compiler g++ fails to compile with the Accelerate framework
30// (tested with versions 10 and 11), so unconditionally disable it.
31#undef HAVE_FRAMEWORK_ACCELERATE
32#endif
33
34#if defined(HAVE_FRAMEWORK_ACCELERATE)
35
36// Use Apple Accelerate framework.
37// https://developer.apple.com/documentation/accelerate/simd
38
39#include <Accelerate/Accelerate.h>
40
41#endif
42
43#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44// See https://en.wikipedia.org/wiki/CPUID.
45# define HAS_CPUID
46#endif
47
48#if defined(HAS_CPUID)
49# if defined(__GNUC__)
50# include <cpuid.h>
51# elif defined(_WIN32)
52# include <intrin.h>
53# endif
54#endif
55
56#if defined(HAVE_NEON) && !defined(__aarch64__)
57# if defined(HAVE_ANDROID_GETCPUFAMILY)
58# include <cpu-features.h>
59# elif defined(HAVE_GETAUXVAL)
60# include <asm/hwcap.h>
61# include <sys/auxv.h>
62# elif defined(HAVE_ELF_AUX_INFO)
63# include <sys/auxv.h>
64# include <sys/elf.h>
65# endif
66#endif
67
68namespace tesseract {
69
70// Computes and returns the dot product of the two n-vectors u and v.
71// Note: because the order of addition is different among the different dot
72// product functions, the results can (and do) vary slightly (although they
73// agree to within about 4e-15). This produces different results when running
74// training, despite all random inputs being precisely equal.
75// To get consistent results, use just one of these dot product functions.
76// On a test multi-layer network, serial is 57% slower than SSE, and AVX
77// is about 8% faster than SSE. This suggests that the time is memory
78// bandwidth constrained and could benefit from holding the reused vector
79// in AVX registers.
81
82static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
83
84SIMDDetect SIMDDetect::detector;
85
86#if defined(__aarch64__)
87// ARMv8 always has NEON.
88bool SIMDDetect::neon_available_ = true;
89#elif defined(HAVE_NEON)
90// If true, then Neon has been detected.
91bool SIMDDetect::neon_available_;
92#else
93// If true, then AVX has been detected.
94bool SIMDDetect::avx_available_;
95bool SIMDDetect::avx2_available_;
96bool SIMDDetect::avx512F_available_;
97bool SIMDDetect::avx512BW_available_;
98bool SIMDDetect::avx512VNNI_available_;
99// If true, then FMA has been detected.
100bool SIMDDetect::fma_available_;
101// If true, then SSe4.1 has been detected.
102bool SIMDDetect::sse_available_;
103#endif
104
105#if defined(HAVE_FRAMEWORK_ACCELERATE)
106static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
107 TFloat total = 0;
108 const int stride = 1;
109#if defined(FAST_FLOAT)
110 vDSP_dotpr(u, stride, v, stride, &total, n);
111#else
112 vDSP_dotprD(u, stride, v, stride, &total, n);
113#endif
114 return total;
115}
116#endif
117
118// Computes and returns the dot product of the two n-vectors u and v.
119static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
120 TFloat total = 0;
121 for (int k = 0; k < n; ++k) {
122 total += u[k] * v[k];
123 }
124 return total;
125}
126
127// Compute dot product using std::inner_product.
128static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
129 return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
130}
131
132static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
133 DotProduct = f;
135}
136
137// Constructor.
138// Tests the architecture in a system-dependent way to detect AVX, SSE and
139// any other available SIMD equipment.
140// __GNUC__ is also defined by compilers that include GNU extensions such as
141// clang.
142SIMDDetect::SIMDDetect() {
143 // The fallback is a generic dot product calculation.
144 SetDotProduct(DotProductGeneric);
145
146#if defined(HAS_CPUID)
147# if defined(__GNUC__)
148 unsigned int eax, ebx, ecx, edx;
149 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
150 // Note that these tests all use hex because the older compilers don't have
151 // the newer flags.
152# if defined(HAVE_SSE4_1)
153 sse_available_ = (ecx & 0x00080000) != 0;
154# endif
155# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
156 auto xgetbv = []() {
157 uint32_t xcr0;
158 __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
159 return xcr0;
160 };
161 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
162 // OSXSAVE bit is set, XMM state and YMM state are fine.
163# if defined(HAVE_FMA)
164 fma_available_ = (ecx & 0x00001000) != 0;
165# endif
166# if defined(HAVE_AVX)
167 avx_available_ = (ecx & 0x10000000) != 0;
168 if (avx_available_) {
169 // There is supposed to be a __get_cpuid_count function, but this is all
170 // there is in my cpuid.h. It is a macro for an asm statement and cannot
171 // be used inside an if.
172 __cpuid_count(7, 0, eax, ebx, ecx, edx);
173 avx2_available_ = (ebx & 0x00000020) != 0;
174 avx512F_available_ = (ebx & 0x00010000) != 0;
175 avx512BW_available_ = (ebx & 0x40000000) != 0;
176 avx512VNNI_available_ = (ecx & 0x00000800) != 0;
177 }
178# endif
179 }
180# endif
181 }
182# elif defined(_WIN32)
183 int cpuInfo[4];
184 int max_function_id;
185 __cpuid(cpuInfo, 0);
186 max_function_id = cpuInfo[0];
187 if (max_function_id >= 1) {
188 __cpuid(cpuInfo, 1);
189# if defined(HAVE_SSE4_1)
190 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
191# endif
192# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
193 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
194 // OSXSAVE bit is set, XMM state and YMM state are fine.
195# if defined(HAVE_FMA)
196 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
197# endif
198# if defined(HAVE_AVX)
199 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
200# endif
201# if defined(HAVE_AVX2)
202 if (max_function_id >= 7) {
203 __cpuid(cpuInfo, 7);
204 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
205 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
206 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
207 avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;
208 }
209# endif
210 }
211# endif
212 }
213# else
214# error "I don't know how to test for SIMD with this compiler"
215# endif
216#endif
217
218#if defined(HAVE_NEON) && !defined(__aarch64__)
219# if defined(HAVE_ANDROID_GETCPUFAMILY)
220 {
221 AndroidCpuFamily family = android_getCpuFamily();
222 if (family == ANDROID_CPU_FAMILY_ARM)
223 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
224 }
225# elif defined(HAVE_GETAUXVAL)
226 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
227# elif defined(HAVE_ELF_AUX_INFO)
228 unsigned long hwcap = 0;
229 elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
230 neon_available_ = hwcap & HWCAP_NEON;
231# endif
232#endif
233
234 // Select code for calculation of dot product based on autodetection.
235 if (false) {
236 // This is a dummy to support conditional compilation.
237#if defined(HAVE_AVX512F)
238 } else if (avx512F_available_) {
239 // AVX512F detected.
241#endif
242#if defined(HAVE_AVX2)
243 } else if (avx2_available_) {
244 // AVX2 detected.
246#endif
247#if defined(HAVE_AVX)
248 } else if (avx_available_) {
249 // AVX detected.
251#endif
252#if defined(HAVE_SSE4_1)
253 } else if (sse_available_) {
254 // SSE detected.
256#endif
257#if defined(HAVE_NEON) || defined(__aarch64__)
258 } else if (neon_available_) {
259 // NEON detected.
261#endif
262 }
263
264 const char *dotproduct_env = getenv("DOTPRODUCT");
265 if (dotproduct_env != nullptr) {
266 // Override automatic settings by value from environment variable.
267 dotproduct = dotproduct_env;
268 Update();
269 }
270}
271
273 // Select code for calculation of dot product based on the
274 // value of the config variable if that value is not empty.
275 const char *dotproduct_method = "generic";
276 if (dotproduct == "auto") {
277 // Automatic detection. Nothing to be done.
278 } else if (dotproduct == "generic") {
279 // Generic code selected by config variable.
280 SetDotProduct(DotProductGeneric);
281 dotproduct_method = "generic";
282 } else if (dotproduct == "native") {
283 // Native optimized code selected by config variable.
285 dotproduct_method = "native";
286#if defined(HAVE_AVX2)
287 } else if (dotproduct == "avx2") {
288 // AVX2 selected by config variable.
290 dotproduct_method = "avx2";
291#endif
292#if defined(HAVE_AVX)
293 } else if (dotproduct == "avx") {
294 // AVX selected by config variable.
296 dotproduct_method = "avx";
297#endif
298#if defined(HAVE_FMA)
299 } else if (dotproduct == "fma") {
300 // FMA selected by config variable.
302 dotproduct_method = "fma";
303#endif
304#if defined(HAVE_SSE4_1)
305 } else if (dotproduct == "sse") {
306 // SSE selected by config variable.
308 dotproduct_method = "sse";
309#endif
310#if defined(HAVE_FRAMEWORK_ACCELERATE)
311 } else if (dotproduct == "accelerate") {
312 SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
313#endif
314#if defined(HAVE_NEON) || defined(__aarch64__)
315 } else if (dotproduct == "neon" && neon_available_) {
316 // NEON selected by config variable.
318 dotproduct_method = "neon";
319#endif
320 } else if (dotproduct == "std::inner_product") {
321 // std::inner_product selected by config variable.
322 SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
323 dotproduct_method = "std::inner_product";
324 } else {
325 // Unsupported value of config variable.
326 tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
327 dotproduct.c_str());
328 tprintf(
329 "Supported values for dotproduct: auto generic native"
330#if defined(HAVE_AVX2)
331 " avx2"
332#endif
333#if defined(HAVE_AVX)
334 " avx"
335#endif
336#if defined(HAVE_FMA)
337 " fma"
338#endif
339#if defined(HAVE_SSE4_1)
340 " sse"
341#endif
342#if defined(HAVE_FRAMEWORK_ACCELERATE)
343 " accelerate"
344#endif
345 " std::inner_product.\n");
346 }
347
348 dotproduct.set_value(dotproduct_method);
349}
350
351} // namespace tesseract
#define STRING_VAR(name, val, comment)
Definition: params.h:363
TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n)
TFloat(*)(const TFloat *, const TFloat *, int) DotProductFunction
Definition: simddetect.h:26
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n)
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n)
Definition: dotproduct.cpp:22
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n)
DotProductFunction DotProduct
Definition: simddetect.cpp:80
double TFloat
Definition: tesstypes.h:39
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)
static const IntSimdMatrix intSimdMatrixAVX2
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
static const IntSimdMatrix intSimdMatrixNEON
static TESS_API void Update()
Definition: simddetect.cpp:272