19# include "config_auto.h"
28#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
31#undef HAVE_FRAMEWORK_ACCELERATE
34#if defined(HAVE_FRAMEWORK_ACCELERATE)
39#include <Accelerate/Accelerate.h>
43#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
56#if defined(HAVE_NEON) && !defined(__aarch64__)
57# if defined(HAVE_ANDROID_GETCPUFAMILY)
58# include <cpu-features.h>
59# elif defined(HAVE_GETAUXVAL)
60# include <asm/hwcap.h>
62# elif defined(HAVE_ELF_AUX_INFO)
82static STRING_VAR(dotproduct,
"auto",
"Function used for calculation of dot product");
86#if defined(__aarch64__)
88bool SIMDDetect::neon_available_ =
true;
89#elif defined(HAVE_NEON)
91bool SIMDDetect::neon_available_;
94bool SIMDDetect::avx_available_;
95bool SIMDDetect::avx2_available_;
96bool SIMDDetect::avx512F_available_;
97bool SIMDDetect::avx512BW_available_;
98bool SIMDDetect::avx512VNNI_available_;
100bool SIMDDetect::fma_available_;
102bool SIMDDetect::sse_available_;
105#if defined(HAVE_FRAMEWORK_ACCELERATE)
108 const int stride = 1;
109#if defined(FAST_FLOAT)
110 vDSP_dotpr(u, stride, v, stride, &total, n);
112 vDSP_dotprD(u, stride, v, stride, &total, n);
121 for (
int k = 0; k < n; ++k) {
122 total += u[k] * v[k];
129 return std::inner_product(u, u + n, v,
static_cast<TFloat>(0));
142SIMDDetect::SIMDDetect() {
144 SetDotProduct(DotProductGeneric);
146#if defined(HAS_CPUID)
147# if defined(__GNUC__)
148 unsigned int eax, ebx, ecx, edx;
149 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
152# if defined(HAVE_SSE4_1)
153 sse_available_ = (ecx & 0x00080000) != 0;
155# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
158 __asm__(
"xgetbv" :
"=a"(xcr0) :
"c"(0) :
"%edx");
161 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
163# if defined(HAVE_FMA)
164 fma_available_ = (ecx & 0x00001000) != 0;
166# if defined(HAVE_AVX)
167 avx_available_ = (ecx & 0x10000000) != 0;
168 if (avx_available_) {
172 __cpuid_count(7, 0, eax, ebx, ecx, edx);
173 avx2_available_ = (ebx & 0x00000020) != 0;
174 avx512F_available_ = (ebx & 0x00010000) != 0;
175 avx512BW_available_ = (ebx & 0x40000000) != 0;
176 avx512VNNI_available_ = (ecx & 0x00000800) != 0;
182# elif defined(_WIN32)
186 max_function_id = cpuInfo[0];
187 if (max_function_id >= 1) {
189# if defined(HAVE_SSE4_1)
190 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
192# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
193 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
195# if defined(HAVE_FMA)
196 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
198# if defined(HAVE_AVX)
199 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
201# if defined(HAVE_AVX2)
202 if (max_function_id >= 7) {
204 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
205 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
206 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
207 avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;
214# error "I don't know how to test for SIMD with this compiler"
218#if defined(HAVE_NEON) && !defined(__aarch64__)
219# if defined(HAVE_ANDROID_GETCPUFAMILY)
221 AndroidCpuFamily family = android_getCpuFamily();
222 if (family == ANDROID_CPU_FAMILY_ARM)
223 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
225# elif defined(HAVE_GETAUXVAL)
226 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
227# elif defined(HAVE_ELF_AUX_INFO)
228 unsigned long hwcap = 0;
229 elf_aux_info(AT_HWCAP, &hwcap,
sizeof hwcap);
230 neon_available_ = hwcap & HWCAP_NEON;
237#if defined(HAVE_AVX512F)
238 }
else if (avx512F_available_) {
242#if defined(HAVE_AVX2)
243 }
else if (avx2_available_) {
248 }
else if (avx_available_) {
252#if defined(HAVE_SSE4_1)
253 }
else if (sse_available_) {
257#if defined(HAVE_NEON) || defined(__aarch64__)
258 }
else if (neon_available_) {
264 const char *dotproduct_env = getenv(
"DOTPRODUCT");
265 if (dotproduct_env !=
nullptr) {
267 dotproduct = dotproduct_env;
275 const char *dotproduct_method =
"generic";
276 if (dotproduct ==
"auto") {
278 }
else if (dotproduct ==
"generic") {
280 SetDotProduct(DotProductGeneric);
281 dotproduct_method =
"generic";
282 }
else if (dotproduct ==
"native") {
285 dotproduct_method =
"native";
286#if defined(HAVE_AVX2)
287 }
else if (dotproduct ==
"avx2") {
290 dotproduct_method =
"avx2";
293 }
else if (dotproduct ==
"avx") {
296 dotproduct_method =
"avx";
299 }
else if (dotproduct ==
"fma") {
302 dotproduct_method =
"fma";
304#if defined(HAVE_SSE4_1)
305 }
else if (dotproduct ==
"sse") {
308 dotproduct_method =
"sse";
310#if defined(HAVE_FRAMEWORK_ACCELERATE)
311 }
else if (dotproduct ==
"accelerate") {
314#if defined(HAVE_NEON) || defined(__aarch64__)
315 }
else if (dotproduct ==
"neon" && neon_available_) {
318 dotproduct_method =
"neon";
320 }
else if (dotproduct ==
"std::inner_product") {
323 dotproduct_method =
"std::inner_product";
326 tprintf(
"Warning, ignoring unsupported config variable value: dotproduct=%s\n",
329 "Supported values for dotproduct: auto generic native"
330#
if defined(HAVE_AVX2)
339#
if defined(HAVE_SSE4_1)
342#
if defined(HAVE_FRAMEWORK_ACCELERATE)
345 " std::inner_product.\n");
348 dotproduct.set_value(dotproduct_method);
#define STRING_VAR(name, val, comment)
TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n)
TFloat(*)(const TFloat *, const TFloat *, int) DotProductFunction
void tprintf(const char *format,...)
TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n)
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n)
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n)
DotProductFunction DotProduct
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)
static const IntSimdMatrix intSimdMatrixAVX2
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
static const IntSimdMatrix intSimdMatrixNEON
static TESS_API void Update()