tesseract v5.3.3.20231005
openclwrapper.cpp
Go to the documentation of this file.
1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software
6// distributed under the License is distributed on an "AS IS" BASIS,
7// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8// See the License for the specific language governing permissions and
9// limitations under the License.
10
11// Include automatically generated configuration file
12#ifdef HAVE_CONFIG_H
13# include "config_auto.h"
14#endif
15
16#ifdef USE_OPENCL
17
18# ifdef _WIN32
19# include <io.h>
20# include <windows.h>
21# else
22# include <sys/types.h>
23# include <unistd.h>
24# endif
25# include <cfloat>
26# include <ctime> // for clock_gettime
27
28# include "oclkernels.h"
29# include "openclwrapper.h"
30
31// for micro-benchmark
32# include "otsuthr.h"
33# include "thresholder.h"
34
35// platform preprocessor commands
36# if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(__CYGWIN__) || \
37 defined(__MINGW32__)
38# define ON_WINDOWS 1
39# define ON_APPLE 0
40# elif defined(__linux__)
41# define ON_WINDOWS 0
42# define ON_APPLE 0
43# elif defined(__APPLE__)
44# define ON_WINDOWS 0
45# define ON_APPLE 1
46# else
47# define ON_WINDOWS 0
48# define ON_APPLE 0
49# endif
50
51# if ON_APPLE
52# include <mach/mach_time.h>
53# endif
54
55# include <cstdio>
56# include <cstdlib>
57# include <cstring> // for memset, strcpy, ...
58# include <vector>
59
60# include "errcode.h" // for ASSERT_HOST
61# include "image.h" // for Image
62
63namespace tesseract {
64
65GPUEnv OpenclDevice::gpuEnv;
66
67bool OpenclDevice::deviceIsSelected = false;
68ds_device OpenclDevice::selectedDevice;
69
70int OpenclDevice::isInited = 0;
71
72static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
73
74static const l_uint32 lmask32[] = {
75 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
76 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
77 0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
78 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
79
80static const l_uint32 rmask32[] = {
81 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
82 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
83 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
84 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
85
86static cl_mem pixsCLBuffer, pixdCLBuffer,
87 pixdCLIntermediate; // Morph operations buffers
88static cl_mem pixThBuffer; // output from thresholdtopix calculation
89static cl_int clStatus;
90static KernelEnv rEnv;
91
92# define DS_TAG_VERSION "<version>"
93# define DS_TAG_VERSION_END "</version>"
94# define DS_TAG_DEVICE "<device>"
95# define DS_TAG_DEVICE_END "</device>"
96# define DS_TAG_SCORE "<score>"
97# define DS_TAG_SCORE_END "</score>"
98# define DS_TAG_DEVICE_TYPE "<type>"
99# define DS_TAG_DEVICE_TYPE_END "</type>"
100# define DS_TAG_DEVICE_NAME "<name>"
101# define DS_TAG_DEVICE_NAME_END "</name>"
102# define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
103# define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
104
105# define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
106
107# define DS_DEVICE_NAME_LENGTH 256
108
109enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
110
111struct ds_profile {
112 std::vector<ds_device> devices;
113 unsigned int numDevices;
114 const char *version;
115};
116
117enum ds_status {
118 DS_SUCCESS = 0,
119 DS_INVALID_PROFILE = 1000,
120 DS_MEMORY_ERROR,
121 DS_INVALID_PERF_EVALUATOR_TYPE,
122 DS_INVALID_PERF_EVALUATOR,
123 DS_PERF_EVALUATOR_ERROR,
124 DS_FILE_ERROR,
125 DS_UNKNOWN_DEVICE_TYPE,
126 DS_PROFILE_FILE_ERROR,
127 DS_SCORE_SERIALIZER_ERROR,
128 DS_SCORE_DESERIALIZER_ERROR
129};
130
131// Pointer to a function that calculates the score of a device (ex:
132// device->score) update the data size of score. The encoding and the format
133// of the score data is implementation defined. The function should return
134// DS_SUCCESS if there's no error to be reported.
135typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
136
137// deallocate memory used by score
138typedef ds_status (*ds_score_release)(TessDeviceScore *score);
139
140static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
141 ds_status status = DS_SUCCESS;
142 if (profile != nullptr) {
143 if (sr != nullptr) {
144 unsigned int i;
145 for (i = 0; i < profile->numDevices; i++) {
146 free(profile->devices[i].oclDeviceName);
147 free(profile->devices[i].oclDriverVersion);
148 status = sr(profile->devices[i].score);
149 if (status != DS_SUCCESS)
150 break;
151 }
152 }
153 delete profile;
154 }
155 return status;
156}
157
158static ds_status initDSProfile(ds_profile **p, const char *version) {
159 int numDevices;
160 cl_uint numPlatforms;
161 std::vector<cl_platform_id> platforms;
162 std::vector<cl_device_id> devices;
163 ds_status status = DS_SUCCESS;
164 unsigned int next;
165 unsigned int i;
166
167 if (p == nullptr)
168 return DS_INVALID_PROFILE;
169
170 ds_profile *profile = new ds_profile;
171
172 memset(profile, 0, sizeof(ds_profile));
173
174 clGetPlatformIDs(0, nullptr, &numPlatforms);
175
176 if (numPlatforms > 0) {
177 platforms.resize(numPlatforms);
178 clGetPlatformIDs(numPlatforms, platforms.data(), nullptr);
179 }
180
181 numDevices = 0;
182 for (i = 0; i < numPlatforms; i++) {
183 cl_uint num;
184 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
185 numDevices += num;
186 }
187
188 if (numDevices > 0) {
189 devices.resize(numDevices);
190 }
191
192 profile->numDevices = numDevices + 1; // +1 to numDevices to include the native CPU
193 profile->devices.resize(profile->numDevices);
194
195 next = 0;
196 for (i = 0; i < numPlatforms; i++) {
197 cl_uint num;
198 unsigned j;
199 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
200 for (j = 0; j < num; j++, next++) {
201 char buffer[DS_DEVICE_NAME_LENGTH];
202 size_t length;
203
204 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
205 profile->devices[next].oclDeviceID = devices[j];
206
207 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH,
208 &buffer, nullptr);
209 length = strlen(buffer);
210 profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
211 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
212
213 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH,
214 &buffer, nullptr);
215 length = strlen(buffer);
216 profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
217 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
218 }
219 }
220 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
221 profile->version = version;
222
223 *p = profile;
224 return status;
225}
226
227static ds_status profileDevices(ds_profile *profile, const ds_evaluation_type type,
228 ds_perf_evaluator evaluator, void *evaluatorData,
229 unsigned int *numUpdates) {
230 ds_status status = DS_SUCCESS;
231 unsigned int i;
232 unsigned int updates = 0;
233
234 if (profile == nullptr) {
235 return DS_INVALID_PROFILE;
236 }
237 if (evaluator == nullptr) {
238 return DS_INVALID_PERF_EVALUATOR;
239 }
240
241 for (i = 0; i < profile->numDevices; i++) {
242 ds_status evaluatorStatus;
243
244 switch (type) {
245 case DS_EVALUATE_NEW_ONLY:
246 if (profile->devices[i].score != nullptr)
247 break;
248 // else fall through
249 case DS_EVALUATE_ALL:
250 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
251 if (evaluatorStatus != DS_SUCCESS) {
252 status = evaluatorStatus;
253 return status;
254 }
255 updates++;
256 break;
257 default:
258 return DS_INVALID_PERF_EVALUATOR_TYPE;
259 break;
260 };
261 }
262 if (numUpdates)
263 *numUpdates = updates;
264 return status;
265}
266
267static const char *findString(const char *contentStart, const char *contentEnd,
268 const char *string) {
269 size_t stringLength;
270 const char *currentPosition;
271 const char *found = nullptr;
272 stringLength = strlen(string);
273 currentPosition = contentStart;
274 for (currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
275 if (*currentPosition == string[0]) {
276 if (currentPosition + stringLength < contentEnd) {
277 if (strncmp(currentPosition, string, stringLength) == 0) {
278 found = currentPosition;
279 break;
280 }
281 }
282 }
283 }
284 return found;
285}
286
287static ds_status readProFile(const char *fileName, char **content, size_t *contentSize) {
288 *contentSize = 0;
289 *content = nullptr;
290 ds_status status = DS_SUCCESS;
291 FILE *input = fopen(fileName, "rb");
292 if (input == nullptr) {
293 status = DS_FILE_ERROR;
294 } else {
295 fseek(input, 0L, SEEK_END);
296 auto pos = std::ftell(input);
297 rewind(input);
298 if (pos > 0) {
299 size_t size = pos;
300 char *binary = new char[size];
301 if (fread(binary, sizeof(char), size, input) != size) {
302 status = DS_FILE_ERROR;
303 delete[] binary;
304 } else {
305 *contentSize = size;
306 *content = binary;
307 }
308 }
309 fclose(input);
310 }
311 return status;
312}
313
314typedef ds_status (*ds_score_deserializer)(ds_device *device, const uint8_t *serializedScore,
315 unsigned int serializedScoreSize);
316
317static ds_status readProfileFromFile(ds_profile *profile, ds_score_deserializer deserializer,
318 const char *file) {
319 ds_status status = DS_SUCCESS;
320 char *contentStart;
321 size_t contentSize;
322
323 if (profile == nullptr)
324 return DS_INVALID_PROFILE;
325
326 status = readProFile(file, &contentStart, &contentSize);
327 if (status == DS_SUCCESS) {
328 const char *currentPosition;
329 const char *dataStart;
330 const char *dataEnd;
331
332 const char *contentEnd = contentStart + contentSize;
333 currentPosition = contentStart;
334
335 // parse the version string
336 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
337 if (dataStart == nullptr) {
338 status = DS_PROFILE_FILE_ERROR;
339 goto cleanup;
340 }
341 dataStart += strlen(DS_TAG_VERSION);
342
343 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
344 if (dataEnd == nullptr) {
345 status = DS_PROFILE_FILE_ERROR;
346 goto cleanup;
347 }
348
349 size_t versionStringLength = strlen(profile->version);
350 if (versionStringLength + dataStart != dataEnd ||
351 strncmp(profile->version, dataStart, versionStringLength) != 0) {
352 // version mismatch
353 status = DS_PROFILE_FILE_ERROR;
354 goto cleanup;
355 }
356 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
357
358 // parse the device information
359 while (1) {
360 unsigned int i;
361
362 const char *deviceTypeStart;
363 const char *deviceTypeEnd;
364 ds_device_type deviceType;
365
366 const char *deviceNameStart;
367 const char *deviceNameEnd;
368
369 const char *deviceScoreStart;
370 const char *deviceScoreEnd;
371
372 const char *deviceDriverStart;
373 const char *deviceDriverEnd;
374
375 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
376 if (dataStart == nullptr) {
377 // nothing useful remain, quit...
378 break;
379 }
380 dataStart += strlen(DS_TAG_DEVICE);
381 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
382 if (dataEnd == nullptr) {
383 status = DS_PROFILE_FILE_ERROR;
384 goto cleanup;
385 }
386
387 // parse the device type
388 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
389 if (deviceTypeStart == nullptr) {
390 status = DS_PROFILE_FILE_ERROR;
391 goto cleanup;
392 }
393 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
394 deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
395 if (deviceTypeEnd == nullptr) {
396 status = DS_PROFILE_FILE_ERROR;
397 goto cleanup;
398 }
399 memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
400
401 // parse the device name
402 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
403 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
404 if (deviceNameStart == nullptr) {
405 status = DS_PROFILE_FILE_ERROR;
406 goto cleanup;
407 }
408 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
409 deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
410 if (deviceNameEnd == nullptr) {
411 status = DS_PROFILE_FILE_ERROR;
412 goto cleanup;
413 }
414
415 deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
416 if (deviceDriverStart == nullptr) {
417 status = DS_PROFILE_FILE_ERROR;
418 goto cleanup;
419 }
420 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
421 deviceDriverEnd =
422 findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
423 if (deviceDriverEnd == nullptr) {
424 status = DS_PROFILE_FILE_ERROR;
425 goto cleanup;
426 }
427
428 // check if this device is on the system
429 for (i = 0; i < profile->numDevices; i++) {
430 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
431 size_t actualDeviceNameLength;
432 size_t driverVersionLength;
433
434 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
435 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
436 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
437 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
438 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
439 actualDeviceNameLength) == 0 &&
440 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
441 driverVersionLength) == 0) {
442 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
443 deviceScoreStart += strlen(DS_TAG_SCORE);
444 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
445 status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
446 deviceScoreEnd - deviceScoreStart);
447 if (status != DS_SUCCESS) {
448 goto cleanup;
449 }
450 }
451 }
452 }
453 } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
454 for (i = 0; i < profile->numDevices; i++) {
455 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
456 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
457 if (deviceScoreStart == nullptr) {
458 status = DS_PROFILE_FILE_ERROR;
459 goto cleanup;
460 }
461 deviceScoreStart += strlen(DS_TAG_SCORE);
462 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
463 status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
464 deviceScoreEnd - deviceScoreStart);
465 if (status != DS_SUCCESS) {
466 goto cleanup;
467 }
468 }
469 }
470 }
471
472 // skip over the current one to find the next device
473 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
474 }
475 }
476cleanup:
477 delete[] contentStart;
478 return status;
479}
480
481typedef ds_status (*ds_score_serializer)(ds_device *device, uint8_t **serializedScore,
482 unsigned int *serializedScoreSize);
483static ds_status writeProfileToFile(ds_profile *profile, ds_score_serializer serializer,
484 const char *file) {
485 ds_status status = DS_SUCCESS;
486
487 if (profile == nullptr)
488 return DS_INVALID_PROFILE;
489
490 FILE *profileFile = fopen(file, "wb");
491 if (profileFile == nullptr) {
492 status = DS_FILE_ERROR;
493 } else {
494 unsigned int i;
495
496 // write version string
497 fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
498 fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
499 fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
500 fwrite("\n", sizeof(char), 1, profileFile);
501
502 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
503 uint8_t *serializedScore;
504 unsigned int serializedScoreSize;
505
506 fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
507
508 fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
509 fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
510 fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
511
512 switch (profile->devices[i].type) {
513 case DS_DEVICE_NATIVE_CPU: {
514 // There's no need to emit a device name for the native CPU device.
515 /*
516fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
517 profileFile);
518fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
519 strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
520fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
521 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
522*/
523 } break;
524 case DS_DEVICE_OPENCL_DEVICE: {
525 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), profileFile);
526 fwrite(profile->devices[i].oclDeviceName, sizeof(char),
527 strlen(profile->devices[i].oclDeviceName), profileFile);
528 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
529
530 fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), strlen(DS_TAG_DEVICE_DRIVER_VERSION),
531 profileFile);
532 fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
533 strlen(profile->devices[i].oclDriverVersion), profileFile);
534 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
535 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
536 } break;
537 default:
538 status = DS_UNKNOWN_DEVICE_TYPE;
539 continue;
540 };
541
542 fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
543 status = serializer(&profile->devices[i], &serializedScore, &serializedScoreSize);
544 if (status == DS_SUCCESS && serializedScore != nullptr && serializedScoreSize > 0) {
545 fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
546 delete[] serializedScore;
547 }
548 fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
549 fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
550 fwrite("\n", sizeof(char), 1, profileFile);
551 }
552 fclose(profileFile);
553 }
554 return status;
555}
556
557// substitute invalid characters in device name with _
558static void legalizeFileName(char *fileName) {
559 // tprintf("fileName: %s\n", fileName);
560 const char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
561 // for each invalid char
562 for (unsigned i = 0; i < strlen(invalidChars); i++) {
563 char invalidStr[4];
564 invalidStr[0] = invalidChars[i];
565 invalidStr[1] = '\0';
566 // tprintf("eliminating %s\n", invalidStr);
567 // char *pos = strstr(fileName, invalidStr);
568 // initial ./ is valid for present directory
569 // if (*pos == '.') pos++;
570 // if (*pos == '/') pos++;
571 for (char *pos = strstr(fileName, invalidStr); pos != nullptr;
572 pos = strstr(pos + 1, invalidStr)) {
573 // tprintf("\tfound: %s, ", pos);
574 pos[0] = '_';
575 // tprintf("fileName: %s\n", fileName);
576 }
577 }
578}
579
580static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
581 // tprintf("[DS] populateGPUEnvFromDevice\n");
582 size_t size;
583 gpuInfo->mnIsUserCreated = 1;
584 // device
585 gpuInfo->mpDevID = device;
586 gpuInfo->mpArryDevsID = new cl_device_id[1];
587 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
588 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, sizeof(cl_device_type),
589 &gpuInfo->mDevType, &size);
590 CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
591 // platform
592 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
593 &gpuInfo->mpPlatformID, &size);
594 CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
595 // context
596 cl_context_properties props[3];
597 props[0] = CL_CONTEXT_PLATFORM;
598 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
599 props[2] = 0;
600 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
601 CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
602 // queue
603 cl_command_queue_properties queueProperties = 0;
604 gpuInfo->mpCmdQueue =
605 clCreateCommandQueue(gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
606 CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
607}
608
609int OpenclDevice::LoadOpencl() {
610# ifdef WIN32
611 HINSTANCE HOpenclDll = nullptr;
612 void *OpenclDll = nullptr;
613 // fprintf(stderr, " LoadOpenclDllxx... \n");
614 OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
615 OpenclDll = LoadLibrary("openCL.dll");
616 if (!static_cast<HINSTANCE>(OpenclDll)) {
617 fprintf(stderr, "[OD] Load opencl.dll failed!\n");
618 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
619 return 0;
620 }
621 fprintf(stderr, "[OD] Load opencl.dll successful!\n");
622# endif
623 return 1;
624}
625int OpenclDevice::SetKernelEnv(KernelEnv *envInfo) {
626 envInfo->mpkContext = gpuEnv.mpContext;
627 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
628 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
629
630 return 1;
631}
632
633static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer, size_t nElements,
634 cl_mem_flags flags, cl_int *pStatus) {
635 cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
636 nElements * sizeof(l_uint32), hostbuffer, pStatus);
637
638 return membuffer;
639}
640
641static Image mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Image pixd, Image pixs,
642 int elements, cl_mem_flags flags, bool memcopy = false,
643 bool sync = true) {
644 if (!pixd) {
645 if (memcopy) {
646 if ((pixd = pixCreateTemplate(pixs)) == nullptr)
647 tprintf("pixd not made\n");
648 } else {
649 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) ==
650 nullptr)
651 tprintf("pixd not made\n");
652 }
653 }
654 l_uint32 *pValues =
655 (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
656 elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
657
658 if (memcopy) {
659 memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
660 } else {
661 pixSetData(pixd, pValues);
662 }
663
664 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr, nullptr);
665
666 if (sync) {
667 clFinish(rEnv.mpkCmdQueue);
668 }
669
670 return pixd;
671}
672
673void OpenclDevice::releaseMorphCLBuffers() {
674 if (pixdCLIntermediate != nullptr)
675 clReleaseMemObject(pixdCLIntermediate);
676 if (pixsCLBuffer != nullptr)
677 clReleaseMemObject(pixsCLBuffer);
678 if (pixdCLBuffer != nullptr)
679 clReleaseMemObject(pixdCLBuffer);
680 if (pixThBuffer != nullptr)
681 clReleaseMemObject(pixThBuffer);
682 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
683}
684
685int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Image pixs) {
686 SetKernelEnv(&rEnv);
687
688 if (pixThBuffer != nullptr) {
689 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
690
691 // Get the output from ThresholdToPix operation
692 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
693 sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
694 } else {
695 // Get data from the source image
696 l_uint32 *srcdata = reinterpret_cast<l_uint32 *>(malloc(wpl * h * sizeof(l_uint32)));
697 memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
698
699 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, CL_MEM_USE_HOST_PTR, &clStatus);
700 }
701
702 pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
703
704 pixdCLIntermediate =
705 allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
706
707 return (int)clStatus;
708}
709
710int OpenclDevice::InitEnv() {
711// tprintf("[OD] OpenclDevice::InitEnv()\n");
712# ifdef SAL_WIN32
713 while (1) {
714 if (1 == LoadOpencl())
715 break;
716 }
717# endif
718 // sets up environment, compiles programs
719
720 InitOpenclRunEnv_DeviceSelection(0);
721 return 1;
722}
723
724int OpenclDevice::ReleaseOpenclRunEnv() {
725 ReleaseOpenclEnv(&gpuEnv);
726# ifdef SAL_WIN32
727 FreeOpenclDll();
728# endif
729 return 1;
730}
731
732inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
733 ASSERT_HOST(kCount > 0);
734 ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
735 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
736 gpuEnv.mnKernelCount++;
737 return 0;
738}
739
740int OpenclDevice::RegistOpenclKernel() {
741 if (!gpuEnv.mnIsUserCreated)
742 memset(&gpuEnv, 0, sizeof(gpuEnv));
743
744 gpuEnv.mnFileCount = 0; // argc;
745 gpuEnv.mnKernelCount = 0UL;
746
747 AddKernelConfig(1, "oclAverageSub1");
748 return 0;
749}
750
751int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) {
752 if (!isInited) {
753 // after programs compiled, selects best device
754 ds_device bestDevice_DS = getDeviceSelection();
755 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
756 // overwrite global static GPUEnv with new device
757 if (selectedDeviceIsOpenCL()) {
758 // tprintf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
759 // for selected device\n");
760 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
761 gpuEnv.mnFileCount = 0; // argc;
762 gpuEnv.mnKernelCount = 0UL;
763 CompileKernelFile(&gpuEnv, "");
764 } else {
765 // tprintf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
766 // b/c native cpu selected\n");
767 }
768 isInited = 1;
769 }
770 return 0;
771}
772
773OpenclDevice::OpenclDevice() {
774 // InitEnv();
775}
776
777OpenclDevice::~OpenclDevice() {
778 // ReleaseOpenclRunEnv();
779}
780
781int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
782 int i = 0;
783 int clStatus = 0;
784
785 if (!isInited) {
786 return 1;
787 }
788
789 for (i = 0; i < gpuEnv.mnFileCount; i++) {
790 if (gpuEnv.mpArryPrograms[i]) {
791 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
792 CHECK_OPENCL(clStatus, "clReleaseProgram");
793 gpuEnv.mpArryPrograms[i] = nullptr;
794 }
795 }
796 if (gpuEnv.mpCmdQueue) {
797 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
798 gpuEnv.mpCmdQueue = nullptr;
799 }
800 if (gpuEnv.mpContext) {
801 clReleaseContext(gpuEnv.mpContext);
802 gpuEnv.mpContext = nullptr;
803 }
804 isInited = 0;
805 gpuInfo->mnIsUserCreated = 0;
806 delete[] gpuInfo->mpArryDevsID;
807 return 1;
808}
809int OpenclDevice::BinaryGenerated(const char *clFileName, FILE **fhandle) {
810 unsigned int i = 0;
811 cl_int clStatus;
812 int status = 0;
813 FILE *fd = nullptr;
814 char fileName[256];
815 char cl_name[128];
816 char deviceName[1024];
817 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName,
818 nullptr);
819 CHECK_OPENCL(clStatus, "clGetDeviceInfo");
820 const char *str = strstr(clFileName, ".cl");
821 memcpy(cl_name, clFileName, str - clFileName);
822 cl_name[str - clFileName] = '\0';
823 snprintf(fileName, sizeof(fileName), "%s-%s.bin", cl_name, deviceName);
824 legalizeFileName(fileName);
825 fd = fopen(fileName, "rb");
826 status = (fd != nullptr) ? 1 : 0;
827 if (fd != nullptr) {
828 *fhandle = fd;
829 }
830 return status;
831}
832int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached, const char *clFileName) {
833 int i;
834 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
835 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
836 if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
837 return 1;
838 }
839 }
840 }
841
842 return 0;
843}
844int OpenclDevice::WriteBinaryToFile(const char *fileName, const char *birary, size_t numBytes) {
845 FILE *output = nullptr;
846 output = fopen(fileName, "wb");
847 if (output == nullptr) {
848 return 0;
849 }
850
851 fwrite(birary, sizeof(char), numBytes, output);
852 fclose(output);
853
854 return 1;
855}
856
857int OpenclDevice::GeneratBinFromKernelSource(cl_program program, const char *clFileName) {
858 unsigned int i = 0;
859 cl_int clStatus;
860 cl_uint numDevices;
861
862 clStatus =
863 clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, nullptr);
864 CHECK_OPENCL(clStatus, "clGetProgramInfo");
865
866 std::vector<cl_device_id> mpArryDevsID(numDevices);
867
868 /* grab the handles to all of the devices in the program. */
869 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices,
870 &mpArryDevsID[0], nullptr);
871 CHECK_OPENCL(clStatus, "clGetProgramInfo");
872
873 /* figure out the sizes of each of the binaries. */
874 std::vector<size_t> binarySizes(numDevices);
875
876 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices,
877 &binarySizes[0], nullptr);
878 CHECK_OPENCL(clStatus, "clGetProgramInfo");
879
880 /* copy over all of the generated binaries. */
881 std::vector<char *> binaries(numDevices);
882
883 for (i = 0; i < numDevices; i++) {
884 if (binarySizes[i] != 0) {
885 binaries[i] = new char[binarySizes[i]];
886 } else {
887 binaries[i] = nullptr;
888 }
889 }
890
891 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices,
892 &binaries[0], nullptr);
893 CHECK_OPENCL(clStatus, "clGetProgramInfo");
894
895 /* dump out each binary into its own separate file. */
896 for (i = 0; i < numDevices; i++) {
897 if (binarySizes[i] != 0) {
898 char fileName[256];
899 char cl_name[128];
900 char deviceName[1024];
901 clStatus =
902 clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
903 CHECK_OPENCL(clStatus, "clGetDeviceInfo");
904
905 const char *str = strstr(clFileName, ".cl");
906 memcpy(cl_name, clFileName, str - clFileName);
907 cl_name[str - clFileName] = '\0';
908 snprintf(fileName, sizeof(fileName), "%s-%s.bin", cl_name, deviceName);
909 legalizeFileName(fileName);
910 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
911 tprintf("[OD] write binary[%s] failed\n", fileName);
912 return 0;
913 } // else
914 tprintf("[OD] write binary[%s] successfully\n", fileName);
915 }
916 }
917
918 // Release all resources and memory
919 for (i = 0; i < numDevices; i++) {
920 delete[] binaries[i];
921 }
922
923 return 1;
924}
925
926int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
927 cl_int clStatus = 0;
928 const char *source;
929 size_t source_size[1];
930 int binary_status, binaryExisted, idx;
931 cl_uint numDevices;
932 FILE *fd, *fd1;
933 const char *filename = "kernel.cl";
934 // fprintf(stderr, "[OD] CompileKernelFile ... \n");
935 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
936 return 1;
937 }
938
939 idx = gpuInfo->mnFileCount;
940
941 source = kernel_src;
942
943 source_size[0] = strlen(source);
944 binaryExisted = 0;
945 binaryExisted = BinaryGenerated(filename, &fd); // don't check for binary during microbenchmark
946 if (binaryExisted == 1) {
947 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, sizeof(numDevices),
948 &numDevices, nullptr);
949 CHECK_OPENCL(clStatus, "clGetContextInfo");
950
951 std::vector<cl_device_id> mpArryDevsID(numDevices);
952 bool b_error = fseek(fd, 0, SEEK_END) < 0;
953 auto pos = std::ftell(fd);
954 b_error |= (pos <= 0);
955 size_t length = pos;
956 b_error |= fseek(fd, 0, SEEK_SET) < 0;
957 if (b_error) {
958 fclose(fd);
959 return 0;
960 }
961
962 std::vector<uint8_t> binary(length + 2);
963
964 memset(&binary[0], 0, length + 2);
965 b_error |= fread(&binary[0], 1, length, fd) != length;
966
967 fclose(fd);
968 fd = nullptr;
969 // grab the handles to all of the devices in the context.
970 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
971 sizeof(cl_device_id) * numDevices, &mpArryDevsID[0], nullptr);
972 CHECK_OPENCL(clStatus, "clGetContextInfo");
973 // fprintf(stderr, "[OD] Create kernel from binary\n");
974 const uint8_t *c_binary = &binary[0];
975 gpuInfo->mpArryPrograms[idx] =
976 clCreateProgramWithBinary(gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length,
977 &c_binary, &binary_status, &clStatus);
978 CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
979 } else {
980 // create a CL program using the kernel source
981 // fprintf(stderr, "[OD] Create kernel from source\n");
982 gpuInfo->mpArryPrograms[idx] =
983 clCreateProgramWithSource(gpuInfo->mpContext, 1, &source, source_size, &clStatus);
984 CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
985 }
986
987 if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
988 return 0;
989 }
990
991 // char options[512];
992 // create a cl program executable for all the devices specified
993 // tprintf("[OD] BuildProgram.\n");
994 if (!gpuInfo->mnIsUserCreated) {
995 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption,
996 nullptr, nullptr);
997 } else {
998 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption,
999 nullptr, nullptr);
1000 }
1001 if (clStatus != CL_SUCCESS) {
1002 tprintf("BuildProgram error!\n");
1003 size_t length;
1004 if (!gpuInfo->mnIsUserCreated) {
1005 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1006 CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1007 } else {
1008 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1009 CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1010 }
1011 if (clStatus != CL_SUCCESS) {
1012 tprintf("opencl create build log fail\n");
1013 return 0;
1014 }
1015 std::vector<char> buildLog(length);
1016 if (!gpuInfo->mnIsUserCreated) {
1017 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1018 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1019 } else {
1020 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1021 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1022 }
1023 if (clStatus != CL_SUCCESS) {
1024 tprintf("opencl program build info fail\n");
1025 return 0;
1026 }
1027
1028 fd1 = fopen("kernel-build.log", "w+");
1029 if (fd1 != nullptr) {
1030 fwrite(&buildLog[0], sizeof(char), length, fd1);
1031 fclose(fd1);
1032 }
1033
1034 return 0;
1035 }
1036
1037 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1038 if (binaryExisted == 0) {
1039 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1040 }
1041
1042 gpuInfo->mnFileCount += 1;
1043 return 1;
1044}
1045
1046l_uint32 *OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata, l_int32 w, l_int32 h, l_int32 wpl,
1047 l_uint32 *line) {
1048 cl_int clStatus;
1049 KernelEnv rEnv;
1050 size_t globalThreads[2];
1051 size_t localThreads[2];
1052 int gsize;
1053 cl_mem valuesCl;
1054 cl_mem outputCl;
1055
1056 // global and local work dimensions for Horizontal pass
1057 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1058 globalThreads[0] = gsize;
1059 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1060 globalThreads[1] = gsize;
1061 localThreads[0] = GROUPSIZE_X;
1062 localThreads[1] = GROUPSIZE_Y;
1063
1064 SetKernelEnv(&rEnv);
1065
1066 l_uint32 *pResult = (l_uint32 *)malloc(w * h * sizeof(l_uint32));
1067 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1068 CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1069
1070 // Allocate input and output OCL buffers
1071 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1072 &clStatus);
1073 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
1074 &clStatus);
1075
1076 // Kernel arguments
1077 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1078 CHECK_OPENCL(clStatus, "clSetKernelArg");
1079 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1080 CHECK_OPENCL(clStatus, "clSetKernelArg");
1081 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1082 CHECK_OPENCL(clStatus, "clSetKernelArg");
1083 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1084 CHECK_OPENCL(clStatus, "clSetKernelArg");
1085 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1086 CHECK_OPENCL(clStatus, "clSetKernelArg");
1087
1088 // Kernel enqueue
1089 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1090 localThreads, 0, nullptr, nullptr);
1091 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1092
1093 /* map results back from gpu */
1094 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0,
1095 w * h * sizeof(l_uint32), 0, nullptr, nullptr, &clStatus);
1096 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1097 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1098
1099 // Sync
1100 clFinish(rEnv.mpkCmdQueue);
1101 return pResult;
1102}
1103
1104// Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
1105// OpenCL kernels
1106static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1107 size_t globalThreads[2];
1108 cl_mem pixtemp;
1109 cl_int status;
1110 int gsize;
1111 size_t localThreads[2];
1112
1113 // Horizontal pass
1114 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1115 globalThreads[0] = gsize;
1116 globalThreads[1] = GROUPSIZE_HMORY;
1117 localThreads[0] = GROUPSIZE_HMORX;
1118 localThreads[1] = GROUPSIZE_HMORY;
1119
1120 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1121 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1122
1123 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1124 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1125 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1126 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1127
1128 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1129 localThreads, 0, nullptr, nullptr);
1130
1131 // Swap source and dest buffers
1132 pixtemp = pixsCLBuffer;
1133 pixsCLBuffer = pixdCLBuffer;
1134 pixdCLBuffer = pixtemp;
1135
1136 // Vertical
1137 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1138 globalThreads[0] = gsize;
1139 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1140 globalThreads[1] = gsize;
1141 localThreads[0] = GROUPSIZE_X;
1142 localThreads[1] = GROUPSIZE_Y;
1143
1144 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1145 CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1146
1147 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1148 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1149 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1150 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1151 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1152 localThreads, 0, nullptr, nullptr);
1153
1154 return status;
1155}
1156
1157// Morphology Erode operation for 5x5 structuring element. Invokes the relevant
1158// OpenCL kernels
1159static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1160 size_t globalThreads[2];
1161 cl_mem pixtemp;
1162 cl_int status;
1163 int gsize;
1164 l_uint32 fwmask, lwmask;
1165 size_t localThreads[2];
1166
1167 lwmask = lmask32[31 - 2];
1168 fwmask = rmask32[31 - 2];
1169
1170 // Horizontal pass
1171 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1172 globalThreads[0] = gsize;
1173 globalThreads[1] = GROUPSIZE_HMORY;
1174 localThreads[0] = GROUPSIZE_HMORX;
1175 localThreads[1] = GROUPSIZE_HMORY;
1176
1177 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1178 CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1179
1180 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1181 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1182 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1183 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1184
1185 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1186 localThreads, 0, nullptr, nullptr);
1187
1188 // Swap source and dest buffers
1189 pixtemp = pixsCLBuffer;
1190 pixsCLBuffer = pixdCLBuffer;
1191 pixdCLBuffer = pixtemp;
1192
1193 // Vertical
1194 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1195 globalThreads[0] = gsize;
1196 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1197 globalThreads[1] = gsize;
1198 localThreads[0] = GROUPSIZE_X;
1199 localThreads[1] = GROUPSIZE_Y;
1200
1201 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1202 CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1203
1204 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1205 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1206 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1207 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1208 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1209 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1210 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1211 localThreads, 0, nullptr, nullptr);
1212
1213 return status;
1214}
1215
1216// Morphology Dilate operation. Invokes the relevant OpenCL kernels
1217static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1218 l_int32 xp, yp, xn, yn;
1219 SEL *sel;
1220 size_t globalThreads[2];
1221 cl_mem pixtemp;
1222 cl_int status = 0;
1223 int gsize;
1224 size_t localThreads[2];
1225 char isEven;
1226
1227 OpenclDevice::SetKernelEnv(&rEnv);
1228
1229 if (hsize == 5 && vsize == 5) {
1230 // Specific case for 5x5
1231 status = pixDilateCL_55(wpl, h);
1232 return status;
1233 }
1234
1235 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1236
1237 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1238 selDestroy(&sel);
1239 // global and local work dimensions for Horizontal pass
1240 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1241 globalThreads[0] = gsize;
1242 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1243 globalThreads[1] = gsize;
1244 localThreads[0] = GROUPSIZE_X;
1245 localThreads[1] = GROUPSIZE_Y;
1246
1247 if (xp > 31 || xn > 31) {
1248 // Generic case.
1249 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1250 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1251
1252 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1253 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1254 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1255 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1256 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1257 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1258 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1259 localThreads, 0, nullptr, nullptr);
1260
1261 if (yp > 0 || yn > 0) {
1262 pixtemp = pixsCLBuffer;
1263 pixsCLBuffer = pixdCLBuffer;
1264 pixdCLBuffer = pixtemp;
1265 }
1266 } else if (xp > 0 || xn > 0) {
1267 // Specific Horizontal pass kernel for half width < 32
1268 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1269 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1270 isEven = (xp != xn);
1271
1272 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1273 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1274 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1275 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1276 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1277 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1278 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1279 localThreads, 0, nullptr, nullptr);
1280
1281 if (yp > 0 || yn > 0) {
1282 pixtemp = pixsCLBuffer;
1283 pixsCLBuffer = pixdCLBuffer;
1284 pixdCLBuffer = pixtemp;
1285 }
1286 }
1287
1288 if (yp > 0 || yn > 0) {
1289 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1290 CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1291
1292 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1293 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1294 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1295 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1296 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1297 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1298 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1299 localThreads, 0, nullptr, nullptr);
1300 }
1301
1302 return status;
1303}
1304
1305// Morphology Erode operation. Invokes the relevant OpenCL kernels
1306static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1307 l_int32 xp, yp, xn, yn;
1308 SEL *sel;
1309 size_t globalThreads[2];
1310 size_t localThreads[2];
1311 cl_mem pixtemp;
1312 cl_int status = 0;
1313 int gsize;
1314 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1315 l_uint32 rwmask, lwmask;
1316 char isEven;
1317
1318 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1319
1320 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1321 selDestroy(&sel);
1322 OpenclDevice::SetKernelEnv(&rEnv);
1323
1324 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1325 // Specific kernel for 5x5
1326 status = pixErodeCL_55(wpl, h);
1327 return status;
1328 }
1329
1330 lwmask = lmask32[31 - (xn & 31)];
1331 rwmask = rmask32[31 - (xp & 31)];
1332
1333 // global and local work dimensions for Horizontal pass
1334 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1335 globalThreads[0] = gsize;
1336 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1337 globalThreads[1] = gsize;
1338 localThreads[0] = GROUPSIZE_X;
1339 localThreads[1] = GROUPSIZE_Y;
1340
1341 // Horizontal Pass
1342 if (xp > 31 || xn > 31) {
1343 // Generic case.
1344 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1345
1346 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1347 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1348 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1349 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1350 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1351 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1352 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1353 status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1354 status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1355 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1356 localThreads, 0, nullptr, nullptr);
1357
1358 if (yp > 0 || yn > 0) {
1359 pixtemp = pixsCLBuffer;
1360 pixsCLBuffer = pixdCLBuffer;
1361 pixdCLBuffer = pixtemp;
1362 }
1363 } else if (xp > 0 || xn > 0) {
1364 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1365 isEven = (xp != xn);
1366
1367 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1368 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1369 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1370 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1371 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1372 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1373 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1374 status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1375 status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1376 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1377 localThreads, 0, nullptr, nullptr);
1378
1379 if (yp > 0 || yn > 0) {
1380 pixtemp = pixsCLBuffer;
1381 pixsCLBuffer = pixdCLBuffer;
1382 pixdCLBuffer = pixtemp;
1383 }
1384 }
1385
1386 // Vertical Pass
1387 if (yp > 0 || yn > 0) {
1388 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1389 CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1390
1391 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1392 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1393 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1394 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1395 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1396 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1397 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1398 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1399 localThreads, 0, nullptr, nullptr);
1400 }
1401
1402 return status;
1403}
1404
1405// Morphology Open operation. Invokes the relevant OpenCL kernels
1406static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1407 cl_int status;
1408 cl_mem pixtemp;
1409
1410 // Erode followed by Dilate
1411 status = pixErodeCL(hsize, vsize, wpl, h);
1412
1413 pixtemp = pixsCLBuffer;
1414 pixsCLBuffer = pixdCLBuffer;
1415 pixdCLBuffer = pixtemp;
1416
1417 status = pixDilateCL(hsize, vsize, wpl, h);
1418
1419 return status;
1420}
1421
1422// Morphology Close operation. Invokes the relevant OpenCL kernels
1423static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1424 cl_int status;
1425 cl_mem pixtemp;
1426
1427 // Dilate followed by Erode
1428 status = pixDilateCL(hsize, vsize, wpl, h);
1429
1430 pixtemp = pixsCLBuffer;
1431 pixsCLBuffer = pixdCLBuffer;
1432 pixdCLBuffer = pixtemp;
1433
1434 status = pixErodeCL(hsize, vsize, wpl, h);
1435
1436 return status;
1437}
1438
1439// output = buffer1 & ~(buffer2)
1440static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2) {
1441 cl_int status;
1442 size_t globalThreads[2];
1443 int gsize;
1444 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1445
1446 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1447 globalThreads[0] = gsize;
1448 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1449 globalThreads[1] = gsize;
1450
1451 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1452 CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1453
1454 // Enqueue a kernel run call.
1455 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1456 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1457 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1458 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1459 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1460 localThreads, 0, nullptr, nullptr);
1461
1462 return status;
1463}
1464
1465// OpenCL implementation of Get Lines from pix function
1466// Note: Assumes the source and dest opencl buffer are initialized. No check
1467// done
1468void OpenclDevice::pixGetLinesCL(Image pixd, Image pixs, Image *pix_vline, Image *pix_hline,
1469 Image *pixClosed, bool getpixClosed, l_int32 close_hsize,
1470 l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize,
1471 l_int32 line_hsize, l_int32 line_vsize) {
1472 l_uint32 wpl, h;
1473 cl_mem pixtemp;
1474
1475 wpl = pixGetWpl(pixs);
1476 h = pixGetHeight(pixs);
1477
1478 // First step : Close Morph operation: Dilate followed by Erode
1479 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1480
1481 // Copy the Close output to CPU buffer
1482 if (getpixClosed) {
1483 *pixClosed =
1484 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl * h, CL_MAP_READ, true, false);
1485 }
1486
1487 // Store the output of close operation in an intermediate buffer
1488 // this will be later used for pixsubtract
1489 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1490 sizeof(int) * wpl * h, 0, nullptr, nullptr);
1491
1492 // Second step: Open Operation - Erode followed by Dilate
1493 pixtemp = pixsCLBuffer;
1494 pixsCLBuffer = pixdCLBuffer;
1495 pixdCLBuffer = pixtemp;
1496
1497 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1498
1499 // Third step: Subtract : (Close - Open)
1500 pixtemp = pixsCLBuffer;
1501 pixsCLBuffer = pixdCLBuffer;
1502 pixdCLBuffer = pixdCLIntermediate;
1503 pixdCLIntermediate = pixtemp;
1504
1505 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1506
1507 // Store the output of Hollow operation in an intermediate buffer
1508 // this will be later used
1509 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1510 sizeof(int) * wpl * h, 0, nullptr, nullptr);
1511
1512 pixtemp = pixsCLBuffer;
1513 pixsCLBuffer = pixdCLBuffer;
1514 pixdCLBuffer = pixtemp;
1515
1516 // Fourth step: Get vertical line
1517 // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1518 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1519
1520 // Copy the vertical line output to CPU buffer
1521 *pix_vline =
1522 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, CL_MAP_READ, true, false);
1523
1524 pixtemp = pixsCLBuffer;
1525 pixsCLBuffer = pixdCLIntermediate;
1526 pixdCLIntermediate = pixtemp;
1527
1528 // Fifth step: Get horizontal line
1529 // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1530 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1531
1532 // Copy the horizontal line output to CPU buffer
1533 *pix_hline =
1534 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, CL_MAP_READ, true, true);
1535
1536 return;
1537}
1538
1539/*************************************************************************
1540 * HistogramRect
1541 * Otsu Thresholding Operations
1542 * histogramAllChannels is laid out as all channel 0, then all channel 1...
1543 * only supports 1 or 4 channels (bytes_per_pixel)
1544 ************************************************************************/
1545int OpenclDevice::HistogramRectOCL(void *imageData, int bytes_per_pixel, int bytes_per_line,
1546 int left, // always 0
1547 int top, // always 0
1548 int width, int height, int kHistogramSize,
1549 int *histogramAllChannels) {
1550 cl_int clStatus;
1551 int retVal = 0;
1552 KernelEnv histKern;
1553 SetKernelEnv(&histKern);
1554 KernelEnv histRedKern;
1555 SetKernelEnv(&histRedKern);
1556 /* map imagedata to device as read only */
1557 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1558 // coherent which we don't need.
1559 // faster option would be to allocate initial image buffer
1560 // using a garlic bus memory type
1561 cl_mem imageBuffer =
1562 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1563 width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1564 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1565
1566 /* setup work group size parameters */
1567 int block_size = 256;
1568 cl_uint numCUs;
1569 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1570 nullptr);
1571 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1572
1573 int requestedOccupancy = 10;
1574 int numWorkGroups = numCUs * requestedOccupancy;
1575 int numThreads = block_size * numWorkGroups;
1576 size_t local_work_size[] = {static_cast<size_t>(block_size)};
1577 size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1578 size_t red_global_work_size[] = {
1579 static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1580
1581 /* map histogramAllChannels as write only */
1582
1583 cl_mem histogramBuffer = clCreateBuffer(
1584 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1585 kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels, &clStatus);
1586 CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1587
1588 /* intermediate histogram buffer */
1589 int histRed = 256;
1590 int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1591
1592 cl_mem tmpHistogramBuffer =
1593 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins * sizeof(cl_uint),
1594 nullptr, &clStatus);
1595 CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1596
1597 /* atomic sync buffer */
1598 int *zeroBuffer = new int[1];
1599 zeroBuffer[0] = 0;
1600 cl_mem atomicSyncBuffer =
1601 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int),
1602 zeroBuffer, &clStatus);
1603 CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1604 delete[] zeroBuffer;
1605 // Create kernel objects based on bytes_per_pixel
1606 if (bytes_per_pixel == 1) {
1607 histKern.mpkKernel =
1608 clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1609 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1610
1611 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1612 "kernel_HistogramRectOneChannelReduction", &clStatus);
1613 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
1614 } else {
1615 histKern.mpkKernel =
1616 clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
1617 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1618
1619 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1620 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1621 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1622 }
1623
1624 void *ptr;
1625
1626 // Initialize tmpHistogramBuffer buffer
1627 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1628 tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
1629 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1630
1631 memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
1632 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, nullptr, nullptr);
1633
1634 /* set kernel 1 arguments */
1635 clStatus = clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1636 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1637 cl_uint numPixels = width * height;
1638 clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1639 CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
1640 clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), &tmpHistogramBuffer);
1641 CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1642
1643 /* set kernel 2 arguments */
1644 int n = numThreads / bytes_per_pixel;
1645 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1646 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1647 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), &tmpHistogramBuffer);
1648 CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1649 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), &histogramBuffer);
1650 CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer");
1651
1652 /* launch histogram */
1653 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, nullptr,
1654 global_work_size, local_work_size, 0, nullptr, nullptr);
1655 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1656 clFinish(histKern.mpkCmdQueue);
1657 if (clStatus != 0) {
1658 retVal = -1;
1659 }
1660 /* launch histogram */
1661 clStatus = clEnqueueNDRangeKernel(histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1662 red_global_work_size, local_work_size, 0, nullptr, nullptr);
1663 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1664 clFinish(histRedKern.mpkCmdQueue);
1665 if (clStatus != 0) {
1666 retVal = -1;
1667 }
1668
1669 /* map results back from gpu */
1670 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0,
1671 kHistogramSize * bytes_per_pixel * sizeof(int), 0, nullptr, nullptr,
1672 &clStatus);
1673 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1674 if (clStatus != 0) {
1675 retVal = -1;
1676 }
1677 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, nullptr, nullptr);
1678
1679 clReleaseMemObject(histogramBuffer);
1680 clReleaseMemObject(imageBuffer);
1681 return retVal;
1682}
1683
1684/*************************************************************************
1685 * Threshold the rectangle, taking everything except the image buffer pointer
1686 * from the class, using thresholds/hi_values to the output IMAGE.
1687 * only supports 1 or 4 channels
1688 ************************************************************************/
1689int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, int bytes_per_pixel,
1690 int bytes_per_line, int *thresholds, int *hi_values,
1691 Image *pix, int height, int width, int top, int left) {
1692 int retVal = 0;
1693 /* create pix result buffer */
1694 *pix = pixCreate(width, height, 1);
1695 uint32_t *pixData = pixGetData(*pix);
1696 int wpl = pixGetWpl(*pix);
1697 int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1698
1699 cl_int clStatus;
1700 KernelEnv rEnv;
1701 SetKernelEnv(&rEnv);
1702
1703 /* setup work group size parameters */
1704 int block_size = 256;
1705 cl_uint numCUs = 6;
1706 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1707 nullptr);
1708 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1709
1710 int requestedOccupancy = 10;
1711 int numWorkGroups = numCUs * requestedOccupancy;
1712 int numThreads = block_size * numWorkGroups;
1713 size_t local_work_size[] = {(size_t)block_size};
1714 size_t global_work_size[] = {(size_t)numThreads};
1715
1716 /* map imagedata to device as read only */
1717 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1718 // coherent which we don't need.
1719 // faster option would be to allocate initial image buffer
1720 // using a garlic bus memory type
1721 cl_mem imageBuffer =
1722 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1723 width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1724 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1725
1726 /* map pix as write only */
1727 pixThBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize,
1728 pixData, &clStatus);
1729 CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1730
1731 /* map thresholds and hi_values */
1732 cl_mem thresholdsBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1733 bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1734 CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1735 cl_mem hiValuesBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1736 bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1737 CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1738
1739 /* compile kernel */
1740 if (bytes_per_pixel == 4) {
1741 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1742 CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1743 } else {
1744 rEnv.mpkKernel =
1745 clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1746 CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1747 }
1748
1749 /* set kernel arguments */
1750 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1751 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1752 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1753 CHECK_OPENCL(clStatus, "clSetKernelArg height");
1754 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1755 CHECK_OPENCL(clStatus, "clSetKernelArg width");
1756 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1757 CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1758 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1759 CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1760 clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1761 CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1762 clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1763 CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1764
1765 /* launch kernel & wait */
1766 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1, nullptr, global_work_size,
1767 local_work_size, 0, nullptr, nullptr);
1768 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1769 clFinish(rEnv.mpkCmdQueue);
1770 if (clStatus != 0) {
1771 tprintf("Setting return value to -1\n");
1772 retVal = -1;
1773 }
1774 /* map results back from gpu */
1775 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0,
1776 nullptr, nullptr, &clStatus);
1777 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1778 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr, nullptr);
1779
1780 clReleaseMemObject(imageBuffer);
1781 clReleaseMemObject(thresholdsBuffer);
1782 clReleaseMemObject(hiValuesBuffer);
1783
1784 return retVal;
1785}
1786
1787/******************************************************************************
1788 * Data Types for Device Selection
1789 *****************************************************************************/
1790
1791struct TessScoreEvaluationInputData {
1792 int height;
1793 int width;
1794 int numChannels;
1795 unsigned char *imageData;
1796 Image pix;
1797};
1798
1799static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
1800 srand(1);
1801 // 8.5x11 inches @ 300dpi rounded to clean multiples
1802 int height = 3328; // %256
1803 int width = 2560; // %512
1804 int numChannels = 4;
1805 input->height = height;
1806 input->width = width;
1807 input->numChannels = numChannels;
1808 unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
1809 height * width * numChannels * sizeof(unsigned char)); // new unsigned char[4][height*width];
1810 input->imageData = (unsigned char *)&imageData4[0];
1811
1812 // zero out image
1813 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1814 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1815 for (int p = 0; p < height * width; p++) {
1816 // unsigned char tmp[4] = imageData4[0];
1817 imageData4[p][0] = pixelWhite[0];
1818 imageData4[p][1] = pixelWhite[1];
1819 imageData4[p][2] = pixelWhite[2];
1820 imageData4[p][3] = pixelWhite[3];
1821 }
1822 // random lines to be eliminated
1823 int maxLineWidth = 64; // pixels wide
1824 int numLines = 10;
1825 // vertical lines
1826 for (int i = 0; i < numLines; i++) {
1827 int lineWidth = rand() % maxLineWidth;
1828 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1829 // tprintf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
1830 for (int row = vertLinePos - lineWidth / 2; row < vertLinePos + lineWidth / 2; row++) {
1831 for (int col = 0; col < height; col++) {
1832 // imageData4[row*width+col] = pixelBlack;
1833 imageData4[row * width + col][0] = pixelBlack[0];
1834 imageData4[row * width + col][1] = pixelBlack[1];
1835 imageData4[row * width + col][2] = pixelBlack[2];
1836 imageData4[row * width + col][3] = pixelBlack[3];
1837 }
1838 }
1839 }
1840 // horizontal lines
1841 for (int i = 0; i < numLines; i++) {
1842 int lineWidth = rand() % maxLineWidth;
1843 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1844 // tprintf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
1845 for (int row = 0; row < width; row++) {
1846 for (int col = horLinePos - lineWidth / 2; col < horLinePos + lineWidth / 2;
1847 col++) { // for (int row = vertLinePos-lineWidth/2; row <
1848 // vertLinePos+lineWidth/2; row++) {
1849 // tprintf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
1850 // imageData4[row*width+col] = pixelBlack;
1851 imageData4[row * width + col][0] = pixelBlack[0];
1852 imageData4[row * width + col][1] = pixelBlack[1];
1853 imageData4[row * width + col][2] = pixelBlack[2];
1854 imageData4[row * width + col][3] = pixelBlack[3];
1855 }
1856 }
1857 }
1858 // spots (noise, squares)
1859 float fractionBlack = 0.1; // how much of the image should be blackened
1860 int numSpots = (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1861 for (int i = 0; i < numSpots; i++) {
1862 int lineWidth = rand() % maxLineWidth;
1863 int col = lineWidth + rand() % (width - 2 * lineWidth);
1864 int row = lineWidth + rand() % (height - 2 * lineWidth);
1865 // tprintf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
1866 for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1867 for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1868 // tprintf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
1869 // imageData4[row*width+col] = pixelBlack;
1870 imageData4[r * width + c][0] = pixelBlack[0];
1871 imageData4[r * width + c][1] = pixelBlack[1];
1872 imageData4[r * width + c][2] = pixelBlack[2];
1873 imageData4[r * width + c][3] = pixelBlack[3];
1874 }
1875 }
1876 }
1877
1878 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1879}
1880
1881struct TessDeviceScore {
1882 float time; // small time means faster device
1883 bool clError; // were there any opencl errors
1884 bool valid; // was the correct response generated
1885};
1886
1887/******************************************************************************
1888 * Micro Benchmarks for Device Selection
1889 *****************************************************************************/
1890
1891static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1892 ds_device_type type) {
1893 double time = 0;
1894# if ON_WINDOWS
1895 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1896 QueryPerformanceFrequency(&freq);
1897# elif ON_APPLE
1898 mach_timebase_info_data_t info = {0, 0};
1899 mach_timebase_info(&info);
1900 long long start, stop;
1901# else
1902 timespec time_funct_start, time_funct_end;
1903# endif
1904 // input data
1905 l_uint32 *tiffdata = (l_uint32 *)input.imageData; // same size and random data; data doesn't
1906 // change workload
1907
1908 // function call
1909 if (type == DS_DEVICE_OPENCL_DEVICE) {
1910# if ON_WINDOWS
1911 QueryPerformanceCounter(&time_funct_start);
1912# elif ON_APPLE
1913 start = mach_absolute_time();
1914# else
1915 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1916# endif
1917
1918 OpenclDevice::gpuEnv = *env;
1919 int wpl = pixGetWpl(input.pix);
1920 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, nullptr);
1921# if ON_WINDOWS
1922 QueryPerformanceCounter(&time_funct_end);
1923 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1924# elif ON_APPLE
1925 stop = mach_absolute_time();
1926 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1927# else
1928 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1929 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1930 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1931# endif
1932
1933 } else {
1934# if ON_WINDOWS
1935 QueryPerformanceCounter(&time_funct_start);
1936# elif ON_APPLE
1937 start = mach_absolute_time();
1938# else
1939 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1940# endif
1941 Image pix = pixCreate(input.width, input.height, 32);
1942 l_uint32 *pixData = pixGetData(pix);
1943 int i, j;
1944 int idx = 0;
1945 for (i = 0; i < input.height; i++) {
1946 for (j = 0; j < input.width; j++) {
1947 l_uint32 tiffword = tiffdata[i * input.width + j];
1948 l_int32 rval = ((tiffword)&0xff);
1949 l_int32 gval = (((tiffword) >> 8) & 0xff);
1950 l_int32 bval = (((tiffword) >> 16) & 0xff);
1951 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
1952 pixData[idx] = value;
1953 idx++;
1954 }
1955 }
1956# if ON_WINDOWS
1957 QueryPerformanceCounter(&time_funct_end);
1958 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1959# elif ON_APPLE
1960 stop = mach_absolute_time();
1961 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1962# else
1963 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1964 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1965 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1966# endif
1967 pix.destroy();
1968 }
1969
1970 return time;
1971}
1972
1973static double histogramRectMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1974 ds_device_type type) {
1975 double time;
1976# if ON_WINDOWS
1977 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1978 QueryPerformanceFrequency(&freq);
1979# elif ON_APPLE
1980 mach_timebase_info_data_t info = {0, 0};
1981 mach_timebase_info(&info);
1982 long long start, stop;
1983# else
1984 timespec time_funct_start, time_funct_end;
1985# endif
1986
1987 const int left = 0;
1988 const int top = 0;
1989 int kHistogramSize = 256;
1990 int bytes_per_line = input.width * input.numChannels;
1991 int *histogramAllChannels = new int[kHistogramSize * input.numChannels];
1992 // function call
1993 if (type == DS_DEVICE_OPENCL_DEVICE) {
1994# if ON_WINDOWS
1995 QueryPerformanceCounter(&time_funct_start);
1996# elif ON_APPLE
1997 start = mach_absolute_time();
1998# else
1999 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2000# endif
2001
2002 OpenclDevice::gpuEnv = *env;
2003 int retVal = OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line,
2004 left, top, input.width, input.height,
2005 kHistogramSize, histogramAllChannels);
2006
2007# if ON_WINDOWS
2008 QueryPerformanceCounter(&time_funct_end);
2009 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2010# elif ON_APPLE
2011 stop = mach_absolute_time();
2012 if (retVal == 0) {
2013 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2014 } else {
2015 time = FLT_MAX;
2016 }
2017# else
2018 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2019 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2020 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2021# endif
2022 } else {
2023 int *histogram = new int[kHistogramSize];
2024# if ON_WINDOWS
2025 QueryPerformanceCounter(&time_funct_start);
2026# elif ON_APPLE
2027 start = mach_absolute_time();
2028# else
2029 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2030# endif
2031 for (int ch = 0; ch < input.numChannels; ++ch) {
2032 tesseract::HistogramRect(input.pix, input.numChannels, left, top, input.width, input.height,
2033 histogram);
2034 }
2035# if ON_WINDOWS
2036 QueryPerformanceCounter(&time_funct_end);
2037 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2038# elif ON_APPLE
2039 stop = mach_absolute_time();
2040 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2041# else
2042 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2043 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2044 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2045# endif
2046 delete[] histogram;
2047 }
2048
2049 // cleanup
2050 delete[] histogramAllChannels;
2051 return time;
2052}
2053
2054// Reproducing the ThresholdRectToPix native version
2055static void ThresholdRectToPix_Native(const unsigned char *imagedata, int bytes_per_pixel,
2056 int bytes_per_line, const int *thresholds,
2057 const int *hi_values, Image *pix) {
2058 int top = 0;
2059 int left = 0;
2060 int width = pixGetWidth(*pix);
2061 int height = pixGetHeight(*pix);
2062
2063 *pix = pixCreate(width, height, 1);
2064 uint32_t *pixdata = pixGetData(*pix);
2065 int wpl = pixGetWpl(*pix);
2066 const unsigned char *srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel;
2067 for (int y = 0; y < height; ++y) {
2068 const uint8_t *linedata = srcdata;
2069 uint32_t *pixline = pixdata + y * wpl;
2070 for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2071 bool white_result = true;
2072 for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2073 if (hi_values[ch] >= 0 && (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2074 white_result = false;
2075 break;
2076 }
2077 }
2078 if (white_result)
2079 CLEAR_DATA_BIT(pixline, x);
2080 else
2081 SET_DATA_BIT(pixline, x);
2082 }
2083 srcdata += bytes_per_line;
2084 }
2085}
2086
2087static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2088 ds_device_type type) {
2089 double time;
2090# if ON_WINDOWS
2091 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2092 QueryPerformanceFrequency(&freq);
2093# elif ON_APPLE
2094 mach_timebase_info_data_t info = {0, 0};
2095 mach_timebase_info(&info);
2096 long long start, stop;
2097# else
2098 timespec time_funct_start, time_funct_end;
2099# endif
2100
2101 // input data
2102 unsigned char pixelHi = (unsigned char)255;
2103 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2104
2105 // Pix* pix = pixCreate(width, height, 1);
2106 int top = 0;
2107 int left = 0;
2108 int bytes_per_line = input.width * input.numChannels;
2109
2110 // function call
2111 if (type == DS_DEVICE_OPENCL_DEVICE) {
2112# if ON_WINDOWS
2113 QueryPerformanceCounter(&time_funct_start);
2114# elif ON_APPLE
2115 start = mach_absolute_time();
2116# else
2117 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2118# endif
2119
2120 OpenclDevice::gpuEnv = *env;
2121 int hi_values[4];
2122 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2123 input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix,
2124 input.height, input.width, top, left);
2125
2126# if ON_WINDOWS
2127 QueryPerformanceCounter(&time_funct_end);
2128 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2129# elif ON_APPLE
2130 stop = mach_absolute_time();
2131 if (retVal == 0) {
2132 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2133 } else {
2134 time = FLT_MAX;
2135 }
2136
2137# else
2138 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2139 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2140 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2141# endif
2142 } else {
2143 tesseract::ImageThresholder thresholder;
2144 thresholder.SetImage(input.pix);
2145# if ON_WINDOWS
2146 QueryPerformanceCounter(&time_funct_start);
2147# elif ON_APPLE
2148 start = mach_absolute_time();
2149# else
2150 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2151# endif
2152 int hi_values[4] = {};
2153 ThresholdRectToPix_Native(input.imageData, input.numChannels, bytes_per_line, thresholds,
2154 hi_values, &input.pix);
2155
2156# if ON_WINDOWS
2157 QueryPerformanceCounter(&time_funct_end);
2158 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2159# elif ON_APPLE
2160 stop = mach_absolute_time();
2161 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2162# else
2163 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2164 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2165 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2166# endif
2167 }
2168
2169 return time;
2170}
2171
2172static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2173 ds_device_type type) {
2174 double time = 0;
2175# if ON_WINDOWS
2176 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2177 QueryPerformanceFrequency(&freq);
2178# elif ON_APPLE
2179 mach_timebase_info_data_t info = {0, 0};
2180 mach_timebase_info(&info);
2181 long long start, stop;
2182# else
2183 timespec time_funct_start, time_funct_end;
2184# endif
2185
2186 // input data
2187 int resolution = 300;
2188 int wpl = pixGetWpl(input.pix);
2189 int kThinLineFraction = 20; // tess constant
2190 int kMinLineLengthFraction = 4; // tess constant
2191 int max_line_width = resolution / kThinLineFraction;
2192 int min_line_length = resolution / kMinLineLengthFraction;
2193 int closing_brick = max_line_width / 3;
2194
2195 // function call
2196 if (type == DS_DEVICE_OPENCL_DEVICE) {
2197# if ON_WINDOWS
2198 QueryPerformanceCounter(&time_funct_start);
2199# elif ON_APPLE
2200 start = mach_absolute_time();
2201# else
2202 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2203# endif
2204 OpenclDevice::gpuEnv = *env;
2205 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2206 Image pix_vline = nullptr, pix_hline = nullptr, pix_closed = nullptr;
2207 OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2208 closing_brick, closing_brick, max_line_width, max_line_width,
2209 min_line_length, min_line_length);
2210
2211 OpenclDevice::releaseMorphCLBuffers();
2212
2213# if ON_WINDOWS
2214 QueryPerformanceCounter(&time_funct_end);
2215 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2216# elif ON_APPLE
2217 stop = mach_absolute_time();
2218 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2219# else
2220 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2221 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2222 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2223# endif
2224 } else {
2225# if ON_WINDOWS
2226 QueryPerformanceCounter(&time_funct_start);
2227# elif ON_APPLE
2228 start = mach_absolute_time();
2229# else
2230 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2231# endif
2232
2233 // native serial code
2234 Image src_pix = input.pix;
2235 Image pix_closed = pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2236 Image pix_solid = pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2237 Image pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2238 pix_solid.destroy();
2239 Image pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2240 Image pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2241 pix_hline.destroy();
2242 pix_vline.destroy();
2243 pix_hollow.destroy();
2244
2245# if ON_WINDOWS
2246 QueryPerformanceCounter(&time_funct_end);
2247 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2248# elif ON_APPLE
2249 stop = mach_absolute_time();
2250 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2251# else
2252 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2253 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2254 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2255# endif
2256 }
2257
2258 return time;
2259}
2260
2261/******************************************************************************
2262 * Device Selection
2263 *****************************************************************************/
2264
2265// encode score object as byte string
2266static ds_status serializeScore(ds_device *device, uint8_t **serializedScore,
2267 unsigned int *serializedScoreSize) {
2268 *serializedScoreSize = sizeof(TessDeviceScore);
2269 *serializedScore = new uint8_t[*serializedScoreSize];
2270 memcpy(*serializedScore, device->score, *serializedScoreSize);
2271 return DS_SUCCESS;
2272}
2273
2274// parses byte string and stores in score object
2275static ds_status deserializeScore(ds_device *device, const uint8_t *serializedScore,
2276 unsigned int serializedScoreSize) {
2277 // check that serializedScoreSize == sizeof(TessDeviceScore);
2278 device->score = new TessDeviceScore;
2279 memcpy(device->score, serializedScore, serializedScoreSize);
2280 return DS_SUCCESS;
2281}
2282
2283static ds_status releaseScore(TessDeviceScore *score) {
2284 delete score;
2285 return DS_SUCCESS;
2286}
2287
2288// evaluate devices
2289static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) {
2290 // overwrite statuc gpuEnv w/ current device
2291 // so native opencl calls can be used; they use static gpuEnv
2292 tprintf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2293 device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2294 GPUEnv *env = nullptr;
2295 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2296 env = &OpenclDevice::gpuEnv;
2297 memset(env, 0, sizeof(*env));
2298 // tprintf("[DS] populating tmp GPUEnv from device\n");
2299 populateGPUEnvFromDevice(env, device->oclDeviceID);
2300 env->mnFileCount = 0; // argc;
2301 env->mnKernelCount = 0UL;
2302 // tprintf("[DS] compiling kernels for tmp GPUEnv\n");
2303 OpenclDevice::CompileKernelFile(env, "");
2304 }
2305
2306 TessScoreEvaluationInputData *input = static_cast<TessScoreEvaluationInputData *>(inputData);
2307
2308 // pixReadTiff
2309 double composeRGBPixelTime = composeRGBPixelMicroBench(env, *input, device->type);
2310
2311 // HistogramRect
2312 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2313
2314 // ThresholdRectToPix
2315 double thresholdRectToPixTime = thresholdRectToPixMicroBench(env, *input, device->type);
2316
2317 // getLineMasks
2318 double getLineMasksMorphTime = getLineMasksMorphMicroBench(env, *input, device->type);
2319
2320 // weigh times (% of cpu time)
2321 // these weights should be the % execution time that the native cpu code took
2322 float composeRGBPixelWeight = 1.2f;
2323 float histogramRectWeight = 2.4f;
2324 float thresholdRectToPixWeight = 4.5f;
2325 float getLineMasksMorphWeight = 5.0f;
2326
2327 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2328 histogramRectWeight * histogramRectTime +
2329 thresholdRectToPixWeight * thresholdRectToPixTime +
2330 getLineMasksMorphWeight * getLineMasksMorphTime;
2331 device->score = new TessDeviceScore;
2332 device->score->time = weightedTime;
2333
2334 tprintf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2335 device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2336 tprintf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight);
2337 tprintf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight);
2338 tprintf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime,
2339 thresholdRectToPixWeight);
2340 tprintf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2341 getLineMasksMorphWeight);
2342 tprintf("[DS]%25s: %f\n", "Score", device->score->time);
2343 return DS_SUCCESS;
2344}
2345
2346// initial call to select device
2347ds_device OpenclDevice::getDeviceSelection() {
2348 if (!deviceIsSelected) {
2349 // check if opencl is available at runtime
2350 if (1 == LoadOpencl()) {
2351 // opencl is available
2352 // setup devices
2353 ds_status status;
2354 ds_profile *profile;
2355 status = initDSProfile(&profile, "v0.1");
2356 // try reading scores from file
2357 const char *fileName = "tesseract_opencl_profile_devices.dat";
2358 status = readProfileFromFile(profile, deserializeScore, fileName);
2359 if (status != DS_SUCCESS) {
2360 // need to run evaluation
2361 tprintf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
2362
2363 // create input data
2364 TessScoreEvaluationInputData input;
2365 populateTessScoreEvaluationInputData(&input);
2366 // perform evaluations
2367 unsigned int numUpdates;
2368 status =
2369 profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, &input, &numUpdates);
2370 // write scores to file
2371 if (status == DS_SUCCESS) {
2372 status = writeProfileToFile(profile, serializeScore, fileName);
2373 if (status == DS_SUCCESS) {
2374 tprintf("[DS] Scores written to file (%s).\n", fileName);
2375 } else {
2376 tprintf(
2377 "[DS] Error saving scores to file (%s); scores not written to "
2378 "file.\n",
2379 fileName);
2380 }
2381 } else {
2382 tprintf(
2383 "[DS] Unable to evaluate performance; scores not written to "
2384 "file.\n");
2385 }
2386 } else {
2387 tprintf("[DS] Profile read from file (%s).\n", fileName);
2388 }
2389
2390 // we now have device scores either from file or evaluation
2391 // select fastest using custom Tesseract selection algorithm
2392 float bestTime = FLT_MAX; // begin search with worst possible time
2393 int bestDeviceIdx = -1;
2394 for (unsigned d = 0; d < profile->numDevices; d++) {
2395 ds_device device = profile->devices[d];
2396 if (device.score == nullptr)
2397 continue;
2398 TessDeviceScore score = *device.score;
2399
2400 float time = score.time;
2401 tprintf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, device.oclDeviceName,
2402 time);
2403 if (time < bestTime) {
2404 bestTime = time;
2405 bestDeviceIdx = d;
2406 }
2407 }
2408 if (bestDeviceIdx >= 0) {
2409 tprintf(
2410 "[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2411 profile->devices[bestDeviceIdx].oclDeviceName,
2412 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2413 }
2414 // cleanup
2415 // TODO: call destructor for profile object?
2416
2417 bool overridden = false;
2418 char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2419 if (overrideDeviceStr != nullptr) {
2420 int overrideDeviceIdx = atoi(overrideDeviceStr);
2421 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2422 tprintf(
2423 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2424 "%i)\n",
2425 overrideDeviceStr, overrideDeviceIdx);
2426 bestDeviceIdx = overrideDeviceIdx - 1;
2427 overridden = true;
2428 } else {
2429 tprintf(
2430 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2431 "valid devices).\n",
2432 overrideDeviceStr, profile->numDevices);
2433 }
2434 }
2435
2436 if (overridden) {
2437 tprintf(
2438 "[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2439 profile->devices[bestDeviceIdx].oclDeviceName,
2440 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2441 }
2442 selectedDevice = profile->devices[bestDeviceIdx];
2443 // cleanup
2444 releaseDSProfile(profile, releaseScore);
2445 } else {
2446 // opencl isn't available at runtime, select native cpu device
2447 tprintf("[DS] OpenCL runtime not available.\n");
2448 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2449 selectedDevice.oclDeviceName = "(null)";
2450 selectedDevice.score = nullptr;
2451 selectedDevice.oclDeviceID = nullptr;
2452 selectedDevice.oclDriverVersion = nullptr;
2453 }
2454 deviceIsSelected = true;
2455 }
2456 return selectedDevice;
2457}
2458
2459bool OpenclDevice::selectedDeviceIsOpenCL() {
2460 ds_device device = getDeviceSelection();
2461 return (device.type == DS_DEVICE_OPENCL_DEVICE);
2462}
2463
2464} // namespace
2465
2466#endif
#define ASSERT_HOST(x)
Definition: errcode.h:54
int value
const double y
const char * p
const int kHistogramSize
Definition: otsuthr.h:30
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void HistogramRect(Image src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:146
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:41
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:39
def next(obj)
Definition: ast.py:56
type
Definition: upload.py:458
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:76