tesseract  4.00.00dev
openclwrapper.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 #ifdef _WIN32
11 #include <io.h>
12 #else
13 #include <sys/types.h>
14 #include <unistd.h>
15 #endif
16 #include <float.h>
17 
18 #include "openclwrapper.h"
19 #include "oclkernels.h"
20 
21 // for micro-benchmark
22 #include "otsuthr.h"
23 #include "thresholder.h"
24 
25 #if ON_APPLE
26 #include <mach/mach_time.h>
27 #include <stdio.h>
28 #endif
29 
30 #define CALLOC LEPT_CALLOC
31 #define FREE LEPT_FREE
32 
33 #ifdef USE_OPENCL
34 
36 GPUEnv OpenclDevice::gpuEnv;
37 
38 bool OpenclDevice::deviceIsSelected = false;
39 ds_device OpenclDevice::selectedDevice;
40 
41 int OpenclDevice::isInited = 0;
42 
43 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
44 
45 static const l_uint32 lmask32[] = {
46  0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
47  0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
48  0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
49  0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
50  0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
51  0xfffffffe, 0xffffffff};
52 
53 static const l_uint32 rmask32[] = {
54  0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
55  0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
56  0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
57  0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
58  0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
59  0x7fffffff, 0xffffffff};
60 
61 static cl_mem pixsCLBuffer, pixdCLBuffer,
62  pixdCLIntermediate; // Morph operations buffers
63 static cl_mem pixThBuffer; // output from thresholdtopix calculation
64 static cl_int clStatus;
65 static KernelEnv rEnv;
66 
67 #define DS_TAG_VERSION "<version>"
68 #define DS_TAG_VERSION_END "</version>"
69 #define DS_TAG_DEVICE "<device>"
70 #define DS_TAG_DEVICE_END "</device>"
71 #define DS_TAG_SCORE "<score>"
72 #define DS_TAG_SCORE_END "</score>"
73 #define DS_TAG_DEVICE_TYPE "<type>"
74 #define DS_TAG_DEVICE_TYPE_END "</type>"
75 #define DS_TAG_DEVICE_NAME "<name>"
76 #define DS_TAG_DEVICE_NAME_END "</name>"
77 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
78 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
79 
80 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
81 
82 #define DS_DEVICE_NAME_LENGTH 256
83 
84 typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type;
85 
86 typedef struct {
87  unsigned int numDevices;
88  ds_device *devices;
89  const char *version;
90 } ds_profile;
91 
92 typedef enum {
93  DS_SUCCESS = 0,
94  DS_INVALID_PROFILE = 1000,
95  DS_MEMORY_ERROR,
96  DS_INVALID_PERF_EVALUATOR_TYPE,
97  DS_INVALID_PERF_EVALUATOR,
98  DS_PERF_EVALUATOR_ERROR,
99  DS_FILE_ERROR,
100  DS_UNKNOWN_DEVICE_TYPE,
101  DS_PROFILE_FILE_ERROR,
102  DS_SCORE_SERIALIZER_ERROR,
103  DS_SCORE_DESERIALIZER_ERROR
104 } ds_status;
105 
106 // Pointer to a function that calculates the score of a device (ex:
107 // device->score) update the data size of score. The encoding and the format
108 // of the score data is implementation defined. The function should return
109 // DS_SUCCESS if there's no error to be reported.
110 typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
111 
112 // deallocate memory used by score
113 typedef ds_status (*ds_score_release)(void *score);
114 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
115  ds_status status = DS_SUCCESS;
116  if (profile != nullptr) {
117  if (profile->devices != nullptr && sr != nullptr) {
118  unsigned int i;
119  for (i = 0; i < profile->numDevices; i++) {
120  free(profile->devices[i].oclDeviceName);
121  free(profile->devices[i].oclDriverVersion);
122  status = sr(profile->devices[i].score);
123  if (status != DS_SUCCESS) break;
124  }
125  free(profile->devices);
126  }
127  free(profile);
128  }
129  return status;
130 }
131 
132 static ds_status initDSProfile(ds_profile **p, const char *version) {
133  int numDevices;
134  cl_uint numPlatforms;
135  cl_platform_id *platforms = nullptr;
136  cl_device_id *devices = nullptr;
137  ds_status status = DS_SUCCESS;
138  unsigned int next;
139  unsigned int i;
140 
141  if (p == nullptr) return DS_INVALID_PROFILE;
142 
143  ds_profile *profile = (ds_profile *)malloc(sizeof(ds_profile));
144  if (profile == nullptr) return DS_MEMORY_ERROR;
145 
146  memset(profile, 0, sizeof(ds_profile));
147 
148  clGetPlatformIDs(0, nullptr, &numPlatforms);
149 
150  if (numPlatforms > 0) {
151  platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
152  if (platforms == nullptr) {
153  status = DS_MEMORY_ERROR;
154  goto cleanup;
155  }
156  clGetPlatformIDs(numPlatforms, platforms, nullptr);
157  }
158 
159  numDevices = 0;
160  for (i = 0; i < (unsigned int)numPlatforms; i++) {
161  cl_uint num;
162  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
163  numDevices += num;
164  }
165 
166  if (numDevices > 0) {
167  devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
168  if (devices == nullptr) {
169  status = DS_MEMORY_ERROR;
170  goto cleanup;
171  }
172  }
173 
174  profile->numDevices =
175  numDevices + 1; // +1 to numDevices to include the native CPU
176  profile->devices =
177  (ds_device *)malloc(profile->numDevices * sizeof(ds_device));
178  if (profile->devices == nullptr) {
179  profile->numDevices = 0;
180  status = DS_MEMORY_ERROR;
181  goto cleanup;
182  }
183  memset(profile->devices, 0, profile->numDevices * sizeof(ds_device));
184 
185  next = 0;
186  for (i = 0; i < (unsigned int)numPlatforms; i++) {
187  cl_uint num;
188  unsigned j;
189  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
190  for (j = 0; j < num; j++, next++) {
191  char buffer[DS_DEVICE_NAME_LENGTH];
192  size_t length;
193 
194  profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
195  profile->devices[next].oclDeviceID = devices[j];
196 
197  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
198  DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
199  length = strlen(buffer);
200  profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
201  memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
202 
203  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
204  DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
205  length = strlen(buffer);
206  profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
207  memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
208  }
209  }
210  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
211  profile->version = version;
212 
213 cleanup:
214  free(platforms);
215  free(devices);
216  if (status == DS_SUCCESS) {
217  *p = profile;
218  } else {
219  if (profile) {
220  free(profile->devices);
221  free(profile);
222  }
223  }
224  return status;
225 }
226 
227 static ds_status profileDevices(ds_profile *profile,
228  const ds_evaluation_type type,
229  ds_perf_evaluator evaluator,
230  void *evaluatorData, unsigned int *numUpdates) {
231  ds_status status = DS_SUCCESS;
232  unsigned int i;
233  unsigned int updates = 0;
234 
235  if (profile == nullptr) {
236  return DS_INVALID_PROFILE;
237  }
238  if (evaluator == nullptr) {
239  return DS_INVALID_PERF_EVALUATOR;
240  }
241 
242  for (i = 0; i < profile->numDevices; i++) {
243  ds_status evaluatorStatus;
244 
245  switch (type) {
246  case DS_EVALUATE_NEW_ONLY:
247  if (profile->devices[i].score != nullptr) break;
248  // else fall through
249  case DS_EVALUATE_ALL:
250  evaluatorStatus = evaluator(profile->devices + i, evaluatorData);
251  if (evaluatorStatus != DS_SUCCESS) {
252  status = evaluatorStatus;
253  return status;
254  }
255  updates++;
256  break;
257  default:
258  return DS_INVALID_PERF_EVALUATOR_TYPE;
259  break;
260  };
261  }
262  if (numUpdates) *numUpdates = updates;
263  return status;
264 }
265 
266 static const char *findString(const char *contentStart, const char *contentEnd,
267  const char *string) {
268  size_t stringLength;
269  const char *currentPosition;
270  const char *found = nullptr;
271  stringLength = strlen(string);
272  currentPosition = contentStart;
273  for (currentPosition = contentStart; currentPosition < contentEnd;
274  currentPosition++) {
275  if (*currentPosition == string[0]) {
276  if (currentPosition + stringLength < contentEnd) {
277  if (strncmp(currentPosition, string, stringLength) == 0) {
278  found = currentPosition;
279  break;
280  }
281  }
282  }
283  }
284  return found;
285 }
286 
287 static ds_status readProFile(const char *fileName, char **content,
288  size_t *contentSize) {
289  size_t size = 0;
290 
291  *contentSize = 0;
292  *content = nullptr;
293 
294  FILE *input = fopen(fileName, "rb");
295  if (input == nullptr) {
296  return DS_FILE_ERROR;
297  }
298 
299  fseek(input, 0L, SEEK_END);
300  size = ftell(input);
301  rewind(input);
302  char *binary = (char *)malloc(size);
303  if (binary == nullptr) {
304  fclose(input);
305  return DS_FILE_ERROR;
306  }
307  fread(binary, sizeof(char), size, input);
308  fclose(input);
309 
310  *contentSize = size;
311  *content = binary;
312  return DS_SUCCESS;
313 }
314 
315 typedef ds_status (*ds_score_deserializer)(ds_device *device,
316  const unsigned char *serializedScore,
317  unsigned int serializedScoreSize);
318 
319 static ds_status readProfileFromFile(ds_profile *profile,
320  ds_score_deserializer deserializer,
321  const char *file) {
322  ds_status status = DS_SUCCESS;
323  char *contentStart = nullptr;
324  const char *contentEnd = nullptr;
325  size_t contentSize;
326 
327  if (profile == nullptr) return DS_INVALID_PROFILE;
328 
329  status = readProFile(file, &contentStart, &contentSize);
330  if (status == DS_SUCCESS) {
331  const char *currentPosition;
332  const char *dataStart;
333  const char *dataEnd;
334 
335  contentEnd = contentStart + contentSize;
336  currentPosition = contentStart;
337 
338  // parse the version string
339  dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
340  if (dataStart == nullptr) {
341  status = DS_PROFILE_FILE_ERROR;
342  goto cleanup;
343  }
344  dataStart += strlen(DS_TAG_VERSION);
345 
346  dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
347  if (dataEnd == nullptr) {
348  status = DS_PROFILE_FILE_ERROR;
349  goto cleanup;
350  }
351 
352  size_t versionStringLength = strlen(profile->version);
353  if (versionStringLength + dataStart != dataEnd ||
354  strncmp(profile->version, dataStart, versionStringLength) != 0) {
355  // version mismatch
356  status = DS_PROFILE_FILE_ERROR;
357  goto cleanup;
358  }
359  currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
360 
361  // parse the device information
362  while (1) {
363  unsigned int i;
364 
365  const char *deviceTypeStart;
366  const char *deviceTypeEnd;
367  ds_device_type deviceType;
368 
369  const char *deviceNameStart;
370  const char *deviceNameEnd;
371 
372  const char *deviceScoreStart;
373  const char *deviceScoreEnd;
374 
375  const char *deviceDriverStart;
376  const char *deviceDriverEnd;
377 
378  dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
379  if (dataStart == nullptr) {
380  // nothing useful remain, quit...
381  break;
382  }
383  dataStart += strlen(DS_TAG_DEVICE);
384  dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
385  if (dataEnd == nullptr) {
386  status = DS_PROFILE_FILE_ERROR;
387  goto cleanup;
388  }
389 
390  // parse the device type
391  deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
392  if (deviceTypeStart == nullptr) {
393  status = DS_PROFILE_FILE_ERROR;
394  goto cleanup;
395  }
396  deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
397  deviceTypeEnd =
398  findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
399  if (deviceTypeEnd == nullptr) {
400  status = DS_PROFILE_FILE_ERROR;
401  goto cleanup;
402  }
403  memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
404 
405  // parse the device name
406  if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
407  deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
408  if (deviceNameStart == nullptr) {
409  status = DS_PROFILE_FILE_ERROR;
410  goto cleanup;
411  }
412  deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
413  deviceNameEnd =
414  findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
415  if (deviceNameEnd == nullptr) {
416  status = DS_PROFILE_FILE_ERROR;
417  goto cleanup;
418  }
419 
420  deviceDriverStart =
421  findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
422  if (deviceDriverStart == nullptr) {
423  status = DS_PROFILE_FILE_ERROR;
424  goto cleanup;
425  }
426  deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
427  deviceDriverEnd = findString(deviceDriverStart, contentEnd,
428  DS_TAG_DEVICE_DRIVER_VERSION_END);
429  if (deviceDriverEnd == nullptr) {
430  status = DS_PROFILE_FILE_ERROR;
431  goto cleanup;
432  }
433 
434  // check if this device is on the system
435  for (i = 0; i < profile->numDevices; i++) {
436  if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
437  size_t actualDeviceNameLength;
438  size_t driverVersionLength;
439 
440  actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
441  driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
442  if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
443  deviceDriverStart + driverVersionLength == deviceDriverEnd &&
444  strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
445  actualDeviceNameLength) == 0 &&
446  strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
447  driverVersionLength) == 0) {
448  deviceScoreStart =
449  findString(dataStart, contentEnd, DS_TAG_SCORE);
450  if (deviceNameStart == nullptr) {
451  status = DS_PROFILE_FILE_ERROR;
452  goto cleanup;
453  }
454  deviceScoreStart += strlen(DS_TAG_SCORE);
455  deviceScoreEnd =
456  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
457  status = deserializer(profile->devices + i,
458  (const unsigned char *)deviceScoreStart,
459  deviceScoreEnd - deviceScoreStart);
460  if (status != DS_SUCCESS) {
461  goto cleanup;
462  }
463  }
464  }
465  }
466  } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
467  for (i = 0; i < profile->numDevices; i++) {
468  if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
469  deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
470  if (deviceScoreStart == nullptr) {
471  status = DS_PROFILE_FILE_ERROR;
472  goto cleanup;
473  }
474  deviceScoreStart += strlen(DS_TAG_SCORE);
475  deviceScoreEnd =
476  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
477  status = deserializer(profile->devices + i,
478  (const unsigned char *)deviceScoreStart,
479  deviceScoreEnd - deviceScoreStart);
480  if (status != DS_SUCCESS) {
481  goto cleanup;
482  }
483  }
484  }
485  }
486 
487  // skip over the current one to find the next device
488  currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
489  }
490  }
491 cleanup:
492  free(contentStart);
493  return status;
494 }
495 
496 typedef ds_status (*ds_score_serializer)(ds_device *device,
497  void **serializedScore,
498  unsigned int *serializedScoreSize);
499 static ds_status writeProfileToFile(ds_profile *profile,
500  ds_score_serializer serializer,
501  const char *file) {
502  ds_status status = DS_SUCCESS;
503 
504  if (profile == nullptr) return DS_INVALID_PROFILE;
505 
506  FILE *profileFile = fopen(file, "wb");
507  if (profileFile == nullptr) {
508  status = DS_FILE_ERROR;
509  } else {
510  unsigned int i;
511 
512  // write version string
513  fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
514  fwrite(profile->version, sizeof(char), strlen(profile->version),
515  profileFile);
516  fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END),
517  profileFile);
518  fwrite("\n", sizeof(char), 1, profileFile);
519 
520  for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
521  void *serializedScore;
522  unsigned int serializedScoreSize;
523 
524  fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
525 
526  fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
527  profileFile);
528  fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
529  fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
530  strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
531 
532  switch (profile->devices[i].type) {
533  case DS_DEVICE_NATIVE_CPU: {
534  // There's no need to emit a device name for the native CPU device.
535  /*
536  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
537  profileFile);
538  fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
539  strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
540  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
541  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
542  */
543  } break;
544  case DS_DEVICE_OPENCL_DEVICE: {
545  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
546  profileFile);
547  fwrite(profile->devices[i].oclDeviceName, sizeof(char),
548  strlen(profile->devices[i].oclDeviceName), profileFile);
549  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
550  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
551 
552  fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
553  strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
554  fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
555  strlen(profile->devices[i].oclDriverVersion), profileFile);
556  fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
557  strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
558  } break;
559  default:
560  status = DS_UNKNOWN_DEVICE_TYPE;
561  break;
562  };
563 
564  fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
565  status = serializer(profile->devices + i, &serializedScore,
566  &serializedScoreSize);
567  if (status == DS_SUCCESS && serializedScore != nullptr &&
568  serializedScoreSize > 0) {
569  fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
570  free(serializedScore);
571  }
572  fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END),
573  profileFile);
574  fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END),
575  profileFile);
576  fwrite("\n", sizeof(char), 1, profileFile);
577  }
578  fclose(profileFile);
579  }
580  return status;
581 }
582 
583 // substitute invalid characters in device name with _
584 static void legalizeFileName(char *fileName) {
585  // printf("fileName: %s\n", fileName);
586  const char *invalidChars =
587  "/\?:*\"><| "; // space is valid but can cause headaches
588  // for each invalid char
589  for (unsigned i = 0; i < strlen(invalidChars); i++) {
590  char invalidStr[4];
591  invalidStr[0] = invalidChars[i];
592  invalidStr[1] = '\0';
593  // printf("eliminating %s\n", invalidStr);
594  // char *pos = strstr(fileName, invalidStr);
595  // initial ./ is valid for present directory
596  // if (*pos == '.') pos++;
597  // if (*pos == '/') pos++;
598  for (char *pos = strstr(fileName, invalidStr); pos != nullptr;
599  pos = strstr(pos + 1, invalidStr)) {
600  // printf("\tfound: %s, ", pos);
601  pos[0] = '_';
602  // printf("fileName: %s\n", fileName);
603  }
604  }
605 }
606 
607 static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
608  // printf("[DS] populateGPUEnvFromDevice\n");
609  size_t size;
610  gpuInfo->mnIsUserCreated = 1;
611  // device
612  gpuInfo->mpDevID = device;
613  gpuInfo->mpArryDevsID = new cl_device_id[1];
614  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
615  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
616  sizeof(cl_device_type), &gpuInfo->mDevType, &size);
617  CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
618  // platform
619  clStatus =
620  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
621  sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
622  CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
623  // context
624  cl_context_properties props[3];
625  props[0] = CL_CONTEXT_PLATFORM;
626  props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
627  props[2] = 0;
628  gpuInfo->mpContext =
629  clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
630  CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
631  // queue
632  cl_command_queue_properties queueProperties = 0;
633  gpuInfo->mpCmdQueue = clCreateCommandQueue(
634  gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
635  CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
636 }
637 
638 int OpenclDevice::LoadOpencl()
639 {
640 #ifdef WIN32
641  HINSTANCE HOpenclDll = nullptr;
642  void *OpenclDll = nullptr;
643  // fprintf(stderr, " LoadOpenclDllxx... \n");
644  OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
645  OpenclDll = LoadLibrary("openCL.dll");
646  if (!static_cast<HINSTANCE>(OpenclDll)) {
647  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
648  FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
649  return 0;
650  }
651  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
652 #endif
653  return 1;
654 }
655 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
656 {
657  envInfo->mpkContext = gpuEnv.mpContext;
658  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
659  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
660 
661  return 1;
662 }
663 
664 static cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer,
665  size_t nElements, cl_mem_flags flags,
666  cl_int *pStatus) {
667  cl_mem membuffer =
668  clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
669  nElements * sizeof(l_uint32), hostbuffer, pStatus);
670 
671  return membuffer;
672 }
673 
674 static Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd,
675  Pix *pixs, int elements, cl_mem_flags flags,
676  bool memcopy = false, bool sync = true) {
677  PROCNAME("mapOutputCLBuffer");
678  if (!pixd) {
679  if (memcopy) {
680  if ((pixd = pixCreateTemplate(pixs)) == nullptr)
681  tprintf("pixd not made\n");
682  } else {
683  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
684  pixGetDepth(pixs))) == nullptr)
685  tprintf("pixd not made\n");
686  }
687  }
688  l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(
689  rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
690  elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
691 
692  if (memcopy) {
693  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
694  } else {
695  pixSetData(pixd, pValues);
696  }
697 
698  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr,
699  nullptr);
700 
701  if (sync) {
702  clFinish(rEnv.mpkCmdQueue);
703  }
704 
705  return pixd;
706 }
707 
708 void OpenclDevice::releaseMorphCLBuffers()
709 {
710  if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate);
711  if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer);
712  if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer);
713  if (pixThBuffer != nullptr) clReleaseMemObject(pixThBuffer);
714  pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
715 }
716 
717 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) {
718  SetKernelEnv(&rEnv);
719 
720  if (pixThBuffer != nullptr) {
721  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
722  CL_MEM_ALLOC_HOST_PTR, &clStatus);
723 
724  // Get the output from ThresholdToPix operation
725  clStatus =
726  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
727  sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
728  } else {
729  // Get data from the source image
730  l_uint32 *srcdata =
731  reinterpret_cast<l_uint32 *>(malloc(wpl * h * sizeof(l_uint32)));
732  memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
733 
734  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
735  CL_MEM_USE_HOST_PTR, &clStatus);
736  }
737 
738  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
739  CL_MEM_ALLOC_HOST_PTR, &clStatus);
740 
741  pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
742  CL_MEM_ALLOC_HOST_PTR, &clStatus);
743 
744  return (int)clStatus;
745 }
746 
747 int OpenclDevice::InitEnv()
748 {
749 //PERF_COUNT_START("OD::InitEnv")
750 // printf("[OD] OpenclDevice::InitEnv()\n");
751 #ifdef SAL_WIN32
752  while( 1 )
753  {
754  if( 1 == LoadOpencl() )
755  break;
756  }
757 PERF_COUNT_SUB("LoadOpencl")
758 #endif
759  // sets up environment, compiles programs
760 
761  InitOpenclRunEnv_DeviceSelection( 0 );
762 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
763 //PERF_COUNT_END
764  return 1;
765 }
766 
767 int OpenclDevice::ReleaseOpenclRunEnv()
768 {
769  ReleaseOpenclEnv( &gpuEnv );
770 #ifdef SAL_WIN32
771  FreeOpenclDll();
772 #endif
773  return 1;
774 }
775 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
776 {
777  if ( kCount < 1 )
778  fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
779  strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
780  gpuEnv.mnKernelCount++;
781  return 0;
782 }
783 int OpenclDevice::RegistOpenclKernel()
784 {
785  if ( !gpuEnv.mnIsUserCreated )
786  memset( &gpuEnv, 0, sizeof(gpuEnv) );
787 
788  gpuEnv.mnFileCount = 0; //argc;
789  gpuEnv.mnKernelCount = 0UL;
790 
791  AddKernelConfig( 1, (const char*) "oclAverageSub1" );
792  return 0;
793 }
794 
795 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
796 //PERF_COUNT_START("InitOpenclRunEnv_DS")
797  if (!isInited) {
798  // after programs compiled, selects best device
799  ds_device bestDevice_DS = getDeviceSelection( );
800 //PERF_COUNT_SUB("called getDeviceSelection()")
801  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
802  // overwrite global static GPUEnv with new device
803  if (selectedDeviceIsOpenCL() ) {
804  //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
805  populateGPUEnvFromDevice( &gpuEnv, bestDevice );
806  gpuEnv.mnFileCount = 0; //argc;
807  gpuEnv.mnKernelCount = 0UL;
808 //PERF_COUNT_SUB("populate gpuEnv")
809  CompileKernelFile(&gpuEnv, "");
810 //PERF_COUNT_SUB("CompileKernelFile")
811  } else {
812  //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
813  }
814  isInited = 1;
815  }
816 //PERF_COUNT_END
817  return 0;
818 }
819 
820 
821 OpenclDevice::OpenclDevice()
822 {
823  //InitEnv();
824 }
825 
826 OpenclDevice::~OpenclDevice()
827 {
828  //ReleaseOpenclRunEnv();
829 }
830 
831 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
832 {
833  int i = 0;
834  int clStatus = 0;
835 
836  if ( !isInited )
837  {
838  return 1;
839  }
840 
841  for ( i = 0; i < gpuEnv.mnFileCount; i++ )
842  {
843  if ( gpuEnv.mpArryPrograms[i] )
844  {
845  clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
846  CHECK_OPENCL( clStatus, "clReleaseProgram" );
847  gpuEnv.mpArryPrograms[i] = nullptr;
848  }
849  }
850  if ( gpuEnv.mpCmdQueue )
851  {
852  clReleaseCommandQueue( gpuEnv.mpCmdQueue );
853  gpuEnv.mpCmdQueue = nullptr;
854  }
855  if ( gpuEnv.mpContext )
856  {
857  clReleaseContext( gpuEnv.mpContext );
858  gpuEnv.mpContext = nullptr;
859  }
860  isInited = 0;
861  gpuInfo->mnIsUserCreated = 0;
862  delete[] gpuInfo->mpArryDevsID;
863  return 1;
864 }
865 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
866 {
867  unsigned int i = 0;
868  cl_int clStatus;
869  int status = 0;
870  char *str = nullptr;
871  FILE *fd = nullptr;
872  char fileName[256] = {0}, cl_name[128] = {0};
873  char deviceName[1024];
874  clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
875  sizeof(deviceName), deviceName, nullptr);
876  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
877  str = (char *)strstr(clFileName, (char *)".cl");
878  memcpy(cl_name, clFileName, str - clFileName);
879  cl_name[str - clFileName] = '\0';
880  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
881  legalizeFileName(fileName);
882  fd = fopen(fileName, "rb");
883  status = (fd != nullptr) ? 1 : 0;
884  if (fd != nullptr) {
885  *fhandle = fd;
886  }
887  return status;
888 
889 }
890 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
891 {
892  int i;
893  for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
894  {
895  if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
896  {
897  if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
898  return 1;
899  }
900  }
901  }
902 
903  return 0;
904 }
905 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
906 {
907  FILE *output = nullptr;
908  output = fopen(fileName, "wb");
909  if (output == nullptr) {
910  return 0;
911  }
912 
913  fwrite( birary, sizeof(char), numBytes, output );
914  fclose( output );
915 
916  return 1;
917 
918 }
919 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
920 {
921  unsigned int i = 0;
922  cl_int clStatus;
923  size_t *binarySizes;
924  cl_uint numDevices;
925  cl_device_id *mpArryDevsID;
926  char **binaries, *str = nullptr;
927 
928  clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
929  sizeof(numDevices), &numDevices, nullptr);
930  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
931 
932  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
933  if (mpArryDevsID == nullptr) {
934  return 0;
935  }
936  /* grab the handles to all of the devices in the program. */
937  clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
938  sizeof(cl_device_id) * numDevices, mpArryDevsID,
939  nullptr);
940  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
941 
942  /* figure out the sizes of each of the binaries. */
943  binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
944 
945  clStatus =
946  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
947  sizeof(size_t) * numDevices, binarySizes, nullptr);
948  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
949 
950  /* copy over all of the generated binaries. */
951  binaries = (char**) malloc( sizeof(char *) * numDevices );
952  if (binaries == nullptr) {
953  return 0;
954  }
955 
956  for ( i = 0; i < numDevices; i++ )
957  {
958  if ( binarySizes[i] != 0 )
959  {
960  binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
961  if (binaries[i] == nullptr) {
962  return 0;
963  }
964  }
965  else
966  {
967  binaries[i] = nullptr;
968  }
969  }
970 
971  clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
972  sizeof(char *) * numDevices, binaries, nullptr);
973  CHECK_OPENCL(clStatus,"clGetProgramInfo");
974 
975  /* dump out each binary into its own separate file. */
976  for ( i = 0; i < numDevices; i++ )
977  {
978  char fileName[256] = { 0 }, cl_name[128] = { 0 };
979 
980  if ( binarySizes[i] != 0 )
981  {
982  char deviceName[1024];
983  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
984  sizeof(deviceName), deviceName, nullptr);
985  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
986 
987  str = (char*) strstr( clFileName, (char*) ".cl" );
988  memcpy( cl_name, clFileName, str - clFileName );
989  cl_name[str - clFileName] = '\0';
990  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
991  legalizeFileName(fileName);
992  if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
993  {
994  printf("[OD] write binary[%s] failed\n", fileName);
995  return 0;
996  } //else
997  printf("[OD] write binary[%s] successfully\n", fileName);
998  }
999  }
1000 
1001  // Release all resouces and memory
1002  for ( i = 0; i < numDevices; i++ )
1003  {
1004  free(binaries[i]);
1005  binaries[i] = nullptr;
1006  }
1007 
1008  free(binaries);
1009  binaries = nullptr;
1010 
1011  free(binarySizes);
1012  binarySizes = nullptr;
1013 
1014  free(mpArryDevsID);
1015  mpArryDevsID = nullptr;
1016 
1017  return 1;
1018 }
1019 
1020 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
1021 {
1022 //PERF_COUNT_START("CompileKernelFile")
1023  cl_int clStatus = 0;
1024  size_t length;
1025  char *buildLog = nullptr, *binary;
1026  const char *source;
1027  size_t source_size[1];
1028  int b_error, binary_status, binaryExisted, idx;
1029  cl_uint numDevices;
1030  cl_device_id *mpArryDevsID;
1031  FILE *fd, *fd1;
1032  const char* filename = "kernel.cl";
1033  //fprintf(stderr, "[OD] CompileKernelFile ... \n");
1034  if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
1035  {
1036  return 1;
1037  }
1038 
1039  idx = gpuInfo->mnFileCount;
1040 
1041  source = kernel_src;
1042 
1043  source_size[0] = strlen( source );
1044  binaryExisted = 0;
1045  binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
1046 //PERF_COUNT_SUB("BinaryGenerated")
1047  if ( binaryExisted == 1 )
1048  {
1049  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
1050  sizeof(numDevices), &numDevices, nullptr);
1051  CHECK_OPENCL(clStatus, "clGetContextInfo");
1052 
1053  mpArryDevsID = (cl_device_id *)malloc(sizeof(cl_device_id) * numDevices);
1054  if (mpArryDevsID == nullptr) {
1055  return 0;
1056  }
1057 //PERF_COUNT_SUB("get numDevices")
1058  b_error = 0;
1059  length = 0;
1060  b_error |= fseek( fd, 0, SEEK_END ) < 0;
1061  b_error |= ( length = ftell(fd) ) <= 0;
1062  b_error |= fseek( fd, 0, SEEK_SET ) < 0;
1063  if ( b_error )
1064  {
1065  return 0;
1066  }
1067 
1068  binary = (char*) malloc( length + 2 );
1069  if ( !binary )
1070  {
1071  return 0;
1072  }
1073 
1074  memset( binary, 0, length + 2 );
1075  b_error |= fread( binary, 1, length, fd ) != length;
1076 
1077 
1078  fclose( fd );
1079 //PERF_COUNT_SUB("read file")
1080  fd = nullptr;
1081  // grab the handles to all of the devices in the context.
1082  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
1083  sizeof(cl_device_id) * numDevices,
1084  mpArryDevsID, nullptr);
1085  CHECK_OPENCL( clStatus, "clGetContextInfo" );
1086 //PERF_COUNT_SUB("get devices")
1087  //fprintf(stderr, "[OD] Create kernel from binary\n");
1088  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
1089  mpArryDevsID, &length, (const unsigned char**) &binary,
1090  &binary_status, &clStatus );
1091  CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
1092 //PERF_COUNT_SUB("clCreateProgramWithBinary")
1093  free( binary );
1094  free( mpArryDevsID );
1095  mpArryDevsID = nullptr;
1096  // PERF_COUNT_SUB("binaryExisted")
1097  }
1098  else
1099  {
1100  // create a CL program using the kernel source
1101  //fprintf(stderr, "[OD] Create kernel from source\n");
1102  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
1103  source_size, &clStatus);
1104  CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
1105 //PERF_COUNT_SUB("!binaryExisted")
1106  }
1107 
1108  if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
1109  return 0;
1110  }
1111 
1112  //char options[512];
1113  // create a cl program executable for all the devices specified
1114  //printf("[OD] BuildProgram.\n");
1115 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
1116  if (!gpuInfo->mnIsUserCreated)
1117  {
1118  clStatus =
1119  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1120  buildOption, nullptr, nullptr);
1121  // PERF_COUNT_SUB("clBuildProgram notUserCreated")
1122  }
1123  else
1124  {
1125  clStatus =
1126  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1127  buildOption, nullptr, nullptr);
1128  // PERF_COUNT_SUB("clBuildProgram isUserCreated")
1129  }
1131  if ( clStatus != CL_SUCCESS )
1132  {
1133  printf ("BuildProgram error!\n");
1134  if ( !gpuInfo->mnIsUserCreated )
1135  {
1136  clStatus = clGetProgramBuildInfo(
1137  gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1138  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1139  }
1140  else
1141  {
1142  clStatus = clGetProgramBuildInfo(
1143  gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1144  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1145  }
1146  if ( clStatus != CL_SUCCESS )
1147  {
1148  printf("opencl create build log fail\n");
1149  return 0;
1150  }
1151  buildLog = (char*) malloc( length );
1152  if (buildLog == (char *)nullptr) {
1153  return 0;
1154  }
1155  if ( !gpuInfo->mnIsUserCreated )
1156  {
1157  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1158  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1159  }
1160  else
1161  {
1162  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1163  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1164  }
1165  if ( clStatus != CL_SUCCESS )
1166  {
1167  printf("opencl program build info fail\n");
1168  return 0;
1169  }
1170 
1171  fd1 = fopen( "kernel-build.log", "w+" );
1172  if (fd1 != nullptr) {
1173  fwrite(buildLog, sizeof(char), length, fd1);
1174  fclose(fd1);
1175  }
1176 
1177  free( buildLog );
1178 //PERF_COUNT_SUB("build error log")
1179  return 0;
1180  }
1181 
1182  strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
1183 //PERF_COUNT_SUB("strcpy")
1184  if ( binaryExisted == 0 ) {
1185  GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
1186  PERF_COUNT_SUB("GenerateBinFromKernelSource")
1187  }
1188 
1189  gpuInfo->mnFileCount += 1;
1190 //PERF_COUNT_END
1191  return 1;
1192 }
1193 
1194 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
1195 {
1196 PERF_COUNT_START("pixReadFromTiffKernel")
1197  cl_int clStatus;
1198  KernelEnv rEnv;
1199  size_t globalThreads[2];
1200  size_t localThreads[2];
1201  int gsize;
1202  cl_mem valuesCl;
1203  cl_mem outputCl;
1204 
1205  //global and local work dimensions for Horizontal pass
1206  gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1207  globalThreads[0] = gsize;
1208  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1209  globalThreads[1] = gsize;
1210  localThreads[0] = GROUPSIZE_X;
1211  localThreads[1] = GROUPSIZE_Y;
1212 
1213  SetKernelEnv( &rEnv );
1214 
1215  l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
1216  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
1217  CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1218 
1219  //Allocate input and output OCL buffers
1220  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1221  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1222 
1223  //Kernel arguments
1224  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1225  CHECK_OPENCL( clStatus, "clSetKernelArg");
1226  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1227  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1228  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1229  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1230  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1231  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1232  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1233  CHECK_OPENCL( clStatus, "clSetKernelArg");
1234 
1235  //Kernel enqueue
1236 PERF_COUNT_SUB("before")
1237 clStatus =
1238  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1239  globalThreads, localThreads, 0, nullptr, nullptr);
1240 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1241 
1242 /* map results back from gpu */
1243 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
1244  0, w * h * sizeof(l_uint32), 0, nullptr, nullptr,
1245  &clStatus);
1246 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1247 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1248 
1249 // Sync
1250 clFinish(rEnv.mpkCmdQueue);
1251 PERF_COUNT_SUB("kernel & map")
1253  return pResult;
1254 }
1255 
1256 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1257 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1258  size_t globalThreads[2];
1259  cl_mem pixtemp;
1260  cl_int status;
1261  int gsize;
1262  size_t localThreads[2];
1263 
1264  // Horizontal pass
1265  gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1266  globalThreads[0] = gsize;
1267  globalThreads[1] = GROUPSIZE_HMORY;
1268  localThreads[0] = GROUPSIZE_HMORX;
1269  localThreads[1] = GROUPSIZE_HMORY;
1270 
1271  rEnv.mpkKernel =
1272  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1273  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1274 
1275  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1276  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1277  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1278  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1279 
1280  status =
1281  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1282  globalThreads, localThreads, 0, nullptr, nullptr);
1283 
1284  // Swap source and dest buffers
1285  pixtemp = pixsCLBuffer;
1286  pixsCLBuffer = pixdCLBuffer;
1287  pixdCLBuffer = pixtemp;
1288 
1289  // Vertical
1290  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1291  globalThreads[0] = gsize;
1292  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1293  globalThreads[1] = gsize;
1294  localThreads[0] = GROUPSIZE_X;
1295  localThreads[1] = GROUPSIZE_Y;
1296 
1297  rEnv.mpkKernel =
1298  clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1299  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1300 
1301  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1302  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1303  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1304  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1305  status =
1306  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1307  globalThreads, localThreads, 0, nullptr, nullptr);
1308 
1309  return status;
1310 }
1311 
1312 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1313 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1314  size_t globalThreads[2];
1315  cl_mem pixtemp;
1316  cl_int status;
1317  int gsize;
1318  l_uint32 fwmask, lwmask;
1319  size_t localThreads[2];
1320 
1321  lwmask = lmask32[31 - 2];
1322  fwmask = rmask32[31 - 2];
1323 
1324  // Horizontal pass
1325  gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1326  globalThreads[0] = gsize;
1327  globalThreads[1] = GROUPSIZE_HMORY;
1328  localThreads[0] = GROUPSIZE_HMORX;
1329  localThreads[1] = GROUPSIZE_HMORY;
1330 
1331  rEnv.mpkKernel =
1332  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1333  CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1334 
1335  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1336  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1337  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1338  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1339 
1340  status =
1341  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1342  globalThreads, localThreads, 0, nullptr, nullptr);
1343 
1344  // Swap source and dest buffers
1345  pixtemp = pixsCLBuffer;
1346  pixsCLBuffer = pixdCLBuffer;
1347  pixdCLBuffer = pixtemp;
1348 
1349  // Vertical
1350  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1351  globalThreads[0] = gsize;
1352  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1353  globalThreads[1] = gsize;
1354  localThreads[0] = GROUPSIZE_X;
1355  localThreads[1] = GROUPSIZE_Y;
1356 
1357  rEnv.mpkKernel =
1358  clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1359  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1360 
1361  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1362  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1363  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1364  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1365  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1366  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1367  status =
1368  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1369  globalThreads, localThreads, 0, nullptr, nullptr);
1370 
1371  return status;
1372 }
1373 
1374 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
1375 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1376  l_int32 h) {
1377  l_int32 xp, yp, xn, yn;
1378  SEL *sel;
1379  size_t globalThreads[2];
1380  cl_mem pixtemp;
1381  cl_int status;
1382  int gsize;
1383  size_t localThreads[2];
1384  char isEven;
1385 
1386  OpenclDevice::SetKernelEnv(&rEnv);
1387 
1388  if (hsize == 5 && vsize == 5) {
1389  // Specific case for 5x5
1390  status = pixDilateCL_55(wpl, h);
1391  return status;
1392  }
1393 
1394  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1395 
1396  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1397  selDestroy(&sel);
1398  // global and local work dimensions for Horizontal pass
1399  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1400  globalThreads[0] = gsize;
1401  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1402  globalThreads[1] = gsize;
1403  localThreads[0] = GROUPSIZE_X;
1404  localThreads[1] = GROUPSIZE_Y;
1405 
1406  if (xp > 31 || xn > 31) {
1407  // Generic case.
1408  rEnv.mpkKernel =
1409  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1410  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1411 
1412  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1413  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1414  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1415  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1416  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1417  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1418  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1419  nullptr, globalThreads, localThreads, 0,
1420  nullptr, nullptr);
1421 
1422  if (yp > 0 || yn > 0) {
1423  pixtemp = pixsCLBuffer;
1424  pixsCLBuffer = pixdCLBuffer;
1425  pixdCLBuffer = pixtemp;
1426  }
1427  } else if (xp > 0 || xn > 0) {
1428  // Specific Horizontal pass kernel for half width < 32
1429  rEnv.mpkKernel =
1430  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1431  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1432  isEven = (xp != xn);
1433 
1434  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1435  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1436  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1437  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1438  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1439  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1440  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1441  nullptr, globalThreads, localThreads, 0,
1442  nullptr, nullptr);
1443 
1444  if (yp > 0 || yn > 0) {
1445  pixtemp = pixsCLBuffer;
1446  pixsCLBuffer = pixdCLBuffer;
1447  pixdCLBuffer = pixtemp;
1448  }
1449  }
1450 
1451  if (yp > 0 || yn > 0) {
1452  rEnv.mpkKernel =
1453  clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1454  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1455 
1456  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1457  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1458  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1459  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1460  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1461  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1462  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1463  nullptr, globalThreads, localThreads, 0,
1464  nullptr, nullptr);
1465  }
1466 
1467  return status;
1468 }
1469 
1470 //Morphology Erode operation. Invokes the relevant OpenCL kernels
1471 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1472  l_uint32 h) {
1473  l_int32 xp, yp, xn, yn;
1474  SEL *sel;
1475  size_t globalThreads[2];
1476  size_t localThreads[2];
1477  cl_mem pixtemp;
1478  cl_int status;
1479  int gsize;
1480  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1481  l_uint32 rwmask, lwmask;
1482  char isEven;
1483 
1484  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1485 
1486  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1487  selDestroy(&sel);
1488  OpenclDevice::SetKernelEnv(&rEnv);
1489 
1490  if (hsize == 5 && vsize == 5 && isAsymmetric) {
1491  // Specific kernel for 5x5
1492  status = pixErodeCL_55(wpl, h);
1493  return status;
1494  }
1495 
1496  lwmask = lmask32[31 - (xn & 31)];
1497  rwmask = rmask32[31 - (xp & 31)];
1498 
1499  // global and local work dimensions for Horizontal pass
1500  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1501  globalThreads[0] = gsize;
1502  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1503  globalThreads[1] = gsize;
1504  localThreads[0] = GROUPSIZE_X;
1505  localThreads[1] = GROUPSIZE_Y;
1506 
1507  // Horizontal Pass
1508  if (xp > 31 || xn > 31) {
1509  // Generic case.
1510  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1511 
1512  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1513  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1514  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1515  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1516  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1517  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1518  status =
1519  clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1520  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1521  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1522  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1523  nullptr, globalThreads, localThreads, 0,
1524  nullptr, nullptr);
1525 
1526  if (yp > 0 || yn > 0) {
1527  pixtemp = pixsCLBuffer;
1528  pixsCLBuffer = pixdCLBuffer;
1529  pixdCLBuffer = pixtemp;
1530  }
1531  } else if (xp > 0 || xn > 0) {
1532  rEnv.mpkKernel =
1533  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1534  isEven = (xp != xn);
1535 
1536  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1537  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1538  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1539  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1540  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1541  status =
1542  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1543  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1544  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1545  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1546  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1547  nullptr, globalThreads, localThreads, 0,
1548  nullptr, nullptr);
1549 
1550  if (yp > 0 || yn > 0) {
1551  pixtemp = pixsCLBuffer;
1552  pixsCLBuffer = pixdCLBuffer;
1553  pixdCLBuffer = pixtemp;
1554  }
1555  }
1556 
1557  // Vertical Pass
1558  if (yp > 0 || yn > 0) {
1559  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1560  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1561 
1562  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1563  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1564  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1565  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1566  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1567  status =
1568  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1569  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1570  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1571  nullptr, globalThreads, localThreads, 0,
1572  nullptr, nullptr);
1573  }
1574 
1575  return status;
1576 }
1577 
1578 //Morphology Open operation. Invokes the relevant OpenCL kernels
1579 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1580  cl_int status;
1581  cl_mem pixtemp;
1582 
1583  // Erode followed by Dilate
1584  status = pixErodeCL(hsize, vsize, wpl, h);
1585 
1586  pixtemp = pixsCLBuffer;
1587  pixsCLBuffer = pixdCLBuffer;
1588  pixdCLBuffer = pixtemp;
1589 
1590  status = pixDilateCL(hsize, vsize, wpl, h);
1591 
1592  return status;
1593 }
1594 
1595 //Morphology Close operation. Invokes the relevant OpenCL kernels
1596 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1597  cl_int status;
1598  cl_mem pixtemp;
1599 
1600  // Dilate followed by Erode
1601  status = pixDilateCL(hsize, vsize, wpl, h);
1602 
1603  pixtemp = pixsCLBuffer;
1604  pixsCLBuffer = pixdCLBuffer;
1605  pixdCLBuffer = pixtemp;
1606 
1607  status = pixErodeCL(hsize, vsize, wpl, h);
1608 
1609  return status;
1610 }
1611 
1612 //output = buffer1 & ~(buffer2)
1613 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1614  cl_mem buffer2, cl_mem outBuffer = nullptr) {
1615  cl_int status;
1616  size_t globalThreads[2];
1617  int gsize;
1618  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1619 
1620  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1621  globalThreads[0] = gsize;
1622  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1623  globalThreads[1] = gsize;
1624 
1625  if (outBuffer != nullptr) {
1626  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract", &status);
1627  CHECK_OPENCL(status, "clCreateKernel pixSubtract");
1628  } else {
1629  rEnv.mpkKernel =
1630  clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1631  CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1632  }
1633 
1634  // Enqueue a kernel run call.
1635  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1636  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1637  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1638  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1639  if (outBuffer != nullptr) {
1640  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outBuffer);
1641  }
1642  status =
1643  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1644  globalThreads, localThreads, 0, nullptr, nullptr);
1645 
1646  return status;
1647 }
1648 
1649 // OpenCL implementation of Get Lines from pix function
1650 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1651 void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline,
1652  Pix **pix_hline, Pix **pixClosed,
1653  bool getpixClosed, l_int32 close_hsize,
1654  l_int32 close_vsize, l_int32 open_hsize,
1655  l_int32 open_vsize, l_int32 line_hsize,
1656  l_int32 line_vsize) {
1657  l_uint32 wpl, h;
1658  cl_mem pixtemp;
1659 
1660  wpl = pixGetWpl(pixs);
1661  h = pixGetHeight(pixs);
1662 
1663  // First step : Close Morph operation: Dilate followed by Erode
1664  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1665 
1666  // Copy the Close output to CPU buffer
1667  if (getpixClosed) {
1668  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1669  wpl * h, CL_MAP_READ, true, false);
1670  }
1671 
1672  // Store the output of close operation in an intermediate buffer
1673  // this will be later used for pixsubtract
1674  clStatus =
1675  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1676  0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1677 
1678  // Second step: Open Operation - Erode followed by Dilate
1679  pixtemp = pixsCLBuffer;
1680  pixsCLBuffer = pixdCLBuffer;
1681  pixdCLBuffer = pixtemp;
1682 
1683  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1684 
1685  // Third step: Subtract : (Close - Open)
1686  pixtemp = pixsCLBuffer;
1687  pixsCLBuffer = pixdCLBuffer;
1688  pixdCLBuffer = pixdCLIntermediate;
1689  pixdCLIntermediate = pixtemp;
1690 
1691  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1692 
1693  // Store the output of Hollow operation in an intermediate buffer
1694  // this will be later used
1695  clStatus =
1696  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1697  0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1698 
1699  pixtemp = pixsCLBuffer;
1700  pixsCLBuffer = pixdCLBuffer;
1701  pixdCLBuffer = pixtemp;
1702 
1703  // Fourth step: Get vertical line
1704  // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1705  clStatus = pixOpenCL(1, line_vsize, wpl, h);
1706 
1707  // Copy the vertical line output to CPU buffer
1708  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1709  CL_MAP_READ, true, false);
1710 
1711  pixtemp = pixsCLBuffer;
1712  pixsCLBuffer = pixdCLIntermediate;
1713  pixdCLIntermediate = pixtemp;
1714 
1715  // Fifth step: Get horizontal line
1716  // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1717  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1718 
1719  // Copy the horizontal line output to CPU buffer
1720  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1721  CL_MAP_READ, true, true);
1722 
1723  return;
1724 }
1725 
1726 /*************************************************************************
1727  * HistogramRect
1728  * Otsu Thresholding Operations
1729  * histogramAllChannels is laid out as all channel 0, then all channel 1...
1730  * only supports 1 or 4 channels (bytes_per_pixel)
1731  ************************************************************************/
1732 int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
1733  int bytes_per_pixel, int bytes_per_line,
1734  int left, // always 0
1735  int top, // always 0
1736  int width, int height, int kHistogramSize,
1737  int *histogramAllChannels) {
1738  PERF_COUNT_START("HistogramRectOCL")
1739  cl_int clStatus;
1740  int retVal = 0;
1741  KernelEnv histKern;
1742  SetKernelEnv(&histKern);
1743  KernelEnv histRedKern;
1744  SetKernelEnv(&histRedKern);
1745  /* map imagedata to device as read only */
1746  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1747  // coherent which we don't need.
1748  // faster option would be to allocate initial image buffer
1749  // using a garlic bus memory type
1750  cl_mem imageBuffer = clCreateBuffer(
1751  histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1752  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1753  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1754 
1755  /* setup work group size parameters */
1756  int block_size = 256;
1757  cl_uint numCUs;
1758  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1759  sizeof(numCUs), &numCUs, nullptr);
1760  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1761 
1762  int requestedOccupancy = 10;
1763  int numWorkGroups = numCUs * requestedOccupancy;
1764  int numThreads = block_size * numWorkGroups;
1765  size_t local_work_size[] = {static_cast<size_t>(block_size)};
1766  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1767  size_t red_global_work_size[] = {
1768  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1769 
1770  /* map histogramAllChannels as write only */
1771 
1772  cl_mem histogramBuffer = clCreateBuffer(
1773  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1774  kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels,
1775  &clStatus);
1776  CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1777 
1778  /* intermediate histogram buffer */
1779  int histRed = 256;
1780  int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1781 
1782  cl_mem tmpHistogramBuffer =
1783  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1784  tmpHistogramBins * sizeof(cl_uint), nullptr, &clStatus);
1785  CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1786 
1787  /* atomic sync buffer */
1788  int *zeroBuffer = new int[1];
1789  zeroBuffer[0] = 0;
1790  cl_mem atomicSyncBuffer = clCreateBuffer(
1791  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1792  sizeof(cl_int), zeroBuffer, &clStatus);
1793  CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1794  delete[] zeroBuffer;
1795  // Create kernel objects based on bytes_per_pixel
1796  if (bytes_per_pixel == 1) {
1797  histKern.mpkKernel = clCreateKernel(
1798  histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1799  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1800 
1801  histRedKern.mpkKernel =
1802  clCreateKernel(histRedKern.mpkProgram,
1803  "kernel_HistogramRectOneChannelReduction", &clStatus);
1804  CHECK_OPENCL(clStatus,
1805  "clCreateKernel kernel_HistogramRectOneChannelReduction");
1806  } else {
1807  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
1808  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1809 
1810  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
1811  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1812  }
1813 
1814  void *ptr;
1815 
1816  //Initialize tmpHistogramBuffer buffer
1817  ptr = clEnqueueMapBuffer(
1818  histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1819  tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
1820  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1821 
1822  memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
1823  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1824  nullptr, nullptr);
1825 
1826  /* set kernel 1 arguments */
1827  clStatus =
1828  clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1829  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
1830  cl_uint numPixels = width*height;
1831  clStatus =
1832  clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1833  CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
1834  clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
1835  &tmpHistogramBuffer);
1836  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
1837 
1838  /* set kernel 2 arguments */
1839  int n = numThreads/bytes_per_pixel;
1840  clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1841  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
1842  clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem),
1843  &tmpHistogramBuffer);
1844  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
1845  clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem),
1846  &histogramBuffer);
1847  CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
1848 
1849  /* launch histogram */
1850 PERF_COUNT_SUB("before")
1851 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1852  nullptr, global_work_size, local_work_size, 0,
1853  nullptr, nullptr);
1854 CHECK_OPENCL(clStatus,
1855  "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1856 clFinish(histKern.mpkCmdQueue);
1857 if (clStatus != 0) {
1858  retVal = -1;
1859  }
1860  /* launch histogram */
1861  clStatus = clEnqueueNDRangeKernel(
1862  histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1863  red_global_work_size, local_work_size, 0, nullptr, nullptr);
1864  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
1865  clFinish( histRedKern.mpkCmdQueue );
1866  if (clStatus != 0) {
1867  retVal = -1;
1868  }
1869  PERF_COUNT_SUB("redKernel")
1870 
1871  /* map results back from gpu */
1872  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1873  CL_MAP_READ, 0,
1874  kHistogramSize * bytes_per_pixel * sizeof(int), 0,
1875  nullptr, nullptr, &clStatus);
1876  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
1877  if (clStatus != 0) {
1878  retVal = -1;
1879  }
1880  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1881  nullptr, nullptr);
1882 
1883  clReleaseMemObject(histogramBuffer);
1884  clReleaseMemObject(imageBuffer);
1885 PERF_COUNT_SUB("after")
1887 return retVal;
1888 }
1889 
1890 /*************************************************************************
1891  * Threshold the rectangle, taking everything except the image buffer pointer
1892  * from the class, using thresholds/hi_values to the output IMAGE.
1893  * only supports 1 or 4 channels
1894  ************************************************************************/
1895 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData,
1896  int bytes_per_pixel, int bytes_per_line,
1897  int *thresholds, int *hi_values,
1898  Pix **pix, int height, int width,
1899  int top, int left) {
1900  PERF_COUNT_START("ThresholdRectToPixOCL")
1901  int retVal = 0;
1902  /* create pix result buffer */
1903  *pix = pixCreate(width, height, 1);
1904  uint32_t *pixData = pixGetData(*pix);
1905  int wpl = pixGetWpl(*pix);
1906  int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1907 
1908  cl_int clStatus;
1909  KernelEnv rEnv;
1910  SetKernelEnv(&rEnv);
1911 
1912  /* setup work group size parameters */
1913  int block_size = 256;
1914  cl_uint numCUs = 6;
1915  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1916  sizeof(numCUs), &numCUs, nullptr);
1917  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1918 
1919  int requestedOccupancy = 10;
1920  int numWorkGroups = numCUs * requestedOccupancy;
1921  int numThreads = block_size * numWorkGroups;
1922  size_t local_work_size[] = {(size_t)block_size};
1923  size_t global_work_size[] = {(size_t)numThreads};
1924 
1925  /* map imagedata to device as read only */
1926  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1927  // coherent which we don't need.
1928  // faster option would be to allocate initial image buffer
1929  // using a garlic bus memory type
1930  cl_mem imageBuffer = clCreateBuffer(
1931  rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1932  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1933  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1934 
1935  /* map pix as write only */
1936  pixThBuffer =
1937  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1938  pixSize, pixData, &clStatus);
1939  CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1940 
1941  /* map thresholds and hi_values */
1942  cl_mem thresholdsBuffer =
1943  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1944  bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1945  CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1946  cl_mem hiValuesBuffer =
1947  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1948  bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1949  CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1950 
1951  /* compile kernel */
1952  if (bytes_per_pixel == 4) {
1953  rEnv.mpkKernel =
1954  clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1955  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1956  } else {
1957  rEnv.mpkKernel = clCreateKernel(
1958  rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1959  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1960  }
1961 
1962  /* set kernel arguments */
1963  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1964  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1965  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1966  CHECK_OPENCL(clStatus, "clSetKernelArg height");
1967  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1968  CHECK_OPENCL(clStatus, "clSetKernelArg width");
1969  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1970  CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1971  clStatus =
1972  clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1973  CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1974  clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1975  CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1976  clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1977  CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1978 
1979  /* launch kernel & wait */
1980  PERF_COUNT_SUB("before")
1981  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1982  nullptr, global_work_size, local_work_size,
1983  0, nullptr, nullptr);
1984  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1985  clFinish(rEnv.mpkCmdQueue);
1986  PERF_COUNT_SUB("kernel")
1987  if (clStatus != 0) {
1988  printf("Setting return value to -1\n");
1989  retVal = -1;
1990  }
1991  /* map results back from gpu */
1992  void *ptr =
1993  clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1994  pixSize, 0, nullptr, nullptr, &clStatus);
1995  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1996  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr,
1997  nullptr);
1998 
1999  clReleaseMemObject(imageBuffer);
2000  clReleaseMemObject(thresholdsBuffer);
2001  clReleaseMemObject(hiValuesBuffer);
2002 
2003  PERF_COUNT_SUB("after")
2005  return retVal;
2006 }
2007 
2008 
2009 
2010 /******************************************************************************
2011  * Data Types for Device Selection
2012  *****************************************************************************/
2013 
2014 typedef struct _TessScoreEvaluationInputData {
2015  int height;
2016  int width;
2017  int numChannels;
2018  unsigned char *imageData;
2019  Pix *pix;
2020 } TessScoreEvaluationInputData;
2021 
2022 static void populateTessScoreEvaluationInputData(
2023  TessScoreEvaluationInputData *input) {
2024  srand(1);
2025  // 8.5x11 inches @ 300dpi rounded to clean multiples
2026  int height = 3328; // %256
2027  int width = 2560; // %512
2028  int numChannels = 4;
2029  input->height = height;
2030  input->width = width;
2031  input->numChannels = numChannels;
2032  unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
2033  height * width * numChannels *
2034  sizeof(unsigned char)); // new unsigned char[4][height*width];
2035  input->imageData = (unsigned char *)&imageData4[0];
2036 
2037  // zero out image
2038  unsigned char pixelWhite[4] = {0, 0, 0, 255};
2039  unsigned char pixelBlack[4] = {255, 255, 255, 255};
2040  for (int p = 0; p < height * width; p++) {
2041  // unsigned char tmp[4] = imageData4[0];
2042  imageData4[p][0] = pixelWhite[0];
2043  imageData4[p][1] = pixelWhite[1];
2044  imageData4[p][2] = pixelWhite[2];
2045  imageData4[p][3] = pixelWhite[3];
2046  }
2047  // random lines to be eliminated
2048  int maxLineWidth = 64; // pixels wide
2049  int numLines = 10;
2050  // vertical lines
2051  for (int i = 0; i < numLines; i++) {
2052  int lineWidth = rand() % maxLineWidth;
2053  int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
2054  // printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
2055  for (int row = vertLinePos - lineWidth / 2;
2056  row < vertLinePos + lineWidth / 2; row++) {
2057  for (int col = 0; col < height; col++) {
2058  // imageData4[row*width+col] = pixelBlack;
2059  imageData4[row * width + col][0] = pixelBlack[0];
2060  imageData4[row * width + col][1] = pixelBlack[1];
2061  imageData4[row * width + col][2] = pixelBlack[2];
2062  imageData4[row * width + col][3] = pixelBlack[3];
2063  }
2064  }
2065  }
2066  // horizontal lines
2067  for (int i = 0; i < numLines; i++) {
2068  int lineWidth = rand() % maxLineWidth;
2069  int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
2070  // printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
2071  for (int row = 0; row < width; row++) {
2072  for (int col = horLinePos - lineWidth / 2;
2073  col < horLinePos + lineWidth / 2;
2074  col++) { // for (int row = vertLinePos-lineWidth/2; row <
2075  // vertLinePos+lineWidth/2; row++) {
2076  // printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
2077  // imageData4[row*width+col] = pixelBlack;
2078  imageData4[row * width + col][0] = pixelBlack[0];
2079  imageData4[row * width + col][1] = pixelBlack[1];
2080  imageData4[row * width + col][2] = pixelBlack[2];
2081  imageData4[row * width + col][3] = pixelBlack[3];
2082  }
2083  }
2084  }
2085  // spots (noise, squares)
2086  float fractionBlack = 0.1; // how much of the image should be blackened
2087  int numSpots =
2088  (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
2089  for (int i = 0; i < numSpots; i++) {
2090  int lineWidth = rand() % maxLineWidth;
2091  int col = lineWidth + rand() % (width - 2 * lineWidth);
2092  int row = lineWidth + rand() % (height - 2 * lineWidth);
2093  // printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
2094  for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
2095  for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
2096  // printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2097  // imageData4[row*width+col] = pixelBlack;
2098  imageData4[r * width + c][0] = pixelBlack[0];
2099  imageData4[r * width + c][1] = pixelBlack[1];
2100  imageData4[r * width + c][2] = pixelBlack[2];
2101  imageData4[r * width + c][3] = pixelBlack[3];
2102  }
2103  }
2104  }
2105 
2106  input->pix = pixCreate(input->width, input->height, 1);
2107 }
2108 
2109 typedef struct _TessDeviceScore {
2110  float time; // small time means faster device
2111  bool clError; // were there any opencl errors
2112  bool valid; // was the correct response generated
2113 } TessDeviceScore;
2114 
2115 /******************************************************************************
2116  * Micro Benchmarks for Device Selection
2117  *****************************************************************************/
2118 
2119 static double composeRGBPixelMicroBench(GPUEnv *env,
2120  TessScoreEvaluationInputData input,
2121  ds_device_type type) {
2122  double time = 0;
2123 #if ON_WINDOWS
2124  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2125  QueryPerformanceFrequency(&freq);
2126 #elif ON_APPLE
2127  mach_timebase_info_data_t info = {0, 0};
2128  mach_timebase_info(&info);
2129  long long start, stop;
2130 #else
2131  timespec time_funct_start, time_funct_end;
2132 #endif
2133  // input data
2134  l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
2135 
2136  // function call
2137  if (type == DS_DEVICE_OPENCL_DEVICE) {
2138 #if ON_WINDOWS
2139  QueryPerformanceCounter(&time_funct_start);
2140 #elif ON_APPLE
2141  start = mach_absolute_time();
2142 #else
2143  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2144 #endif
2145 
2146  OpenclDevice::gpuEnv = *env;
2147  int wpl = pixGetWpl(input.pix);
2148  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2149  wpl, nullptr);
2150 #if ON_WINDOWS
2151  QueryPerformanceCounter(&time_funct_end);
2152  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2153 #elif ON_APPLE
2154  stop = mach_absolute_time();
2155  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2156 #else
2157  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2158  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2159 #endif
2160 
2161  } else {
2162 #if ON_WINDOWS
2163  QueryPerformanceCounter(&time_funct_start);
2164 #elif ON_APPLE
2165  start = mach_absolute_time();
2166 #else
2167  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2168 #endif
2169  Pix *pix = pixCreate(input.width, input.height, 32);
2170  l_uint32 *pixData = pixGetData(pix);
2171  int i, j;
2172  int idx = 0;
2173  for (i = 0; i < input.height ; i++) {
2174  for (j = 0; j < input.width; j++) {
2175  l_uint32 tiffword = tiffdata[i * input.width + j];
2176  l_int32 rval = ((tiffword) & 0xff);
2177  l_int32 gval = (((tiffword) >> 8) & 0xff);
2178  l_int32 bval = (((tiffword) >> 16) & 0xff);
2179  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2180  pixData[idx] = value;
2181  idx++;
2182  }
2183  }
2184 #if ON_WINDOWS
2185  QueryPerformanceCounter(&time_funct_end);
2186  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2187 #elif ON_APPLE
2188  stop = mach_absolute_time();
2189  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2190 #else
2191  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2192  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2193 #endif
2194  pixDestroy(&pix);
2195  }
2196 
2197 
2198  // cleanup
2199 
2200  return time;
2201 }
2202 
2203 static double histogramRectMicroBench(GPUEnv *env,
2204  TessScoreEvaluationInputData input,
2205  ds_device_type type) {
2206  double time;
2207 #if ON_WINDOWS
2208  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2209  QueryPerformanceFrequency(&freq);
2210 #elif ON_APPLE
2211  mach_timebase_info_data_t info = {0, 0};
2212  mach_timebase_info(&info);
2213  long long start, stop;
2214 #else
2215  timespec time_funct_start, time_funct_end;
2216 #endif
2217 
2218  int left = 0;
2219  int top = 0;
2220  int kHistogramSize = 256;
2221  int bytes_per_line = input.width*input.numChannels;
2222  int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
2223  // function call
2224  if (type == DS_DEVICE_OPENCL_DEVICE) {
2225 #if ON_WINDOWS
2226  QueryPerformanceCounter(&time_funct_start);
2227 #elif ON_APPLE
2228  start = mach_absolute_time();
2229 #else
2230  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2231 #endif
2232 
2233  OpenclDevice::gpuEnv = *env;
2234  int retVal = OpenclDevice::HistogramRectOCL(
2235  input.imageData, input.numChannels, bytes_per_line, top, left,
2236  input.width, input.height, kHistogramSize, histogramAllChannels);
2237 
2238 #if ON_WINDOWS
2239  QueryPerformanceCounter(&time_funct_end);
2240  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2241 #elif ON_APPLE
2242  stop = mach_absolute_time();
2243  if (retVal == 0) {
2244  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2245  } else {
2246  time = FLT_MAX;
2247  }
2248 #else
2249  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2250  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2251 #endif
2252  } else {
2253  int *histogram = new int[kHistogramSize];
2254 #if ON_WINDOWS
2255  QueryPerformanceCounter(&time_funct_start);
2256 #elif ON_APPLE
2257  start = mach_absolute_time();
2258 #else
2259  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2260 #endif
2261  for (int ch = 0; ch < input.numChannels; ++ch) {
2262  tesseract::HistogramRect(input.pix, input.numChannels, left, top,
2263  input.width, input.height, histogram);
2264  }
2265 #if ON_WINDOWS
2266  QueryPerformanceCounter(&time_funct_end);
2267  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2268 #elif ON_APPLE
2269  stop = mach_absolute_time();
2270  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2271 #else
2272  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2273  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2274 #endif
2275  delete[] histogram;
2276  }
2277 
2278  // cleanup
2279  delete[] histogramAllChannels;
2280  return time;
2281 }
2282 
2283 //Reproducing the ThresholdRectToPix native version
2284 static void ThresholdRectToPix_Native(const unsigned char *imagedata,
2285  int bytes_per_pixel, int bytes_per_line,
2286  const int *thresholds,
2287  const int *hi_values, Pix **pix) {
2288  int top = 0;
2289  int left = 0;
2290  int width = pixGetWidth(*pix);
2291  int height = pixGetHeight(*pix);
2292 
2293  *pix = pixCreate(width, height, 1);
2294  uint32_t *pixdata = pixGetData(*pix);
2295  int wpl = pixGetWpl(*pix);
2296  const unsigned char* srcdata = imagedata + top * bytes_per_line +
2297  left * bytes_per_pixel;
2298  for (int y = 0; y < height; ++y) {
2299  const uint8_t *linedata = srcdata;
2300  uint32_t *pixline = pixdata + y * wpl;
2301  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2302  bool white_result = true;
2303  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2304  if (hi_values[ch] >= 0 &&
2305  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2306  white_result = false;
2307  break;
2308  }
2309  }
2310  if (white_result)
2311  CLEAR_DATA_BIT(pixline, x);
2312  else
2313  SET_DATA_BIT(pixline, x);
2314  }
2315  srcdata += bytes_per_line;
2316  }
2317 }
2318 
2319 static double thresholdRectToPixMicroBench(GPUEnv *env,
2320  TessScoreEvaluationInputData input,
2321  ds_device_type type) {
2322  double time;
2323 #if ON_WINDOWS
2324  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2325  QueryPerformanceFrequency(&freq);
2326 #elif ON_APPLE
2327  mach_timebase_info_data_t info = {0, 0};
2328  mach_timebase_info(&info);
2329  long long start, stop;
2330 #else
2331  timespec time_funct_start, time_funct_end;
2332 #endif
2333 
2334  // input data
2335  unsigned char pixelHi = (unsigned char)255;
2336  int* thresholds = new int[4];
2337  thresholds[0] = pixelHi/2;
2338  thresholds[1] = pixelHi/2;
2339  thresholds[2] = pixelHi/2;
2340  thresholds[3] = pixelHi/2;
2341  int *hi_values = new int[4];
2342  thresholds[0] = pixelHi;
2343  thresholds[1] = pixelHi;
2344  thresholds[2] = pixelHi;
2345  thresholds[3] = pixelHi;
2346  //Pix* pix = pixCreate(width, height, 1);
2347  int top = 0;
2348  int left = 0;
2349  int bytes_per_line = input.width*input.numChannels;
2350 
2351  // function call
2352  if (type == DS_DEVICE_OPENCL_DEVICE) {
2353 #if ON_WINDOWS
2354  QueryPerformanceCounter(&time_funct_start);
2355 #elif ON_APPLE
2356  start = mach_absolute_time();
2357 #else
2358  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2359 #endif
2360 
2361  OpenclDevice::gpuEnv = *env;
2362  int retVal = OpenclDevice::ThresholdRectToPixOCL(
2363  input.imageData, input.numChannels, bytes_per_line, thresholds,
2364  hi_values, &input.pix, input.height, input.width, top, left);
2365 
2366 #if ON_WINDOWS
2367  QueryPerformanceCounter(&time_funct_end);
2368  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2369 #elif ON_APPLE
2370  stop = mach_absolute_time();
2371  if (retVal == 0) {
2372  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2373  ;
2374  } else {
2375  time = FLT_MAX;
2376  }
2377 
2378 #else
2379  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2380  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2381 #endif
2382  } else {
2383 
2384 
2385  tesseract::ImageThresholder thresholder;
2386  thresholder.SetImage( input.pix );
2387 #if ON_WINDOWS
2388  QueryPerformanceCounter(&time_funct_start);
2389 #elif ON_APPLE
2390  start = mach_absolute_time();
2391 #else
2392  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2393 #endif
2394  ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
2395  thresholds, hi_values, &input.pix );
2396 
2397 #if ON_WINDOWS
2398  QueryPerformanceCounter(&time_funct_end);
2399  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2400 #elif ON_APPLE
2401  stop = mach_absolute_time();
2402  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2403 #else
2404  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2405  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2406 #endif
2407  }
2408 
2409  // cleanup
2410  delete[] thresholds;
2411  delete[] hi_values;
2412  return time;
2413 }
2414 
2415 static double getLineMasksMorphMicroBench(GPUEnv *env,
2416  TessScoreEvaluationInputData input,
2417  ds_device_type type) {
2418  double time = 0;
2419 #if ON_WINDOWS
2420  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2421  QueryPerformanceFrequency(&freq);
2422 #elif ON_APPLE
2423  mach_timebase_info_data_t info = {0, 0};
2424  mach_timebase_info(&info);
2425  long long start, stop;
2426 #else
2427  timespec time_funct_start, time_funct_end;
2428 #endif
2429 
2430  // input data
2431  int resolution = 300;
2432  int wpl = pixGetWpl(input.pix);
2433  int kThinLineFraction = 20; // tess constant
2434  int kMinLineLengthFraction = 4; // tess constant
2435  int max_line_width = resolution / kThinLineFraction;
2436  int min_line_length = resolution / kMinLineLengthFraction;
2437  int closing_brick = max_line_width / 3;
2438 
2439  // function call
2440  if (type == DS_DEVICE_OPENCL_DEVICE) {
2441 #if ON_WINDOWS
2442  QueryPerformanceCounter(&time_funct_start);
2443 #elif ON_APPLE
2444  start = mach_absolute_time();
2445 #else
2446  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2447 #endif
2448  OpenclDevice::gpuEnv = *env;
2449  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2450  Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr;
2451  OpenclDevice::pixGetLinesCL(
2452  nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2453  closing_brick, closing_brick, max_line_width, max_line_width,
2454  min_line_length, min_line_length);
2455 
2456  OpenclDevice::releaseMorphCLBuffers();
2457 
2458 #if ON_WINDOWS
2459  QueryPerformanceCounter(&time_funct_end);
2460  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2461 #elif ON_APPLE
2462  stop = mach_absolute_time();
2463  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2464 #else
2465  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2466  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2467 #endif
2468  } else {
2469 #if ON_WINDOWS
2470  QueryPerformanceCounter(&time_funct_start);
2471 #elif ON_APPLE
2472  start = mach_absolute_time();
2473 #else
2474  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2475 #endif
2476 
2477  // native serial code
2478  Pix *src_pix = input.pix;
2479  Pix *pix_closed =
2480  pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2481  Pix *pix_solid =
2482  pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2483  Pix *pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2484  pixDestroy(&pix_solid);
2485  Pix *pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2486  Pix *pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2487  pixDestroy(&pix_hollow);
2488 
2489 #if ON_WINDOWS
2490  QueryPerformanceCounter(&time_funct_end);
2491  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2492 #elif ON_APPLE
2493  stop = mach_absolute_time();
2494  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2495 #else
2496  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2497  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2498 #endif
2499  }
2500 
2501  return time;
2502 }
2503 
2504 
2505 
2506 /******************************************************************************
2507  * Device Selection
2508  *****************************************************************************/
2509 
2510 #include "stdlib.h"
2511 
2512 // encode score object as byte string
2513 static ds_status serializeScore(ds_device *device, void **serializedScore,
2514  unsigned int *serializedScoreSize) {
2515  *serializedScoreSize = sizeof(TessDeviceScore);
2516  *serializedScore = new unsigned char[*serializedScoreSize];
2517  memcpy(*serializedScore, device->score, *serializedScoreSize);
2518  return DS_SUCCESS;
2519 }
2520 
2521 // parses byte string and stores in score object
2522 static ds_status deserializeScore(ds_device *device,
2523  const unsigned char *serializedScore,
2524  unsigned int serializedScoreSize) {
2525  // check that serializedScoreSize == sizeof(TessDeviceScore);
2526  device->score = new TessDeviceScore;
2527  memcpy(device->score, serializedScore, serializedScoreSize);
2528  return DS_SUCCESS;
2529 }
2530 
2531 static ds_status releaseScore(void *score) {
2532  delete (TessDeviceScore *)score;
2533  return DS_SUCCESS;
2534 }
2535 
2536 // evaluate devices
2537 static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) {
2538  // overwrite statuc gpuEnv w/ current device
2539  // so native opencl calls can be used; they use static gpuEnv
2540  printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2541  device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2542  GPUEnv *env = nullptr;
2543  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2544  env = new GPUEnv;
2545  // printf("[DS] populating tmp GPUEnv from device\n");
2546  populateGPUEnvFromDevice(env, device->oclDeviceID);
2547  env->mnFileCount = 0; // argc;
2548  env->mnKernelCount = 0UL;
2549  // printf("[DS] compiling kernels for tmp GPUEnv\n");
2550  OpenclDevice::gpuEnv = *env;
2551  OpenclDevice::CompileKernelFile(env, "");
2552  }
2553 
2554  TessScoreEvaluationInputData *input =
2555  static_cast<TessScoreEvaluationInputData *>(inputData);
2556 
2557  // pixReadTiff
2558  double composeRGBPixelTime =
2559  composeRGBPixelMicroBench(env, *input, device->type);
2560 
2561  // HistogramRect
2562  double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2563 
2564  // ThresholdRectToPix
2565  double thresholdRectToPixTime =
2566  thresholdRectToPixMicroBench(env, *input, device->type);
2567 
2568  // getLineMasks
2569  double getLineMasksMorphTime =
2570  getLineMasksMorphMicroBench(env, *input, device->type);
2571 
2572  // weigh times (% of cpu time)
2573  // these weights should be the % execution time that the native cpu code took
2574  float composeRGBPixelWeight = 1.2f;
2575  float histogramRectWeight = 2.4f;
2576  float thresholdRectToPixWeight = 4.5f;
2577  float getLineMasksMorphWeight = 5.0f;
2578 
2579  float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2580  histogramRectWeight * histogramRectTime +
2581  thresholdRectToPixWeight * thresholdRectToPixTime +
2582  getLineMasksMorphWeight * getLineMasksMorphTime;
2583  device->score = new TessDeviceScore;
2584  ((TessDeviceScore *)device->score)->time = weightedTime;
2585 
2586  printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2587  device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2588  printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime,
2589  composeRGBPixelWeight);
2590  printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime,
2591  histogramRectWeight);
2592  printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix",
2593  thresholdRectToPixTime, thresholdRectToPixWeight);
2594  printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2595  getLineMasksMorphWeight);
2596  printf("[DS]%25s: %f\n", "Score",
2597  static_cast<TessDeviceScore *>(device->score)->time);
2598  return DS_SUCCESS;
2599 }
2600 
2601 // initial call to select device
2602 ds_device OpenclDevice::getDeviceSelection( ) {
2603  if (!deviceIsSelected) {
2604  PERF_COUNT_START("getDeviceSelection")
2605  // check if opencl is available at runtime
2606  if (1 == LoadOpencl()) {
2607  // opencl is available
2608  // PERF_COUNT_SUB("LoadOpencl")
2609  // setup devices
2610  ds_status status;
2611  ds_profile *profile;
2612  status = initDSProfile(&profile, "v0.1");
2613  PERF_COUNT_SUB("initDSProfile")
2614  // try reading scores from file
2615  const char *fileName = "tesseract_opencl_profile_devices.dat";
2616  status = readProfileFromFile(profile, deserializeScore, fileName);
2617  if (status != DS_SUCCESS) {
2618  // need to run evaluation
2619  printf("[DS] Profile file not available (%s); performing profiling.\n",
2620  fileName);
2621 
2622  // create input data
2623  TessScoreEvaluationInputData input;
2624  populateTessScoreEvaluationInputData(&input);
2625  // PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
2626  // perform evaluations
2627  unsigned int numUpdates;
2628  status = profileDevices(profile, DS_EVALUATE_ALL,
2629  evaluateScoreForDevice, &input, &numUpdates);
2630  PERF_COUNT_SUB("profileDevices")
2631  // write scores to file
2632  if (status == DS_SUCCESS) {
2633  status = writeProfileToFile(profile, serializeScore, fileName);
2634  PERF_COUNT_SUB("writeProfileToFile")
2635  if (status == DS_SUCCESS) {
2636  printf("[DS] Scores written to file (%s).\n", fileName);
2637  } else {
2638  printf(
2639  "[DS] Error saving scores to file (%s); scores not written to "
2640  "file.\n",
2641  fileName);
2642  }
2643  } else {
2644  printf(
2645  "[DS] Unable to evaluate performance; scores not written to "
2646  "file.\n");
2647  }
2648  } else {
2649  PERF_COUNT_SUB("readProfileFromFile")
2650  printf("[DS] Profile read from file (%s).\n", fileName);
2651  }
2652 
2653  // we now have device scores either from file or evaluation
2654  // select fastest using custom Tesseract selection algorithm
2655  float bestTime = FLT_MAX; // begin search with worst possible time
2656  int bestDeviceIdx = -1;
2657  for (unsigned d = 0; d < profile->numDevices; d++) {
2658  ds_device device = profile->devices[d];
2659  TessDeviceScore score = *(TessDeviceScore *)device.score;
2660 
2661  float time = score.time;
2662  printf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2663  device.oclDeviceName, time);
2664  if (time < bestTime) {
2665  bestTime = time;
2666  bestDeviceIdx = d;
2667  }
2668  }
2669  printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2670  profile->devices[bestDeviceIdx].oclDeviceName,
2671  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2672  ? "OpenCL"
2673  : "Native");
2674  // cleanup
2675  // TODO: call destructor for profile object?
2676 
2677  bool overridden = false;
2678  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2679  if (overrideDeviceStr != nullptr) {
2680  int overrideDeviceIdx = atoi(overrideDeviceStr);
2681  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2682  printf(
2683  "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2684  "%i)\n",
2685  overrideDeviceStr, overrideDeviceIdx);
2686  bestDeviceIdx = overrideDeviceIdx - 1;
2687  overridden = true;
2688  } else {
2689  printf(
2690  "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2691  "valid devices).\n",
2692  overrideDeviceStr, profile->numDevices);
2693  }
2694  }
2695 
2696  if (overridden) {
2697  printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2698  profile->devices[bestDeviceIdx].oclDeviceName,
2699  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2700  ? "OpenCL"
2701  : "Native");
2702  }
2703  selectedDevice = profile->devices[bestDeviceIdx];
2704  // cleanup
2705  releaseDSProfile(profile, releaseScore);
2706  } else {
2707  // opencl isn't available at runtime, select native cpu device
2708  printf("[DS] OpenCL runtime not available.\n");
2709  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2710  selectedDevice.oclDeviceName = "(null)";
2711  selectedDevice.score = nullptr;
2712  selectedDevice.oclDeviceID = nullptr;
2713  selectedDevice.oclDriverVersion = nullptr;
2714  }
2715  deviceIsSelected = true;
2716  PERF_COUNT_SUB("select from Profile")
2718  }
2719  // PERF_COUNT_END
2720  return selectedDevice;
2721 }
2722 
2723 
2724 bool OpenclDevice::selectedDeviceIsOpenCL() {
2725  ds_device device = getDeviceSelection();
2726  return (device.type == DS_DEVICE_OPENCL_DEVICE);
2727 }
2728 
2729 #endif
const char * kernel_src
Definition: oclkernels.h:21
#define PERF_COUNT_SUB(SUB)
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:151
#define PERF_COUNT_END
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
#define tprintf(...)
Definition: tprintf.h:31
const int kHistogramSize
Definition: otsuthr.h:27
#define PERF_COUNT_START(FUNCT_NAME)
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62