All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
openclwrapper.cpp
Go to the documentation of this file.
1 #ifdef _WIN32
2 #include <Windows.h>
3 #include <io.h>
4 #else
5 #include <sys/types.h>
6 #include <unistd.h>
7 #endif
8 #include <float.h>
9 
10 #include "openclwrapper.h"
11 #include "oclkernels.h"
12 
13 // for micro-benchmark
14 #include "otsuthr.h"
15 #include "thresholder.h"
16 
17 #ifdef USE_OPENCL
18 
19 #if ON_APPLE
20 #define TIMESPEC mach_timespec
21 #else
22 #define TIMESPEC timespec
23 #endif
24 
26 GPUEnv OpenclDevice::gpuEnv;
27 
28 #if USE_DEVICE_SELECTION
29 bool OpenclDevice::deviceIsSelected = false;
30 ds_device OpenclDevice::selectedDevice;
31 #endif
32 
33 int OpenclDevice::isInited =0;
34 
35 struct tiff_transform {
36  int vflip; /* if non-zero, image needs a vertical fip */
37  int hflip; /* if non-zero, image needs a horizontal flip */
38  int rotate; /* -1 -> counterclockwise 90-degree rotation,
39  0 -> no rotation
40  1 -> clockwise 90-degree rotation */
41 };
42 
43 static struct tiff_transform tiff_orientation_transforms[] = {
44  {0, 0, 0},
45  {0, 1, 0},
46  {1, 1, 0},
47  {1, 0, 0},
48  {0, 1, -1},
49  {0, 0, 1},
50  {0, 1, 1},
51  {0, 0, -1}
52 };
53 
54 static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000;
55 
56 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
57 cl_mem pixThBuffer; //output from thresholdtopix calculation
58 cl_int clStatus;
59 KernelEnv rEnv;
60 
61 // substitute invalid characters in device name with _
62 void legalizeFileName( char *fileName) {
63  //printf("fileName: %s\n", fileName);
64  char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
65  // for each invalid char
66  for (int i = 0; i < strlen(invalidChars); i++) {
67  char invalidStr[4];
68  invalidStr[0] = invalidChars[i];
69  invalidStr[1] = NULL;
70  //printf("eliminating %s\n", invalidStr);
71  //char *pos = strstr(fileName, invalidStr);
72  // initial ./ is valid for present directory
73  //if (*pos == '.') pos++;
74  //if (*pos == '/') pos++;
75  for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) {
76  //printf("\tfound: %s, ", pos);
77  pos[0] = '_';
78  //printf("fileName: %s\n", fileName);
79  }
80  }
81 }
82 
83 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
84  //printf("[DS] populateGPUEnvFromDevice\n");
85  size_t size;
86  gpuInfo->mnIsUserCreated = 1;
87  // device
88  gpuInfo->mpDevID = device;
89  gpuInfo->mpArryDevsID = new cl_device_id[1];
90  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
91  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE , sizeof(cl_device_type), (void *) &gpuInfo->mDevType , &size);
92  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
93  // platform
94  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID , &size);
95  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
96  // context
97  cl_context_properties props[3];
98  props[0] = CL_CONTEXT_PLATFORM;
99  props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
100  props[2] = 0;
101  gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus);
102  CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
103  // queue
104  cl_command_queue_properties queueProperties = 0;
105  gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
106  CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
107 
108 }
109 
110 int OpenclDevice::LoadOpencl()
111 {
112 #ifdef WIN32
113  HINSTANCE HOpenclDll = NULL;
114  void * OpenclDll = NULL;
115  //fprintf(stderr, " LoadOpenclDllxx... \n");
116  OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
117  OpenclDll = LoadLibrary( "openCL.dll" );
118  if ( !static_cast<HINSTANCE>( OpenclDll ) )
119  {
120  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
121  FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
122  return 0;
123 
124  }
125  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
126 #endif
127  return 1;
128 }
129 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
130 {
131  envInfo->mpkContext = gpuEnv.mpContext;
132  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
133  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
134 
135  return 1;
136 }
137 
138 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
139 {
140  cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
141  nElements * sizeof(l_uint32), hostbuffer, pStatus);
142 
143  return membuffer;
144 }
145 
146 PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true)
147 {
148  PROCNAME("mapOutputCLBuffer");
149  if (!pixd)
150  {
151  if (memcopy)
152  {
153  if ((pixd = pixCreateTemplate(pixs)) == NULL)
154  (PIX *)ERROR_PTR("pixd not made", procName, NULL);
155  }
156  else
157  {
158  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL)
159  (PIX *)ERROR_PTR("pixd not made", procName, NULL);
160  }
161  }
162  l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
163  elements * sizeof(l_uint32), 0, NULL, NULL, NULL );
164 
165  if (memcopy)
166  {
167  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
168  }
169  else
170  {
171  pixSetData(pixd, pValues);
172  }
173 
174  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL);
175 
176  if (sync)
177  {
178  clFinish( rEnv.mpkCmdQueue );
179  }
180 
181  return pixd;
182 }
183 
184  cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
185 {
186  cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
187  nElements * sizeof(l_int32), NULL, pStatus);
188 
189  if (_pValues != NULL)
190  {
191  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
192  nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
193 
194  memcpy(pValues, _pValues, nElements * sizeof(l_int32));
195 
196  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
197 
198  if (sync)
199  clFinish( rEnv.mpkCmdQueue );
200  }
201 
202  return xValues;
203 }
204 
205 int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
206 {
207  size_t length;
208  cl_int clStatus;
209  cl_uint numPlatforms, numDevices;
210  cl_platform_id *platforms;
211  cl_context_properties cps[3];
212  char platformName[256];
213  unsigned int i;
214 
215 
216  // Have a look at the available platforms.
217 
218  if ( !gpuInfo->mnIsUserCreated )
219  {
220  clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
221  if ( clStatus != CL_SUCCESS )
222  {
223  return 1;
224  }
225  gpuInfo->mpPlatformID = NULL;
226 
227  if ( 0 < numPlatforms )
228  {
229  platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
230  if ( platforms == (cl_platform_id*) NULL )
231  {
232  return 1;
233  }
234  clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
235 
236  if ( clStatus != CL_SUCCESS )
237  {
238  return 1;
239  }
240 
241  for ( i = 0; i < numPlatforms; i++ )
242  {
243  clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
244  sizeof( platformName ), platformName, NULL );
245 
246  if ( clStatus != CL_SUCCESS )
247  {
248  return 1;
249  }
250  gpuInfo->mpPlatformID = platforms[i];
251 
252  //if (!strcmp(platformName, "Intel(R) Coporation"))
253  //if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
254  {
255  gpuInfo->mpPlatformID = platforms[i];
256 
257  if ( getenv("SC_OPENCLCPU") )
258  {
259  clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
260  CL_DEVICE_TYPE_CPU, // device_type for CPU device
261  0, // num_entries
262  NULL, // devices
263  &numDevices);
264  printf("Selecting OpenCL device: CPU (a)\n");
265  }
266  else
267  {
268  clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
269  CL_DEVICE_TYPE_GPU, // device_type for GPU device
270  0, // num_entries
271  NULL, // devices
272  &numDevices);
273  printf("Selecting OpenCL device: GPU (a)\n");
274  }
275  if ( clStatus != CL_SUCCESS )
276  continue;
277 
278  if ( numDevices )
279  break;
280  }
281  }
282  if ( clStatus != CL_SUCCESS )
283  return 1;
284  free( platforms );
285  }
286  if ( NULL == gpuInfo->mpPlatformID )
287  return 1;
288 
289  // Use available platform.
290  cps[0] = CL_CONTEXT_PLATFORM;
291  cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
292  cps[2] = 0;
293  // Set device type for OpenCL
294 
295  if ( getenv("SC_OPENCLCPU") )
296  {
297  gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
298  printf("Selecting OpenCL device: CPU (b)\n");
299  }
300  else
301  {
302  gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
303  printf("Selecting OpenCL device: GPU (b)\n");
304  }
305 
306  gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
307 
308  if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
309  {
310  gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
311  gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
312  printf("Selecting OpenCL device: CPU (c)\n");
313  }
314  if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
315  {
316  gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
317  gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
318  printf("Selecting OpenCL device: DEFAULT (c)\n");
319  }
320  if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
321  return 1;
322  // Detect OpenCL devices.
323  // First, get the size of device list data
324  clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
325  if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
326  return 1;
327  // Now allocate memory for device list based on the size we got earlier
328  gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
329  if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
330  return 1;
331  // Now, get the device list data
332  clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
333  gpuInfo->mpArryDevsID, NULL );
334  if ( clStatus != CL_SUCCESS )
335  return 1;
336 
337  // Create OpenCL command queue.
338  gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
339 
340  if ( clStatus != CL_SUCCESS )
341  return 1;
342  }
343 
344  clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
345  // Check device extensions for double type
346  size_t aDevExtInfoSize = 0;
347 
348  clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
349  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
350 
351  char *aExtInfo = new char[aDevExtInfoSize];
352 
353  clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
354  sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
355  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
356 
357  gpuInfo->mnKhrFp64Flag = 0;
358  gpuInfo->mnAmdFp64Flag = 0;
359 
360  if ( strstr( aExtInfo, "cl_khr_fp64" ) )
361  {
362  gpuInfo->mnKhrFp64Flag = 1;
363  }
364  else
365  {
366  // Check if cl_amd_fp64 extension is supported
367  if ( strstr( aExtInfo, "cl_amd_fp64" ) )
368  gpuInfo->mnAmdFp64Flag = 1;
369  }
370  delete []aExtInfo;
371 
372  return 0;
373 }
374 
375 void OpenclDevice::releaseMorphCLBuffers()
376 {
377  if (pixdCLIntermediate != NULL)
378  clReleaseMemObject(pixdCLIntermediate);
379  if (pixsCLBuffer != NULL)
380  clReleaseMemObject(pixsCLBuffer);
381  if (pixdCLBuffer != NULL)
382  clReleaseMemObject(pixdCLBuffer);
383  if (pixThBuffer != NULL)
384  clReleaseMemObject(pixThBuffer);
385 }
386 
387 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
388 {
389  SetKernelEnv( &rEnv );
390 
391  if (pixThBuffer != NULL)
392  {
393  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
394 
395  //Get the output from ThresholdToPix operation
396  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL);
397  }
398  else
399  {
400  //Get data from the source image
401  l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
402  memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
403 
404  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
405  }
406 
407  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
408 
409  pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
410 
411  return (int)clStatus;
412 }
413 
414 int OpenclDevice::InitEnv()
415 {
416 //PERF_COUNT_START("OD::InitEnv")
417 // printf("[OD] OpenclDevice::InitEnv()\n");
418 #ifdef SAL_WIN32
419  while( 1 )
420  {
421  if( 1 == LoadOpencl() )
422  break;
423  }
424 PERF_COUNT_SUB("LoadOpencl")
425 #endif
426  // sets up environment, compiles programs
427 
428 
429 #if USE_DEVICE_SELECTION
430 
431  InitOpenclRunEnv_DeviceSelection( 0 );
432 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
433 #else
434  // init according to device
435  InitOpenclRunEnv( 0 );
436 #endif
437 //PERF_COUNT_END
438  return 1;
439 }
440 
441 int OpenclDevice::ReleaseOpenclRunEnv()
442 {
443  ReleaseOpenclEnv( &gpuEnv );
444 #ifdef SAL_WIN32
445  FreeOpenclDll();
446 #endif
447  return 1;
448 }
449 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
450 {
451  if ( kCount < 1 )
452  fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
453  strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
454  gpuEnv.mnKernelCount++;
455  return 0;
456 }
457 int OpenclDevice::RegistOpenclKernel()
458 {
459  if ( !gpuEnv.mnIsUserCreated )
460  memset( &gpuEnv, 0, sizeof(gpuEnv) );
461 
462  gpuEnv.mnFileCount = 0; //argc;
463  gpuEnv.mnKernelCount = 0UL;
464 
465  AddKernelConfig( 1, (const char*) "oclAverageSub1" );
466  return 0;
467 }
468 int OpenclDevice::InitOpenclRunEnv( int argc )
469 {
470  int status = 0;
471  if ( MAX_CLKERNEL_NUM <= 0 )
472  {
473  return 1;
474  }
475  if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
476  return 1;
477 
478  if ( !isInited )
479  {
480  RegistOpenclKernel();
481  //initialize devices, context, comand_queue
482  status = InitOpenclRunEnv( &gpuEnv );
483  if ( status )
484  {
485  fprintf(stderr,"init_opencl_env failed.\n");
486  return 1;
487  }
488  fprintf(stderr,"init_opencl_env successed.\n");
489  //initialize program, kernelName, kernelCount
490  if( getenv( "SC_FLOAT" ) )
491  {
492  gpuEnv.mnKhrFp64Flag = 0;
493  gpuEnv.mnAmdFp64Flag = 0;
494  }
495  if( gpuEnv.mnKhrFp64Flag )
496  {
497  fprintf(stderr,"----use khr double type in kernel----\n");
498  status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
499  }
500  else if( gpuEnv.mnAmdFp64Flag )
501  {
502  fprintf(stderr,"----use amd double type in kernel----\n");
503  status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
504  }
505  else
506  {
507  fprintf(stderr,"----use float type in kernel----\n");
508  status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
509  }
510  if ( status == 0 || gpuEnv.mnKernelCount == 0 )
511  {
512  fprintf(stderr,"CompileKernelFile failed.\n");
513  return 1;
514  }
515  fprintf(stderr,"CompileKernelFile successed.\n");
516  isInited = 1;
517  }
518  return 0;
519 }
520 
521 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
522 //PERF_COUNT_START("InitOpenclRunEnv_DS")
523 #if USE_DEVICE_SELECTION
524  if (!isInited) {
525  // after programs compiled, selects best device
526  //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
527  ds_device bestDevice_DS = getDeviceSelection( );
528 //PERF_COUNT_SUB("called getDeviceSelection()")
529  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
530  // overwrite global static GPUEnv with new device
531  if (selectedDeviceIsOpenCL() ) {
532  //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
533  populateGPUEnvFromDevice( &gpuEnv, bestDevice );
534  gpuEnv.mnFileCount = 0; //argc;
535  gpuEnv.mnKernelCount = 0UL;
536 //PERF_COUNT_SUB("populate gpuEnv")
537  CompileKernelFile(&gpuEnv, "");
538 //PERF_COUNT_SUB("CompileKernelFile")
539  } else {
540  //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
541  }
542  isInited = 1;
543  }
544 #endif
545 //PERF_COUNT_END
546  return 0;
547 }
548 
549 
550 OpenclDevice::OpenclDevice()
551 {
552  //InitEnv();
553 }
554 
555 OpenclDevice::~OpenclDevice()
556 {
557  //ReleaseOpenclRunEnv();
558 }
559 
560 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
561 {
562  int i = 0;
563  int clStatus = 0;
564 
565  if ( !isInited )
566  {
567  return 1;
568  }
569 
570  for ( i = 0; i < gpuEnv.mnFileCount; i++ )
571  {
572  if ( gpuEnv.mpArryPrograms[i] )
573  {
574  clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
575  CHECK_OPENCL( clStatus, "clReleaseProgram" );
576  gpuEnv.mpArryPrograms[i] = NULL;
577  }
578  }
579  if ( gpuEnv.mpCmdQueue )
580  {
581  clReleaseCommandQueue( gpuEnv.mpCmdQueue );
582  gpuEnv.mpCmdQueue = NULL;
583  }
584  if ( gpuEnv.mpContext )
585  {
586  clReleaseContext( gpuEnv.mpContext );
587  gpuEnv.mpContext = NULL;
588  }
589  isInited = 0;
590  gpuInfo->mnIsUserCreated = 0;
591  free( gpuInfo->mpArryDevsID );
592  return 1;
593 }
594 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
595 {
596  unsigned int i = 0;
597  cl_int clStatus;
598  int status = 0;
599  char *str = NULL;
600  FILE *fd = NULL;
601  cl_uint numDevices=0;
602  if ( getenv("SC_OPENCLCPU") )
603  {
604  clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
605  CL_DEVICE_TYPE_CPU, // device_type for CPU device
606  0, // num_entries
607  NULL, // devices ID
608  &numDevices);
609  }
610  else
611  {
612  clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
613  CL_DEVICE_TYPE_GPU, // device_type for GPU device
614  0, // num_entries
615  NULL, // devices ID
616  &numDevices);
617  }
618  CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
619  for ( i = 0; i < numDevices; i++ )
620  {
621  char fileName[256] = { 0 }, cl_name[128] = { 0 };
622  if ( gpuEnv.mpArryDevsID[i] != 0 )
623  {
624  char deviceName[1024];
625  clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
626  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
627  str = (char*) strstr( clFileName, (char*) ".cl" );
628  memcpy( cl_name, clFileName, str - clFileName );
629  cl_name[str - clFileName] = '\0';
630  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
631  legalizeFileName(fileName);
632  fd = fopen( fileName, "rb" );
633  status = ( fd != NULL ) ? 1 : 0;
634  }
635  }
636  if ( fd != NULL )
637  {
638  *fhandle = fd;
639  }
640  return status;
641 
642 }
643 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
644 {
645  int i;
646  for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
647  {
648  if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
649  {
650  if ( gpuEnvCached->mpArryPrograms[i] != NULL )
651  {
652  return 1;
653  }
654  }
655  }
656 
657  return 0;
658 }
659 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
660 {
661  FILE *output = NULL;
662  output = fopen( fileName, "wb" );
663  if ( output == NULL )
664  {
665  return 0;
666  }
667 
668  fwrite( birary, sizeof(char), numBytes, output );
669  fclose( output );
670 
671  return 1;
672 
673 }
674 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
675 {
676  unsigned int i = 0;
677  cl_int clStatus;
678  size_t *binarySizes, numDevices;
679  cl_device_id *mpArryDevsID;
680  char **binaries, *str = NULL;
681 
682  clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
683  sizeof(numDevices), &numDevices, NULL );
684  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
685 
686  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
687  if ( mpArryDevsID == NULL )
688  {
689  return 0;
690  }
691  /* grab the handles to all of the devices in the program. */
692  clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
693  sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
694  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
695 
696  /* figure out the sizes of each of the binaries. */
697  binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
698 
699  clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
700  sizeof(size_t) * numDevices, binarySizes, NULL );
701  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
702 
703  /* copy over all of the generated binaries. */
704  binaries = (char**) malloc( sizeof(char *) * numDevices );
705  if ( binaries == NULL )
706  {
707  return 0;
708  }
709 
710  for ( i = 0; i < numDevices; i++ )
711  {
712  if ( binarySizes[i] != 0 )
713  {
714  binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
715  if ( binaries[i] == NULL )
716  {
717  // cleanup all memory allocated so far
718  for(int cleanupIndex = 0; cleanupIndex < i; ++cleanupIndex)
719  {
720  free(binaries[cleanupIndex]);
721  }
722  // cleanup binary array
723  free(binaries);
724 
725  return 0;
726  }
727  }
728  else
729  {
730  binaries[i] = NULL;
731  }
732  }
733 
734  clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
735  sizeof(char *) * numDevices, binaries, NULL );
736  CHECK_OPENCL(clStatus,"clGetProgramInfo");
737 
738  /* dump out each binary into its own separate file. */
739  for ( i = 0; i < numDevices; i++ )
740  {
741  char fileName[256] = { 0 }, cl_name[128] = { 0 };
742 
743  if ( binarySizes[i] != 0 )
744  {
745  char deviceName[1024];
746  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
747  sizeof(deviceName), deviceName, NULL);
748  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
749 
750  str = (char*) strstr( clFileName, (char*) ".cl" );
751  memcpy( cl_name, clFileName, str - clFileName );
752  cl_name[str - clFileName] = '\0';
753  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
754  legalizeFileName(fileName);
755  if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
756  {
757  printf("[OD] write binary[%s] failed\n", fileName);
758  return 0;
759  } //else
760  printf("[OD] write binary[%s] succesfully\n", fileName);
761  }
762  }
763 
764  // Release all resouces and memory
765  for ( i = 0; i < numDevices; i++ )
766  {
767  if ( binaries[i] != NULL )
768  {
769  free( binaries[i] );
770  binaries[i] = NULL;
771  }
772  }
773 
774  if ( binaries != NULL )
775  {
776  free( binaries );
777  binaries = NULL;
778  }
779 
780  if ( binarySizes != NULL )
781  {
782  free( binarySizes );
783  binarySizes = NULL;
784  }
785 
786  if ( mpArryDevsID != NULL )
787  {
788  free( mpArryDevsID );
789  mpArryDevsID = NULL;
790  }
791  return 1;
792 }
793 
794 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
795 {
796  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
797  nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
798  clFinish( rEnv.mpkCmdQueue );
799  if (_pValues != NULL)
800  {
801  for ( int i = 0; i < (int)nElements; i++ )
802  pValues[i] = (l_int32)_pValues[i];
803  }
804 
805  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
806  //clFinish( rEnv.mpkCmdQueue );
807  return;
808 }
809 
810 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
811 {
812 //PERF_COUNT_START("CompileKernelFile")
813  cl_int clStatus = 0;
814  size_t length;
815  char *buildLog = NULL, *binary;
816  const char *source;
817  size_t source_size[1];
818  int b_error, binary_status, binaryExisted, idx;
819  size_t numDevices;
820  cl_device_id *mpArryDevsID;
821  FILE *fd, *fd1;
822  const char* filename = "kernel.cl";
823  //fprintf(stderr, "[OD] CompileKernelFile ... \n");
824  if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
825  {
826  return 1;
827  }
828 
829  idx = gpuInfo->mnFileCount;
830 
831  source = kernel_src;
832 
833  source_size[0] = strlen( source );
834  binaryExisted = 0;
835  binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
836 //PERF_COUNT_SUB("BinaryGenerated")
837  if ( binaryExisted == 1 )
838  {
839  clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
840  sizeof(numDevices), &numDevices, NULL );
841  CHECK_OPENCL( clStatus, "clGetContextInfo" );
842 
843  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
844  if ( mpArryDevsID == NULL )
845  {
846  return 0;
847  }
848 //PERF_COUNT_SUB("get numDevices")
849  b_error = 0;
850  length = 0;
851  b_error |= fseek( fd, 0, SEEK_END ) < 0;
852  b_error |= ( length = ftell(fd) ) <= 0;
853  b_error |= fseek( fd, 0, SEEK_SET ) < 0;
854  if ( b_error )
855  {
856  return 0;
857  }
858 
859  binary = (char*) malloc( length + 2 );
860  if ( !binary )
861  {
862  return 0;
863  }
864 
865  memset( binary, 0, length + 2 );
866  b_error |= fread( binary, 1, length, fd ) != length;
867 
868 
869  fclose( fd );
870 //PERF_COUNT_SUB("read file")
871  fd = NULL;
872  // grab the handles to all of the devices in the context.
873  clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
874  sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
875  CHECK_OPENCL( clStatus, "clGetContextInfo" );
876 //PERF_COUNT_SUB("get devices")
877  //fprintf(stderr, "[OD] Create kernel from binary\n");
878  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
879  mpArryDevsID, &length, (const unsigned char**) &binary,
880  &binary_status, &clStatus );
881  CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
882 //PERF_COUNT_SUB("clCreateProgramWithBinary")
883  free( binary );
884  free( mpArryDevsID );
885  mpArryDevsID = NULL;
886 //PERF_COUNT_SUB("binaryExisted")
887  }
888  else
889  {
890  // create a CL program using the kernel source
891  //fprintf(stderr, "[OD] Create kernel from source\n");
892  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
893  source_size, &clStatus);
894  CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
895 //PERF_COUNT_SUB("!binaryExisted")
896  }
897 
898  if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
899  {
900  return 0;
901  }
902 
903  //char options[512];
904  // create a cl program executable for all the devices specified
905  //printf("[OD] BuildProgram.\n");
906 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
907  if (!gpuInfo->mnIsUserCreated)
908  {
909  clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
910  buildOption, NULL, NULL);
911 //PERF_COUNT_SUB("clBuildProgram notUserCreated")
912  }
913  else
914  {
915  clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
916  buildOption, NULL, NULL);
917 //PERF_COUNT_SUB("clBuildProgram isUserCreated")
918  }
920  if ( clStatus != CL_SUCCESS )
921  {
922  printf ("BuildProgram error!\n");
923  if ( !gpuInfo->mnIsUserCreated )
924  {
925  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
926  CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
927  }
928  else
929  {
930  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
931  CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
932  }
933  if ( clStatus != CL_SUCCESS )
934  {
935  printf("opencl create build log fail\n");
936  return 0;
937  }
938  buildLog = (char*) malloc( length );
939  if ( buildLog == (char*) NULL )
940  {
941  return 0;
942  }
943  if ( !gpuInfo->mnIsUserCreated )
944  {
945  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
946  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
947  }
948  else
949  {
950  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
951  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
952  }
953  if ( clStatus != CL_SUCCESS )
954  {
955  printf("opencl program build info fail\n");
956  return 0;
957  }
958 
959  fd1 = fopen( "kernel-build.log", "w+" );
960  if ( fd1 != NULL )
961  {
962  fwrite( buildLog, sizeof(char), length, fd1 );
963  fclose( fd1 );
964  }
965 
966  free( buildLog );
967 //PERF_COUNT_SUB("build error log")
968  return 0;
969  }
970 
971  strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
972 //PERF_COUNT_SUB("strcpy")
973  if ( binaryExisted == 0 ) {
974  GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
975  PERF_COUNT_SUB("GenerateBinFromKernelSource")
976  }
977 
978  gpuInfo->mnFileCount += 1;
979 //PERF_COUNT_END
980  return 1;
981 }
982 
983 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
984 {
985 PERF_COUNT_START("pixReadFromTiffKernel")
986  cl_int clStatus;
987  KernelEnv rEnv;
988  size_t globalThreads[2];
989  size_t localThreads[2];
990  int gsize;
991  cl_mem valuesCl;
992  cl_mem outputCl;
993 
994  //global and local work dimensions for Horizontal pass
995  gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
996  globalThreads[0] = gsize;
997  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
998  globalThreads[1] = gsize;
999  localThreads[0] = GROUPSIZE_X;
1000  localThreads[1] = GROUPSIZE_Y;
1001 
1002  SetKernelEnv( &rEnv );
1003 
1004  l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
1005  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
1006  CHECK_OPENCL( clStatus, "clCreateKernel");
1007 
1008  //Allocate input and output OCL buffers
1009  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1010  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1011 
1012  //Kernel arguments
1013  clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
1014  CHECK_OPENCL( clStatus, "clSetKernelArg");
1015  clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w );
1016  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1017  clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h );
1018  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1019  clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl );
1020  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1021  clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
1022  CHECK_OPENCL( clStatus, "clSetKernelArg");
1023 
1024  //Kernel enqueue
1025 PERF_COUNT_SUB("before")
1026  clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL );
1027  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
1028 
1029  /* map results back from gpu */
1030  void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus);
1031  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl");
1032  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
1033 
1034  //Sync
1035  clFinish( rEnv.mpkCmdQueue );
1036 PERF_COUNT_SUB("kernel & map")
1038  return pResult;
1039 }
1040 
1041 
1042 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
1043 {
1044 PERF_COUNT_START("pixReadTiffCL")
1045  FILE *fp;
1046 PIX *pix;
1047 
1048  //printf("pixReadTiffCl file");
1049  PROCNAME("pixReadTiff");
1050 
1051  if (!filename)
1052  return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
1053 
1054  if ((fp = fopenReadStream(filename)) == NULL)
1055  return (PIX *)ERROR_PTR("image file not found", procName, NULL);
1056  if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
1057  fclose(fp);
1058  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1059  }
1060  fclose(fp);
1062  return pix;
1063 
1064 }
1065 TIFF *
1066 OpenclDevice::fopenTiffCl(FILE *fp,
1067  const char *modestring)
1068 {
1069 l_int32 fd;
1070 
1071  PROCNAME("fopenTiff");
1072 
1073  if (!fp)
1074  return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
1075  if (!modestring)
1076  return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
1077 
1078  if ((fd = fileno(fp)) < 0)
1079  return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
1080  lseek(fd, 0, SEEK_SET);
1081 
1082  return TIFFFdOpen(fd, "TIFFstream", modestring);
1083 }
1084 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif,
1085  l_int32 *pxres,
1086  l_int32 *pyres)
1087 {
1088 l_uint16 resunit;
1089 l_int32 foundxres, foundyres;
1090 l_float32 fxres, fyres;
1091 
1092  PROCNAME("getTiffStreamResolution");
1093 
1094  if (!tif)
1095  return ERROR_INT("tif not opened", procName, 1);
1096  if (!pxres || !pyres)
1097  return ERROR_INT("&xres and &yres not both defined", procName, 1);
1098  *pxres = *pyres = 0;
1099 
1100  TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
1101  foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
1102  foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
1103  if (!foundxres && !foundyres) return 1;
1104  if (!foundxres && foundyres)
1105  fxres = fyres;
1106  else if (foundxres && !foundyres)
1107  fyres = fxres;
1108 
1109  if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */
1110  *pxres = (l_int32)(2.54 * fxres + 0.5);
1111  *pyres = (l_int32)(2.54 * fyres + 0.5);
1112  }
1113  else {
1114  *pxres = (l_int32)fxres;
1115  *pyres = (l_int32)fyres;
1116  }
1117 
1118  return 0;
1119 }
1120 
1121 struct L_Memstream
1122 {
1123 l_uint8 *buffer; /* expands to hold data when written to; */
1124  /* fixed size when read from. */
1125 size_t bufsize; /* current size allocated when written to; */
1126  /* fixed size of input data when read from. */
1127 size_t offset; /* byte offset from beginning of buffer. */
1128 size_t hw; /* high-water mark; max bytes in buffer. */
1129 l_uint8 **poutdata; /* input param for writing; data goes here. */
1130 size_t *poutsize; /* input param for writing; data size goes here. */
1131 };
1132 typedef struct L_Memstream L_MEMSTREAM;
1133 
1134 /* These are static functions for memory I/O */
1135 static L_MEMSTREAM *memstreamCreateForRead(l_uint8 *indata, size_t pinsize);
1136 static L_MEMSTREAM *memstreamCreateForWrite(l_uint8 **poutdata,
1137  size_t *poutsize);
1138 static tsize_t tiffReadCallback(thandle_t handle, tdata_t data, tsize_t length);
1139 static tsize_t tiffWriteCallback(thandle_t handle, tdata_t data,
1140  tsize_t length);
1141 static toff_t tiffSeekCallback(thandle_t handle, toff_t offset, l_int32 whence);
1142 static l_int32 tiffCloseCallback(thandle_t handle);
1143 static toff_t tiffSizeCallback(thandle_t handle);
1144 static l_int32 tiffMapCallback(thandle_t handle, tdata_t *data, toff_t *length);
1145 static void tiffUnmapCallback(thandle_t handle, tdata_t data, toff_t length);
1146 
1147 
1148 static L_MEMSTREAM *
1149 memstreamCreateForRead(l_uint8 *indata,
1150 size_t insize)
1151 {
1152  L_MEMSTREAM *mstream;
1153 
1154  mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
1155  mstream->buffer = indata; /* handle to input data array */
1156  mstream->bufsize = insize; /* amount of input data */
1157  mstream->hw = insize; /* high-water mark fixed at input data size */
1158  mstream->offset = 0; /* offset always starts at 0 */
1159  return mstream;
1160 }
1161 
1162 
1163 static L_MEMSTREAM *
1164 memstreamCreateForWrite(l_uint8 **poutdata,
1165 size_t *poutsize)
1166 {
1167  L_MEMSTREAM *mstream;
1168 
1169  mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
1170  mstream->buffer = (l_uint8 *)CALLOC(8 * 1024, 1);
1171  mstream->bufsize = 8 * 1024;
1172  mstream->poutdata = poutdata; /* used only at end of write */
1173  mstream->poutsize = poutsize; /* ditto */
1174  mstream->hw = mstream->offset = 0;
1175  return mstream;
1176 }
1177 
1178 
1179 static tsize_t
1180 tiffReadCallback(thandle_t handle,
1181 tdata_t data,
1182 tsize_t length)
1183 {
1184  L_MEMSTREAM *mstream;
1185  size_t amount;
1186 
1187  mstream = (L_MEMSTREAM *)handle;
1188  amount = L_MIN((size_t)length, mstream->hw - mstream->offset);
1189  memcpy(data, mstream->buffer + mstream->offset, amount);
1190  mstream->offset += amount;
1191  return amount;
1192 }
1193 
1194 
1195 static tsize_t
1196 tiffWriteCallback(thandle_t handle,
1197 tdata_t data,
1198 tsize_t length)
1199 {
1200  L_MEMSTREAM *mstream;
1201  size_t newsize;
1202 
1203  /* reallocNew() uses calloc to initialize the array.
1204  * If malloc is used instead, for some of the encoding methods,
1205  * not all the data in 'bufsize' bytes in the buffer will
1206  * have been initialized by the end of the compression. */
1207  mstream = (L_MEMSTREAM *)handle;
1208  if (mstream->offset + length > mstream->bufsize) {
1209  newsize = 2 * (mstream->offset + length);
1210  mstream->buffer = (l_uint8 *)reallocNew((void **)&mstream->buffer,
1211  mstream->offset, newsize);
1212  mstream->bufsize = newsize;
1213  }
1214 
1215  memcpy(mstream->buffer + mstream->offset, data, length);
1216  mstream->offset += length;
1217  mstream->hw = L_MAX(mstream->offset, mstream->hw);
1218  return length;
1219 }
1220 
1221 
1222 static toff_t
1223 tiffSeekCallback(thandle_t handle,
1224 toff_t offset,
1225 l_int32 whence)
1226 {
1227  L_MEMSTREAM *mstream;
1228 
1229  PROCNAME("tiffSeekCallback");
1230  mstream = (L_MEMSTREAM *)handle;
1231  switch (whence) {
1232  case SEEK_SET:
1233  /* fprintf(stderr, "seek_set: offset = %d\n", offset); */
1234  mstream->offset = offset;
1235  break;
1236  case SEEK_CUR:
1237  /* fprintf(stderr, "seek_cur: offset = %d\n", offset); */
1238  mstream->offset += offset;
1239  break;
1240  case SEEK_END:
1241  /* fprintf(stderr, "seek end: hw = %d, offset = %d\n",
1242  mstream->hw, offset); */
1243  mstream->offset = mstream->hw - offset; /* offset >= 0 */
1244  break;
1245  default:
1246  return (toff_t)ERROR_INT("bad whence value", procName,
1247  mstream->offset);
1248  }
1249 
1250  return mstream->offset;
1251 }
1252 
1253 
1254 static l_int32
1255 tiffCloseCallback(thandle_t handle)
1256 {
1257  L_MEMSTREAM *mstream;
1258 
1259  mstream = (L_MEMSTREAM *)handle;
1260  if (mstream->poutdata) { /* writing: save the output data */
1261  *mstream->poutdata = mstream->buffer;
1262  *mstream->poutsize = mstream->hw;
1263  }
1264  FREE(mstream); /* never free the buffer! */
1265  return 0;
1266 }
1267 
1268 
1269 static toff_t
1270 tiffSizeCallback(thandle_t handle)
1271 {
1272  L_MEMSTREAM *mstream;
1273 
1274  mstream = (L_MEMSTREAM *)handle;
1275  return mstream->hw;
1276 }
1277 
1278 
1279 static l_int32
1280 tiffMapCallback(thandle_t handle,
1281 tdata_t *data,
1282 toff_t *length)
1283 {
1284  L_MEMSTREAM *mstream;
1285 
1286  mstream = (L_MEMSTREAM *)handle;
1287  *data = mstream->buffer;
1288  *length = mstream->hw;
1289  return 0;
1290 }
1291 
1292 
1293 static void
1294 tiffUnmapCallback(thandle_t handle,
1295 tdata_t data,
1296 toff_t length)
1297 {
1298  return;
1299 }
1300 
1301 
1318 static TIFF *
1319 fopenTiffMemstream(const char *filename,
1320 const char *operation,
1321 l_uint8 **pdata,
1322 size_t *pdatasize)
1323 {
1324  L_MEMSTREAM *mstream;
1325 
1326  PROCNAME("fopenTiffMemstream");
1327 
1328  if (!filename)
1329  return (TIFF *)ERROR_PTR("filename not defined", procName, NULL);
1330  if (!operation)
1331  return (TIFF *)ERROR_PTR("operation not defined", procName, NULL);
1332  if (!pdata)
1333  return (TIFF *)ERROR_PTR("&data not defined", procName, NULL);
1334  if (!pdatasize)
1335  return (TIFF *)ERROR_PTR("&datasize not defined", procName, NULL);
1336  if (!strcmp(operation, "r") && !strcmp(operation, "w"))
1337  return (TIFF *)ERROR_PTR("operation not 'r' or 'w'}", procName, NULL);
1338 
1339  if (!strcmp(operation, "r"))
1340  mstream = memstreamCreateForRead(*pdata, *pdatasize);
1341  else
1342  mstream = memstreamCreateForWrite(pdata, pdatasize);
1343 
1344  return TIFFClientOpen(filename, operation, mstream,
1345  tiffReadCallback, tiffWriteCallback,
1346  tiffSeekCallback, tiffCloseCallback,
1347  tiffSizeCallback, tiffMapCallback,
1348  tiffUnmapCallback);
1349 }
1350 
1351 
1352 
1353 PIX *
1354 OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32 n)
1355 {
1356  l_int32 i, pagefound;
1357  PIX *pix;
1358  TIFF *tif;
1359  L_MEMSTREAM *memStream;
1360  PROCNAME("pixReadMemTiffCl");
1361 
1362  if (!data)
1363  return (PIX *)ERROR_PTR("data pointer is NULL", procName, NULL);
1364 
1365  if ((tif = fopenTiffMemstream("", "r", (l_uint8 **)&data, &size)) == NULL)
1366  return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
1367 
1368  pagefound = FALSE;
1369  pix = NULL;
1370  for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
1371  if (i == n) {
1372  pagefound = TRUE;
1373  if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
1374  TIFFCleanup(tif);
1375  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1376  }
1377  break;
1378  }
1379  if (TIFFReadDirectory(tif) == 0)
1380  break;
1381  }
1382 
1383  if (pagefound == FALSE) {
1384  L_WARNING("tiff page %d not found", procName);
1385  TIFFCleanup(tif);
1386  return NULL;
1387  }
1388 
1389  TIFFCleanup(tif);
1390  return pix;
1391 }
1392 
1393 PIX *
1394 OpenclDevice::pixReadStreamTiffCl(FILE *fp,
1395  l_int32 n)
1396 {
1397 l_int32 i, pagefound;
1398 PIX *pix;
1399 TIFF *tif;
1400 
1401  PROCNAME("pixReadStreamTiff");
1402 
1403  if (!fp)
1404  return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
1405 
1406  if ((tif = fopenTiffCl(fp, "rb")) == NULL)
1407  return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
1408 
1409  pagefound = FALSE;
1410  pix = NULL;
1411  for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
1412  if (i == n) {
1413  pagefound = TRUE;
1414  if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
1415  TIFFCleanup(tif);
1416  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1417  }
1418  break;
1419  }
1420  if (TIFFReadDirectory(tif) == 0)
1421  break;
1422  }
1423 
1424  if (pagefound == FALSE) {
1425  L_WARNING("tiff page %d not found", procName, n);
1426  TIFFCleanup(tif);
1427  return NULL;
1428  }
1429 
1430  TIFFCleanup(tif);
1431  return pix;
1432 }
1433 
1434 static l_int32
1435 getTiffCompressedFormat(l_uint16 tiffcomp)
1436 {
1437 l_int32 comptype;
1438 
1439  switch (tiffcomp)
1440  {
1441  case COMPRESSION_CCITTFAX4:
1442  comptype = IFF_TIFF_G4;
1443  break;
1444  case COMPRESSION_CCITTFAX3:
1445  comptype = IFF_TIFF_G3;
1446  break;
1447  case COMPRESSION_CCITTRLE:
1448  comptype = IFF_TIFF_RLE;
1449  break;
1450  case COMPRESSION_PACKBITS:
1451  comptype = IFF_TIFF_PACKBITS;
1452  break;
1453  case COMPRESSION_LZW:
1454  comptype = IFF_TIFF_LZW;
1455  break;
1456  case COMPRESSION_ADOBE_DEFLATE:
1457  comptype = IFF_TIFF_ZIP;
1458  break;
1459  default:
1460  comptype = IFF_TIFF;
1461  break;
1462  }
1463  return comptype;
1464 }
1465 
1466 void compare(l_uint32 *cpu, l_uint32 *gpu,int size)
1467 {
1468  for(int i=0;i<size;i++)
1469  {
1470  if(cpu[i]!=gpu[i])
1471  {
1472  printf("\ndoesnot match\n");
1473  return;
1474  }
1475  }
1476  printf("\nit matches\n");
1477 
1478 }
1479 
1480 //OpenCL implementation of pixReadFromTiffStream.
1481 //Similar to the CPU implentation of pixReadFromTiffStream
1482 PIX *
1483 OpenclDevice::pixReadFromTiffStreamCl(TIFF *tif)
1484 {
1485 l_uint8 *linebuf, *data;
1486 l_uint16 spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
1487 l_uint16 *redmap, *greenmap, *bluemap;
1488 l_int32 d, wpl, bpl, comptype, i, ncolors;
1489 l_int32 xres, yres;
1490 l_uint32 w, h;
1491 l_uint32 *line, *tiffdata;
1492 PIX *pix;
1493 PIXCMAP *cmap;
1494 
1495  PROCNAME("pixReadFromTiffStream");
1496 
1497  if (!tif)
1498  return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
1499 
1500 
1501  TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
1502  TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
1503  bpp = bps * spp;
1504  if (bpp > 32)
1505  return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
1506  if (spp == 1)
1507  d = bps;
1508  else if (spp == 3 || spp == 4)
1509  d = 32;
1510  else
1511  return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
1512 
1513  TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
1514  TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
1515  tiffbpl = TIFFScanlineSize(tif);
1516 
1517  if ((pix = pixCreate(w, h, d)) == NULL)
1518  return (PIX *)ERROR_PTR("pix not made", procName, NULL);
1519  data = (l_uint8 *)pixGetData(pix);
1520  wpl = pixGetWpl(pix);
1521  bpl = 4 * wpl;
1522 
1523 
1524  if (spp == 1) {
1525  if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL)
1526  return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
1527 
1528  for (i = 0 ; i < h ; i++) {
1529  if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
1530  FREE(linebuf);
1531  pixDestroy(&pix);
1532  return (PIX *)ERROR_PTR("line read fail", procName, NULL);
1533  }
1534  memcpy((char *)data, (char *)linebuf, tiffbpl);
1535  data += bpl;
1536  }
1537  if (bps <= 8)
1538  pixEndianByteSwap(pix);
1539  else
1540  pixEndianTwoByteSwap(pix);
1541  FREE(linebuf);
1542  }
1543  else {
1544  if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
1545  pixDestroy(&pix);
1546  return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
1547  }
1548  if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
1549  ORIENTATION_TOPLEFT, 0)) {
1550  FREE(tiffdata);
1551  pixDestroy(&pix);
1552  return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
1553  }
1554  line = pixGetData(pix);
1555 
1556  //Invoke the OpenCL kernel for pixReadFromTiff
1557  l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
1558  pixSetData(pix, output_gpu);
1559 
1560  FREE(tiffdata);
1561  }
1562 
1563  if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
1564  pixSetXRes(pix, xres);
1565  pixSetYRes(pix, yres);
1566  }
1567 
1568 
1569  TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
1570  comptype = getTiffCompressedFormat(tiffcomp);
1571  pixSetInputFormat(pix, comptype);
1572 
1573  if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
1574 
1575  if ((cmap = pixcmapCreate(bps)) == NULL) {
1576  pixDestroy(&pix);
1577  return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
1578  }
1579  ncolors = 1 << bps;
1580  for (i = 0; i < ncolors; i++)
1581  pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
1582  bluemap[i] >> 8);
1583  pixSetColormap(pix, cmap);
1584  }
1585  else {
1586  if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
1587 
1588  if (tiffcomp == COMPRESSION_CCITTFAX3 ||
1589  tiffcomp == COMPRESSION_CCITTFAX4 ||
1590  tiffcomp == COMPRESSION_CCITTRLE ||
1591  tiffcomp == COMPRESSION_CCITTRLEW) {
1592  photometry = PHOTOMETRIC_MINISWHITE;
1593  }
1594  else
1595  photometry = PHOTOMETRIC_MINISBLACK;
1596  }
1597  if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
1598  (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
1599  pixInvert(pix, pix);
1600  }
1601 
1602  if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
1603  if (orientation >= 1 && orientation <= 8) {
1604  struct tiff_transform *transform =
1605  &tiff_orientation_transforms[orientation - 1];
1606  if (transform->vflip) pixFlipTB(pix, pix);
1607  if (transform->hflip) pixFlipLR(pix, pix);
1608  if (transform->rotate) {
1609  PIX *oldpix = pix;
1610  pix = pixRotate90(oldpix, transform->rotate);
1611  pixDestroy(&oldpix);
1612  }
1613  }
1614  }
1615 
1616  return pix;
1617 }
1618 
1619 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1620 cl_int
1621 pixDilateCL_55(l_int32 wpl, l_int32 h)
1622 {
1623  size_t globalThreads[2];
1624  cl_mem pixtemp;
1625  cl_int status;
1626  int gsize;
1627  size_t localThreads[2];
1628 
1629  //Horizontal pass
1630  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1631  globalThreads[0] = gsize;
1632  globalThreads[1] = GROUPSIZE_HMORY;
1633  localThreads[0] = GROUPSIZE_HMORX;
1634  localThreads[1] = GROUPSIZE_HMORY;
1635 
1636  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
1637 
1638  status = clSetKernelArg(rEnv.mpkKernel,
1639  0,
1640  sizeof(cl_mem),
1641  &pixsCLBuffer);
1642  status = clSetKernelArg(rEnv.mpkKernel,
1643  1,
1644  sizeof(cl_mem),
1645  &pixdCLBuffer);
1646  status = clSetKernelArg(rEnv.mpkKernel,
1647  2,
1648  sizeof(wpl),
1649  (const void *)&wpl);
1650  status = clSetKernelArg(rEnv.mpkKernel,
1651  3,
1652  sizeof(h),
1653  (const void *)&h);
1654 
1655  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1656  rEnv.mpkKernel,
1657  2,
1658  NULL,
1659  globalThreads,
1660  localThreads,
1661  0,
1662  NULL,
1663  NULL);
1664 
1665  //Swap source and dest buffers
1666  pixtemp = pixsCLBuffer;
1667  pixsCLBuffer = pixdCLBuffer;
1668  pixdCLBuffer = pixtemp;
1669 
1670  //Vertical
1671  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1672  globalThreads[0] = gsize;
1673  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1674  globalThreads[1] = gsize;
1675  localThreads[0] = GROUPSIZE_X;
1676  localThreads[1] = GROUPSIZE_Y;
1677 
1678  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
1679 
1680  status = clSetKernelArg(rEnv.mpkKernel,
1681  0,
1682  sizeof(cl_mem),
1683  &pixsCLBuffer);
1684  status = clSetKernelArg(rEnv.mpkKernel,
1685  1,
1686  sizeof(cl_mem),
1687  &pixdCLBuffer);
1688  status = clSetKernelArg(rEnv.mpkKernel,
1689  2,
1690  sizeof(wpl),
1691  (const void *)&wpl);
1692  status = clSetKernelArg(rEnv.mpkKernel,
1693  3,
1694  sizeof(h),
1695  (const void *)&h);
1696  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1697  rEnv.mpkKernel,
1698  2,
1699  NULL,
1700  globalThreads,
1701  localThreads,
1702  0,
1703  NULL,
1704  NULL);
1705 
1706  return status;
1707 }
1708 
1709 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1710 cl_int
1711 pixErodeCL_55(l_int32 wpl, l_int32 h)
1712 {
1713  size_t globalThreads[2];
1714  cl_mem pixtemp;
1715  cl_int status;
1716  int gsize;
1717  l_uint32 fwmask, lwmask;
1718  size_t localThreads[2];
1719 
1720  lwmask = lmask32[32 - 2];
1721  fwmask = rmask32[32 - 2];
1722 
1723  //Horizontal pass
1724  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1725  globalThreads[0] = gsize;
1726  globalThreads[1] = GROUPSIZE_HMORY;
1727  localThreads[0] = GROUPSIZE_HMORX;
1728  localThreads[1] = GROUPSIZE_HMORY;
1729 
1730  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
1731 
1732  status = clSetKernelArg(rEnv.mpkKernel,
1733  0,
1734  sizeof(cl_mem),
1735  &pixsCLBuffer);
1736  status = clSetKernelArg(rEnv.mpkKernel,
1737  1,
1738  sizeof(cl_mem),
1739  &pixdCLBuffer);
1740  status = clSetKernelArg(rEnv.mpkKernel,
1741  2,
1742  sizeof(wpl),
1743  (const void *)&wpl);
1744  status = clSetKernelArg(rEnv.mpkKernel,
1745  3,
1746  sizeof(h),
1747  (const void *)&h);
1748 
1749  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1750  rEnv.mpkKernel,
1751  2,
1752  NULL,
1753  globalThreads,
1754  localThreads,
1755  0,
1756  NULL,
1757  NULL);
1758 
1759  //Swap source and dest buffers
1760  pixtemp = pixsCLBuffer;
1761  pixsCLBuffer = pixdCLBuffer;
1762  pixdCLBuffer = pixtemp;
1763 
1764  //Vertical
1765  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1766  globalThreads[0] = gsize;
1767  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1768  globalThreads[1] = gsize;
1769  localThreads[0] = GROUPSIZE_X;
1770  localThreads[1] = GROUPSIZE_Y;
1771 
1772  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
1773 
1774  status = clSetKernelArg(rEnv.mpkKernel,
1775  0,
1776  sizeof(cl_mem),
1777  &pixsCLBuffer);
1778  status = clSetKernelArg(rEnv.mpkKernel,
1779  1,
1780  sizeof(cl_mem),
1781  &pixdCLBuffer);
1782  status = clSetKernelArg(rEnv.mpkKernel,
1783  2,
1784  sizeof(wpl),
1785  (const void *)&wpl);
1786  status = clSetKernelArg(rEnv.mpkKernel,
1787  3,
1788  sizeof(h),
1789  (const void *)&h);
1790  status = clSetKernelArg(rEnv.mpkKernel,
1791  4,
1792  sizeof(fwmask),
1793  (const void *)&fwmask);
1794  status = clSetKernelArg(rEnv.mpkKernel,
1795  5,
1796  sizeof(lwmask),
1797  (const void *)&lwmask);
1798  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1799  rEnv.mpkKernel,
1800  2,
1801  NULL,
1802  globalThreads,
1803  localThreads,
1804  0,
1805  NULL,
1806  NULL);
1807 
1808  return status;
1809 }
1810 
1811 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
1812 cl_int
1813 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1814 {
1815  l_int32 xp, yp, xn, yn;
1816  SEL* sel;
1817  size_t globalThreads[2];
1818  cl_mem pixtemp;
1819  cl_int status;
1820  int gsize;
1821  size_t localThreads[2];
1822  char isEven;
1823 
1824  OpenclDevice::SetKernelEnv( &rEnv );
1825 
1826  if (hsize == 5 && vsize == 5)
1827  {
1828  //Specific case for 5x5
1829  status = pixDilateCL_55(wpl, h);
1830  return status;
1831  }
1832 
1833  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1834 
1835  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1836 
1837  //global and local work dimensions for Horizontal pass
1838  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1839  globalThreads[0] = gsize;
1840  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1841  globalThreads[1] = gsize;
1842  localThreads[0] = GROUPSIZE_X;
1843  localThreads[1] = GROUPSIZE_Y;
1844 
1845  if (xp > 31 || xn > 31)
1846  {
1847  //Generic case.
1848  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status );
1849 
1850  status = clSetKernelArg(rEnv.mpkKernel,
1851  0,
1852  sizeof(cl_mem),
1853  &pixsCLBuffer);
1854  status = clSetKernelArg(rEnv.mpkKernel,
1855  1,
1856  sizeof(cl_mem),
1857  &pixdCLBuffer);
1858  status = clSetKernelArg(rEnv.mpkKernel,
1859  2,
1860  sizeof(xp),
1861  (const void *)&xp);
1862  status = clSetKernelArg(rEnv.mpkKernel,
1863  3,
1864  sizeof(xn),
1865  (const void *)&xn);
1866  status = clSetKernelArg(rEnv.mpkKernel,
1867  4,
1868  sizeof(wpl),
1869  (const void *)&wpl);
1870  status = clSetKernelArg(rEnv.mpkKernel,
1871  5,
1872  sizeof(h),
1873  (const void *)&h);
1874  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1875  rEnv.mpkKernel,
1876  2,
1877  NULL,
1878  globalThreads,
1879  localThreads,
1880  0,
1881  NULL,
1882  NULL);
1883 
1884  if (yp > 0 || yn > 0)
1885  {
1886  pixtemp = pixsCLBuffer;
1887  pixsCLBuffer = pixdCLBuffer;
1888  pixdCLBuffer = pixtemp;
1889  }
1890  }
1891  else if (xp > 0 || xn > 0 )
1892  {
1893  //Specfic Horizontal pass kernel for half width < 32
1894  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status );
1895  isEven = (xp != xn);
1896 
1897  status = clSetKernelArg(rEnv.mpkKernel,
1898  0,
1899  sizeof(cl_mem),
1900  &pixsCLBuffer);
1901  status = clSetKernelArg(rEnv.mpkKernel,
1902  1,
1903  sizeof(cl_mem),
1904  &pixdCLBuffer);
1905  status = clSetKernelArg(rEnv.mpkKernel,
1906  2,
1907  sizeof(xp),
1908  (const void *)&xp);
1909  status = clSetKernelArg(rEnv.mpkKernel,
1910  3,
1911  sizeof(wpl),
1912  (const void *)&wpl);
1913  status = clSetKernelArg(rEnv.mpkKernel,
1914  4,
1915  sizeof(h),
1916  (const void *)&h);
1917  status = clSetKernelArg(rEnv.mpkKernel,
1918  5,
1919  sizeof(isEven),
1920  (const void *)&isEven);
1921  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1922  rEnv.mpkKernel,
1923  2,
1924  NULL,
1925  globalThreads,
1926  localThreads,
1927  0,
1928  NULL,
1929  NULL);
1930 
1931  if (yp > 0 || yn > 0)
1932  {
1933  pixtemp = pixsCLBuffer;
1934  pixsCLBuffer = pixdCLBuffer;
1935  pixdCLBuffer = pixtemp;
1936  }
1937  }
1938 
1939  if (yp > 0 || yn > 0)
1940  {
1941  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
1942 
1943  status = clSetKernelArg(rEnv.mpkKernel,
1944  0,
1945  sizeof(cl_mem),
1946  &pixsCLBuffer);
1947  status = clSetKernelArg(rEnv.mpkKernel,
1948  1,
1949  sizeof(cl_mem),
1950  &pixdCLBuffer);
1951  status = clSetKernelArg(rEnv.mpkKernel,
1952  2,
1953  sizeof(yp),
1954  (const void *)&yp);
1955  status = clSetKernelArg(rEnv.mpkKernel,
1956  3,
1957  sizeof(wpl),
1958  (const void *)&wpl);
1959  status = clSetKernelArg(rEnv.mpkKernel,
1960  4,
1961  sizeof(h),
1962  (const void *)&h);
1963  status = clSetKernelArg(rEnv.mpkKernel,
1964  5,
1965  sizeof(yn),
1966  (const void *)&yn);
1967  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1968  rEnv.mpkKernel,
1969  2,
1970  NULL,
1971  globalThreads,
1972  localThreads,
1973  0,
1974  NULL,
1975  NULL);
1976  }
1977 
1978 
1979  return status;
1980 }
1981 
1982 //Morphology Erode operation. Invokes the relevant OpenCL kernels
1983 cl_int
1984 pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h)
1985 {
1986 
1987  l_int32 xp, yp, xn, yn;
1988  SEL* sel;
1989  size_t globalThreads[2];
1990  size_t localThreads[2];
1991  cl_mem pixtemp;
1992  cl_int status;
1993  int gsize;
1994  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1995  l_uint32 rwmask, lwmask;
1996  char isEven;
1997 
1998  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1999 
2000  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
2001 
2002  OpenclDevice::SetKernelEnv( &rEnv );
2003 
2004  if (hsize == 5 && vsize == 5 && isAsymmetric)
2005  {
2006  //Specific kernel for 5x5
2007  status = pixErodeCL_55(wpl, h);
2008  return status;
2009  }
2010 
2011  rwmask = rmask32[32 - (xp & 31)];
2012  lwmask = lmask32[32 - (xn & 31)];
2013 
2014  //global and local work dimensions for Horizontal pass
2015  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2016  globalThreads[0] = gsize;
2017  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2018  globalThreads[1] = gsize;
2019  localThreads[0] = GROUPSIZE_X;
2020  localThreads[1] = GROUPSIZE_Y;
2021 
2022  //Horizontal Pass
2023  if (xp > 31 || xn > 31 )
2024  {
2025  //Generic case.
2026  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status );
2027 
2028  status = clSetKernelArg(rEnv.mpkKernel,
2029  0,
2030  sizeof(cl_mem),
2031  &pixsCLBuffer);
2032  status = clSetKernelArg(rEnv.mpkKernel,
2033  1,
2034  sizeof(cl_mem),
2035  &pixdCLBuffer);
2036  status = clSetKernelArg(rEnv.mpkKernel,
2037  2,
2038  sizeof(xp),
2039  (const void *)&xp);
2040  status = clSetKernelArg(rEnv.mpkKernel,
2041  3,
2042  sizeof(xn),
2043  (const void *)&xn);
2044  status = clSetKernelArg(rEnv.mpkKernel,
2045  4,
2046  sizeof(wpl),
2047  (const void *)&wpl);
2048  status = clSetKernelArg(rEnv.mpkKernel,
2049  5,
2050  sizeof(h),
2051  (const void *)&h);
2052  status = clSetKernelArg(rEnv.mpkKernel,
2053  6,
2054  sizeof(isAsymmetric),
2055  (const void *)&isAsymmetric);
2056  status = clSetKernelArg(rEnv.mpkKernel,
2057  7,
2058  sizeof(rwmask),
2059  (const void *)&rwmask);
2060  status = clSetKernelArg(rEnv.mpkKernel,
2061  8,
2062  sizeof(lwmask),
2063  (const void *)&lwmask);
2064  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2065  rEnv.mpkKernel,
2066  2,
2067  NULL,
2068  globalThreads,
2069  localThreads,
2070  0,
2071  NULL,
2072  NULL);
2073 
2074  if (yp > 0 || yn > 0)
2075  {
2076  pixtemp = pixsCLBuffer;
2077  pixsCLBuffer = pixdCLBuffer;
2078  pixdCLBuffer = pixtemp;
2079  }
2080  }
2081  else if (xp > 0 || xn > 0)
2082  {
2083  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status );
2084  isEven = (xp != xn);
2085 
2086  status = clSetKernelArg(rEnv.mpkKernel,
2087  0,
2088  sizeof(cl_mem),
2089  &pixsCLBuffer);
2090  status = clSetKernelArg(rEnv.mpkKernel,
2091  1,
2092  sizeof(cl_mem),
2093  &pixdCLBuffer);
2094  status = clSetKernelArg(rEnv.mpkKernel,
2095  2,
2096  sizeof(xp),
2097  (const void *)&xp);
2098  status = clSetKernelArg(rEnv.mpkKernel,
2099  3,
2100  sizeof(wpl),
2101  (const void *)&wpl);
2102  status = clSetKernelArg(rEnv.mpkKernel,
2103  4,
2104  sizeof(h),
2105  (const void *)&h);
2106  status = clSetKernelArg(rEnv.mpkKernel,
2107  5,
2108  sizeof(isAsymmetric),
2109  (const void *)&isAsymmetric);
2110  status = clSetKernelArg(rEnv.mpkKernel,
2111  6,
2112  sizeof(rwmask),
2113  (const void *)&rwmask);
2114  status = clSetKernelArg(rEnv.mpkKernel,
2115  7,
2116  sizeof(lwmask),
2117  (const void *)&lwmask);
2118  status = clSetKernelArg(rEnv.mpkKernel,
2119  8,
2120  sizeof(isEven),
2121  (const void *)&isEven);
2122  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2123  rEnv.mpkKernel,
2124  2,
2125  NULL,
2126  globalThreads,
2127  localThreads,
2128  0,
2129  NULL,
2130  NULL);
2131 
2132  if (yp > 0 || yn > 0)
2133  {
2134  pixtemp = pixsCLBuffer;
2135  pixsCLBuffer = pixdCLBuffer;
2136  pixdCLBuffer = pixtemp;
2137  }
2138  }
2139 
2140  //Vertical Pass
2141  if (yp > 0 || yn > 0)
2142  {
2143  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status );
2144 
2145  status = clSetKernelArg(rEnv.mpkKernel,
2146  0,
2147  sizeof(cl_mem),
2148  &pixsCLBuffer);
2149  status = clSetKernelArg(rEnv.mpkKernel,
2150  1,
2151  sizeof(cl_mem),
2152  &pixdCLBuffer);
2153  status = clSetKernelArg(rEnv.mpkKernel,
2154  2,
2155  sizeof(yp),
2156  (const void *)&yp);
2157  status = clSetKernelArg(rEnv.mpkKernel,
2158  3,
2159  sizeof(wpl),
2160  (const void *)&wpl);
2161  status = clSetKernelArg(rEnv.mpkKernel,
2162  4,
2163  sizeof(h),
2164  (const void *)&h);
2165  status = clSetKernelArg(rEnv.mpkKernel,
2166  5,
2167  sizeof(isAsymmetric),
2168  (const void *)&isAsymmetric);
2169  status = clSetKernelArg(rEnv.mpkKernel,
2170  6,
2171  sizeof(yn),
2172  (const void *)&yn);
2173  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2174  rEnv.mpkKernel,
2175  2,
2176  NULL,
2177  globalThreads,
2178  localThreads,
2179  0,
2180  NULL,
2181  NULL);
2182  }
2183 
2184  return status;
2185 }
2186 
2187 // OpenCL implementation of Morphology Dilate
2188 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2189 PIX*
2190 OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false)
2191 {
2192  l_uint32 wpl, h;
2193 
2194  wpl = pixGetWpl(pixs);
2195  h = pixGetHeight(pixs);
2196 
2197  clStatus = pixDilateCL(hsize, vsize, wpl, h);
2198 
2199  if (reqDataCopy)
2200  {
2201  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false);
2202  }
2203 
2204  return pixd;
2205 }
2206 
2207 // OpenCL implementation of Morphology Erode
2208 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2209 PIX*
2210 OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false)
2211 {
2212  l_uint32 wpl, h;
2213 
2214  wpl = pixGetWpl(pixs);
2215  h = pixGetHeight(pixs);
2216 
2217  clStatus = pixErodeCL(hsize, vsize, wpl, h);
2218 
2219  if (reqDataCopy)
2220  {
2221  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2222  }
2223 
2224  return pixd;
2225 }
2226 
2227 //Morphology Open operation. Invokes the relevant OpenCL kernels
2228 cl_int
2229 pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
2230 {
2231  cl_int status;
2232  cl_mem pixtemp;
2233 
2234  //Erode followed by Dilate
2235  status = pixErodeCL(hsize, vsize, wpl, h);
2236 
2237  pixtemp = pixsCLBuffer;
2238  pixsCLBuffer = pixdCLBuffer;
2239  pixdCLBuffer = pixtemp;
2240 
2241  status = pixDilateCL(hsize, vsize, wpl, h);
2242 
2243  return status;
2244 }
2245 
2246 //Morphology Close operation. Invokes the relevant OpenCL kernels
2247 cl_int
2248 pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
2249 {
2250  cl_int status;
2251  cl_mem pixtemp;
2252 
2253  //Dilate followed by Erode
2254  status = pixDilateCL(hsize, vsize, wpl, h);
2255 
2256  pixtemp = pixsCLBuffer;
2257  pixsCLBuffer = pixdCLBuffer;
2258  pixdCLBuffer = pixtemp;
2259 
2260  status = pixErodeCL(hsize, vsize, wpl, h);
2261 
2262  return status;
2263 }
2264 
2265 // OpenCL implementation of Morphology Close
2266 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2267 PIX*
2268 OpenclDevice::pixCloseBrickCL(PIX *pixd,
2269  PIX *pixs,
2270  l_int32 hsize,
2271  l_int32 vsize,
2272  bool reqDataCopy = false)
2273 {
2274  l_uint32 wpl, h;
2275 
2276  wpl = pixGetWpl(pixs);
2277  h = pixGetHeight(pixs);
2278 
2279  clStatus = pixCloseCL(hsize, vsize, wpl, h);
2280 
2281  if (reqDataCopy)
2282  {
2283  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2284  }
2285 
2286  return pixd;
2287 }
2288 
2289 // OpenCL implementation of Morphology Open
2290 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2291 PIX*
2292 OpenclDevice::pixOpenBrickCL(PIX *pixd,
2293  PIX *pixs,
2294  l_int32 hsize,
2295  l_int32 vsize,
2296  bool reqDataCopy = false)
2297 {
2298  l_uint32 wpl, h;
2299 
2300  wpl = pixGetWpl(pixs);
2301  h = pixGetHeight(pixs);
2302 
2303  clStatus = pixOpenCL(hsize, vsize, wpl, h);
2304 
2305  if (reqDataCopy)
2306  {
2307  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2308  }
2309 
2310  return pixd;
2311 }
2312 
2313 //pix OR operation: outbuffer = buffer1 | buffer2
2314 cl_int
2315 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
2316 {
2317  cl_int status;
2318  size_t globalThreads[2];
2319  int gsize;
2320  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
2321 
2322  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2323  globalThreads[0] = gsize;
2324  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2325  globalThreads[1] = gsize;
2326 
2327  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
2328 
2329  status = clSetKernelArg(rEnv.mpkKernel,
2330  0,
2331  sizeof(cl_mem),
2332  &buffer1);
2333  status = clSetKernelArg(rEnv.mpkKernel,
2334  1,
2335  sizeof(cl_mem),
2336  &buffer2);
2337  status = clSetKernelArg(rEnv.mpkKernel,
2338  2,
2339  sizeof(cl_mem),
2340  &outbuffer);
2341  status = clSetKernelArg(rEnv.mpkKernel,
2342  3,
2343  sizeof(wpl),
2344  (const void *)&wpl);
2345  status = clSetKernelArg(rEnv.mpkKernel,
2346  4,
2347  sizeof(h),
2348  (const void *)&h);
2349  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2350  rEnv.mpkKernel,
2351  2,
2352  NULL,
2353  globalThreads,
2354  localThreads,
2355  0,
2356  NULL,
2357  NULL);
2358 
2359  return status;
2360 }
2361 
2362 //pix AND operation: outbuffer = buffer1 & buffer2
2363 cl_int
2364 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
2365 {
2366  cl_int status;
2367  size_t globalThreads[2];
2368  int gsize;
2369  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
2370 
2371  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2372  globalThreads[0] = gsize;
2373  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2374  globalThreads[1] = gsize;
2375 
2376  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
2377 
2378  // Enqueue a kernel run call.
2379  status = clSetKernelArg(rEnv.mpkKernel,
2380  0,
2381  sizeof(cl_mem),
2382  &buffer1);
2383  status = clSetKernelArg(rEnv.mpkKernel,
2384  1,
2385  sizeof(cl_mem),
2386  &buffer2);
2387  status = clSetKernelArg(rEnv.mpkKernel,
2388  2,
2389  sizeof(cl_mem),
2390  &outbuffer);
2391  status = clSetKernelArg(rEnv.mpkKernel,
2392  3,
2393  sizeof(wpl),
2394  (const void *)&wpl);
2395  status = clSetKernelArg(rEnv.mpkKernel,
2396  4,
2397  sizeof(h),
2398  (const void *)&h);
2399  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2400  rEnv.mpkKernel,
2401  2,
2402  NULL,
2403  globalThreads,
2404  localThreads,
2405  0,
2406  NULL,
2407  NULL);
2408 
2409  return status;
2410 }
2411 
2412 //output = buffer1 & ~(buffer2)
2413 cl_int
2414 pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL)
2415 {
2416  cl_int status;
2417  size_t globalThreads[2];
2418  int gsize;
2419  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
2420 
2421  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2422  globalThreads[0] = gsize;
2423  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2424  globalThreads[1] = gsize;
2425 
2426  if (outBuffer != NULL)
2427  {
2428  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status );
2429  }
2430  else
2431  {
2432  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status );
2433  }
2434 
2435  // Enqueue a kernel run call.
2436  status = clSetKernelArg(rEnv.mpkKernel,
2437  0,
2438  sizeof(cl_mem),
2439  &buffer1);
2440  status = clSetKernelArg(rEnv.mpkKernel,
2441  1,
2442  sizeof(cl_mem),
2443  &buffer2);
2444  status = clSetKernelArg(rEnv.mpkKernel,
2445  2,
2446  sizeof(wpl),
2447  (const void *)&wpl);
2448  status = clSetKernelArg(rEnv.mpkKernel,
2449  3,
2450  sizeof(h),
2451  (const void *)&h);
2452  if (outBuffer != NULL)
2453  {
2454  status = clSetKernelArg(rEnv.mpkKernel,
2455  4,
2456  sizeof(cl_mem),
2457  (const void *)&outBuffer);
2458  }
2459  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2460  rEnv.mpkKernel,
2461  2,
2462  NULL,
2463  globalThreads,
2464  localThreads,
2465  0,
2466  NULL,
2467  NULL);
2468 
2469  return status;
2470 }
2471 
2472 // OpenCL implementation of Subtract pix
2473 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2474 PIX*
2475 OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy = false)
2476 {
2477  l_uint32 wpl, h;
2478 
2479  PROCNAME("pixSubtractCL");
2480 
2481  if (!pixs1)
2482  return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
2483  if (!pixs2)
2484  return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
2485  if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
2486  return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
2487 
2488 #if EQUAL_SIZE_WARNING
2489  if (!pixSizesEqual(pixs1, pixs2))
2490  L_WARNING("pixs1 and pixs2 not equal sizes", procName);
2491 #endif /* EQUAL_SIZE_WARNING */
2492 
2493  wpl = pixGetWpl(pixs1);
2494  h = pixGetHeight(pixs1);
2495 
2496  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2497 
2498  if (reqDataCopy)
2499  {
2500  //Read back output data from OCL buffer to cpu
2501  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
2502  }
2503 
2504  return pixd;
2505 }
2506 
2507 // OpenCL implementation of Hollow pix
2508 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2509 PIX*
2510 OpenclDevice::pixHollowCL(PIX *pixd,
2511  PIX *pixs,
2512  l_int32 close_hsize,
2513  l_int32 close_vsize,
2514  l_int32 open_hsize,
2515  l_int32 open_vsize,
2516  bool reqDataCopy = false)
2517 {
2518  l_uint32 wpl, h;
2519  cl_mem pixtemp;
2520 
2521  wpl = pixGetWpl(pixs);
2522  h = pixGetHeight(pixs);
2523 
2524  //First step : Close Morph operation: Dilate followed by Erode
2525  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
2526 
2527  //Store the output of close operation in an intermediate buffer
2528  //this will be later used for pixsubtract
2529  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
2530 
2531  //Second step: Open Operation - Erode followed by Dilate
2532  pixtemp = pixsCLBuffer;
2533  pixsCLBuffer = pixdCLBuffer;
2534  pixdCLBuffer = pixtemp;
2535 
2536  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
2537 
2538  //Third step: Subtract : (Close - Open)
2539  pixtemp = pixsCLBuffer;
2540  pixsCLBuffer = pixdCLBuffer;
2541  pixdCLBuffer = pixdCLIntermediate;
2542  pixdCLIntermediate = pixtemp;
2543 
2544  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2545 
2546  if (reqDataCopy)
2547  {
2548  //Read back output data from OCL buffer to cpu
2549  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2550  }
2551  return pixd;
2552 }
2553 
2554 // OpenCL implementation of Get Lines from pix function
2555 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2556 void
2557 OpenclDevice::pixGetLinesCL(PIX *pixd,
2558  PIX *pixs,
2559  PIX** pix_vline,
2560  PIX** pix_hline,
2561  PIX** pixClosed,
2562  bool getpixClosed,
2563  l_int32 close_hsize, l_int32 close_vsize,
2564  l_int32 open_hsize, l_int32 open_vsize,
2565  l_int32 line_hsize, l_int32 line_vsize)
2566 {
2567  l_uint32 wpl, h;
2568  cl_mem pixtemp;
2569 
2570  wpl = pixGetWpl(pixs);
2571  h = pixGetHeight(pixs);
2572 
2573  //First step : Close Morph operation: Dilate followed by Erode
2574  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
2575 
2576  //Copy the Close output to CPU buffer
2577  if (getpixClosed)
2578  {
2579  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false);
2580  }
2581 
2582  //Store the output of close operation in an intermediate buffer
2583  //this will be later used for pixsubtract
2584  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
2585 
2586  //Second step: Open Operation - Erode followed by Dilate
2587  pixtemp = pixsCLBuffer;
2588  pixsCLBuffer = pixdCLBuffer;
2589  pixdCLBuffer = pixtemp;
2590 
2591  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
2592 
2593  //Third step: Subtract : (Close - Open)
2594  pixtemp = pixsCLBuffer;
2595  pixsCLBuffer = pixdCLBuffer;
2596  pixdCLBuffer = pixdCLIntermediate;
2597  pixdCLIntermediate = pixtemp;
2598 
2599  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2600 
2601  //Store the output of Hollow operation in an intermediate buffer
2602  //this will be later used
2603  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
2604 
2605  pixtemp = pixsCLBuffer;
2606  pixsCLBuffer = pixdCLBuffer;
2607  pixdCLBuffer = pixtemp;
2608 
2609  //Fourth step: Get vertical line
2610  //pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
2611  clStatus = pixOpenCL(1, line_vsize, wpl, h);
2612 
2613  //Copy the vertical line output to CPU buffer
2614  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false);
2615 
2616  pixtemp = pixsCLBuffer;
2617  pixsCLBuffer = pixdCLIntermediate;
2618  pixdCLIntermediate = pixtemp;
2619 
2620  //Fifth step: Get horizontal line
2621  //pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
2622  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
2623 
2624  //Copy the horizontal line output to CPU buffer
2625  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true);
2626 
2627  return;
2628 }
2629 
2630 
2631 /*************************************************************************
2632  * HistogramRect
2633  * Otsu Thresholding Operations
2634  * histogramAllChannels is layed out as all channel 0, then all channel 1...
2635  * only supports 1 or 4 channels (bytes_per_pixel)
2636  ************************************************************************/
2637 void OpenclDevice::HistogramRectOCL(
2638  const unsigned char* imageData,
2639  int bytes_per_pixel,
2640  int bytes_per_line,
2641  int left, // always 0
2642  int top, // always 0
2643  int width,
2644  int height,
2645  int kHistogramSize,
2646  int* histogramAllChannels)
2647 {
2648 PERF_COUNT_START("HistogramRectOCL")
2649  cl_int clStatus;
2650  KernelEnv histKern;
2651  SetKernelEnv( &histKern );
2652  KernelEnv histRedKern;
2653  SetKernelEnv( &histRedKern );
2654  /* map imagedata to device as read only */
2655  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
2656  // faster option would be to allocate initial image buffer
2657  // using a garlic bus memory type
2658  cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
2659  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2660 
2661  /* setup work group size parameters */
2662  int block_size = 256;
2663  cl_uint numCUs;
2664  clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
2665  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2666 
2667  int requestedOccupancy = 10;
2668  int numWorkGroups = numCUs * requestedOccupancy;
2669  int numThreads = block_size*numWorkGroups;
2670  size_t local_work_size[] = {static_cast<size_t>(block_size)};
2671  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
2672  size_t red_global_work_size[] = {
2673  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
2674 
2675  /* map histogramAllChannels as write only */
2676  int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
2677 
2678  cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus );
2679  CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer");
2680 
2681  /* intermediate histogram buffer */
2682  int histRed = 256;
2683  int tmpHistogramBins = kHistogramSize*bytes_per_pixel*histRed;
2684 
2685  cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus );
2686  CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer");
2687 
2688  /* atomic sync buffer */
2689  int *zeroBuffer = new int[1];
2690  zeroBuffer[0] = 0;
2691  cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
2692  CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
2693 
2694  //Create kernel objects based on bytes_per_pixel
2695  if (bytes_per_pixel == 1)
2696  {
2697  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus );
2698  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
2699 
2700  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus );
2701  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
2702  } else {
2703  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
2704  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
2705 
2706  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
2707  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
2708  }
2709 
2710  void *ptr;
2711 
2712  //Initialize tmpHistogramBuffer buffer
2713  ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus);
2714  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
2715 
2716  memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
2717  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL);
2718 
2719  /* set kernel 1 arguments */
2720  clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
2721  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2722  cl_uint numPixels = width*height;
2723  clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels );
2724  CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
2725  clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
2726  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
2727 
2728  /* set kernel 2 arguments */
2729  int n = numThreads/bytes_per_pixel;
2730  clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n );
2731  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2732  clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
2733  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
2734  clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer );
2735  CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
2736 
2737  /* launch histogram */
2738 PERF_COUNT_SUB("before")
2739  clStatus = clEnqueueNDRangeKernel(
2740  histKern.mpkCmdQueue,
2741  histKern.mpkKernel,
2742  1, NULL, global_work_size, local_work_size,
2743  0, NULL, NULL );
2744  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
2745  clFinish( histKern.mpkCmdQueue );
2746 
2747  /* launch histogram */
2748  clStatus = clEnqueueNDRangeKernel(
2749  histRedKern.mpkCmdQueue,
2750  histRedKern.mpkKernel,
2751  1, NULL, red_global_work_size, local_work_size,
2752  0, NULL, NULL );
2753  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
2754  clFinish( histRedKern.mpkCmdQueue );
2755 
2756 PERF_COUNT_SUB("redKernel")
2757 
2758  /* map results back from gpu */
2759  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
2760  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
2761 
2762  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
2763 
2764  clReleaseMemObject(histogramBuffer);
2765  clReleaseMemObject(imageBuffer);
2766 PERF_COUNT_SUB("after")
2768 
2769 }
2770 
2771 /*************************************************************************
2772  * Threshold the rectangle, taking everything except the image buffer pointer
2773  * from the class, using thresholds/hi_values to the output IMAGE.
2774  * only supports 1 or 4 channels
2775  ************************************************************************/
2776 void OpenclDevice::ThresholdRectToPixOCL(
2777  const unsigned char* imageData,
2778  int bytes_per_pixel,
2779  int bytes_per_line,
2780  const int* thresholds,
2781  const int* hi_values,
2782  Pix** pix,
2783  int height,
2784  int width,
2785  int top,
2786  int left) {
2787 PERF_COUNT_START("ThresholdRectToPixOCL")
2788 
2789  /* create pix result buffer */
2790  *pix = pixCreate(width, height, 1);
2791  uinT32* pixData = pixGetData(*pix);
2792  int wpl = pixGetWpl(*pix);
2793  int pixSize = wpl*height*sizeof(uinT32);
2794 
2795  cl_int clStatus;
2796  KernelEnv rEnv;
2797  SetKernelEnv( &rEnv );
2798 
2799  /* setup work group size parameters */
2800  int block_size = 256;
2801  cl_uint numCUs = 6;
2802  clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
2803  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2804 
2805  int requestedOccupancy = 10;
2806  int numWorkGroups = numCUs * requestedOccupancy;
2807  int numThreads = block_size*numWorkGroups;
2808  size_t local_work_size[] = {(size_t) block_size};
2809  size_t global_work_size[] = {(size_t) numThreads};
2810 
2811  /* map imagedata to device as read only */
2812  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
2813  // faster option would be to allocate initial image buffer
2814  // using a garlic bus memory type
2815  cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
2816  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2817 
2818  /* map pix as write only */
2819  pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus );
2820  CHECK_OPENCL( clStatus, "clCreateBuffer pix");
2821 
2822  /* map thresholds and hi_values */
2823  cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus );
2824  CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer");
2825  cl_mem hiValuesBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus );
2826  CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer");
2827 
2828  /* compile kernel */
2829  if (bytes_per_pixel == 4) {
2830  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus );
2831  CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix");
2832  } else {
2833  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus );
2834  CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
2835  }
2836 
2837  /* set kernel arguments */
2838  clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
2839  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2840  cl_uint numPixels = width*height;
2841  clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height );
2842  CHECK_OPENCL( clStatus, "clSetKernelArg height" );
2843  clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width );
2844  CHECK_OPENCL( clStatus, "clSetKernelArg width" );
2845  clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl );
2846  CHECK_OPENCL( clStatus, "clSetKernelArg wpl" );
2847  clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer );
2848  CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" );
2849  clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer );
2850  CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" );
2851  clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer );
2852  CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer");
2853 
2854  /* launch kernel & wait */
2855 PERF_COUNT_SUB("before")
2856  clStatus = clEnqueueNDRangeKernel(
2857  rEnv.mpkCmdQueue,
2858  rEnv.mpkKernel,
2859  1, NULL, global_work_size, local_work_size,
2860  0, NULL, NULL );
2861  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
2862  clFinish( rEnv.mpkCmdQueue );
2863 PERF_COUNT_SUB("kernel")
2864 
2865  /* map results back from gpu */
2866  void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
2867  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
2868  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL);
2869 
2870  clReleaseMemObject(imageBuffer);
2871  clReleaseMemObject(thresholdsBuffer);
2872  clReleaseMemObject(hiValuesBuffer);
2873 
2874 PERF_COUNT_SUB("after")
2876 }
2877 
2878 
2879 #if USE_DEVICE_SELECTION
2880 
2881 /******************************************************************************
2882  * Data Types for Device Selection
2883  *****************************************************************************/
2884 
2885 typedef struct _TessScoreEvaluationInputData {
2886  int height;
2887  int width;
2888  int numChannels;
2889  unsigned char *imageData;
2890  Pix *pix;
2891 } TessScoreEvaluationInputData;
2892 
2893 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
2894  srand(1);
2895  // 8.5x11 inches @ 300dpi rounded to clean multiples
2896  int height = 3328; // %256
2897  int width = 2560; // %512
2898  int numChannels = 4;
2899  input->height = height;
2900  input->width = width;
2901  input->numChannels = numChannels;
2902  unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
2903  input->imageData = (unsigned char *) &imageData4[0];
2904 
2905  // zero out image
2906  unsigned char pixelWhite[4] = { 0, 0, 0, 255};
2907  unsigned char pixelBlack[4] = {255, 255, 255, 255};
2908  for (int p = 0; p < height*width; p++) {
2909  //unsigned char tmp[4] = imageData4[0];
2910  imageData4[p][0] = pixelWhite[0];
2911  imageData4[p][1] = pixelWhite[1];
2912  imageData4[p][2] = pixelWhite[2];
2913  imageData4[p][3] = pixelWhite[3];
2914  }
2915  // random lines to be eliminated
2916  int maxLineWidth = 64; // pixels wide
2917  int numLines = 10;
2918  // vertical lines
2919  for (int i = 0; i < numLines; i++) {
2920  int lineWidth = rand()%maxLineWidth;
2921  int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
2922  //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
2923  for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2924  for (int col = 0; col < height; col++) {
2925  //imageData4[row*width+col] = pixelBlack;
2926  imageData4[row*width+col][0] = pixelBlack[0];
2927  imageData4[row*width+col][1] = pixelBlack[1];
2928  imageData4[row*width+col][2] = pixelBlack[2];
2929  imageData4[row*width+col][3] = pixelBlack[3];
2930  }
2931  }
2932  }
2933  // horizontal lines
2934  for (int i = 0; i < numLines; i++) {
2935  int lineWidth = rand()%maxLineWidth;
2936  int horLinePos = lineWidth + rand()%(height-2*lineWidth);
2937  //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
2938  for (int row = 0; row < width; row++) {
2939  for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2940  //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
2941  //imageData4[row*width+col] = pixelBlack;
2942  imageData4[row*width+col][0] = pixelBlack[0];
2943  imageData4[row*width+col][1] = pixelBlack[1];
2944  imageData4[row*width+col][2] = pixelBlack[2];
2945  imageData4[row*width+col][3] = pixelBlack[3];
2946  }
2947  }
2948  }
2949  // spots (noise, squares)
2950  float fractionBlack = 0.1; // how much of the image should be blackened
2951  int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
2952  for (int i = 0; i < numSpots; i++) {
2953 
2954  int lineWidth = rand()%maxLineWidth;
2955  int col = lineWidth + rand()%(width-2*lineWidth);
2956  int row = lineWidth + rand()%(height-2*lineWidth);
2957  //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
2958  for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
2959  for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
2960  //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2961  //imageData4[row*width+col] = pixelBlack;
2962  imageData4[r*width+c][0] = pixelBlack[0];
2963  imageData4[r*width+c][1] = pixelBlack[1];
2964  imageData4[r*width+c][2] = pixelBlack[2];
2965  imageData4[r*width+c][3] = pixelBlack[3];
2966  }
2967  }
2968  }
2969 
2970  input->pix = pixCreate(input->width, input->height, 1);
2971 }
2972 
2973 typedef struct _TessDeviceScore {
2974  float time; // small time means faster device
2975  bool clError; // were there any opencl errors
2976  bool valid; // was the correct response generated
2977 } TessDeviceScore;
2978 
2979 /******************************************************************************
2980  * Micro Benchmarks for Device Selection
2981  *****************************************************************************/
2982 
2983 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2984 
2985  double time = 0;
2986 #if ON_WINDOWS
2987  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2988  QueryPerformanceFrequency(&freq);
2989 #elif ON_APPLE
2990  mach_timespec_t time_funct_start, time_funct_end;
2991 #else
2992  TIMESPEC time_funct_start, time_funct_end;
2993 #endif
2994  // input data
2995  l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
2996 
2997  // function call
2998  if (type == DS_DEVICE_OPENCL_DEVICE) {
2999 #if ON_WINDOWS
3000  QueryPerformanceCounter(&time_funct_start);
3001 #else
3002  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3003 #endif
3004 
3005  OpenclDevice::gpuEnv = *env;
3006  int wpl = pixGetWpl(input.pix);
3007  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL);
3008 #if ON_WINDOWS
3009  QueryPerformanceCounter(&time_funct_end);
3010  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3011 #else
3012  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3013  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3014 #endif
3015 
3016  } else {
3017 #if ON_WINDOWS
3018  QueryPerformanceCounter(&time_funct_start);
3019 #else
3020  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3021 #endif
3022  Pix *pix = pixCreate(input.width, input.height, 32);
3023  l_uint32 *pixData = pixGetData(pix);
3024  int wpl = pixGetWpl(pix);
3025  //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
3026  //pixSetData(pix, output_gpu);
3027  int i, j;
3028  int idx = 0;
3029  for (i = 0; i < input.height ; i++) {
3030  for (j = 0; j < input.width; j++) {
3031 
3032  l_uint32 tiffword = tiffdata[i * input.width + j];
3033  l_int32 rval = ((tiffword) & 0xff);
3034  l_int32 gval = (((tiffword) >> 8) & 0xff);
3035  l_int32 bval = (((tiffword) >> 16) & 0xff);
3036  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
3037  pixData[idx] = value;
3038  idx++;
3039  }
3040  }
3041 #if ON_WINDOWS
3042  QueryPerformanceCounter(&time_funct_end);
3043  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3044 #else
3045  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3046  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3047 #endif
3048  pixDestroy(&pix);
3049  }
3050 
3051 
3052  // cleanup
3053 
3054  return time;
3055 }
3056 
3057 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
3058 
3059  double time;
3060 #if ON_WINDOWS
3061  LARGE_INTEGER freq, time_funct_start, time_funct_end;
3062  QueryPerformanceFrequency(&freq);
3063 #elif ON_APPLE
3064  mach_timespec_t time_funct_start, time_funct_end;
3065 #else
3066  TIMESPEC time_funct_start, time_funct_end;
3067 #endif
3068 
3069  unsigned char pixelHi = (unsigned char)255;
3070 
3071  int left = 0;
3072  int top = 0;
3073  int kHistogramSize = 256;
3074  int bytes_per_line = input.width*input.numChannels;
3075  int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
3076 
3077  // function call
3078  if (type == DS_DEVICE_OPENCL_DEVICE) {
3079 #if ON_WINDOWS
3080  QueryPerformanceCounter(&time_funct_start);
3081 #else
3082  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3083 #endif
3084 
3085  OpenclDevice::gpuEnv = *env;
3086  int wpl = pixGetWpl(input.pix);
3087  OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
3088 
3089 #if ON_WINDOWS
3090  QueryPerformanceCounter(&time_funct_end);
3091  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3092 #else
3093  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3094  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3095 #endif
3096  } else {
3097 
3098  int *histogram = new int[kHistogramSize];
3099 #if ON_WINDOWS
3100  QueryPerformanceCounter(&time_funct_start);
3101 #else
3102  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3103 #endif
3104  for (int ch = 0; ch < input.numChannels; ++ch) {
3105  tesseract::HistogramRect(input.pix, input.numChannels,
3106  left, top, input.width, input.height, histogram);
3107  }
3108 #if ON_WINDOWS
3109  QueryPerformanceCounter(&time_funct_end);
3110  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3111 #else
3112  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3113  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3114 #endif
3115  delete[] histogram;
3116  }
3117 
3118  // cleanup
3119  //delete[] imageData;
3120  delete[] histogramAllChannels;
3121  return time;
3122 }
3123 
3124 //Reproducing the ThresholdRectToPix native version
3125 void ThresholdRectToPix_Native(const unsigned char* imagedata,
3126  int bytes_per_pixel,
3127  int bytes_per_line,
3128  const int* thresholds,
3129  const int* hi_values,
3130  Pix** pix) {
3131  int top = 0;
3132  int left = 0;
3133  int width = pixGetWidth(*pix);
3134  int height = pixGetHeight(*pix);
3135 
3136  *pix = pixCreate(width, height, 1);
3137  uinT32* pixdata = pixGetData(*pix);
3138  int wpl = pixGetWpl(*pix);
3139  const unsigned char* srcdata = imagedata + top * bytes_per_line +
3140  left * bytes_per_pixel;
3141  for (int y = 0; y < height; ++y) {
3142  const uinT8* linedata = srcdata;
3143  uinT32* pixline = pixdata + y * wpl;
3144  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
3145  bool white_result = true;
3146  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
3147  if (hi_values[ch] >= 0 &&
3148  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
3149  white_result = false;
3150  break;
3151  }
3152  }
3153  if (white_result)
3154  CLEAR_DATA_BIT(pixline, x);
3155  else
3156  SET_DATA_BIT(pixline, x);
3157  }
3158  srcdata += bytes_per_line;
3159  }
3160 }
3161 
3162 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
3163 
3164  double time;
3165 #if ON_WINDOWS
3166  LARGE_INTEGER freq, time_funct_start, time_funct_end;
3167  QueryPerformanceFrequency(&freq);
3168 #elif ON_APPLE
3169  mach_timespec_t time_funct_start, time_funct_end;
3170 #else
3171  TIMESPEC time_funct_start, time_funct_end;
3172 #endif
3173 
3174  // input data
3175  unsigned char pixelHi = (unsigned char)255;
3176  int* thresholds = new int[4];
3177  thresholds[0] = pixelHi/2;
3178  thresholds[1] = pixelHi/2;
3179  thresholds[2] = pixelHi/2;
3180  thresholds[3] = pixelHi/2;
3181  int *hi_values = new int[4];
3182  thresholds[0] = pixelHi;
3183  thresholds[1] = pixelHi;
3184  thresholds[2] = pixelHi;
3185  thresholds[3] = pixelHi;
3186  //Pix* pix = pixCreate(width, height, 1);
3187  int top = 0;
3188  int left = 0;
3189  int bytes_per_line = input.width*input.numChannels;
3190 
3191  // function call
3192  if (type == DS_DEVICE_OPENCL_DEVICE) {
3193 #if ON_WINDOWS
3194  QueryPerformanceCounter(&time_funct_start);
3195 #else
3196  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3197 #endif
3198 
3199  OpenclDevice::gpuEnv = *env;
3200  int wpl = pixGetWpl(input.pix);
3201  OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
3202 
3203 #if ON_WINDOWS
3204  QueryPerformanceCounter(&time_funct_end);
3205  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3206 #else
3207  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3208  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3209 #endif
3210  } else {
3211 
3212 
3213  tesseract::ImageThresholder thresholder;
3214  thresholder.SetImage( input.pix );
3215 #if ON_WINDOWS
3216  QueryPerformanceCounter(&time_funct_start);
3217 #else
3218  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3219 #endif
3220  ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
3221  thresholds, hi_values, &input.pix );
3222 
3223 #if ON_WINDOWS
3224  QueryPerformanceCounter(&time_funct_end);
3225  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3226 #else
3227  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3228  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3229 #endif
3230  }
3231 
3232  // cleanup
3233  delete[] thresholds;
3234  delete[] hi_values;
3235  return time;
3236 }
3237 
3238 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
3239 
3240  double time = 0;
3241 #if ON_WINDOWS
3242  LARGE_INTEGER freq, time_funct_start, time_funct_end;
3243  QueryPerformanceFrequency(&freq);
3244 #elif ON_APPLE
3245  mach_timespec_t time_funct_start, time_funct_end;
3246 #else
3247  TIMESPEC time_funct_start, time_funct_end;
3248 #endif
3249 
3250  // input data
3251  int resolution = 300;
3252  int wpl = pixGetWpl(input.pix);
3253  int kThinLineFraction = 20; // tess constant
3254  int kMinLineLengthFraction = 4; // tess constant
3255  int max_line_width = resolution / kThinLineFraction;
3256  int min_line_length = resolution / kMinLineLengthFraction;
3257  int closing_brick = max_line_width / 3;
3258 
3259  // function call
3260  if (type == DS_DEVICE_OPENCL_DEVICE) {
3261 #if ON_WINDOWS
3262  QueryPerformanceCounter(&time_funct_start);
3263 #else
3264  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3265 #endif
3266  Pix *src_pix = input.pix;
3267  OpenclDevice::gpuEnv = *env;
3268  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
3269  Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
3270  OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length);
3271 
3272  OpenclDevice::releaseMorphCLBuffers();
3273 
3274 #if ON_WINDOWS
3275  QueryPerformanceCounter(&time_funct_end);
3276  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3277 #else
3278  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3279  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3280 #endif
3281  } else {
3282 #if ON_WINDOWS
3283  QueryPerformanceCounter(&time_funct_start);
3284 #else
3285  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3286 #endif
3287 
3288  // native serial code
3289  Pix *src_pix = input.pix;
3290  Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
3291  Pix *pix_solid = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
3292  Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
3293  pixDestroy(&pix_solid);
3294  Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
3295  Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
3296  pixDestroy(&pix_hollow);
3297 
3298 #if ON_WINDOWS
3299  QueryPerformanceCounter(&time_funct_end);
3300  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3301 #else
3302  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3303  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3304 #endif
3305  }
3306 
3307  return time;
3308 }
3309 
3310 
3311 
3312 /******************************************************************************
3313  * Device Selection
3314  *****************************************************************************/
3315 
3316 #include "stdlib.h"
3317 
3318 
3319 // encode score object as byte string
3320 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
3321  *serializedScoreSize = sizeof(TessDeviceScore);
3322  *serializedScore = (void *) new unsigned char[*serializedScoreSize];
3323  memcpy(*serializedScore, device->score, *serializedScoreSize);
3324  return DS_SUCCESS;
3325 }
3326 
3327 // parses byte string and stores in score object
3328 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
3329  // check that serializedScoreSize == sizeof(TessDeviceScore);
3330  device->score = new TessDeviceScore;
3331  memcpy(device->score, serializedScore, serializedScoreSize);
3332  return DS_SUCCESS;
3333 }
3334 
3335 
3336 
3337 // evaluate devices
3338 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
3339 
3340  // overwrite statuc gpuEnv w/ current device
3341  // so native opencl calls can be used; they use static gpuEnv
3342  printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
3343  GPUEnv *env = NULL;
3344  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
3345  env = new GPUEnv;
3346  //printf("[DS] populating tmp GPUEnv from device\n");
3347  populateGPUEnvFromDevice( env, device->oclDeviceID);
3348  env->mnFileCount = 0; //argc;
3349  env->mnKernelCount = 0UL;
3350  //printf("[DS] compiling kernels for tmp GPUEnv\n");
3351  OpenclDevice::gpuEnv = *env;
3352  OpenclDevice::CompileKernelFile(env, "");
3353  }
3354 
3355 
3356  TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
3357 
3358  // pixReadTiff
3359  double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
3360 
3361  // HistogramRect
3362  double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
3363 
3364  // ThresholdRectToPix
3365  double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
3366 
3367  // getLineMasks
3368  double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
3369 
3370 
3371  // weigh times (% of cpu time)
3372  // these weights should be the % execution time that the native cpu code took
3373  float composeRGBPixelWeight = 1.2f;
3374  float histogramRectWeight = 2.4f;
3375  float thresholdRectToPixWeight = 4.5f;
3376  float getLineMasksMorphWeight = 5.0f;
3377 
3378  float weightedTime =
3379  composeRGBPixelWeight * composeRGBPixelTime +
3380  histogramRectWeight * histogramRectTime +
3381  thresholdRectToPixWeight * thresholdRectToPixTime +
3382  getLineMasksMorphWeight * getLineMasksMorphTime
3383  ;
3384  device->score = (void *)new TessDeviceScore;
3385  ((TessDeviceScore *)device->score)->time = weightedTime;
3386 
3387  printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
3388  printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
3389  printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
3390  printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
3391  printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
3392  printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
3393  return DS_SUCCESS;
3394 }
3395 
3396 // initial call to select device
3397 ds_device OpenclDevice::getDeviceSelection( ) {
3398 //PERF_COUNT_START("getDeviceSelection")
3399  if (!deviceIsSelected) {
3400 PERF_COUNT_START("getDeviceSelection")
3401  // check if opencl is available at runtime
3402  if( 1 == LoadOpencl() ) {
3403  // opencl is available
3404 //PERF_COUNT_SUB("LoadOpencl")
3405  // setup devices
3406  ds_status status;
3407  ds_profile *profile;
3408  status = initDSProfile( &profile, "v0.1" );
3409 PERF_COUNT_SUB("initDSProfile")
3410  // try reading scores from file
3411  char *fileName = "tesseract_opencl_profile_devices.dat";
3412  status = readProfileFromFile( profile, deserializeScore, fileName);
3413  if (status != DS_SUCCESS) {
3414  // need to run evaluation
3415  printf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
3416 
3417  // create input data
3418  TessScoreEvaluationInputData input;
3419  populateTessScoreEvaluationInputData( &input );
3420 //PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
3421  // perform evaluations
3422  unsigned int numUpdates;
3423  status = profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates );
3424 PERF_COUNT_SUB("profileDevices")
3425  // write scores to file
3426  if ( status == DS_SUCCESS ) {
3427  status = writeProfileToFile( profile, serializeScore, fileName);
3428 PERF_COUNT_SUB("writeProfileToFile")
3429  if ( status == DS_SUCCESS ) {
3430  printf("[DS] Scores written to file (%s).\n", fileName);
3431  } else {
3432  printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName);
3433  }
3434  } else {
3435  printf("[DS] Unable to evaluate performance; scores not written to file.\n");
3436  }
3437 
3438  } else {
3439 
3440 PERF_COUNT_SUB("readProfileFromFile")
3441  printf("[DS] Profile read from file (%s).\n", fileName);
3442  }
3443 
3444  // we now have device scores either from file or evaluation
3445  // select fastest using custom Tesseract selection algorithm
3446  float bestTime = FLT_MAX; // begin search with worst possible time
3447  int bestDeviceIdx = -1;
3448  for (int d = 0; d < profile->numDevices; d++) {
3449  //((TessDeviceScore *)device->score)->time
3450  ds_device device = profile->devices[d];
3451  TessDeviceScore score = *(TessDeviceScore *)device.score;
3452 
3453  float time = score.time;
3454  printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time);
3455  if (time < bestTime) {
3456  bestTime = time;
3457  bestDeviceIdx = d;
3458  }
3459  }
3460  printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
3461  // cleanup
3462  // TODO: call destructor for profile object?
3463 
3464  bool overrided = false;
3465  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
3466  if (overrideDeviceStr != NULL) {
3467  int overrideDeviceIdx = atoi(overrideDeviceStr);
3468  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) {
3469  printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx);
3470  bestDeviceIdx = overrideDeviceIdx - 1;
3471  overrided = true;
3472  } else {
3473  printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices);
3474  }
3475 }
3476 
3477  if (overrided) {
3478  printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
3479  }
3480  selectedDevice = profile->devices[bestDeviceIdx];
3481 
3482  } else {
3483  // opencl isn't available at runtime, select native cpu device
3484  printf("[DS] OpenCL runtime not available.\n");
3485  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
3486  selectedDevice.oclDeviceName = "(null)";
3487  selectedDevice.score = NULL;
3488  selectedDevice.oclDeviceID = NULL;
3489  selectedDevice.oclDriverVersion = NULL;
3490  }
3491  deviceIsSelected = true;
3492 PERF_COUNT_SUB("select from Profile")
3494  }
3495 //PERF_COUNT_END
3496  return selectedDevice;
3497 }
3498 
3499 #endif
3500 
3501 bool OpenclDevice::selectedDeviceIsOpenCL() {
3502 #if USE_DEVICE_SELECTION
3503  ds_device device = getDeviceSelection();
3504  return (device.type == DS_DEVICE_OPENCL_DEVICE);
3505 #else
3506  return true;
3507 #endif
3508 }
3509 
3510 bool OpenclDevice::selectedDeviceIsNativeCPU() {
3511 #if USE_DEVICE_SELECTION
3512  ds_device device = getDeviceSelection();
3513  return (device.type == DS_DEVICE_NATIVE_CPU);
3514 #else
3515  return false;
3516 #endif
3517 }
3518 
3519 
3520 
3521 #endif
#define PERF_COUNT_SUB(SUB)
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
unsigned int uinT32
Definition: host.h:103
cmap_table cmap
#define PERF_COUNT_START(FUNCT_NAME)
const char * kernel_src
Definition: oclkernels.h:11
#define FALSE
Definition: capi.h:29
#define PERF_COUNT_END
#define TRUE
Definition: capi.h:28
const int kHistogramSize
Definition: otsuthr.h:27
#define NULL
Definition: host.h:144
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:157
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
unsigned char uinT8
Definition: host.h:99