13 #include <sys/types.h> 26 #include <mach/mach_time.h> 30 #define CALLOC LEPT_CALLOC 31 #define FREE LEPT_FREE 36 GPUEnv OpenclDevice::gpuEnv;
38 bool OpenclDevice::deviceIsSelected =
false;
39 ds_device OpenclDevice::selectedDevice;
41 int OpenclDevice::isInited = 0;
43 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
45 static const l_uint32 lmask32[] = {
46 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
47 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
48 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
49 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
50 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
51 0xfffffffe, 0xffffffff};
53 static const l_uint32 rmask32[] = {
54 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
55 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
56 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
57 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
58 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
59 0x7fffffff, 0xffffffff};
61 static cl_mem pixsCLBuffer, pixdCLBuffer,
63 static cl_mem pixThBuffer;
64 static cl_int clStatus;
65 static KernelEnv rEnv;
67 #define DS_TAG_VERSION "<version>" 68 #define DS_TAG_VERSION_END "</version>" 69 #define DS_TAG_DEVICE "<device>" 70 #define DS_TAG_DEVICE_END "</device>" 71 #define DS_TAG_SCORE "<score>" 72 #define DS_TAG_SCORE_END "</score>" 73 #define DS_TAG_DEVICE_TYPE "<type>" 74 #define DS_TAG_DEVICE_TYPE_END "</type>" 75 #define DS_TAG_DEVICE_NAME "<name>" 76 #define DS_TAG_DEVICE_NAME_END "</name>" 77 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>" 78 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>" 80 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu" 82 #define DS_DEVICE_NAME_LENGTH 256 84 typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type;
87 unsigned int numDevices;
94 DS_INVALID_PROFILE = 1000,
96 DS_INVALID_PERF_EVALUATOR_TYPE,
97 DS_INVALID_PERF_EVALUATOR,
98 DS_PERF_EVALUATOR_ERROR,
100 DS_UNKNOWN_DEVICE_TYPE,
101 DS_PROFILE_FILE_ERROR,
102 DS_SCORE_SERIALIZER_ERROR,
103 DS_SCORE_DESERIALIZER_ERROR
110 typedef ds_status (*ds_perf_evaluator)(ds_device *device,
void *data);
113 typedef ds_status (*ds_score_release)(
void *score);
114 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
115 ds_status status = DS_SUCCESS;
116 if (profile !=
nullptr) {
117 if (profile->devices !=
nullptr && sr !=
nullptr) {
119 for (i = 0; i < profile->numDevices; i++) {
120 free(profile->devices[i].oclDeviceName);
121 free(profile->devices[i].oclDriverVersion);
122 status = sr(profile->devices[i].score);
123 if (status != DS_SUCCESS)
break;
125 free(profile->devices);
132 static ds_status initDSProfile(ds_profile **p,
const char *version) {
134 cl_uint numPlatforms;
135 cl_platform_id *platforms =
nullptr;
136 cl_device_id *devices =
nullptr;
137 ds_status status = DS_SUCCESS;
141 if (p ==
nullptr)
return DS_INVALID_PROFILE;
143 ds_profile *profile = (ds_profile *)malloc(
sizeof(ds_profile));
144 if (profile ==
nullptr)
return DS_MEMORY_ERROR;
146 memset(profile, 0,
sizeof(ds_profile));
148 clGetPlatformIDs(0,
nullptr, &numPlatforms);
150 if (numPlatforms > 0) {
151 platforms = (cl_platform_id *)malloc(numPlatforms *
sizeof(cl_platform_id));
152 if (platforms ==
nullptr) {
153 status = DS_MEMORY_ERROR;
156 clGetPlatformIDs(numPlatforms, platforms,
nullptr);
160 for (i = 0; i < (
unsigned int)numPlatforms; i++) {
162 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0,
nullptr, &num);
166 if (numDevices > 0) {
167 devices = (cl_device_id *)malloc(numDevices *
sizeof(cl_device_id));
168 if (devices ==
nullptr) {
169 status = DS_MEMORY_ERROR;
174 profile->numDevices =
177 (ds_device *)malloc(profile->numDevices *
sizeof(ds_device));
178 if (profile->devices ==
nullptr) {
179 profile->numDevices = 0;
180 status = DS_MEMORY_ERROR;
183 memset(profile->devices, 0, profile->numDevices *
sizeof(ds_device));
186 for (i = 0; i < (
unsigned int)numPlatforms; i++) {
189 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
190 for (j = 0; j < num; j++, next++) {
191 char buffer[DS_DEVICE_NAME_LENGTH];
194 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
195 profile->devices[next].oclDeviceID = devices[j];
197 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
198 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
199 length = strlen(buffer);
200 profile->devices[next].oclDeviceName = (
char *)malloc(length + 1);
201 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
203 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
204 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
205 length = strlen(buffer);
206 profile->devices[next].oclDriverVersion = (
char *)malloc(length + 1);
207 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
210 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
211 profile->version = version;
216 if (status == DS_SUCCESS) {
220 free(profile->devices);
227 static ds_status profileDevices(ds_profile *profile,
228 const ds_evaluation_type type,
229 ds_perf_evaluator evaluator,
230 void *evaluatorData,
unsigned int *numUpdates) {
231 ds_status status = DS_SUCCESS;
233 unsigned int updates = 0;
235 if (profile ==
nullptr) {
236 return DS_INVALID_PROFILE;
238 if (evaluator ==
nullptr) {
239 return DS_INVALID_PERF_EVALUATOR;
242 for (i = 0; i < profile->numDevices; i++) {
243 ds_status evaluatorStatus;
246 case DS_EVALUATE_NEW_ONLY:
247 if (profile->devices[i].score !=
nullptr)
break;
249 case DS_EVALUATE_ALL:
250 evaluatorStatus = evaluator(profile->devices + i, evaluatorData);
251 if (evaluatorStatus != DS_SUCCESS) {
252 status = evaluatorStatus;
258 return DS_INVALID_PERF_EVALUATOR_TYPE;
262 if (numUpdates) *numUpdates = updates;
266 static const char *findString(
const char *contentStart,
const char *contentEnd,
267 const char *
string) {
269 const char *currentPosition;
270 const char *found =
nullptr;
271 stringLength = strlen(
string);
272 currentPosition = contentStart;
273 for (currentPosition = contentStart; currentPosition < contentEnd;
275 if (*currentPosition ==
string[0]) {
276 if (currentPosition + stringLength < contentEnd) {
277 if (strncmp(currentPosition,
string, stringLength) == 0) {
278 found = currentPosition;
287 static ds_status readProFile(
const char *fileName,
char **content,
288 size_t *contentSize) {
294 FILE *input = fopen(fileName,
"rb");
295 if (input ==
nullptr) {
296 return DS_FILE_ERROR;
299 fseek(input, 0L, SEEK_END);
302 char *binary = (
char *)malloc(size);
303 if (binary ==
nullptr) {
305 return DS_FILE_ERROR;
307 fread(binary,
sizeof(
char), size, input);
315 typedef ds_status (*ds_score_deserializer)(ds_device *device,
316 const unsigned char *serializedScore,
317 unsigned int serializedScoreSize);
319 static ds_status readProfileFromFile(ds_profile *profile,
320 ds_score_deserializer deserializer,
322 ds_status status = DS_SUCCESS;
323 char *contentStart =
nullptr;
324 const char *contentEnd =
nullptr;
327 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
329 status = readProFile(file, &contentStart, &contentSize);
330 if (status == DS_SUCCESS) {
331 const char *currentPosition;
332 const char *dataStart;
335 contentEnd = contentStart + contentSize;
336 currentPosition = contentStart;
339 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
340 if (dataStart ==
nullptr) {
341 status = DS_PROFILE_FILE_ERROR;
344 dataStart += strlen(DS_TAG_VERSION);
346 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
347 if (dataEnd ==
nullptr) {
348 status = DS_PROFILE_FILE_ERROR;
352 size_t versionStringLength = strlen(profile->version);
353 if (versionStringLength + dataStart != dataEnd ||
354 strncmp(profile->version, dataStart, versionStringLength) != 0) {
356 status = DS_PROFILE_FILE_ERROR;
359 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
365 const char *deviceTypeStart;
366 const char *deviceTypeEnd;
367 ds_device_type deviceType;
369 const char *deviceNameStart;
370 const char *deviceNameEnd;
372 const char *deviceScoreStart;
373 const char *deviceScoreEnd;
375 const char *deviceDriverStart;
376 const char *deviceDriverEnd;
378 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
379 if (dataStart ==
nullptr) {
383 dataStart += strlen(DS_TAG_DEVICE);
384 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
385 if (dataEnd ==
nullptr) {
386 status = DS_PROFILE_FILE_ERROR;
391 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
392 if (deviceTypeStart ==
nullptr) {
393 status = DS_PROFILE_FILE_ERROR;
396 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
398 findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
399 if (deviceTypeEnd ==
nullptr) {
400 status = DS_PROFILE_FILE_ERROR;
403 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
406 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
407 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
408 if (deviceNameStart ==
nullptr) {
409 status = DS_PROFILE_FILE_ERROR;
412 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
414 findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
415 if (deviceNameEnd ==
nullptr) {
416 status = DS_PROFILE_FILE_ERROR;
421 findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
422 if (deviceDriverStart ==
nullptr) {
423 status = DS_PROFILE_FILE_ERROR;
426 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
427 deviceDriverEnd = findString(deviceDriverStart, contentEnd,
428 DS_TAG_DEVICE_DRIVER_VERSION_END);
429 if (deviceDriverEnd ==
nullptr) {
430 status = DS_PROFILE_FILE_ERROR;
435 for (i = 0; i < profile->numDevices; i++) {
436 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
437 size_t actualDeviceNameLength;
438 size_t driverVersionLength;
440 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
441 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
442 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
443 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
444 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
445 actualDeviceNameLength) == 0 &&
446 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
447 driverVersionLength) == 0) {
449 findString(dataStart, contentEnd, DS_TAG_SCORE);
450 if (deviceNameStart ==
nullptr) {
451 status = DS_PROFILE_FILE_ERROR;
454 deviceScoreStart += strlen(DS_TAG_SCORE);
456 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
457 status = deserializer(profile->devices + i,
458 (
const unsigned char *)deviceScoreStart,
459 deviceScoreEnd - deviceScoreStart);
460 if (status != DS_SUCCESS) {
466 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
467 for (i = 0; i < profile->numDevices; i++) {
468 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
469 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
470 if (deviceScoreStart ==
nullptr) {
471 status = DS_PROFILE_FILE_ERROR;
474 deviceScoreStart += strlen(DS_TAG_SCORE);
476 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
477 status = deserializer(profile->devices + i,
478 (
const unsigned char *)deviceScoreStart,
479 deviceScoreEnd - deviceScoreStart);
480 if (status != DS_SUCCESS) {
488 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
496 typedef ds_status (*ds_score_serializer)(ds_device *device,
497 void **serializedScore,
498 unsigned int *serializedScoreSize);
499 static ds_status writeProfileToFile(ds_profile *profile,
500 ds_score_serializer serializer,
502 ds_status status = DS_SUCCESS;
504 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
506 FILE *profileFile = fopen(file,
"wb");
507 if (profileFile ==
nullptr) {
508 status = DS_FILE_ERROR;
513 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
514 fwrite(profile->version,
sizeof(
char), strlen(profile->version),
516 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END),
518 fwrite(
"\n",
sizeof(
char), 1, profileFile);
520 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
521 void *serializedScore;
522 unsigned int serializedScoreSize;
524 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
526 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE),
528 fwrite(&profile->devices[i].type,
sizeof(ds_device_type), 1, profileFile);
529 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char),
530 strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
532 switch (profile->devices[i].type) {
533 case DS_DEVICE_NATIVE_CPU: {
544 case DS_DEVICE_OPENCL_DEVICE: {
545 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME),
547 fwrite(profile->devices[i].oclDeviceName,
sizeof(
char),
548 strlen(profile->devices[i].oclDeviceName), profileFile);
549 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char),
550 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
552 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char),
553 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
554 fwrite(profile->devices[i].oclDriverVersion,
sizeof(
char),
555 strlen(profile->devices[i].oclDriverVersion), profileFile);
556 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
557 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
560 status = DS_UNKNOWN_DEVICE_TYPE;
564 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
565 status = serializer(profile->devices + i, &serializedScore,
566 &serializedScoreSize);
567 if (status == DS_SUCCESS && serializedScore !=
nullptr &&
568 serializedScoreSize > 0) {
569 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
570 free(serializedScore);
572 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END),
574 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END),
576 fwrite(
"\n",
sizeof(
char), 1, profileFile);
584 static void legalizeFileName(
char *fileName) {
586 const char *invalidChars =
589 for (
unsigned i = 0; i < strlen(invalidChars); i++) {
591 invalidStr[0] = invalidChars[i];
592 invalidStr[1] =
'\0';
598 for (
char *pos = strstr(fileName, invalidStr); pos !=
nullptr;
599 pos = strstr(pos + 1, invalidStr)) {
607 static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
610 gpuInfo->mnIsUserCreated = 1;
612 gpuInfo->mpDevID = device;
613 gpuInfo->mpArryDevsID =
new cl_device_id[1];
614 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
615 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
616 sizeof(cl_device_type), &gpuInfo->mDevType, &size);
617 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
620 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
621 sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
622 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
624 cl_context_properties props[3];
625 props[0] = CL_CONTEXT_PLATFORM;
626 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
629 clCreateContext(props, 1, &gpuInfo->mpDevID,
nullptr,
nullptr, &clStatus);
630 CHECK_OPENCL(clStatus,
"populateGPUEnv::createContext");
632 cl_command_queue_properties queueProperties = 0;
633 gpuInfo->mpCmdQueue = clCreateCommandQueue(
634 gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
635 CHECK_OPENCL(clStatus,
"populateGPUEnv::createCommandQueue");
638 int OpenclDevice::LoadOpencl()
641 HINSTANCE HOpenclDll =
nullptr;
642 void *OpenclDll =
nullptr;
644 OpenclDll =
static_cast<HINSTANCE
>(HOpenclDll);
645 OpenclDll = LoadLibrary(
"openCL.dll");
646 if (!static_cast<HINSTANCE>(OpenclDll)) {
647 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
648 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
651 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
655 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
657 envInfo->mpkContext = gpuEnv.mpContext;
658 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
659 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
664 static cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer,
665 size_t nElements, cl_mem_flags flags,
668 clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
669 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
674 static Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd,
675 Pix *pixs,
int elements, cl_mem_flags flags,
676 bool memcopy =
false,
bool sync =
true) {
677 PROCNAME(
"mapOutputCLBuffer");
680 if ((pixd = pixCreateTemplate(pixs)) ==
nullptr)
683 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
684 pixGetDepth(pixs))) ==
nullptr)
688 l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(
689 rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
690 elements *
sizeof(l_uint32), 0,
nullptr,
nullptr,
nullptr);
693 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
695 pixSetData(pixd, pValues);
698 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0,
nullptr,
702 clFinish(rEnv.mpkCmdQueue);
708 void OpenclDevice::releaseMorphCLBuffers()
710 if (pixdCLIntermediate !=
nullptr) clReleaseMemObject(pixdCLIntermediate);
711 if (pixsCLBuffer !=
nullptr) clReleaseMemObject(pixsCLBuffer);
712 if (pixdCLBuffer !=
nullptr) clReleaseMemObject(pixdCLBuffer);
713 if (pixThBuffer !=
nullptr) clReleaseMemObject(pixThBuffer);
714 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer =
nullptr;
717 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix *pixs) {
720 if (pixThBuffer !=
nullptr) {
721 pixsCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
722 CL_MEM_ALLOC_HOST_PTR, &clStatus);
726 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
727 sizeof(l_uint32) * wpl * h, 0,
nullptr,
nullptr);
731 reinterpret_cast<l_uint32 *
>(malloc(wpl * h *
sizeof(l_uint32)));
732 memcpy(srcdata, pixGetData(pixs), wpl * h *
sizeof(l_uint32));
734 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
735 CL_MEM_USE_HOST_PTR, &clStatus);
738 pixdCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
739 CL_MEM_ALLOC_HOST_PTR, &clStatus);
741 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
742 CL_MEM_ALLOC_HOST_PTR, &clStatus);
744 return (
int)clStatus;
747 int OpenclDevice::InitEnv()
754 if( 1 == LoadOpencl() )
761 InitOpenclRunEnv_DeviceSelection( 0 );
767 int OpenclDevice::ReleaseOpenclRunEnv()
769 ReleaseOpenclEnv( &gpuEnv );
775 inline int OpenclDevice::AddKernelConfig(
int kCount,
const char *kName )
778 fprintf(stderr,
"Error: ( KCount < 1 ) AddKernelConfig\n" );
779 strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
780 gpuEnv.mnKernelCount++;
783 int OpenclDevice::RegistOpenclKernel()
785 if ( !gpuEnv.mnIsUserCreated )
786 memset( &gpuEnv, 0,
sizeof(gpuEnv) );
788 gpuEnv.mnFileCount = 0;
789 gpuEnv.mnKernelCount = 0UL;
791 AddKernelConfig( 1, (
const char*)
"oclAverageSub1" );
795 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc ) {
799 ds_device bestDevice_DS = getDeviceSelection( );
801 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
803 if (selectedDeviceIsOpenCL() ) {
805 populateGPUEnvFromDevice( &gpuEnv, bestDevice );
806 gpuEnv.mnFileCount = 0;
807 gpuEnv.mnKernelCount = 0UL;
809 CompileKernelFile(&gpuEnv,
"");
821 OpenclDevice::OpenclDevice()
826 OpenclDevice::~OpenclDevice()
831 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
841 for ( i = 0; i < gpuEnv.mnFileCount; i++ )
843 if ( gpuEnv.mpArryPrograms[i] )
845 clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
846 CHECK_OPENCL( clStatus,
"clReleaseProgram" );
847 gpuEnv.mpArryPrograms[i] =
nullptr;
850 if ( gpuEnv.mpCmdQueue )
852 clReleaseCommandQueue( gpuEnv.mpCmdQueue );
853 gpuEnv.mpCmdQueue =
nullptr;
855 if ( gpuEnv.mpContext )
857 clReleaseContext( gpuEnv.mpContext );
858 gpuEnv.mpContext =
nullptr;
861 gpuInfo->mnIsUserCreated = 0;
862 delete[] gpuInfo->mpArryDevsID;
865 int OpenclDevice::BinaryGenerated(
const char * clFileName, FILE ** fhandle )
872 char fileName[256] = {0}, cl_name[128] = {0};
873 char deviceName[1024];
874 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
875 sizeof(deviceName), deviceName,
nullptr);
876 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
877 str = (
char *)strstr(clFileName, (
char *)
".cl");
878 memcpy(cl_name, clFileName, str - clFileName);
879 cl_name[str - clFileName] =
'\0';
880 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
881 legalizeFileName(fileName);
882 fd = fopen(fileName,
"rb");
883 status = (fd !=
nullptr) ? 1 : 0;
890 int OpenclDevice::CachedOfKernerPrg(
const GPUEnv *gpuEnvCached,
const char * clFileName )
893 for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
895 if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
897 if (gpuEnvCached->mpArryPrograms[i] !=
nullptr) {
905 int OpenclDevice::WriteBinaryToFile(
const char* fileName,
const char* birary,
size_t numBytes )
907 FILE *output =
nullptr;
908 output = fopen(fileName,
"wb");
909 if (output ==
nullptr) {
913 fwrite( birary,
sizeof(
char), numBytes, output );
919 int OpenclDevice::GeneratBinFromKernelSource( cl_program program,
const char * clFileName )
925 cl_device_id *mpArryDevsID;
926 char **binaries, *str =
nullptr;
928 clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
929 sizeof(numDevices), &numDevices,
nullptr);
930 CHECK_OPENCL( clStatus,
"clGetProgramInfo" );
932 mpArryDevsID = (cl_device_id*) malloc(
sizeof(cl_device_id) * numDevices );
933 if (mpArryDevsID ==
nullptr) {
937 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
938 sizeof(cl_device_id) * numDevices, mpArryDevsID,
940 CHECK_OPENCL( clStatus,
"clGetProgramInfo" );
943 binarySizes = (
size_t*) malloc(
sizeof(
size_t) * numDevices );
946 clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
947 sizeof(
size_t) * numDevices, binarySizes,
nullptr);
948 CHECK_OPENCL( clStatus,
"clGetProgramInfo" );
951 binaries = (
char**) malloc(
sizeof(
char *) * numDevices );
952 if (binaries ==
nullptr) {
956 for ( i = 0; i < numDevices; i++ )
958 if ( binarySizes[i] != 0 )
960 binaries[i] = (
char*) malloc(
sizeof(
char) * binarySizes[i] );
961 if (binaries[i] ==
nullptr) {
967 binaries[i] =
nullptr;
971 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
972 sizeof(
char *) * numDevices, binaries,
nullptr);
973 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
976 for ( i = 0; i < numDevices; i++ )
978 char fileName[256] = { 0 }, cl_name[128] = { 0 };
980 if ( binarySizes[i] != 0 )
982 char deviceName[1024];
983 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
984 sizeof(deviceName), deviceName,
nullptr);
985 CHECK_OPENCL( clStatus,
"clGetDeviceInfo" );
987 str = (
char*) strstr( clFileName, (
char*)
".cl" );
988 memcpy( cl_name, clFileName, str - clFileName );
989 cl_name[str - clFileName] =
'\0';
990 sprintf( fileName,
"%s-%s.bin", cl_name, deviceName );
991 legalizeFileName(fileName);
992 if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
994 printf(
"[OD] write binary[%s] failed\n", fileName);
997 printf(
"[OD] write binary[%s] successfully\n", fileName);
1002 for ( i = 0; i < numDevices; i++ )
1005 binaries[i] =
nullptr;
1012 binarySizes =
nullptr;
1015 mpArryDevsID =
nullptr;
1020 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo,
const char *buildOption )
1023 cl_int clStatus = 0;
1025 char *buildLog =
nullptr, *binary;
1027 size_t source_size[1];
1028 int b_error, binary_status, binaryExisted, idx;
1030 cl_device_id *mpArryDevsID;
1032 const char*
filename =
"kernel.cl";
1034 if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
1039 idx = gpuInfo->mnFileCount;
1043 source_size[0] = strlen( source );
1045 binaryExisted = BinaryGenerated( filename, &fd );
1047 if ( binaryExisted == 1 )
1049 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
1050 sizeof(numDevices), &numDevices,
nullptr);
1051 CHECK_OPENCL(clStatus,
"clGetContextInfo");
1053 mpArryDevsID = (cl_device_id *)malloc(
sizeof(cl_device_id) * numDevices);
1054 if (mpArryDevsID ==
nullptr) {
1060 b_error |= fseek( fd, 0, SEEK_END ) < 0;
1061 b_error |= ( length = ftell(fd) ) <= 0;
1062 b_error |= fseek( fd, 0, SEEK_SET ) < 0;
1068 binary = (
char*) malloc( length + 2 );
1074 memset( binary, 0, length + 2 );
1075 b_error |= fread( binary, 1, length, fd ) != length;
1082 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
1083 sizeof(cl_device_id) * numDevices,
1084 mpArryDevsID,
nullptr);
1085 CHECK_OPENCL( clStatus,
"clGetContextInfo" );
1088 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
1089 mpArryDevsID, &length, (
const unsigned char**) &binary,
1090 &binary_status, &clStatus );
1091 CHECK_OPENCL( clStatus,
"clCreateProgramWithBinary" );
1094 free( mpArryDevsID );
1095 mpArryDevsID =
nullptr;
1102 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
1103 source_size, &clStatus);
1104 CHECK_OPENCL( clStatus,
"clCreateProgramWithSource" );
1108 if (gpuInfo->mpArryPrograms[idx] == (cl_program)
nullptr) {
1116 if (!gpuInfo->mnIsUserCreated)
1119 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1120 buildOption,
nullptr,
nullptr);
1126 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1127 buildOption,
nullptr,
nullptr);
1131 if ( clStatus != CL_SUCCESS )
1133 printf (
"BuildProgram error!\n");
1134 if ( !gpuInfo->mnIsUserCreated )
1136 clStatus = clGetProgramBuildInfo(
1137 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1138 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1142 clStatus = clGetProgramBuildInfo(
1143 gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1144 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1146 if ( clStatus != CL_SUCCESS )
1148 printf(
"opencl create build log fail\n");
1151 buildLog = (
char*) malloc( length );
1152 if (buildLog == (
char *)
nullptr) {
1155 if ( !gpuInfo->mnIsUserCreated )
1157 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1158 CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1162 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1163 CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1165 if ( clStatus != CL_SUCCESS )
1167 printf(
"opencl program build info fail\n");
1171 fd1 = fopen(
"kernel-build.log",
"w+" );
1172 if (fd1 !=
nullptr) {
1173 fwrite(buildLog,
sizeof(
char), length, fd1);
1182 strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
1184 if ( binaryExisted == 0 ) {
1185 GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
1189 gpuInfo->mnFileCount += 1;
1194 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
1199 size_t globalThreads[2];
1200 size_t localThreads[2];
1206 gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1207 globalThreads[0] = gsize;
1208 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1209 globalThreads[1] = gsize;
1210 localThreads[0] = GROUPSIZE_X;
1211 localThreads[1] = GROUPSIZE_Y;
1213 SetKernelEnv( &rEnv );
1215 l_uint32 *pResult = (l_uint32 *)malloc(w*h *
sizeof(l_uint32));
1216 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram,
"composeRGBPixel", &clStatus );
1217 CHECK_OPENCL(clStatus,
"clCreateKernel composeRGBPixel");
1220 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1221 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1224 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &valuesCl);
1225 CHECK_OPENCL( clStatus,
"clSetKernelArg");
1226 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(w), &w);
1227 CHECK_OPENCL( clStatus,
"clSetKernelArg" );
1228 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(h), &h);
1229 CHECK_OPENCL( clStatus,
"clSetKernelArg" );
1230 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1231 CHECK_OPENCL( clStatus,
"clSetKernelArg" );
1232 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outputCl);
1233 CHECK_OPENCL( clStatus,
"clSetKernelArg");
1238 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1239 globalThreads, localThreads, 0,
nullptr,
nullptr);
1240 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel");
1243 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
1244 0, w * h *
sizeof(l_uint32), 0,
nullptr,
nullptr,
1246 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer outputCl");
1247 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0,
nullptr,
nullptr);
1250 clFinish(rEnv.mpkCmdQueue);
1257 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1258 size_t globalThreads[2];
1262 size_t localThreads[2];
1265 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1266 globalThreads[0] = gsize;
1267 globalThreads[1] = GROUPSIZE_HMORY;
1268 localThreads[0] = GROUPSIZE_HMORX;
1269 localThreads[1] = GROUPSIZE_HMORY;
1272 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_5x5", &status);
1273 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1275 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1276 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1277 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1278 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1281 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1282 globalThreads, localThreads, 0,
nullptr,
nullptr);
1285 pixtemp = pixsCLBuffer;
1286 pixsCLBuffer = pixdCLBuffer;
1287 pixdCLBuffer = pixtemp;
1290 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1291 globalThreads[0] = gsize;
1292 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1293 globalThreads[1] = gsize;
1294 localThreads[0] = GROUPSIZE_X;
1295 localThreads[1] = GROUPSIZE_Y;
1298 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer_5x5", &status);
1299 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1301 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1302 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1303 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1304 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1306 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1307 globalThreads, localThreads, 0,
nullptr,
nullptr);
1313 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1314 size_t globalThreads[2];
1318 l_uint32 fwmask, lwmask;
1319 size_t localThreads[2];
1321 lwmask = lmask32[31 - 2];
1322 fwmask = rmask32[31 - 2];
1325 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1326 globalThreads[0] = gsize;
1327 globalThreads[1] = GROUPSIZE_HMORY;
1328 localThreads[0] = GROUPSIZE_HMORX;
1329 localThreads[1] = GROUPSIZE_HMORY;
1332 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_5x5", &status);
1333 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1335 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1336 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1337 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1338 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1341 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1342 globalThreads, localThreads, 0,
nullptr,
nullptr);
1345 pixtemp = pixsCLBuffer;
1346 pixsCLBuffer = pixdCLBuffer;
1347 pixdCLBuffer = pixtemp;
1350 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1351 globalThreads[0] = gsize;
1352 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1353 globalThreads[1] = gsize;
1354 localThreads[0] = GROUPSIZE_X;
1355 localThreads[1] = GROUPSIZE_Y;
1358 clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer_5x5", &status);
1359 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1361 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1362 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1363 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1364 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1365 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1366 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1368 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1369 globalThreads, localThreads, 0,
nullptr,
nullptr);
1375 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1377 l_int32 xp, yp, xn, yn;
1379 size_t globalThreads[2];
1383 size_t localThreads[2];
1386 OpenclDevice::SetKernelEnv(&rEnv);
1388 if (hsize == 5 && vsize == 5) {
1390 status = pixDilateCL_55(wpl, h);
1394 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1396 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1399 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1400 globalThreads[0] = gsize;
1401 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1402 globalThreads[1] = gsize;
1403 localThreads[0] = GROUPSIZE_X;
1404 localThreads[1] = GROUPSIZE_Y;
1406 if (xp > 31 || xn > 31) {
1409 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1410 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1412 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1413 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1414 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1415 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1416 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1417 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1418 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1419 nullptr, globalThreads, localThreads, 0,
1422 if (yp > 0 || yn > 0) {
1423 pixtemp = pixsCLBuffer;
1424 pixsCLBuffer = pixdCLBuffer;
1425 pixdCLBuffer = pixtemp;
1427 }
else if (xp > 0 || xn > 0) {
1430 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1431 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1432 isEven = (xp != xn);
1434 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1435 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1436 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1437 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1438 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1439 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1440 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1441 nullptr, globalThreads, localThreads, 0,
1444 if (yp > 0 || yn > 0) {
1445 pixtemp = pixsCLBuffer;
1446 pixsCLBuffer = pixdCLBuffer;
1447 pixdCLBuffer = pixtemp;
1451 if (yp > 0 || yn > 0) {
1453 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer", &status);
1454 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1456 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1457 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1458 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1459 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1460 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1461 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1462 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1463 nullptr, globalThreads, localThreads, 0,
1471 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1473 l_int32 xp, yp, xn, yn;
1475 size_t globalThreads[2];
1476 size_t localThreads[2];
1480 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1481 l_uint32 rwmask, lwmask;
1484 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1486 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1488 OpenclDevice::SetKernelEnv(&rEnv);
1490 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1492 status = pixErodeCL_55(wpl, h);
1496 lwmask = lmask32[31 - (xn & 31)];
1497 rwmask = rmask32[31 - (xp & 31)];
1500 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1501 globalThreads[0] = gsize;
1502 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1503 globalThreads[1] = gsize;
1504 localThreads[0] = GROUPSIZE_X;
1505 localThreads[1] = GROUPSIZE_Y;
1508 if (xp > 31 || xn > 31) {
1510 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1512 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1513 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1514 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1515 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1516 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1517 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1519 clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1520 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1521 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1522 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1523 nullptr, globalThreads, localThreads, 0,
1526 if (yp > 0 || yn > 0) {
1527 pixtemp = pixsCLBuffer;
1528 pixsCLBuffer = pixdCLBuffer;
1529 pixdCLBuffer = pixtemp;
1531 }
else if (xp > 0 || xn > 0) {
1533 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1534 isEven = (xp != xn);
1536 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1537 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1538 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1539 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1540 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1542 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1543 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1544 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1545 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1546 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1547 nullptr, globalThreads, localThreads, 0,
1550 if (yp > 0 || yn > 0) {
1551 pixtemp = pixsCLBuffer;
1552 pixsCLBuffer = pixdCLBuffer;
1553 pixdCLBuffer = pixtemp;
1558 if (yp > 0 || yn > 0) {
1559 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1560 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1562 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1563 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1564 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1565 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1566 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1568 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1569 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1570 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1571 nullptr, globalThreads, localThreads, 0,
1579 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1584 status = pixErodeCL(hsize, vsize, wpl, h);
1586 pixtemp = pixsCLBuffer;
1587 pixsCLBuffer = pixdCLBuffer;
1588 pixdCLBuffer = pixtemp;
1590 status = pixDilateCL(hsize, vsize, wpl, h);
1596 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1601 status = pixDilateCL(hsize, vsize, wpl, h);
1603 pixtemp = pixsCLBuffer;
1604 pixsCLBuffer = pixdCLBuffer;
1605 pixdCLBuffer = pixtemp;
1607 status = pixErodeCL(hsize, vsize, wpl, h);
1613 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1614 cl_mem buffer2, cl_mem outBuffer =
nullptr) {
1616 size_t globalThreads[2];
1618 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1620 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1621 globalThreads[0] = gsize;
1622 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1623 globalThreads[1] = gsize;
1625 if (outBuffer !=
nullptr) {
1626 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"pixSubtract", &status);
1627 CHECK_OPENCL(status,
"clCreateKernel pixSubtract");
1630 clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1631 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1635 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1636 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1637 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1638 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1639 if (outBuffer !=
nullptr) {
1640 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outBuffer);
1643 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1644 globalThreads, localThreads, 0,
nullptr,
nullptr);
1651 void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline,
1652 Pix **pix_hline, Pix **pixClosed,
1653 bool getpixClosed, l_int32 close_hsize,
1654 l_int32 close_vsize, l_int32 open_hsize,
1655 l_int32 open_vsize, l_int32 line_hsize,
1656 l_int32 line_vsize) {
1660 wpl = pixGetWpl(pixs);
1661 h = pixGetHeight(pixs);
1664 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1668 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1669 wpl * h, CL_MAP_READ,
true,
false);
1675 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1676 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1679 pixtemp = pixsCLBuffer;
1680 pixsCLBuffer = pixdCLBuffer;
1681 pixdCLBuffer = pixtemp;
1683 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1686 pixtemp = pixsCLBuffer;
1687 pixsCLBuffer = pixdCLBuffer;
1688 pixdCLBuffer = pixdCLIntermediate;
1689 pixdCLIntermediate = pixtemp;
1691 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1696 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1697 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1699 pixtemp = pixsCLBuffer;
1700 pixsCLBuffer = pixdCLBuffer;
1701 pixdCLBuffer = pixtemp;
1705 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1708 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1709 CL_MAP_READ,
true,
false);
1711 pixtemp = pixsCLBuffer;
1712 pixsCLBuffer = pixdCLIntermediate;
1713 pixdCLIntermediate = pixtemp;
1717 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1720 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1721 CL_MAP_READ,
true,
true);
1732 int OpenclDevice::HistogramRectOCL(
unsigned char *imageData,
1733 int bytes_per_pixel,
int bytes_per_line,
1737 int *histogramAllChannels) {
1742 SetKernelEnv(&histKern);
1743 KernelEnv histRedKern;
1744 SetKernelEnv(&histRedKern);
1750 cl_mem imageBuffer = clCreateBuffer(
1751 histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1752 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1753 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1756 int block_size = 256;
1758 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1759 sizeof(numCUs), &numCUs,
nullptr);
1760 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1762 int requestedOccupancy = 10;
1763 int numWorkGroups = numCUs * requestedOccupancy;
1764 int numThreads = block_size * numWorkGroups;
1765 size_t local_work_size[] = {
static_cast<size_t>(block_size)};
1766 size_t global_work_size[] = {
static_cast<size_t>(numThreads)};
1767 size_t red_global_work_size[] = {
1768 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1772 cl_mem histogramBuffer = clCreateBuffer(
1773 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1774 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels,
1776 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1780 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1782 cl_mem tmpHistogramBuffer =
1783 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1784 tmpHistogramBins *
sizeof(cl_uint),
nullptr, &clStatus);
1785 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1788 int *zeroBuffer =
new int[1];
1790 cl_mem atomicSyncBuffer = clCreateBuffer(
1791 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1792 sizeof(cl_int), zeroBuffer, &clStatus);
1793 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1794 delete[] zeroBuffer;
1796 if (bytes_per_pixel == 1) {
1797 histKern.mpkKernel = clCreateKernel(
1798 histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1799 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1801 histRedKern.mpkKernel =
1802 clCreateKernel(histRedKern.mpkProgram,
1803 "kernel_HistogramRectOneChannelReduction", &clStatus);
1804 CHECK_OPENCL(clStatus,
1805 "clCreateKernel kernel_HistogramRectOneChannelReduction");
1807 histKern.mpkKernel = clCreateKernel( histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus );
1808 CHECK_OPENCL( clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1810 histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram,
"kernel_HistogramRectAllChannelsReduction", &clStatus );
1811 CHECK_OPENCL( clStatus,
"clCreateKernel kernel_HistogramRectAllChannelsReduction");
1817 ptr = clEnqueueMapBuffer(
1818 histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1819 tmpHistogramBins *
sizeof(cl_uint), 0,
nullptr,
nullptr, &clStatus);
1820 CHECK_OPENCL( clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1822 memset(ptr, 0, tmpHistogramBins*
sizeof(cl_uint));
1823 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1828 clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1829 CHECK_OPENCL( clStatus,
"clSetKernelArg imageBuffer");
1830 cl_uint numPixels = width*height;
1832 clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1833 CHECK_OPENCL( clStatus,
"clSetKernelArg numPixels" );
1834 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem),
1835 &tmpHistogramBuffer);
1836 CHECK_OPENCL( clStatus,
"clSetKernelArg tmpHistogramBuffer");
1839 int n = numThreads/bytes_per_pixel;
1840 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1841 CHECK_OPENCL( clStatus,
"clSetKernelArg imageBuffer");
1842 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem),
1843 &tmpHistogramBuffer);
1844 CHECK_OPENCL( clStatus,
"clSetKernelArg tmpHistogramBuffer");
1845 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem),
1847 CHECK_OPENCL( clStatus,
"clSetKernelArg histogramBuffer");
1851 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1852 nullptr, global_work_size, local_work_size, 0,
1854 CHECK_OPENCL(clStatus,
1855 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1856 clFinish(histKern.mpkCmdQueue);
1857 if (clStatus != 0) {
1861 clStatus = clEnqueueNDRangeKernel(
1862 histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1,
nullptr,
1863 red_global_work_size, local_work_size, 0,
nullptr,
nullptr);
1864 CHECK_OPENCL( clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
1865 clFinish( histRedKern.mpkCmdQueue );
1866 if (clStatus != 0) {
1872 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1875 nullptr,
nullptr, &clStatus);
1876 CHECK_OPENCL( clStatus,
"clEnqueueMapBuffer histogramBuffer");
1877 if (clStatus != 0) {
1880 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1883 clReleaseMemObject(histogramBuffer);
1884 clReleaseMemObject(imageBuffer);
1895 int OpenclDevice::ThresholdRectToPixOCL(
unsigned char *imageData,
1896 int bytes_per_pixel,
int bytes_per_line,
1897 int *thresholds,
int *hi_values,
1898 Pix **pix,
int height,
int width,
1899 int top,
int left) {
1903 *pix = pixCreate(width, height, 1);
1904 uint32_t *pixData = pixGetData(*pix);
1905 int wpl = pixGetWpl(*pix);
1906 int pixSize = wpl * height *
sizeof(uint32_t);
1910 SetKernelEnv(&rEnv);
1913 int block_size = 256;
1915 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1916 sizeof(numCUs), &numCUs,
nullptr);
1917 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1919 int requestedOccupancy = 10;
1920 int numWorkGroups = numCUs * requestedOccupancy;
1921 int numThreads = block_size * numWorkGroups;
1922 size_t local_work_size[] = {(size_t)block_size};
1923 size_t global_work_size[] = {(size_t)numThreads};
1930 cl_mem imageBuffer = clCreateBuffer(
1931 rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1932 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1933 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1937 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1938 pixSize, pixData, &clStatus);
1939 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1942 cl_mem thresholdsBuffer =
1943 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1944 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1945 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1946 cl_mem hiValuesBuffer =
1947 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1948 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1949 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1952 if (bytes_per_pixel == 4) {
1954 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1955 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1957 rEnv.mpkKernel = clCreateKernel(
1958 rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1959 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1963 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1964 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1965 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
1966 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
1967 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
1968 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
1969 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
1970 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
1972 clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
1973 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
1974 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
1975 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
1976 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
1977 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
1981 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1982 nullptr, global_work_size, local_work_size,
1983 0,
nullptr,
nullptr);
1984 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1985 clFinish(rEnv.mpkCmdQueue);
1987 if (clStatus != 0) {
1988 printf(
"Setting return value to -1\n");
1993 clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1994 pixSize, 0,
nullptr,
nullptr, &clStatus);
1995 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1996 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0,
nullptr,
1999 clReleaseMemObject(imageBuffer);
2000 clReleaseMemObject(thresholdsBuffer);
2001 clReleaseMemObject(hiValuesBuffer);
2014 typedef struct _TessScoreEvaluationInputData {
2018 unsigned char *imageData;
2020 } TessScoreEvaluationInputData;
2022 static void populateTessScoreEvaluationInputData(
2023 TessScoreEvaluationInputData *input) {
2028 int numChannels = 4;
2029 input->height = height;
2030 input->width = width;
2031 input->numChannels = numChannels;
2032 unsigned char(*imageData4)[4] = (
unsigned char(*)[4])malloc(
2033 height * width * numChannels *
2034 sizeof(
unsigned char));
2035 input->imageData = (
unsigned char *)&imageData4[0];
2038 unsigned char pixelWhite[4] = {0, 0, 0, 255};
2039 unsigned char pixelBlack[4] = {255, 255, 255, 255};
2040 for (
int p = 0; p < height * width; p++) {
2042 imageData4[p][0] = pixelWhite[0];
2043 imageData4[p][1] = pixelWhite[1];
2044 imageData4[p][2] = pixelWhite[2];
2045 imageData4[p][3] = pixelWhite[3];
2048 int maxLineWidth = 64;
2051 for (
int i = 0; i < numLines; i++) {
2052 int lineWidth = rand() % maxLineWidth;
2053 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
2055 for (
int row = vertLinePos - lineWidth / 2;
2056 row < vertLinePos + lineWidth / 2; row++) {
2057 for (
int col = 0; col < height; col++) {
2059 imageData4[row * width + col][0] = pixelBlack[0];
2060 imageData4[row * width + col][1] = pixelBlack[1];
2061 imageData4[row * width + col][2] = pixelBlack[2];
2062 imageData4[row * width + col][3] = pixelBlack[3];
2067 for (
int i = 0; i < numLines; i++) {
2068 int lineWidth = rand() % maxLineWidth;
2069 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
2071 for (
int row = 0; row < width; row++) {
2072 for (
int col = horLinePos - lineWidth / 2;
2073 col < horLinePos + lineWidth / 2;
2078 imageData4[row * width + col][0] = pixelBlack[0];
2079 imageData4[row * width + col][1] = pixelBlack[1];
2080 imageData4[row * width + col][2] = pixelBlack[2];
2081 imageData4[row * width + col][3] = pixelBlack[3];
2086 float fractionBlack = 0.1;
2088 (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
2089 for (
int i = 0; i < numSpots; i++) {
2090 int lineWidth = rand() % maxLineWidth;
2091 int col = lineWidth + rand() % (width - 2 * lineWidth);
2092 int row = lineWidth + rand() % (height - 2 * lineWidth);
2094 for (
int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
2095 for (
int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
2098 imageData4[r * width + c][0] = pixelBlack[0];
2099 imageData4[r * width + c][1] = pixelBlack[1];
2100 imageData4[r * width + c][2] = pixelBlack[2];
2101 imageData4[r * width + c][3] = pixelBlack[3];
2106 input->pix = pixCreate(input->width, input->height, 1);
2109 typedef struct _TessDeviceScore {
2119 static double composeRGBPixelMicroBench(GPUEnv *env,
2120 TessScoreEvaluationInputData input,
2121 ds_device_type type) {
2124 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2125 QueryPerformanceFrequency(&freq);
2127 mach_timebase_info_data_t info = {0, 0};
2128 mach_timebase_info(&info);
2129 long long start, stop;
2131 timespec time_funct_start, time_funct_end;
2134 l_uint32 *tiffdata = (l_uint32 *)input.imageData;
2137 if (type == DS_DEVICE_OPENCL_DEVICE) {
2139 QueryPerformanceCounter(&time_funct_start);
2141 start = mach_absolute_time();
2143 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2146 OpenclDevice::gpuEnv = *env;
2147 int wpl = pixGetWpl(input.pix);
2148 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2151 QueryPerformanceCounter(&time_funct_end);
2152 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2154 stop = mach_absolute_time();
2155 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2157 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2158 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2163 QueryPerformanceCounter(&time_funct_start);
2165 start = mach_absolute_time();
2167 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2169 Pix *pix = pixCreate(input.width, input.height, 32);
2170 l_uint32 *pixData = pixGetData(pix);
2173 for (i = 0; i < input.height ; i++) {
2174 for (j = 0; j < input.width; j++) {
2175 l_uint32 tiffword = tiffdata[i * input.width + j];
2176 l_int32 rval = ((tiffword) & 0xff);
2177 l_int32 gval = (((tiffword) >> 8) & 0xff);
2178 l_int32 bval = (((tiffword) >> 16) & 0xff);
2179 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2180 pixData[idx] = value;
2185 QueryPerformanceCounter(&time_funct_end);
2186 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2188 stop = mach_absolute_time();
2189 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2191 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2192 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2203 static double histogramRectMicroBench(GPUEnv *env,
2204 TessScoreEvaluationInputData input,
2205 ds_device_type type) {
2208 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2209 QueryPerformanceFrequency(&freq);
2211 mach_timebase_info_data_t info = {0, 0};
2212 mach_timebase_info(&info);
2213 long long start, stop;
2215 timespec time_funct_start, time_funct_end;
2221 int bytes_per_line = input.width*input.numChannels;
2222 int *histogramAllChannels =
new int[kHistogramSize*input.numChannels];
2224 if (type == DS_DEVICE_OPENCL_DEVICE) {
2226 QueryPerformanceCounter(&time_funct_start);
2228 start = mach_absolute_time();
2230 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2233 OpenclDevice::gpuEnv = *env;
2234 int retVal = OpenclDevice::HistogramRectOCL(
2235 input.imageData, input.numChannels, bytes_per_line, top, left,
2236 input.width, input.height, kHistogramSize, histogramAllChannels);
2239 QueryPerformanceCounter(&time_funct_end);
2240 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2242 stop = mach_absolute_time();
2244 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2249 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2250 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2255 QueryPerformanceCounter(&time_funct_start);
2257 start = mach_absolute_time();
2259 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2261 for (
int ch = 0; ch < input.numChannels; ++ch) {
2263 input.width, input.height, histogram);
2266 QueryPerformanceCounter(&time_funct_end);
2267 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2269 stop = mach_absolute_time();
2270 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2272 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2273 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2279 delete[] histogramAllChannels;
2284 static void ThresholdRectToPix_Native(
const unsigned char *imagedata,
2285 int bytes_per_pixel,
int bytes_per_line,
2286 const int *thresholds,
2287 const int *hi_values, Pix **pix) {
2290 int width = pixGetWidth(*pix);
2291 int height = pixGetHeight(*pix);
2293 *pix = pixCreate(width, height, 1);
2294 uint32_t *pixdata = pixGetData(*pix);
2295 int wpl = pixGetWpl(*pix);
2296 const unsigned char* srcdata = imagedata + top * bytes_per_line +
2297 left * bytes_per_pixel;
2298 for (
int y = 0; y < height; ++y) {
2299 const uint8_t *linedata = srcdata;
2300 uint32_t *pixline = pixdata + y * wpl;
2301 for (
int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2302 bool white_result =
true;
2303 for (
int ch = 0; ch < bytes_per_pixel; ++ch) {
2304 if (hi_values[ch] >= 0 &&
2305 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2306 white_result =
false;
2311 CLEAR_DATA_BIT(pixline, x);
2313 SET_DATA_BIT(pixline, x);
2315 srcdata += bytes_per_line;
2319 static double thresholdRectToPixMicroBench(GPUEnv *env,
2320 TessScoreEvaluationInputData input,
2321 ds_device_type type) {
2324 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2325 QueryPerformanceFrequency(&freq);
2327 mach_timebase_info_data_t info = {0, 0};
2328 mach_timebase_info(&info);
2329 long long start, stop;
2331 timespec time_funct_start, time_funct_end;
2335 unsigned char pixelHi = (
unsigned char)255;
2336 int* thresholds =
new int[4];
2337 thresholds[0] = pixelHi/2;
2338 thresholds[1] = pixelHi/2;
2339 thresholds[2] = pixelHi/2;
2340 thresholds[3] = pixelHi/2;
2341 int *hi_values =
new int[4];
2342 thresholds[0] = pixelHi;
2343 thresholds[1] = pixelHi;
2344 thresholds[2] = pixelHi;
2345 thresholds[3] = pixelHi;
2349 int bytes_per_line = input.width*input.numChannels;
2352 if (type == DS_DEVICE_OPENCL_DEVICE) {
2354 QueryPerformanceCounter(&time_funct_start);
2356 start = mach_absolute_time();
2358 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2361 OpenclDevice::gpuEnv = *env;
2362 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2363 input.imageData, input.numChannels, bytes_per_line, thresholds,
2364 hi_values, &input.pix, input.height, input.width, top, left);
2367 QueryPerformanceCounter(&time_funct_end);
2368 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2370 stop = mach_absolute_time();
2372 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2379 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2380 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2388 QueryPerformanceCounter(&time_funct_start);
2390 start = mach_absolute_time();
2392 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2394 ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
2395 thresholds, hi_values, &input.pix );
2398 QueryPerformanceCounter(&time_funct_end);
2399 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2401 stop = mach_absolute_time();
2402 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2404 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2405 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2410 delete[] thresholds;
2415 static double getLineMasksMorphMicroBench(GPUEnv *env,
2416 TessScoreEvaluationInputData input,
2417 ds_device_type type) {
2420 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2421 QueryPerformanceFrequency(&freq);
2423 mach_timebase_info_data_t info = {0, 0};
2424 mach_timebase_info(&info);
2425 long long start, stop;
2427 timespec time_funct_start, time_funct_end;
2431 int resolution = 300;
2432 int wpl = pixGetWpl(input.pix);
2437 int closing_brick = max_line_width / 3;
2440 if (type == DS_DEVICE_OPENCL_DEVICE) {
2442 QueryPerformanceCounter(&time_funct_start);
2444 start = mach_absolute_time();
2446 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2448 OpenclDevice::gpuEnv = *env;
2449 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2450 Pix *pix_vline =
nullptr, *pix_hline =
nullptr, *pix_closed =
nullptr;
2451 OpenclDevice::pixGetLinesCL(
2452 nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed,
true,
2453 closing_brick, closing_brick, max_line_width, max_line_width,
2454 min_line_length, min_line_length);
2456 OpenclDevice::releaseMorphCLBuffers();
2459 QueryPerformanceCounter(&time_funct_end);
2460 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2462 stop = mach_absolute_time();
2463 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2465 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2466 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2470 QueryPerformanceCounter(&time_funct_start);
2472 start = mach_absolute_time();
2474 clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2478 Pix *src_pix = input.pix;
2480 pixCloseBrick(
nullptr, src_pix, closing_brick, closing_brick);
2482 pixOpenBrick(
nullptr, pix_closed, max_line_width, max_line_width);
2483 Pix *pix_hollow = pixSubtract(
nullptr, pix_closed, pix_solid);
2484 pixDestroy(&pix_solid);
2485 Pix *pix_vline = pixOpenBrick(
nullptr, pix_hollow, 1, min_line_length);
2486 Pix *pix_hline = pixOpenBrick(
nullptr, pix_hollow, min_line_length, 1);
2487 pixDestroy(&pix_hollow);
2490 QueryPerformanceCounter(&time_funct_end);
2491 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(
double)(freq.QuadPart);
2493 stop = mach_absolute_time();
2494 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2496 clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2497 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2513 static ds_status serializeScore(ds_device *device,
void **serializedScore,
2514 unsigned int *serializedScoreSize) {
2515 *serializedScoreSize =
sizeof(TessDeviceScore);
2516 *serializedScore =
new unsigned char[*serializedScoreSize];
2517 memcpy(*serializedScore, device->score, *serializedScoreSize);
2522 static ds_status deserializeScore(ds_device *device,
2523 const unsigned char *serializedScore,
2524 unsigned int serializedScoreSize) {
2526 device->score =
new TessDeviceScore;
2527 memcpy(device->score, serializedScore, serializedScoreSize);
2531 static ds_status releaseScore(
void *score) {
2532 delete (TessDeviceScore *)score;
2537 static ds_status evaluateScoreForDevice(ds_device *device,
void *inputData) {
2540 printf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2541 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2542 GPUEnv *env =
nullptr;
2543 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2546 populateGPUEnvFromDevice(env, device->oclDeviceID);
2547 env->mnFileCount = 0;
2548 env->mnKernelCount = 0UL;
2550 OpenclDevice::gpuEnv = *env;
2551 OpenclDevice::CompileKernelFile(env,
"");
2554 TessScoreEvaluationInputData *input =
2555 static_cast<TessScoreEvaluationInputData *
>(inputData);
2558 double composeRGBPixelTime =
2559 composeRGBPixelMicroBench(env, *input, device->type);
2562 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2565 double thresholdRectToPixTime =
2566 thresholdRectToPixMicroBench(env, *input, device->type);
2569 double getLineMasksMorphTime =
2570 getLineMasksMorphMicroBench(env, *input, device->type);
2574 float composeRGBPixelWeight = 1.2f;
2575 float histogramRectWeight = 2.4f;
2576 float thresholdRectToPixWeight = 4.5f;
2577 float getLineMasksMorphWeight = 5.0f;
2579 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2580 histogramRectWeight * histogramRectTime +
2581 thresholdRectToPixWeight * thresholdRectToPixTime +
2582 getLineMasksMorphWeight * getLineMasksMorphTime;
2583 device->score =
new TessDeviceScore;
2584 ((TessDeviceScore *)device->score)->time = weightedTime;
2586 printf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2587 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2588 printf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime,
2589 composeRGBPixelWeight);
2590 printf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime,
2591 histogramRectWeight);
2592 printf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix",
2593 thresholdRectToPixTime, thresholdRectToPixWeight);
2594 printf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime,
2595 getLineMasksMorphWeight);
2596 printf(
"[DS]%25s: %f\n",
"Score",
2597 static_cast<TessDeviceScore *>(device->score)->time);
2602 ds_device OpenclDevice::getDeviceSelection( ) {
2603 if (!deviceIsSelected) {
2606 if (1 == LoadOpencl()) {
2611 ds_profile *profile;
2612 status = initDSProfile(&profile,
"v0.1");
2615 const char *fileName =
"tesseract_opencl_profile_devices.dat";
2616 status = readProfileFromFile(profile, deserializeScore, fileName);
2617 if (status != DS_SUCCESS) {
2619 printf(
"[DS] Profile file not available (%s); performing profiling.\n",
2623 TessScoreEvaluationInputData input;
2624 populateTessScoreEvaluationInputData(&input);
2627 unsigned int numUpdates;
2628 status = profileDevices(profile, DS_EVALUATE_ALL,
2629 evaluateScoreForDevice, &input, &numUpdates);
2632 if (status == DS_SUCCESS) {
2633 status = writeProfileToFile(profile, serializeScore, fileName);
2635 if (status == DS_SUCCESS) {
2636 printf(
"[DS] Scores written to file (%s).\n", fileName);
2639 "[DS] Error saving scores to file (%s); scores not written to " 2645 "[DS] Unable to evaluate performance; scores not written to " 2650 printf(
"[DS] Profile read from file (%s).\n", fileName);
2655 float bestTime = FLT_MAX;
2656 int bestDeviceIdx = -1;
2657 for (
unsigned d = 0; d < profile->numDevices; d++) {
2658 ds_device device = profile->devices[d];
2659 TessDeviceScore score = *(TessDeviceScore *)device.score;
2661 float time = score.time;
2662 printf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2663 device.oclDeviceName, time);
2664 if (time < bestTime) {
2669 printf(
"[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2670 profile->devices[bestDeviceIdx].oclDeviceName,
2671 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2677 bool overridden =
false;
2678 char *overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2679 if (overrideDeviceStr !=
nullptr) {
2680 int overrideDeviceIdx = atoi(overrideDeviceStr);
2681 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2683 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, " 2685 overrideDeviceStr, overrideDeviceIdx);
2686 bestDeviceIdx = overrideDeviceIdx - 1;
2690 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are " 2691 "valid devices).\n",
2692 overrideDeviceStr, profile->numDevices);
2697 printf(
"[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2698 profile->devices[bestDeviceIdx].oclDeviceName,
2699 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2703 selectedDevice = profile->devices[bestDeviceIdx];
2705 releaseDSProfile(profile, releaseScore);
2708 printf(
"[DS] OpenCL runtime not available.\n");
2709 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2710 selectedDevice.oclDeviceName =
"(null)";
2711 selectedDevice.score =
nullptr;
2712 selectedDevice.oclDeviceID =
nullptr;
2713 selectedDevice.oclDriverVersion =
nullptr;
2715 deviceIsSelected =
true;
2720 return selectedDevice;
2724 bool OpenclDevice::selectedDeviceIsOpenCL() {
2725 ds_device device = getDeviceSelection();
2726 return (device.type == DS_DEVICE_OPENCL_DEVICE);
#define PERF_COUNT_SUB(SUB)
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
#define PERF_COUNT_START(FUNCT_NAME)
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)