13# include "config_auto.h"
22# include <sys/types.h>
36# if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(__CYGWIN__) || \
40# elif defined(__linux__)
43# elif defined(__APPLE__)
52# include <mach/mach_time.h>
65GPUEnv OpenclDevice::gpuEnv;
67bool OpenclDevice::deviceIsSelected =
false;
68ds_device OpenclDevice::selectedDevice;
70int OpenclDevice::isInited = 0;
72static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
74static const l_uint32 lmask32[] = {
75 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
76 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
77 0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
78 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
80static const l_uint32 rmask32[] = {
81 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
82 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
83 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
84 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
86static cl_mem pixsCLBuffer, pixdCLBuffer,
88static cl_mem pixThBuffer;
89static cl_int clStatus;
92# define DS_TAG_VERSION "<version>"
93# define DS_TAG_VERSION_END "</version>"
94# define DS_TAG_DEVICE "<device>"
95# define DS_TAG_DEVICE_END "</device>"
96# define DS_TAG_SCORE "<score>"
97# define DS_TAG_SCORE_END "</score>"
98# define DS_TAG_DEVICE_TYPE "<type>"
99# define DS_TAG_DEVICE_TYPE_END "</type>"
100# define DS_TAG_DEVICE_NAME "<name>"
101# define DS_TAG_DEVICE_NAME_END "</name>"
102# define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
103# define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
105# define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
107# define DS_DEVICE_NAME_LENGTH 256
109enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
112 std::vector<ds_device> devices;
113 unsigned int numDevices;
119 DS_INVALID_PROFILE = 1000,
121 DS_INVALID_PERF_EVALUATOR_TYPE,
122 DS_INVALID_PERF_EVALUATOR,
123 DS_PERF_EVALUATOR_ERROR,
125 DS_UNKNOWN_DEVICE_TYPE,
126 DS_PROFILE_FILE_ERROR,
127 DS_SCORE_SERIALIZER_ERROR,
128 DS_SCORE_DESERIALIZER_ERROR
135typedef ds_status (*ds_perf_evaluator)(ds_device *device,
void *data);
138typedef ds_status (*ds_score_release)(TessDeviceScore *score);
140static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
141 ds_status status = DS_SUCCESS;
142 if (profile !=
nullptr) {
145 for (
i = 0;
i < profile->numDevices;
i++) {
146 free(profile->devices[
i].oclDeviceName);
147 free(profile->devices[
i].oclDriverVersion);
148 status = sr(profile->devices[
i].score);
149 if (status != DS_SUCCESS)
158static ds_status initDSProfile(ds_profile **
p,
const char *version) {
160 cl_uint numPlatforms;
161 std::vector<cl_platform_id> platforms;
162 std::vector<cl_device_id> devices;
163 ds_status status = DS_SUCCESS;
168 return DS_INVALID_PROFILE;
170 ds_profile *profile =
new ds_profile;
172 memset(profile, 0,
sizeof(ds_profile));
174 clGetPlatformIDs(0,
nullptr, &numPlatforms);
176 if (numPlatforms > 0) {
177 platforms.resize(numPlatforms);
178 clGetPlatformIDs(numPlatforms, platforms.data(),
nullptr);
182 for (
i = 0;
i < numPlatforms;
i++) {
184 clGetDeviceIDs(platforms[
i], CL_DEVICE_TYPE_ALL, 0,
nullptr, &num);
188 if (numDevices > 0) {
189 devices.resize(numDevices);
192 profile->numDevices = numDevices + 1;
193 profile->devices.resize(profile->numDevices);
196 for (
i = 0;
i < numPlatforms;
i++) {
199 clGetDeviceIDs(platforms[
i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
200 for (j = 0; j < num; j++,
next++) {
201 char buffer[DS_DEVICE_NAME_LENGTH];
204 profile->devices[
next].type = DS_DEVICE_OPENCL_DEVICE;
205 profile->devices[
next].oclDeviceID = devices[j];
207 clGetDeviceInfo(profile->devices[
next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH,
209 length = strlen(buffer);
210 profile->devices[
next].oclDeviceName = (
char *)malloc(length + 1);
211 memcpy(profile->devices[
next].oclDeviceName, buffer, length + 1);
213 clGetDeviceInfo(profile->devices[
next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH,
215 length = strlen(buffer);
216 profile->devices[
next].oclDriverVersion = (
char *)malloc(length + 1);
217 memcpy(profile->devices[
next].oclDriverVersion, buffer, length + 1);
220 profile->devices[
next].type = DS_DEVICE_NATIVE_CPU;
221 profile->version = version;
227static ds_status profileDevices(ds_profile *profile,
const ds_evaluation_type
type,
228 ds_perf_evaluator evaluator,
void *evaluatorData,
229 unsigned int *numUpdates) {
230 ds_status status = DS_SUCCESS;
232 unsigned int updates = 0;
234 if (profile ==
nullptr) {
235 return DS_INVALID_PROFILE;
237 if (evaluator ==
nullptr) {
238 return DS_INVALID_PERF_EVALUATOR;
241 for (
i = 0;
i < profile->numDevices;
i++) {
242 ds_status evaluatorStatus;
245 case DS_EVALUATE_NEW_ONLY:
246 if (profile->devices[
i].score !=
nullptr)
249 case DS_EVALUATE_ALL:
250 evaluatorStatus = evaluator(&profile->devices[
i], evaluatorData);
251 if (evaluatorStatus != DS_SUCCESS) {
252 status = evaluatorStatus;
258 return DS_INVALID_PERF_EVALUATOR_TYPE;
263 *numUpdates = updates;
267static const char *findString(
const char *contentStart,
const char *contentEnd,
268 const char *
string) {
270 const char *currentPosition;
271 const char *found =
nullptr;
272 stringLength = strlen(
string);
273 currentPosition = contentStart;
274 for (currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
275 if (*currentPosition ==
string[0]) {
276 if (currentPosition + stringLength < contentEnd) {
277 if (strncmp(currentPosition,
string, stringLength) == 0) {
278 found = currentPosition;
287static ds_status readProFile(
const char *fileName,
char **content,
size_t *contentSize) {
290 ds_status status = DS_SUCCESS;
291 FILE *input = fopen(fileName,
"rb");
292 if (input ==
nullptr) {
293 status = DS_FILE_ERROR;
295 fseek(input, 0L, SEEK_END);
296 auto pos = std::ftell(input);
300 char *binary =
new char[size];
301 if (fread(binary,
sizeof(
char), size, input) != size) {
302 status = DS_FILE_ERROR;
314typedef ds_status (*ds_score_deserializer)(ds_device *device,
const uint8_t *serializedScore,
315 unsigned int serializedScoreSize);
317static ds_status readProfileFromFile(ds_profile *profile, ds_score_deserializer deserializer,
319 ds_status status = DS_SUCCESS;
323 if (profile ==
nullptr)
324 return DS_INVALID_PROFILE;
326 status = readProFile(
file, &contentStart, &contentSize);
327 if (status == DS_SUCCESS) {
328 const char *currentPosition;
329 const char *dataStart;
332 const char *contentEnd = contentStart + contentSize;
333 currentPosition = contentStart;
336 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
337 if (dataStart ==
nullptr) {
338 status = DS_PROFILE_FILE_ERROR;
341 dataStart += strlen(DS_TAG_VERSION);
343 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
344 if (dataEnd ==
nullptr) {
345 status = DS_PROFILE_FILE_ERROR;
349 size_t versionStringLength = strlen(profile->version);
350 if (versionStringLength + dataStart != dataEnd ||
351 strncmp(profile->version, dataStart, versionStringLength) != 0) {
353 status = DS_PROFILE_FILE_ERROR;
356 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
362 const char *deviceTypeStart;
363 const char *deviceTypeEnd;
364 ds_device_type deviceType;
366 const char *deviceNameStart;
367 const char *deviceNameEnd;
369 const char *deviceScoreStart;
370 const char *deviceScoreEnd;
372 const char *deviceDriverStart;
373 const char *deviceDriverEnd;
375 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
376 if (dataStart ==
nullptr) {
380 dataStart += strlen(DS_TAG_DEVICE);
381 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
382 if (dataEnd ==
nullptr) {
383 status = DS_PROFILE_FILE_ERROR;
388 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
389 if (deviceTypeStart ==
nullptr) {
390 status = DS_PROFILE_FILE_ERROR;
393 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
394 deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
395 if (deviceTypeEnd ==
nullptr) {
396 status = DS_PROFILE_FILE_ERROR;
399 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
402 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
403 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
404 if (deviceNameStart ==
nullptr) {
405 status = DS_PROFILE_FILE_ERROR;
408 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
409 deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
410 if (deviceNameEnd ==
nullptr) {
411 status = DS_PROFILE_FILE_ERROR;
415 deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
416 if (deviceDriverStart ==
nullptr) {
417 status = DS_PROFILE_FILE_ERROR;
420 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
422 findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
423 if (deviceDriverEnd ==
nullptr) {
424 status = DS_PROFILE_FILE_ERROR;
429 for (
i = 0;
i < profile->numDevices;
i++) {
430 if (profile->devices[
i].type == DS_DEVICE_OPENCL_DEVICE) {
431 size_t actualDeviceNameLength;
432 size_t driverVersionLength;
434 actualDeviceNameLength = strlen(profile->devices[
i].oclDeviceName);
435 driverVersionLength = strlen(profile->devices[
i].oclDriverVersion);
436 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
437 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
438 strncmp(profile->devices[
i].oclDeviceName, deviceNameStart,
439 actualDeviceNameLength) == 0 &&
440 strncmp(profile->devices[
i].oclDriverVersion, deviceDriverStart,
441 driverVersionLength) == 0) {
442 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
443 deviceScoreStart += strlen(DS_TAG_SCORE);
444 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
445 status = deserializer(&profile->devices[
i], (
const unsigned char *)deviceScoreStart,
446 deviceScoreEnd - deviceScoreStart);
447 if (status != DS_SUCCESS) {
453 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
454 for (
i = 0;
i < profile->numDevices;
i++) {
455 if (profile->devices[
i].type == DS_DEVICE_NATIVE_CPU) {
456 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
457 if (deviceScoreStart ==
nullptr) {
458 status = DS_PROFILE_FILE_ERROR;
461 deviceScoreStart += strlen(DS_TAG_SCORE);
462 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
463 status = deserializer(&profile->devices[
i], (
const unsigned char *)deviceScoreStart,
464 deviceScoreEnd - deviceScoreStart);
465 if (status != DS_SUCCESS) {
473 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
477 delete[] contentStart;
481typedef ds_status (*ds_score_serializer)(ds_device *device, uint8_t **serializedScore,
482 unsigned int *serializedScoreSize);
483static ds_status writeProfileToFile(ds_profile *profile, ds_score_serializer serializer,
485 ds_status status = DS_SUCCESS;
487 if (profile ==
nullptr)
488 return DS_INVALID_PROFILE;
490 FILE *profileFile = fopen(
file,
"wb");
491 if (profileFile ==
nullptr) {
492 status = DS_FILE_ERROR;
497 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
498 fwrite(profile->version,
sizeof(
char), strlen(profile->version), profileFile);
499 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END), profileFile);
500 fwrite(
"\n",
sizeof(
char), 1, profileFile);
502 for (
i = 0;
i < profile->numDevices && status == DS_SUCCESS;
i++) {
503 uint8_t *serializedScore;
504 unsigned int serializedScoreSize;
506 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
508 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
509 fwrite(&profile->devices[
i].type,
sizeof(ds_device_type), 1, profileFile);
510 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
512 switch (profile->devices[
i].type) {
513 case DS_DEVICE_NATIVE_CPU: {
524 case DS_DEVICE_OPENCL_DEVICE: {
525 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME), profileFile);
526 fwrite(profile->devices[
i].oclDeviceName,
sizeof(
char),
527 strlen(profile->devices[
i].oclDeviceName), profileFile);
528 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
530 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char), strlen(DS_TAG_DEVICE_DRIVER_VERSION),
532 fwrite(profile->devices[
i].oclDriverVersion,
sizeof(
char),
533 strlen(profile->devices[
i].oclDriverVersion), profileFile);
534 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
535 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
538 status = DS_UNKNOWN_DEVICE_TYPE;
542 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
543 status = serializer(&profile->devices[
i], &serializedScore, &serializedScoreSize);
544 if (status == DS_SUCCESS && serializedScore !=
nullptr && serializedScoreSize > 0) {
545 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
546 delete[] serializedScore;
548 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END), profileFile);
549 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END), profileFile);
550 fwrite(
"\n",
sizeof(
char), 1, profileFile);
558static void legalizeFileName(
char *fileName) {
560 const char *invalidChars =
"/\?:*\"><| ";
562 for (
unsigned i = 0;
i < strlen(invalidChars);
i++) {
564 invalidStr[0] = invalidChars[
i];
565 invalidStr[1] =
'\0';
571 for (
char *pos = strstr(fileName, invalidStr); pos !=
nullptr;
572 pos = strstr(pos + 1, invalidStr)) {
580static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
583 gpuInfo->mnIsUserCreated = 1;
585 gpuInfo->mpDevID = device;
586 gpuInfo->mpArryDevsID =
new cl_device_id[1];
587 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
588 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
sizeof(cl_device_type),
589 &gpuInfo->mDevType, &size);
590 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
592 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
sizeof(cl_platform_id),
593 &gpuInfo->mpPlatformID, &size);
594 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
596 cl_context_properties props[3];
597 props[0] = CL_CONTEXT_PLATFORM;
598 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
600 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID,
nullptr,
nullptr, &clStatus);
601 CHECK_OPENCL(clStatus,
"populateGPUEnv::createContext");
603 cl_command_queue_properties queueProperties = 0;
604 gpuInfo->mpCmdQueue =
605 clCreateCommandQueue(gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
606 CHECK_OPENCL(clStatus,
"populateGPUEnv::createCommandQueue");
609int OpenclDevice::LoadOpencl() {
611 HINSTANCE HOpenclDll =
nullptr;
612 void *OpenclDll =
nullptr;
614 OpenclDll =
static_cast<HINSTANCE
>(HOpenclDll);
615 OpenclDll = LoadLibrary(
"openCL.dll");
616 if (!
static_cast<HINSTANCE
>(OpenclDll)) {
617 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
618 FreeLibrary(
static_cast<HINSTANCE
>(OpenclDll));
621 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
625int OpenclDevice::SetKernelEnv(KernelEnv *envInfo) {
626 envInfo->mpkContext = gpuEnv.mpContext;
627 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
628 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
633static cl_mem allocateZeroCopyBuffer(
const KernelEnv &rEnv, l_uint32 *hostbuffer,
size_t nElements,
634 cl_mem_flags flags, cl_int *pStatus) {
635 cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
636 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
641static Image mapOutputCLBuffer(
const KernelEnv &rEnv, cl_mem clbuffer, Image pixd, Image pixs,
642 int elements, cl_mem_flags flags,
bool memcopy =
false,
646 if ((pixd = pixCreateTemplate(pixs)) ==
nullptr)
649 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) ==
655 (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
656 elements *
sizeof(l_uint32), 0,
nullptr,
nullptr,
nullptr);
659 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
661 pixSetData(pixd, pValues);
664 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0,
nullptr,
nullptr);
667 clFinish(rEnv.mpkCmdQueue);
673void OpenclDevice::releaseMorphCLBuffers() {
674 if (pixdCLIntermediate !=
nullptr)
675 clReleaseMemObject(pixdCLIntermediate);
676 if (pixsCLBuffer !=
nullptr)
677 clReleaseMemObject(pixsCLBuffer);
678 if (pixdCLBuffer !=
nullptr)
679 clReleaseMemObject(pixdCLBuffer);
680 if (pixThBuffer !=
nullptr)
681 clReleaseMemObject(pixThBuffer);
682 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer =
nullptr;
685int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Image pixs) {
688 if (pixThBuffer !=
nullptr) {
689 pixsCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
692 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
693 sizeof(l_uint32) * wpl * h, 0,
nullptr,
nullptr);
696 l_uint32 *srcdata =
reinterpret_cast<l_uint32 *
>(malloc(wpl * h *
sizeof(l_uint32)));
697 memcpy(srcdata, pixGetData(pixs), wpl * h *
sizeof(l_uint32));
699 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, CL_MEM_USE_HOST_PTR, &clStatus);
702 pixdCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
705 allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
707 return (
int)clStatus;
710int OpenclDevice::InitEnv() {
714 if (1 == LoadOpencl())
720 InitOpenclRunEnv_DeviceSelection(0);
724int OpenclDevice::ReleaseOpenclRunEnv() {
725 ReleaseOpenclEnv(&gpuEnv);
732inline int OpenclDevice::AddKernelConfig(
int kCount,
const char *kName) {
734 ASSERT_HOST(strlen(kName) <
sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
735 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
736 gpuEnv.mnKernelCount++;
740int OpenclDevice::RegistOpenclKernel() {
741 if (!gpuEnv.mnIsUserCreated)
742 memset(&gpuEnv, 0,
sizeof(gpuEnv));
744 gpuEnv.mnFileCount = 0;
745 gpuEnv.mnKernelCount = 0UL;
747 AddKernelConfig(1,
"oclAverageSub1");
751int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc) {
754 ds_device bestDevice_DS = getDeviceSelection();
755 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
757 if (selectedDeviceIsOpenCL()) {
760 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
761 gpuEnv.mnFileCount = 0;
762 gpuEnv.mnKernelCount = 0UL;
763 CompileKernelFile(&gpuEnv,
"");
773OpenclDevice::OpenclDevice() {
777OpenclDevice::~OpenclDevice() {
781int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
789 for (
i = 0;
i < gpuEnv.mnFileCount;
i++) {
790 if (gpuEnv.mpArryPrograms[
i]) {
791 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[
i]);
792 CHECK_OPENCL(clStatus,
"clReleaseProgram");
793 gpuEnv.mpArryPrograms[
i] =
nullptr;
796 if (gpuEnv.mpCmdQueue) {
797 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
798 gpuEnv.mpCmdQueue =
nullptr;
800 if (gpuEnv.mpContext) {
801 clReleaseContext(gpuEnv.mpContext);
802 gpuEnv.mpContext =
nullptr;
805 gpuInfo->mnIsUserCreated = 0;
806 delete[] gpuInfo->mpArryDevsID;
809int OpenclDevice::BinaryGenerated(
const char *clFileName, FILE **fhandle) {
816 char deviceName[1024];
817 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[
i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName,
819 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
820 const char *str = strstr(clFileName,
".cl");
821 memcpy(cl_name, clFileName, str - clFileName);
822 cl_name[str - clFileName] =
'\0';
823 snprintf(fileName,
sizeof(fileName),
"%s-%s.bin", cl_name, deviceName);
824 legalizeFileName(fileName);
825 fd = fopen(fileName,
"rb");
826 status = (fd !=
nullptr) ? 1 : 0;
832int OpenclDevice::CachedOfKernerPrg(
const GPUEnv *gpuEnvCached,
const char *clFileName) {
834 for (
i = 0;
i < gpuEnvCached->mnFileCount;
i++) {
835 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[
i], clFileName) == 0) {
836 if (gpuEnvCached->mpArryPrograms[
i] !=
nullptr) {
844int OpenclDevice::WriteBinaryToFile(
const char *fileName,
const char *birary,
size_t numBytes) {
846 output = fopen(fileName,
"wb");
851 fwrite(birary,
sizeof(
char), numBytes,
output);
857int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
const char *clFileName) {
863 clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
sizeof(numDevices), &numDevices,
nullptr);
864 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
866 std::vector<cl_device_id> mpArryDevsID(numDevices);
869 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
sizeof(cl_device_id) * numDevices,
870 &mpArryDevsID[0],
nullptr);
871 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
874 std::vector<size_t> binarySizes(numDevices);
876 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
sizeof(
size_t) * numDevices,
877 &binarySizes[0],
nullptr);
878 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
881 std::vector<char *> binaries(numDevices);
883 for (
i = 0;
i < numDevices;
i++) {
884 if (binarySizes[
i] != 0) {
885 binaries[
i] =
new char[binarySizes[
i]];
887 binaries[
i] =
nullptr;
891 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
sizeof(
char *) * numDevices,
892 &binaries[0],
nullptr);
893 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
896 for (
i = 0;
i < numDevices;
i++) {
897 if (binarySizes[
i] != 0) {
900 char deviceName[1024];
902 clGetDeviceInfo(mpArryDevsID[
i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName,
nullptr);
903 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
905 const char *str = strstr(clFileName,
".cl");
906 memcpy(cl_name, clFileName, str - clFileName);
907 cl_name[str - clFileName] =
'\0';
908 snprintf(fileName,
sizeof(fileName),
"%s-%s.bin", cl_name, deviceName);
909 legalizeFileName(fileName);
910 if (!WriteBinaryToFile(fileName, binaries[
i], binarySizes[
i])) {
911 tprintf(
"[OD] write binary[%s] failed\n", fileName);
914 tprintf(
"[OD] write binary[%s] successfully\n", fileName);
919 for (
i = 0;
i < numDevices;
i++) {
920 delete[] binaries[
i];
926int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo,
const char *buildOption) {
929 size_t source_size[1];
930 int binary_status, binaryExisted, idx;
933 const char *filename =
"kernel.cl";
935 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
939 idx = gpuInfo->mnFileCount;
943 source_size[0] = strlen(source);
945 binaryExisted = BinaryGenerated(filename, &fd);
946 if (binaryExisted == 1) {
947 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
sizeof(numDevices),
948 &numDevices,
nullptr);
949 CHECK_OPENCL(clStatus,
"clGetContextInfo");
951 std::vector<cl_device_id> mpArryDevsID(numDevices);
952 bool b_error = fseek(fd, 0, SEEK_END) < 0;
953 auto pos = std::ftell(fd);
954 b_error |= (pos <= 0);
956 b_error |= fseek(fd, 0, SEEK_SET) < 0;
962 std::vector<uint8_t> binary(length + 2);
964 memset(&binary[0], 0, length + 2);
965 b_error |= fread(&binary[0], 1, length, fd) != length;
970 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
971 sizeof(cl_device_id) * numDevices, &mpArryDevsID[0],
nullptr);
972 CHECK_OPENCL(clStatus,
"clGetContextInfo");
974 const uint8_t *c_binary = &binary[0];
975 gpuInfo->mpArryPrograms[idx] =
976 clCreateProgramWithBinary(gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length,
977 &c_binary, &binary_status, &clStatus);
978 CHECK_OPENCL(clStatus,
"clCreateProgramWithBinary");
982 gpuInfo->mpArryPrograms[idx] =
983 clCreateProgramWithSource(gpuInfo->mpContext, 1, &source, source_size, &clStatus);
984 CHECK_OPENCL(clStatus,
"clCreateProgramWithSource");
987 if (gpuInfo->mpArryPrograms[idx] == (cl_program)
nullptr) {
994 if (!gpuInfo->mnIsUserCreated) {
995 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption,
998 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption,
1001 if (clStatus != CL_SUCCESS) {
1002 tprintf(
"BuildProgram error!\n");
1004 if (!gpuInfo->mnIsUserCreated) {
1005 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1006 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1008 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1009 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1011 if (clStatus != CL_SUCCESS) {
1012 tprintf(
"opencl create build log fail\n");
1015 std::vector<char> buildLog(length);
1016 if (!gpuInfo->mnIsUserCreated) {
1017 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1018 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1020 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1021 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1023 if (clStatus != CL_SUCCESS) {
1024 tprintf(
"opencl program build info fail\n");
1028 fd1 = fopen(
"kernel-build.log",
"w+");
1029 if (fd1 !=
nullptr) {
1030 fwrite(&buildLog[0],
sizeof(
char), length, fd1);
1037 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1038 if (binaryExisted == 0) {
1039 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1042 gpuInfo->mnFileCount += 1;
1046l_uint32 *OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata, l_int32 w, l_int32 h, l_int32 wpl,
1050 size_t globalThreads[2];
1051 size_t localThreads[2];
1057 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1058 globalThreads[0] = gsize;
1059 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1060 globalThreads[1] = gsize;
1061 localThreads[0] = GROUPSIZE_X;
1062 localThreads[1] = GROUPSIZE_Y;
1064 SetKernelEnv(&rEnv);
1066 l_uint32 *pResult = (l_uint32 *)malloc(w * h *
sizeof(l_uint32));
1067 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"composeRGBPixel", &clStatus);
1068 CHECK_OPENCL(clStatus,
"clCreateKernel composeRGBPixel");
1071 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1073 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
1077 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &valuesCl);
1078 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1079 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(w), &w);
1080 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1081 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(h), &h);
1082 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1083 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1084 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1085 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outputCl);
1086 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1089 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1090 localThreads, 0,
nullptr,
nullptr);
1091 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel");
1094 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0,
1095 w * h *
sizeof(l_uint32), 0,
nullptr,
nullptr, &clStatus);
1096 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer outputCl");
1097 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0,
nullptr,
nullptr);
1100 clFinish(rEnv.mpkCmdQueue);
1106static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1107 size_t globalThreads[2];
1111 size_t localThreads[2];
1114 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1115 globalThreads[0] = gsize;
1116 globalThreads[1] = GROUPSIZE_HMORY;
1117 localThreads[0] = GROUPSIZE_HMORX;
1118 localThreads[1] = GROUPSIZE_HMORY;
1120 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_5x5", &status);
1121 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1123 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1124 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1125 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1126 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1128 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1129 localThreads, 0,
nullptr,
nullptr);
1132 pixtemp = pixsCLBuffer;
1133 pixsCLBuffer = pixdCLBuffer;
1134 pixdCLBuffer = pixtemp;
1137 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1138 globalThreads[0] = gsize;
1139 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1140 globalThreads[1] = gsize;
1141 localThreads[0] = GROUPSIZE_X;
1142 localThreads[1] = GROUPSIZE_Y;
1144 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer_5x5", &status);
1145 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1147 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1148 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1149 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1150 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1151 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1152 localThreads, 0,
nullptr,
nullptr);
1159static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1160 size_t globalThreads[2];
1164 l_uint32 fwmask, lwmask;
1165 size_t localThreads[2];
1167 lwmask = lmask32[31 - 2];
1168 fwmask = rmask32[31 - 2];
1171 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1172 globalThreads[0] = gsize;
1173 globalThreads[1] = GROUPSIZE_HMORY;
1174 localThreads[0] = GROUPSIZE_HMORX;
1175 localThreads[1] = GROUPSIZE_HMORY;
1177 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_5x5", &status);
1178 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1180 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1181 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1182 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1183 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1185 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1186 localThreads, 0,
nullptr,
nullptr);
1189 pixtemp = pixsCLBuffer;
1190 pixsCLBuffer = pixdCLBuffer;
1191 pixdCLBuffer = pixtemp;
1194 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1195 globalThreads[0] = gsize;
1196 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1197 globalThreads[1] = gsize;
1198 localThreads[0] = GROUPSIZE_X;
1199 localThreads[1] = GROUPSIZE_Y;
1201 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer_5x5", &status);
1202 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1204 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1205 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1206 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1207 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1208 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1209 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1210 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1211 localThreads, 0,
nullptr,
nullptr);
1217static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1218 l_int32 xp, yp, xn, yn;
1220 size_t globalThreads[2];
1224 size_t localThreads[2];
1227 OpenclDevice::SetKernelEnv(&rEnv);
1229 if (hsize == 5 && vsize == 5) {
1231 status = pixDilateCL_55(wpl, h);
1235 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1237 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1240 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1241 globalThreads[0] = gsize;
1242 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1243 globalThreads[1] = gsize;
1244 localThreads[0] = GROUPSIZE_X;
1245 localThreads[1] = GROUPSIZE_Y;
1247 if (xp > 31 || xn > 31) {
1249 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1250 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1252 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1253 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1254 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1255 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1256 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1257 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1258 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1259 localThreads, 0,
nullptr,
nullptr);
1261 if (yp > 0 || yn > 0) {
1262 pixtemp = pixsCLBuffer;
1263 pixsCLBuffer = pixdCLBuffer;
1264 pixdCLBuffer = pixtemp;
1266 }
else if (xp > 0 || xn > 0) {
1268 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1269 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1270 isEven = (xp != xn);
1272 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1273 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1274 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1275 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1276 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1277 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1278 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1279 localThreads, 0,
nullptr,
nullptr);
1281 if (yp > 0 || yn > 0) {
1282 pixtemp = pixsCLBuffer;
1283 pixsCLBuffer = pixdCLBuffer;
1284 pixdCLBuffer = pixtemp;
1288 if (yp > 0 || yn > 0) {
1289 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer", &status);
1290 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1292 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1293 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1294 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1295 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1296 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1297 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1298 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1299 localThreads, 0,
nullptr,
nullptr);
1306static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1307 l_int32 xp, yp, xn, yn;
1309 size_t globalThreads[2];
1310 size_t localThreads[2];
1314 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1315 l_uint32 rwmask, lwmask;
1318 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1320 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1322 OpenclDevice::SetKernelEnv(&rEnv);
1324 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1326 status = pixErodeCL_55(wpl, h);
1330 lwmask = lmask32[31 - (xn & 31)];
1331 rwmask = rmask32[31 - (xp & 31)];
1334 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1335 globalThreads[0] = gsize;
1336 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1337 globalThreads[1] = gsize;
1338 localThreads[0] = GROUPSIZE_X;
1339 localThreads[1] = GROUPSIZE_Y;
1342 if (xp > 31 || xn > 31) {
1344 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1346 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1347 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1348 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1349 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1350 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1351 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1352 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1353 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1354 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1355 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1356 localThreads, 0,
nullptr,
nullptr);
1358 if (yp > 0 || yn > 0) {
1359 pixtemp = pixsCLBuffer;
1360 pixsCLBuffer = pixdCLBuffer;
1361 pixdCLBuffer = pixtemp;
1363 }
else if (xp > 0 || xn > 0) {
1364 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1365 isEven = (xp != xn);
1367 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1368 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1369 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1370 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1371 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1372 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1373 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1374 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1375 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1376 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1377 localThreads, 0,
nullptr,
nullptr);
1379 if (yp > 0 || yn > 0) {
1380 pixtemp = pixsCLBuffer;
1381 pixsCLBuffer = pixdCLBuffer;
1382 pixdCLBuffer = pixtemp;
1387 if (yp > 0 || yn > 0) {
1388 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1389 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1391 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1392 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1393 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1394 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1395 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1396 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1397 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1398 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1399 localThreads, 0,
nullptr,
nullptr);
1406static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1411 status = pixErodeCL(hsize, vsize, wpl, h);
1413 pixtemp = pixsCLBuffer;
1414 pixsCLBuffer = pixdCLBuffer;
1415 pixdCLBuffer = pixtemp;
1417 status = pixDilateCL(hsize, vsize, wpl, h);
1423static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1428 status = pixDilateCL(hsize, vsize, wpl, h);
1430 pixtemp = pixsCLBuffer;
1431 pixsCLBuffer = pixdCLBuffer;
1432 pixdCLBuffer = pixtemp;
1434 status = pixErodeCL(hsize, vsize, wpl, h);
1440static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2) {
1442 size_t globalThreads[2];
1444 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1446 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1447 globalThreads[0] = gsize;
1448 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1449 globalThreads[1] = gsize;
1451 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1452 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1455 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1456 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1457 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1458 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1459 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr, globalThreads,
1460 localThreads, 0,
nullptr,
nullptr);
1468void OpenclDevice::pixGetLinesCL(Image pixd, Image pixs, Image *pix_vline, Image *pix_hline,
1469 Image *pixClosed,
bool getpixClosed, l_int32 close_hsize,
1470 l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize,
1471 l_int32 line_hsize, l_int32 line_vsize) {
1475 wpl = pixGetWpl(pixs);
1476 h = pixGetHeight(pixs);
1479 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1484 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl * h, CL_MAP_READ,
true,
false);
1489 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1490 sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1493 pixtemp = pixsCLBuffer;
1494 pixsCLBuffer = pixdCLBuffer;
1495 pixdCLBuffer = pixtemp;
1497 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1500 pixtemp = pixsCLBuffer;
1501 pixsCLBuffer = pixdCLBuffer;
1502 pixdCLBuffer = pixdCLIntermediate;
1503 pixdCLIntermediate = pixtemp;
1505 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1509 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1510 sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1512 pixtemp = pixsCLBuffer;
1513 pixsCLBuffer = pixdCLBuffer;
1514 pixdCLBuffer = pixtemp;
1518 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1522 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, CL_MAP_READ,
true,
false);
1524 pixtemp = pixsCLBuffer;
1525 pixsCLBuffer = pixdCLIntermediate;
1526 pixdCLIntermediate = pixtemp;
1530 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1534 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, CL_MAP_READ,
true,
true);
1545int OpenclDevice::HistogramRectOCL(
void *imageData,
int bytes_per_pixel,
int bytes_per_line,
1549 int *histogramAllChannels) {
1553 SetKernelEnv(&histKern);
1554 KernelEnv histRedKern;
1555 SetKernelEnv(&histRedKern);
1561 cl_mem imageBuffer =
1562 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1563 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1564 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1567 int block_size = 256;
1569 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(numCUs), &numCUs,
1571 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1573 int requestedOccupancy = 10;
1574 int numWorkGroups = numCUs * requestedOccupancy;
1575 int numThreads = block_size * numWorkGroups;
1576 size_t local_work_size[] = {
static_cast<size_t>(block_size)};
1577 size_t global_work_size[] = {
static_cast<size_t>(numThreads)};
1578 size_t red_global_work_size[] = {
1579 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1583 cl_mem histogramBuffer = clCreateBuffer(
1584 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1585 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels, &clStatus);
1586 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1590 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1592 cl_mem tmpHistogramBuffer =
1593 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins *
sizeof(cl_uint),
1594 nullptr, &clStatus);
1595 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1598 int *zeroBuffer =
new int[1];
1600 cl_mem atomicSyncBuffer =
1601 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(cl_int),
1602 zeroBuffer, &clStatus);
1603 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1604 delete[] zeroBuffer;
1606 if (bytes_per_pixel == 1) {
1607 histKern.mpkKernel =
1608 clCreateKernel(histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1609 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1611 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1612 "kernel_HistogramRectOneChannelReduction", &clStatus);
1613 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannelReduction");
1615 histKern.mpkKernel =
1616 clCreateKernel(histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus);
1617 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1619 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1620 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1621 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannelsReduction");
1627 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1628 tmpHistogramBins *
sizeof(cl_uint), 0,
nullptr,
nullptr, &clStatus);
1629 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1631 memset(ptr, 0, tmpHistogramBins *
sizeof(cl_uint));
1632 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
nullptr,
nullptr);
1635 clStatus = clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1636 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1637 cl_uint numPixels = width * height;
1638 clStatus = clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1639 CHECK_OPENCL(clStatus,
"clSetKernelArg numPixels");
1640 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem), &tmpHistogramBuffer);
1641 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1644 int n = numThreads / bytes_per_pixel;
1645 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1646 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1647 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem), &tmpHistogramBuffer);
1648 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1649 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem), &histogramBuffer);
1650 CHECK_OPENCL(clStatus,
"clSetKernelArg histogramBuffer");
1653 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
nullptr,
1654 global_work_size, local_work_size, 0,
nullptr,
nullptr);
1655 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1656 clFinish(histKern.mpkCmdQueue);
1657 if (clStatus != 0) {
1661 clStatus = clEnqueueNDRangeKernel(histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1,
nullptr,
1662 red_global_work_size, local_work_size, 0,
nullptr,
nullptr);
1663 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1664 clFinish(histRedKern.mpkCmdQueue);
1665 if (clStatus != 0) {
1670 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0,
1671 kHistogramSize * bytes_per_pixel *
sizeof(
int), 0,
nullptr,
nullptr,
1673 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1674 if (clStatus != 0) {
1677 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
nullptr,
nullptr);
1679 clReleaseMemObject(histogramBuffer);
1680 clReleaseMemObject(imageBuffer);
1689int OpenclDevice::ThresholdRectToPixOCL(
unsigned char *imageData,
int bytes_per_pixel,
1690 int bytes_per_line,
int *thresholds,
int *hi_values,
1691 Image *pix,
int height,
int width,
int top,
int left) {
1694 *pix = pixCreate(width, height, 1);
1695 uint32_t *pixData = pixGetData(*pix);
1696 int wpl = pixGetWpl(*pix);
1697 int pixSize = wpl * height *
sizeof(uint32_t);
1701 SetKernelEnv(&rEnv);
1704 int block_size = 256;
1706 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(numCUs), &numCUs,
1708 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1710 int requestedOccupancy = 10;
1711 int numWorkGroups = numCUs * requestedOccupancy;
1712 int numThreads = block_size * numWorkGroups;
1713 size_t local_work_size[] = {(size_t)block_size};
1714 size_t global_work_size[] = {(size_t)numThreads};
1721 cl_mem imageBuffer =
1722 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1723 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1724 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1727 pixThBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize,
1728 pixData, &clStatus);
1729 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1732 cl_mem thresholdsBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1733 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1734 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1735 cl_mem hiValuesBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1736 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1737 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1740 if (bytes_per_pixel == 4) {
1741 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1742 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1745 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1746 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1750 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1751 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1752 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
1753 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
1754 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
1755 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
1756 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
1757 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
1758 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
1759 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
1760 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
1761 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
1762 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
1763 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
1766 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
nullptr, global_work_size,
1767 local_work_size, 0,
nullptr,
nullptr);
1768 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1769 clFinish(rEnv.mpkCmdQueue);
1770 if (clStatus != 0) {
1771 tprintf(
"Setting return value to -1\n");
1775 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0,
1776 nullptr,
nullptr, &clStatus);
1777 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1778 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0,
nullptr,
nullptr);
1780 clReleaseMemObject(imageBuffer);
1781 clReleaseMemObject(thresholdsBuffer);
1782 clReleaseMemObject(hiValuesBuffer);
1791struct TessScoreEvaluationInputData {
1795 unsigned char *imageData;
1799static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
1804 int numChannels = 4;
1805 input->height = height;
1806 input->width = width;
1807 input->numChannels = numChannels;
1808 unsigned char(*imageData4)[4] = (
unsigned char(*)[4])malloc(
1809 height * width * numChannels *
sizeof(
unsigned char));
1810 input->imageData = (
unsigned char *)&imageData4[0];
1813 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1814 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1815 for (
int p = 0;
p < height * width;
p++) {
1817 imageData4[
p][0] = pixelWhite[0];
1818 imageData4[
p][1] = pixelWhite[1];
1819 imageData4[
p][2] = pixelWhite[2];
1820 imageData4[
p][3] = pixelWhite[3];
1823 int maxLineWidth = 64;
1826 for (
int i = 0;
i < numLines;
i++) {
1827 int lineWidth = rand() % maxLineWidth;
1828 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1830 for (
int row = vertLinePos - lineWidth / 2; row < vertLinePos + lineWidth / 2; row++) {
1831 for (
int col = 0; col < height; col++) {
1833 imageData4[row * width + col][0] = pixelBlack[0];
1834 imageData4[row * width + col][1] = pixelBlack[1];
1835 imageData4[row * width + col][2] = pixelBlack[2];
1836 imageData4[row * width + col][3] = pixelBlack[3];
1841 for (
int i = 0;
i < numLines;
i++) {
1842 int lineWidth = rand() % maxLineWidth;
1843 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1845 for (
int row = 0; row < width; row++) {
1846 for (
int col = horLinePos - lineWidth / 2; col < horLinePos + lineWidth / 2;
1851 imageData4[row * width + col][0] = pixelBlack[0];
1852 imageData4[row * width + col][1] = pixelBlack[1];
1853 imageData4[row * width + col][2] = pixelBlack[2];
1854 imageData4[row * width + col][3] = pixelBlack[3];
1859 float fractionBlack = 0.1;
1860 int numSpots = (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1861 for (
int i = 0;
i < numSpots;
i++) {
1862 int lineWidth = rand() % maxLineWidth;
1863 int col = lineWidth + rand() % (width - 2 * lineWidth);
1864 int row = lineWidth + rand() % (height - 2 * lineWidth);
1866 for (
int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1867 for (
int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1870 imageData4[r * width + c][0] = pixelBlack[0];
1871 imageData4[r * width + c][1] = pixelBlack[1];
1872 imageData4[r * width + c][2] = pixelBlack[2];
1873 imageData4[r * width + c][3] = pixelBlack[3];
1878 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1881struct TessDeviceScore {
1891static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1892 ds_device_type
type) {
1895 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1896 QueryPerformanceFrequency(&freq);
1898 mach_timebase_info_data_t info = {0, 0};
1899 mach_timebase_info(&info);
1900 long long start, stop;
1902 timespec time_funct_start, time_funct_end;
1905 l_uint32 *tiffdata = (l_uint32 *)input.imageData;
1909 if (
type == DS_DEVICE_OPENCL_DEVICE) {
1911 QueryPerformanceCounter(&time_funct_start);
1913 start = mach_absolute_time();
1915 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1918 OpenclDevice::gpuEnv = *env;
1919 int wpl = pixGetWpl(input.pix);
1920 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl,
nullptr);
1922 QueryPerformanceCounter(&time_funct_end);
1923 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
1925 stop = mach_absolute_time();
1926 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
1928 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1929 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1930 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1935 QueryPerformanceCounter(&time_funct_start);
1937 start = mach_absolute_time();
1939 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1941 Image pix = pixCreate(input.width, input.height, 32);
1942 l_uint32 *pixData = pixGetData(pix);
1945 for (
i = 0;
i < input.height;
i++) {
1946 for (j = 0; j < input.width; j++) {
1947 l_uint32 tiffword = tiffdata[
i * input.width + j];
1948 l_int32 rval = ((tiffword)&0xff);
1949 l_int32 gval = (((tiffword) >> 8) & 0xff);
1950 l_int32 bval = (((tiffword) >> 16) & 0xff);
1951 l_uint32
value = (rval << 24) | (gval << 16) | (bval << 8);
1952 pixData[idx] =
value;
1957 QueryPerformanceCounter(&time_funct_end);
1958 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
1960 stop = mach_absolute_time();
1961 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
1963 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1964 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1965 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1973static double histogramRectMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1974 ds_device_type
type) {
1977 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1978 QueryPerformanceFrequency(&freq);
1980 mach_timebase_info_data_t info = {0, 0};
1981 mach_timebase_info(&info);
1982 long long start, stop;
1984 timespec time_funct_start, time_funct_end;
1990 int bytes_per_line = input.width * input.numChannels;
1991 int *histogramAllChannels =
new int[
kHistogramSize * input.numChannels];
1993 if (
type == DS_DEVICE_OPENCL_DEVICE) {
1995 QueryPerformanceCounter(&time_funct_start);
1997 start = mach_absolute_time();
1999 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2002 OpenclDevice::gpuEnv = *env;
2003 int retVal = OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line,
2004 left, top, input.width, input.height,
2008 QueryPerformanceCounter(&time_funct_end);
2009 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2011 stop = mach_absolute_time();
2013 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2018 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2019 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2020 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2025 QueryPerformanceCounter(&time_funct_start);
2027 start = mach_absolute_time();
2029 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2031 for (
int ch = 0;
ch < input.numChannels; ++
ch) {
2036 QueryPerformanceCounter(&time_funct_end);
2037 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2039 stop = mach_absolute_time();
2040 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2042 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2043 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2044 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2050 delete[] histogramAllChannels;
2055static void ThresholdRectToPix_Native(
const unsigned char *imagedata,
int bytes_per_pixel,
2056 int bytes_per_line,
const int *thresholds,
2057 const int *hi_values, Image *pix) {
2060 int width = pixGetWidth(*pix);
2061 int height = pixGetHeight(*pix);
2063 *pix = pixCreate(width, height, 1);
2064 uint32_t *pixdata = pixGetData(*pix);
2065 int wpl = pixGetWpl(*pix);
2066 const unsigned char *srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel;
2067 for (
int y = 0;
y < height; ++
y) {
2068 const uint8_t *linedata = srcdata;
2069 uint32_t *pixline = pixdata +
y * wpl;
2070 for (
int x = 0;
x < width; ++
x, linedata += bytes_per_pixel) {
2071 bool white_result =
true;
2072 for (
int ch = 0;
ch < bytes_per_pixel; ++
ch) {
2073 if (hi_values[
ch] >= 0 && (linedata[
ch] > thresholds[
ch]) == (hi_values[
ch] == 0)) {
2074 white_result =
false;
2079 CLEAR_DATA_BIT(pixline,
x);
2081 SET_DATA_BIT(pixline,
x);
2083 srcdata += bytes_per_line;
2087static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2088 ds_device_type
type) {
2091 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2092 QueryPerformanceFrequency(&freq);
2094 mach_timebase_info_data_t info = {0, 0};
2095 mach_timebase_info(&info);
2096 long long start, stop;
2098 timespec time_funct_start, time_funct_end;
2102 unsigned char pixelHi = (
unsigned char)255;
2103 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2108 int bytes_per_line = input.width * input.numChannels;
2111 if (
type == DS_DEVICE_OPENCL_DEVICE) {
2113 QueryPerformanceCounter(&time_funct_start);
2115 start = mach_absolute_time();
2117 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2120 OpenclDevice::gpuEnv = *env;
2122 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2123 input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix,
2124 input.height, input.width, top, left);
2127 QueryPerformanceCounter(&time_funct_end);
2128 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2130 stop = mach_absolute_time();
2132 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2138 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2139 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2140 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2146 QueryPerformanceCounter(&time_funct_start);
2148 start = mach_absolute_time();
2150 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2152 int hi_values[4] = {};
2153 ThresholdRectToPix_Native(input.imageData, input.numChannels, bytes_per_line, thresholds,
2154 hi_values, &input.pix);
2157 QueryPerformanceCounter(&time_funct_end);
2158 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2160 stop = mach_absolute_time();
2161 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2163 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2164 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2165 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2172static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2173 ds_device_type
type) {
2176 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2177 QueryPerformanceFrequency(&freq);
2179 mach_timebase_info_data_t info = {0, 0};
2180 mach_timebase_info(&info);
2181 long long start, stop;
2183 timespec time_funct_start, time_funct_end;
2187 int resolution = 300;
2188 int wpl = pixGetWpl(input.pix);
2193 int closing_brick = max_line_width / 3;
2196 if (
type == DS_DEVICE_OPENCL_DEVICE) {
2198 QueryPerformanceCounter(&time_funct_start);
2200 start = mach_absolute_time();
2202 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2204 OpenclDevice::gpuEnv = *env;
2205 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2206 Image pix_vline =
nullptr, pix_hline =
nullptr, pix_closed =
nullptr;
2207 OpenclDevice::pixGetLinesCL(
nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed,
true,
2208 closing_brick, closing_brick, max_line_width, max_line_width,
2209 min_line_length, min_line_length);
2211 OpenclDevice::releaseMorphCLBuffers();
2214 QueryPerformanceCounter(&time_funct_end);
2215 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2217 stop = mach_absolute_time();
2218 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2220 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2221 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2222 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2226 QueryPerformanceCounter(&time_funct_start);
2228 start = mach_absolute_time();
2230 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2234 Image src_pix = input.pix;
2235 Image pix_closed = pixCloseBrick(
nullptr, src_pix, closing_brick, closing_brick);
2236 Image pix_solid = pixOpenBrick(
nullptr, pix_closed, max_line_width, max_line_width);
2237 Image pix_hollow = pixSubtract(
nullptr, pix_closed, pix_solid);
2238 pix_solid.destroy();
2239 Image pix_vline = pixOpenBrick(
nullptr, pix_hollow, 1, min_line_length);
2240 Image pix_hline = pixOpenBrick(
nullptr, pix_hollow, min_line_length, 1);
2241 pix_hline.destroy();
2242 pix_vline.destroy();
2243 pix_hollow.destroy();
2246 QueryPerformanceCounter(&time_funct_end);
2247 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (
double)(freq.QuadPart);
2249 stop = mach_absolute_time();
2250 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2252 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2253 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2254 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2266static ds_status serializeScore(ds_device *device, uint8_t **serializedScore,
2267 unsigned int *serializedScoreSize) {
2268 *serializedScoreSize =
sizeof(TessDeviceScore);
2269 *serializedScore =
new uint8_t[*serializedScoreSize];
2270 memcpy(*serializedScore, device->score, *serializedScoreSize);
2275static ds_status deserializeScore(ds_device *device,
const uint8_t *serializedScore,
2276 unsigned int serializedScoreSize) {
2278 device->score =
new TessDeviceScore;
2279 memcpy(device->score, serializedScore, serializedScoreSize);
2283static ds_status releaseScore(TessDeviceScore *score) {
2289static ds_status evaluateScoreForDevice(ds_device *device,
void *inputData) {
2292 tprintf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2293 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2294 GPUEnv *env =
nullptr;
2295 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2296 env = &OpenclDevice::gpuEnv;
2297 memset(env, 0,
sizeof(*env));
2299 populateGPUEnvFromDevice(env, device->oclDeviceID);
2300 env->mnFileCount = 0;
2301 env->mnKernelCount = 0UL;
2303 OpenclDevice::CompileKernelFile(env,
"");
2306 TessScoreEvaluationInputData *input =
static_cast<TessScoreEvaluationInputData *
>(inputData);
2309 double composeRGBPixelTime = composeRGBPixelMicroBench(env, *input, device->type);
2312 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2315 double thresholdRectToPixTime = thresholdRectToPixMicroBench(env, *input, device->type);
2318 double getLineMasksMorphTime = getLineMasksMorphMicroBench(env, *input, device->type);
2322 float composeRGBPixelWeight = 1.2f;
2323 float histogramRectWeight = 2.4f;
2324 float thresholdRectToPixWeight = 4.5f;
2325 float getLineMasksMorphWeight = 5.0f;
2327 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2328 histogramRectWeight * histogramRectTime +
2329 thresholdRectToPixWeight * thresholdRectToPixTime +
2330 getLineMasksMorphWeight * getLineMasksMorphTime;
2331 device->score =
new TessDeviceScore;
2332 device->score->time = weightedTime;
2334 tprintf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2335 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2336 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight);
2337 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime, histogramRectWeight);
2338 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix", thresholdRectToPixTime,
2339 thresholdRectToPixWeight);
2340 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime,
2341 getLineMasksMorphWeight);
2342 tprintf(
"[DS]%25s: %f\n",
"Score", device->score->time);
2347ds_device OpenclDevice::getDeviceSelection() {
2348 if (!deviceIsSelected) {
2350 if (1 == LoadOpencl()) {
2354 ds_profile *profile;
2355 status = initDSProfile(&profile,
"v0.1");
2357 const char *fileName =
"tesseract_opencl_profile_devices.dat";
2358 status = readProfileFromFile(profile, deserializeScore, fileName);
2359 if (status != DS_SUCCESS) {
2361 tprintf(
"[DS] Profile file not available (%s); performing profiling.\n", fileName);
2364 TessScoreEvaluationInputData input;
2365 populateTessScoreEvaluationInputData(&input);
2367 unsigned int numUpdates;
2369 profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, &input, &numUpdates);
2371 if (status == DS_SUCCESS) {
2372 status = writeProfileToFile(profile, serializeScore, fileName);
2373 if (status == DS_SUCCESS) {
2374 tprintf(
"[DS] Scores written to file (%s).\n", fileName);
2377 "[DS] Error saving scores to file (%s); scores not written to "
2383 "[DS] Unable to evaluate performance; scores not written to "
2387 tprintf(
"[DS] Profile read from file (%s).\n", fileName);
2392 float bestTime = FLT_MAX;
2393 int bestDeviceIdx = -1;
2394 for (
unsigned d = 0; d < profile->numDevices; d++) {
2395 ds_device device = profile->devices[d];
2396 if (device.score ==
nullptr)
2398 TessDeviceScore score = *device.score;
2400 float time = score.time;
2401 tprintf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, device.oclDeviceName,
2403 if (time < bestTime) {
2408 if (bestDeviceIdx >= 0) {
2410 "[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2411 profile->devices[bestDeviceIdx].oclDeviceName,
2412 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2417 bool overridden =
false;
2418 char *overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2419 if (overrideDeviceStr !=
nullptr) {
2420 int overrideDeviceIdx = atoi(overrideDeviceStr);
2421 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2423 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2425 overrideDeviceStr, overrideDeviceIdx);
2426 bestDeviceIdx = overrideDeviceIdx - 1;
2430 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2431 "valid devices).\n",
2432 overrideDeviceStr, profile->numDevices);
2438 "[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2439 profile->devices[bestDeviceIdx].oclDeviceName,
2440 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2442 selectedDevice = profile->devices[bestDeviceIdx];
2444 releaseDSProfile(profile, releaseScore);
2447 tprintf(
"[DS] OpenCL runtime not available.\n");
2448 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2449 selectedDevice.oclDeviceName =
"(null)";
2450 selectedDevice.score =
nullptr;
2451 selectedDevice.oclDeviceID =
nullptr;
2452 selectedDevice.oclDriverVersion =
nullptr;
2454 deviceIsSelected =
true;
2456 return selectedDevice;
2459bool OpenclDevice::selectedDeviceIsOpenCL() {
2460 ds_device device = getDeviceSelection();
2461 return (device.type == DS_DEVICE_OPENCL_DEVICE);
void tprintf(const char *format,...)
void HistogramRect(Image src_pix, int channel, int left, int top, int width, int height, int *histogram)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)