11#ifndef TESSERACT_OPENCL_OCLKERNELS_H_
12# define TESSERACT_OPENCL_OCLKERNELS_H_
14# ifndef USE_EXTERNAL_KERNEL
15# define KERNEL(...) # __VA_ARGS__ "\n"
21static const char *kernel_src =
KERNEL(
22\n #ifdef KHR_DP_EXTENSION\n
23\n #pragma OPENCL EXTENSION cl_khr_fp64
25\n #elif AMD_DP_EXTENSION\n
26\n #pragma OPENCL EXTENSION cl_amd_fp64
30 __kernel
void composeRGBPixel(__global
uint *tiffdata,
int w,
int h,
int wpl, __global
uint *
output) {
31 int i = get_global_id(1);
32 int j = get_global_id(0);
33 int tiffword, rval, gval, bval;
36 if ((
i >= h) || (j >= w))
39 tiffword = tiffdata[
i * w + j];
40 rval = ((tiffword)&0xff);
41 gval = (((tiffword) >> 8) & 0xff);
42 bval = (((tiffword) >> 16) & 0xff);
43 output[
i * wpl + j] = (rval << (8 * (
sizeof(
uint) - 1 - 0))) | (gval << (8 * (
sizeof(
uint) - 1 - 1))) | (bval << (8 * (
sizeof(
uint) - 1 - 2)));
47\n__kernel
void pixSubtract_inplace(__global
int *dword, __global
int *sword,
const int wpl,
const int h) {
48 const unsigned int row = get_global_id(1);
49 const unsigned int col = get_global_id(0);
50 const unsigned int pos = row * wpl + col;
53 if (row >= h || col >= wpl)
56 *(dword + pos) &= ~(*(sword + pos));
60\n__kernel
void morphoDilateHor_5x5(__global
int *sword, __global
int *dword,
const int wpl,
const int h) {
61 const unsigned int pos = get_global_id(0);
62 unsigned int prevword, nextword, currword, tempword;
63 unsigned int destword;
64 const int col = pos % wpl;
70 currword = *(sword + pos);
77 prevword = *(sword + pos - 1);
82 nextword = *(sword + pos + 1);
88 tempword = (prevword << (31)) | ((currword >> 1));
91 tempword = (currword << 1) | (nextword >> (31));
96 tempword = (prevword << (30)) | ((currword >> 2));
99 tempword = (currword << 2) | (nextword >> (30));
100 destword |= tempword;
102 *(dword + pos) = destword;
106\n__kernel
void morphoDilateVer_5x5(__global
int *sword, __global
int *dword,
const int wpl,
const int h) {
107 const int col = get_global_id(0);
108 const int row = get_global_id(1);
109 const unsigned int pos = row * wpl + col;
110 unsigned int tempword;
111 unsigned int destword;
115 if (row >= h || col >= wpl)
118 destword = *(sword + pos);
121 i = (row - 2) < 0 ? row : (row - 2);
122 tempword = *(sword +
i * wpl + col);
123 destword |= tempword;
126 i = (row - 1) < 0 ? row : (row - 1);
127 tempword = *(sword +
i * wpl + col);
128 destword |= tempword;
131 i = (row >= (h - 1)) ? row : (row + 1);
132 tempword = *(sword +
i * wpl + col);
133 destword |= tempword;
136 i = (row >= (h - 2)) ? row : (row + 2);
137 tempword = *(sword +
i * wpl + col);
138 destword |= tempword;
140 *(dword + pos) = destword;
144\n__kernel
void morphoDilateHor(__global
int *sword, __global
int *dword,
const int xp,
const int xn,
const int wpl,
const int h) {
145 const int col = get_global_id(0);
146 const int row = get_global_id(1);
147 const unsigned int pos = row * wpl + col;
148 unsigned int parbitsxp, parbitsxn, nwords;
149 unsigned int destword, tempword, lastword, currword;
150 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
151 int i, j, siter, eiter;
154 if (pos >= (wpl * h) || (xn < 1 && xp < 1))
157 currword = *(sword + pos);
169 siter = (col - nwords);
170 eiter = (col + nwords);
176 firstword = *(sword + pos - 1);
179 if (col == (wpl - 1))
182 secondword = *(sword + pos + 1);
185 for (
i = 1;
i <= parbitsxp;
i++) {
187 tempword = ((
i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32 -
i)) | ((currword >>
i));
189 destword |= tempword;
192 tempword = (currword <<
i) | (secondword >> (32 -
i));
193 destword |= tempword;
199 destword |= firstword;
202 destword |= secondword;
205 *(dword + pos) = destword;
212 firstword = *(sword + row * wpl + siter);
217 lastword = *(sword + row * wpl + eiter);
219 for (
i = 1;
i < nwords;
i++) {
224 secondword = *(sword + row * wpl + siter +
i);
226 lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
228 firstword = secondword;
230 if ((siter +
i + 1) < 0)
233 secondword = *(sword + row * wpl + siter +
i + 1);
235 lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
238 if ((eiter -
i) >= wpl)
241 firstword = *(sword + row * wpl + eiter -
i);
243 rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
245 lastword = firstword;
246 if ((eiter -
i - 1) >= wpl)
249 firstword = *(sword + row * wpl + eiter -
i - 1);
251 rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
253 for (j = 1; j < 32; j++) {
255 tempword = (lprevword << j) | (lnextword >> (32 - j));
256 destword |= tempword;
259 tempword = (rprevword << j) | (rnextword >> (32 - j));
260 destword |= tempword;
263 destword |= lprevword;
264 destword |= lnextword;
265 destword |= rprevword;
266 destword |= rnextword;
268 lastword = firstword;
269 firstword = secondword;
272 *(dword + pos) = destword;
276\n__kernel
void morphoDilateHor_32word(__global
int *sword, __global
int *dword,
const int halfwidth,
const int wpl,
const int h,
const char isEven) {
277 const int col = get_global_id(0);
278 const int row = get_global_id(1);
279 const unsigned int pos = row * wpl + col;
280 unsigned int prevword, nextword, currword, tempword;
281 unsigned int destword;
285 if (pos >= (wpl * h))
288 currword = *(sword + pos);
295 prevword = *(sword + pos - 1);
297 if (col == (wpl - 1))
300 nextword = *(sword + pos + 1);
302 for (
i = 1;
i <= halfwidth;
i++) {
304 if (
i == halfwidth && isEven) {
307 tempword = (prevword << (32 -
i)) | ((currword >>
i));
310 destword |= tempword;
313 tempword = (currword <<
i) | (nextword >> (32 -
i));
315 destword |= tempword;
318 *(dword + pos) = destword;
322\n__kernel
void morphoDilateVer(__global
int *sword, __global
int *dword,
const int yp,
const int wpl,
const int h,
const int yn) {
323 const int col = get_global_id(0);
324 const int row = get_global_id(1);
325 const unsigned int pos = row * wpl + col;
326 unsigned int tempword;
327 unsigned int destword;
331 if (row >= h || col >= wpl)
334 destword = *(sword + pos);
337 siter = (row - yn) < 0 ? 0 : (row - yn);
338 eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
340 for (
i = siter;
i <= eiter;
i++) {
341 tempword = *(sword +
i * wpl + col);
343 destword |= tempword;
346 *(dword + pos) = destword;
350\n__kernel
void morphoErodeHor_5x5(__global
int *sword, __global
int *dword,
const int wpl,
const int h) {
351 const unsigned int pos = get_global_id(0);
352 unsigned int prevword, nextword, currword, tempword;
353 unsigned int destword;
354 const int col = pos % wpl;
357 if (pos >= (wpl * h))
360 currword = *(sword + pos);
365 prevword = 0xffffffff;
367 prevword = *(sword + pos - 1);
369 if (col == (wpl - 1))
370 nextword = 0xffffffff;
372 nextword = *(sword + pos + 1);
378 tempword = (prevword << (31)) | ((currword >> 1));
379 destword &= tempword;
381 tempword = (currword << 1) | (nextword >> (31));
382 destword &= tempword;
386 tempword = (prevword << (30)) | ((currword >> 2));
387 destword &= tempword;
389 tempword = (currword << 2) | (nextword >> (30));
390 destword &= tempword;
392 *(dword + pos) = destword;
396\n__kernel
void morphoErodeVer_5x5(__global
int *sword, __global
int *dword,
const int wpl,
const int h,
const int fwmask,
const int lwmask) {
397 const int col = get_global_id(0);
398 const int row = get_global_id(1);
399 const unsigned int pos = row * wpl + col;
400 unsigned int tempword;
401 unsigned int destword;
405 if (row >= h || col >= wpl)
408 destword = *(sword + pos);
410 if (row < 2 || row >= (h - 2)) {
416 tempword = *(sword +
i * wpl + col);
417 destword &= tempword;
422 tempword = *(sword +
i * wpl + col);
423 destword &= tempword;
428 tempword = *(sword +
i * wpl + col);
429 destword &= tempword;
434 tempword = *(sword +
i * wpl + col);
435 destword &= tempword;
440 if (col == (wpl - 1)) {
445 *(dword + pos) = destword;
449\n__kernel
void morphoErodeHor(__global
int *sword, __global
int *dword,
const int xp,
const int xn,
const int wpl,
const int h,
const char isAsymmetric,
const int rwmask,
const int lwmask) {
450 const int col = get_global_id(0);
451 const int row = get_global_id(1);
452 const unsigned int pos = row * wpl + col;
453 unsigned int parbitsxp, parbitsxn, nwords;
454 unsigned int destword, tempword, lastword, currword;
455 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
456 int i, j, siter, eiter;
459 if (pos >= (wpl * h) || (xn < 1 && xp < 1))
462 currword = *(sword + pos);
474 siter = (col - nwords);
475 eiter = (col + nwords);
479 firstword = 0xffffffff;
481 firstword = *(sword + pos - 1);
484 if (col == (wpl - 1))
485 secondword = 0xffffffff;
487 secondword = *(sword + pos + 1);
490 for (
i = 1;
i <= parbitsxp;
i++) {
492 tempword = (firstword << (32 -
i)) | ((currword >>
i));
493 destword &= tempword;
496 tempword = ((
i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword <<
i) | (secondword >> (32 -
i));
499 destword &= tempword;
505 destword &= firstword;
508 destword &= secondword;
515 if (col == (wpl - 1))
519 *(dword + pos) = destword;
524 firstword = 0xffffffff;
526 firstword = *(sword + row * wpl + siter);
529 lastword = 0xffffffff;
531 lastword = *(sword + row * wpl + eiter);
533 for (
i = 1;
i < nwords;
i++) {
536 secondword = 0xffffffff;
538 secondword = *(sword + row * wpl + siter +
i);
540 lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
542 firstword = secondword;
544 if ((siter +
i + 1) < 0)
545 secondword = 0xffffffff;
547 secondword = *(sword + row * wpl + siter +
i + 1);
549 lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
552 if ((eiter -
i) >= wpl)
553 firstword = 0xffffffff;
555 firstword = *(sword + row * wpl + eiter -
i);
557 rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
559 lastword = firstword;
560 if ((eiter -
i - 1) >= wpl)
561 firstword = 0xffffffff;
563 firstword = *(sword + row * wpl + eiter -
i - 1);
565 rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
567 for (j = 0; j < 32; j++) {
569 tempword = (lprevword << j) | (lnextword >> (32 - j));
570 destword &= tempword;
573 tempword = (rprevword << j) | (rnextword >> (32 - j));
574 destword &= tempword;
577 destword &= lprevword;
578 destword &= lnextword;
579 destword &= rprevword;
580 destword &= rnextword;
582 lastword = firstword;
583 firstword = secondword;
588 if (col < (nwords - 1))
590 else if (col == (nwords - 1))
592 else if (col > (wpl - nwords))
594 else if (col == (wpl - nwords))
598 *(dword + pos) = destword;
602\n__kernel
void morphoErodeHor_32word(__global
int *sword, __global
int *dword,
const int halfwidth,
const int wpl,
const int h,
const char clearBoundPixH,
const int rwmask,
const int lwmask,
const char isEven) {
603 const int col = get_global_id(0);
604 const int row = get_global_id(1);
605 const unsigned int pos = row * wpl + col;
606 unsigned int prevword, nextword, currword, tempword, destword;
610 if (pos >= (wpl * h))
613 currword = *(sword + pos);
618 prevword = 0xffffffff;
620 prevword = *(sword + pos - 1);
622 if (col == (wpl - 1))
623 nextword = 0xffffffff;
625 nextword = *(sword + pos + 1);
627 for (
i = 1;
i <= halfwidth;
i++) {
629 tempword = (prevword << (32 -
i)) | ((currword >>
i));
631 destword &= tempword;
634 if (
i == halfwidth && isEven) {
635 tempword = 0xffffffff;
637 tempword = (currword <<
i) | (nextword >> (32 -
i));
640 destword &= tempword;
643 if (clearBoundPixH) {
646 }
else if (col == (wpl - 1)) {
651 *(dword + pos) = destword;
655\n__kernel
void morphoErodeVer(__global
int *sword, __global
int *dword,
const int yp,
const int wpl,
const int h,
const char clearBoundPixV,
const int yn) {
656 const int col = get_global_id(0);
657 const int row = get_global_id(1);
658 const unsigned int pos = row * wpl + col;
659 unsigned int tempword, destword;
663 if (row >= h || col >= wpl)
666 destword = *(sword + pos);
669 siter = (row - yp) < 0 ? 0 : (row - yp);
670 eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
672 for (
i = siter;
i <= eiter;
i++) {
673 tempword = *(sword +
i * wpl + col);
675 destword &= tempword;
679 if (clearBoundPixV && ((row < yp) || ((h - row) <= yn))) {
683 *(dword + pos) = destword;
694\n #define HIST_REDUNDANCY 256\n
695\n #define GROUP_SIZE 256\n
696\n #define HIST_SIZE 256\n
697\n #define NUM_CHANNELS 4\n
698\n #define HR_UNROLL_SIZE 8 \n
699\n #define HR_UNROLL_TYPE uchar8 \n
701 __attribute__((reqd_work_group_size(256, 1, 1))) __kernel
void kernel_HistogramRectAllChannels(__global
const uchar8 *data,
uint numPixels, __global
uint *histBuffer) {
704 int threadOffset = get_global_id(0) % HIST_REDUNDANCY;
707 for (
uint pc = get_global_id(0); pc < numPixels * NUM_CHANNELS / HR_UNROLL_SIZE; pc += get_global_size(0)) {
710 atomic_inc(&histBuffer[0 * HIST_SIZE * HIST_REDUNDANCY + pixels.s0 * HIST_REDUNDANCY + threadOffset]);
711 atomic_inc(&histBuffer[0 * HIST_SIZE * HIST_REDUNDANCY + pixels.s4 * HIST_REDUNDANCY + threadOffset]);
712 atomic_inc(&histBuffer[1 * HIST_SIZE * HIST_REDUNDANCY + pixels.s1 * HIST_REDUNDANCY + threadOffset]);
713 atomic_inc(&histBuffer[1 * HIST_SIZE * HIST_REDUNDANCY + pixels.s5 * HIST_REDUNDANCY + threadOffset]);
714 atomic_inc(&histBuffer[2 * HIST_SIZE * HIST_REDUNDANCY + pixels.s2 * HIST_REDUNDANCY + threadOffset]);
715 atomic_inc(&histBuffer[2 * HIST_SIZE * HIST_REDUNDANCY + pixels.s6 * HIST_REDUNDANCY + threadOffset]);
716 atomic_inc(&histBuffer[3 * HIST_SIZE * HIST_REDUNDANCY + pixels.s3 * HIST_REDUNDANCY + threadOffset]);
717 atomic_inc(&histBuffer[3 * HIST_SIZE * HIST_REDUNDANCY + pixels.s7 * HIST_REDUNDANCY + threadOffset]);
723 __attribute__((reqd_work_group_size(256, 1, 1))) __kernel
void kernel_HistogramRectOneChannel(__global
const uchar8 *data,
uint numPixels, __global
uint *histBuffer) {
726 int threadOffset = get_global_id(0) % HIST_REDUNDANCY;
729 for (
uint pc = get_global_id(0); pc < numPixels / HR_UNROLL_SIZE; pc += get_global_size(0)) {
732 atomic_inc(&histBuffer[pixels.s0 * HIST_REDUNDANCY + threadOffset]);
733 atomic_inc(&histBuffer[pixels.s1 * HIST_REDUNDANCY + threadOffset]);
734 atomic_inc(&histBuffer[pixels.s2 * HIST_REDUNDANCY + threadOffset]);
735 atomic_inc(&histBuffer[pixels.s3 * HIST_REDUNDANCY + threadOffset]);
736 atomic_inc(&histBuffer[pixels.s4 * HIST_REDUNDANCY + threadOffset]);
737 atomic_inc(&histBuffer[pixels.s5 * HIST_REDUNDANCY + threadOffset]);
738 atomic_inc(&histBuffer[pixels.s6 * HIST_REDUNDANCY + threadOffset]);
739 atomic_inc(&histBuffer[pixels.s7 * HIST_REDUNDANCY + threadOffset]);
746 KERNEL(
__attribute__((reqd_work_group_size(256, 1, 1))) __kernel
void kernel_HistogramRectAllChannelsReduction(
int n,
747 __global
uint *histBuffer, __global
int *histResult) {
749 int channel = get_group_id(0) / HIST_SIZE;
750 int bin = get_group_id(0) % HIST_SIZE;
754 for (
uint i = get_local_id(0);
i < HIST_REDUNDANCY;
i += GROUP_SIZE) {
755 value += histBuffer[channel * HIST_SIZE * HIST_REDUNDANCY + bin * HIST_REDUNDANCY +
i];
759 __local
int localHist[GROUP_SIZE];
760 localHist[get_local_id(0)] =
value;
761 barrier(CLK_LOCAL_MEM_FENCE);
762 for (
int stride = GROUP_SIZE / 2; stride >= 1; stride /= 2) {
763 if (get_local_id(0) < stride) {
764 value = localHist[get_local_id(0) + stride];
766 barrier(CLK_LOCAL_MEM_FENCE);
767 if (get_local_id(0) < stride) {
768 localHist[get_local_id(0)] +=
value;
770 barrier(CLK_LOCAL_MEM_FENCE);
774 if (get_local_id(0) == 0) {
775 histResult[get_group_id(0)] = localHist[0];
782 __attribute__((reqd_work_group_size(256, 1, 1))) __kernel
void kernel_HistogramRectOneChannelReduction(
int n,
783 __global
uint *histBuffer, __global
int *histResult) {
786 int bin = get_group_id(0) % HIST_SIZE;
790 for (
int i = get_local_id(0);
i < HIST_REDUNDANCY;
i += GROUP_SIZE) {
791 value += histBuffer[bin * HIST_REDUNDANCY +
i];
795 __local
int localHist[GROUP_SIZE];
796 localHist[get_local_id(0)] =
value;
797 barrier(CLK_LOCAL_MEM_FENCE);
798 for (
int stride = GROUP_SIZE / 2; stride >= 1; stride /= 2) {
799 if (get_local_id(0) < stride) {
800 value = localHist[get_local_id(0) + stride];
802 barrier(CLK_LOCAL_MEM_FENCE);
803 if (get_local_id(0) < stride) {
804 localHist[get_local_id(0)] +=
value;
806 barrier(CLK_LOCAL_MEM_FENCE);
810 if (get_local_id(0) == 0) {
811 histResult[get_group_id(0)] = localHist[0];
821\n #define CHAR_VEC_WIDTH 4 \n
822\n #define PIXELS_PER_WORD 32 \n
823\n #define PIXELS_PER_BURST 8 \n
824\n #define BURSTS_PER_WORD (PIXELS_PER_WORD)/(PIXELS_PER_BURST) \n
826 uchar s[PIXELS_PER_BURST * NUM_CHANNELS];
827 uchar4 v[(PIXELS_PER_BURST * NUM_CHANNELS) / CHAR_VEC_WIDTH];
830 __attribute__((reqd_work_group_size(256, 1, 1))) __kernel
void kernel_ThresholdRectToPix(__global
const uchar4 *imageData,
int height,
int width,
832 __global
int *thresholds, __global
int *hi_values, __global
int *pix) {
834 int pThresholds[NUM_CHANNELS];
835 int pHi_Values[NUM_CHANNELS];
836 for (
int i = 0;
i < NUM_CHANNELS;
i++) {
837 pThresholds[
i] = thresholds[
i];
838 pHi_Values[
i] = hi_values[
i];
842 for (
uint w = get_global_id(0); w < wpl * height; w += get_global_size(0)) {
843 unsigned int word = 0;
845 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
848 int offset = (w / wpl) * width;
849 offset += (w % wpl) * PIXELS_PER_WORD;
850 offset += b * PIXELS_PER_BURST;
852 for (
int i = 0;
i < PIXELS_PER_BURST; ++
i)
853 pixels.v[
i] = imageData[offset +
i];
856 for (
int p = 0;
p < PIXELS_PER_BURST;
p++) {
857 for (
int c = 0; c < NUM_CHANNELS; c++) {
858 unsigned char pixChan = pixels.s[
p * NUM_CHANNELS + c];
859 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
860 const uint kTopBit = 0x80000000;
861 word |= (kTopBit >> ((b * PIXELS_PER_BURST +
p) & 31));
870\n #define CHAR_VEC_WIDTH 8 \n
871\n #define PIXELS_PER_WORD 32 \n
872\n #define PIXELS_PER_BURST 8 \n
873\n #define BURSTS_PER_WORD (PIXELS_PER_WORD) / (PIXELS_PER_BURST) \n
875 uchar s[PIXELS_PER_BURST * 1];
876 uchar8 v[(PIXELS_PER_BURST * 1) / CHAR_VEC_WIDTH];
879 __attribute__((reqd_work_group_size(256, 1, 1))) __kernel
void kernel_ThresholdRectToPix_OneChan(__global
const uchar8 *imageData,
int height,
int width,
881 __global
int *thresholds, __global
int *hi_values, __global
int *pix) {
885 for (
int i = 0;
i < 1;
i++) {
886 pThresholds[
i] = thresholds[
i];
887 pHi_Values[
i] = hi_values[
i];
891 for (
uint w = get_global_id(0); w < wpl * height; w += get_global_size(0)) {
892 unsigned int word = 0;
895 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
899 pixels.v[0] = imageData[w * BURSTS_PER_WORD + b + 0];
902 for (
int p = 0;
p < PIXELS_PER_BURST;
p++) {
906\n #ifdef __ENDIAN_LITTLE__\n
911 \n #endif \n
unsigned char pixChan = pixels.s[idx];
912 if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
913 const uint kTopBit = 0x80000000;
914 word |= (kTopBit >> ((b * PIXELS_PER_BURST +
p) & 31));
#define __attribute__(attr)