tesseract v5.3.3.20231005
networkio.cpp
Go to the documentation of this file.
1
2// File: networkio.cpp
3// Description: Network input/output data, allowing float/int implementations.
4// Author: Ray Smith
5//
6// (C) Copyright 2014, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#include "networkio.h"
19#include <cfloat> // for FLT_MAX
20#include <cmath>
21
22#include <allheaders.h>
23#include "functions.h"
24#include "statistc.h"
25#include "tprintf.h"
26
27namespace tesseract {
28
29// Minimum value to output for certainty.
30const float kMinCertainty = -20.0f;
31// Probability corresponding to kMinCertainty.
32const float kMinProb = std::exp(kMinCertainty);
33
34// Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.
35void NetworkIO::Resize2d(bool int_mode, int width, int num_features) {
36 stride_map_ = StrideMap();
37 int_mode_ = int_mode;
38 if (int_mode_) {
39 i_.ResizeNoInit(width, num_features, GetPadding(num_features));
40 } else {
41 f_.ResizeNoInit(width, num_features);
42 }
43}
44
45// Resizes to a specific stride_map.
46void NetworkIO::ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features) {
47 // If this method crashes with this == nullptr,
48 // it most likely got here through an uninitialized scratch element,
49 // ie call NetworkScratch::IO::Resizexxx() not NetworkIO::Resizexxx()!!
50 stride_map_ = stride_map;
51 int_mode_ = int_mode;
52 if (int_mode_) {
53 i_.ResizeNoInit(stride_map.Width(), num_features, GetPadding(num_features));
54 } else {
55 f_.ResizeNoInit(stride_map.Width(), num_features);
56 }
58}
59
60// Shrinks image size by x_scale,y_scale, and use given number of features.
61void NetworkIO::ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features) {
62 StrideMap stride_map = src.stride_map_;
63 stride_map.ScaleXY(x_scale, y_scale);
64 ResizeToMap(src.int_mode_, stride_map, num_features);
65}
66
67// Resizes to just 1 x-coord, whatever the input.
68void NetworkIO::ResizeXTo1(const NetworkIO &src, int num_features) {
69 StrideMap stride_map = src.stride_map_;
71 ResizeToMap(src.int_mode_, stride_map, num_features);
72}
73
74// Initialize all the array to zero.
76 int width = Width();
77 // Zero out the everything. Column-by-column in case it is aligned.
78 for (int t = 0; t < width; ++t) {
79 ZeroTimeStep(t);
80 }
81}
82
83// Initializes to zero all elements of the array that do not correspond to
84// valid image positions. (If a batch of different-sized images are packed
85// together, then there will be padding pixels.)
87 int num_features = NumFeatures();
88 int full_width = stride_map_.Size(FD_WIDTH);
89 int full_height = stride_map_.Size(FD_HEIGHT);
90 StrideMap::Index b_index(stride_map_);
91 do {
92 int end_x = b_index.MaxIndexOfDim(FD_WIDTH) + 1;
93 if (end_x < full_width) {
94 // The width is small, so fill for every valid y.
95 StrideMap::Index y_index(b_index);
96 int fill_size = num_features * (full_width - end_x);
97 do {
98 StrideMap::Index z_index(y_index);
99 z_index.AddOffset(end_x, FD_WIDTH);
100 if (int_mode_) {
101 ZeroVector(fill_size, i_[z_index.t()]);
102 } else {
103 ZeroVector(fill_size, f_[z_index.t()]);
104 }
105 } while (y_index.AddOffset(1, FD_HEIGHT));
106 }
107 int end_y = b_index.MaxIndexOfDim(FD_HEIGHT) + 1;
108 if (end_y < full_height) {
109 // The height is small, so fill in the space in one go.
110 StrideMap::Index y_index(b_index);
111 y_index.AddOffset(end_y, FD_HEIGHT);
112 int fill_size = num_features * full_width * (full_height - end_y);
113 if (int_mode_) {
114 ZeroVector(fill_size, i_[y_index.t()]);
115 } else {
116 ZeroVector(fill_size, f_[y_index.t()]);
117 }
118 }
119 } while (b_index.AddOffset(1, FD_BATCH));
120}
121
122// Helper computes a black point and white point to contrast-enhance an image.
123// The computation is based on the assumption that the image is of a single line
124// of text, so a horizontal line through the middle of the image passes through
125// at least some of it, so local minima and maxima are a good proxy for black
126// and white pixel samples.
127static void ComputeBlackWhite(Image pix, float *black, float *white) {
128 int width = pixGetWidth(pix);
129 int height = pixGetHeight(pix);
130 STATS mins(0, 255), maxes(0, 255);
131 if (width >= 3) {
132 int y = height / 2;
133 l_uint32 *line = pixGetData(pix) + pixGetWpl(pix) * y;
134 int prev = GET_DATA_BYTE(line, 0);
135 int curr = GET_DATA_BYTE(line, 1);
136 for (int x = 1; x + 1 < width; ++x) {
137 int next = GET_DATA_BYTE(line, x + 1);
138 if ((curr < prev && curr <= next) || (curr <= prev && curr < next)) {
139 // Local minimum.
140 mins.add(curr, 1);
141 }
142 if ((curr > prev && curr >= next) || (curr >= prev && curr > next)) {
143 // Local maximum.
144 maxes.add(curr, 1);
145 }
146 prev = curr;
147 curr = next;
148 }
149 }
150 if (mins.get_total() == 0) {
151 mins.add(0, 1);
152 }
153 if (maxes.get_total() == 0) {
154 maxes.add(255, 1);
155 }
156 *black = mins.ile(0.25);
157 *white = maxes.ile(0.75);
158}
159
160// Sets up the array from the given image, using the currently set int_mode_.
161// If the image width doesn't match the shape, the image is truncated or padded
162// with noise to match.
163void NetworkIO::FromPix(const StaticShape &shape, const Image pix, TRand *randomizer) {
164 std::vector<Image> pixes(1, pix);
165 FromPixes(shape, pixes, randomizer);
166}
167
168// Sets up the array from the given set of images, using the currently set
169// int_mode_. If the image width doesn't match the shape, the images are
170// truncated or padded with noise to match.
171void NetworkIO::FromPixes(const StaticShape &shape, const std::vector<Image> &pixes,
172 TRand *randomizer) {
173 int target_height = shape.height();
174 int target_width = shape.width();
175 std::vector<std::pair<int, int>> h_w_pairs;
176 for (auto &&pix : pixes) {
177 Image var_pix = pix;
178 int width = pixGetWidth(var_pix);
179 if (target_width != 0) {
180 width = target_width;
181 }
182 int height = pixGetHeight(var_pix);
183 if (target_height != 0) {
184 height = target_height;
185 }
186 h_w_pairs.emplace_back(height, width);
187 }
188 stride_map_.SetStride(h_w_pairs);
189 ResizeToMap(int_mode(), stride_map_, shape.depth());
190 // Iterate over the images again to copy the data.
191 for (size_t b = 0; b < pixes.size(); ++b) {
192 Image pix = pixes[b];
193 float black = 0.0f, white = 255.0f;
194 if (shape.depth() != 3) {
195 ComputeBlackWhite(pix, &black, &white);
196 }
197 float contrast = (white - black) / 2.0f;
198 if (contrast <= 0.0f) {
199 contrast = 1.0f;
200 }
201 if (shape.height() == 1) {
202 Copy1DGreyImage(b, pix, black, contrast, randomizer);
203 } else {
204 Copy2DImage(b, pix, black, contrast, randomizer);
205 }
206 }
207}
208
209// Copies the given pix to *this at the given batch index, stretching and
210// clipping the pixel values so that [black, black + 2*contrast] maps to the
211// dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.
212// This is a 2-d operation in the sense that the output depth is the number
213// of input channels, the height is the height of the image, and the width
214// is the width of the image, or truncated/padded with noise if the width
215// is a fixed size.
216void NetworkIO::Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer) {
217 int width = pixGetWidth(pix);
218 int height = pixGetHeight(pix);
219 int wpl = pixGetWpl(pix);
220 StrideMap::Index index(stride_map_);
221 index.AddOffset(batch, FD_BATCH);
222 int t = index.t();
223 int target_height = stride_map_.Size(FD_HEIGHT);
224 int target_width = stride_map_.Size(FD_WIDTH);
225 int num_features = NumFeatures();
226 bool color = num_features == 3;
227 if (width > target_width) {
228 width = target_width;
229 }
230 uint32_t *line = pixGetData(pix);
231 for (int y = 0; y < target_height; ++y, line += wpl) {
232 int x = 0;
233 if (y < height) {
234 for (x = 0; x < width; ++x, ++t) {
235 if (color) {
236 int f = 0;
237 for (int c = COLOR_RED; c <= COLOR_BLUE; ++c) {
238 int pixel = GET_DATA_BYTE(line + x, c);
239 SetPixel(t, f++, pixel, black, contrast);
240 }
241 } else {
242 int pixel = GET_DATA_BYTE(line, x);
243 SetPixel(t, 0, pixel, black, contrast);
244 }
245 }
246 }
247 for (; x < target_width; ++x) {
248 Randomize(t++, 0, num_features, randomizer);
249 }
250 }
251}
252
253// Copies the given pix to *this at the given batch index, as Copy2DImage
254// above, except that the output depth is the height of the input image, the
255// output height is 1, and the output width as for Copy2DImage.
256// The image is thus treated as a 1-d set of vertical pixel strips.
257void NetworkIO::Copy1DGreyImage(int batch, Image pix, float black, float contrast,
258 TRand *randomizer) {
259 int width = pixGetWidth(pix);
260 int height = pixGetHeight(pix);
261 ASSERT_HOST(height == NumFeatures());
262 int wpl = pixGetWpl(pix);
263 StrideMap::Index index(stride_map_);
264 index.AddOffset(batch, FD_BATCH);
265 int t = index.t();
266 int target_width = stride_map_.Size(FD_WIDTH);
267 if (width > target_width) {
268 width = target_width;
269 }
270 int x;
271 for (x = 0; x < width; ++x, ++t) {
272 for (int y = 0; y < height; ++y) {
273 uint32_t *line = pixGetData(pix) + wpl * y;
274 int pixel = GET_DATA_BYTE(line, x);
275 SetPixel(t, y, pixel, black, contrast);
276 }
277 }
278 for (; x < target_width; ++x) {
279 Randomize(t++, 0, height, randomizer);
280 }
281}
282
283// Helper stores the pixel value in i_ or f_ according to int_mode_.
284// t: is the index from the StrideMap corresponding to the current
285// [batch,y,x] position
286// f: is the index into the depth/channel
287// pixel: the value of the pixel from the image (in one channel)
288// black: the pixel value to map to the lowest of the range of *this
289// contrast: the range of pixel values to stretch to half the range of *this.
290void NetworkIO::SetPixel(int t, int f, int pixel, float black, float contrast) {
291 float float_pixel = (pixel - black) / contrast - 1.0f;
292 if (int_mode_) {
293 i_[t][f] = ClipToRange<int>(IntCastRounded((INT8_MAX + 1) * float_pixel), -INT8_MAX, INT8_MAX);
294 } else {
295 f_[t][f] = float_pixel;
296 }
297}
298
299// Converts the array to a Pix. Must be pixDestroyed after use.
301 // Count the width of the image, and find the max multiplication factor.
302 int im_width = stride_map_.Size(FD_WIDTH);
303 int im_height = stride_map_.Size(FD_HEIGHT);
304 int num_features = NumFeatures();
305 int feature_factor = 1;
306 if (num_features == 3) {
307 // Special hack for color.
308 num_features = 1;
309 feature_factor = 3;
310 }
311 Image pix = pixCreate(im_width, im_height * num_features, 32);
312 StrideMap::Index index(stride_map_);
313 do {
314 int im_x = index.index(FD_WIDTH);
315 int top_im_y = index.index(FD_HEIGHT);
316 int im_y = top_im_y;
317 int t = index.t();
318 if (int_mode_) {
319 const int8_t *features = i_[t];
320 for (int y = 0; y < num_features; ++y, im_y += im_height) {
321 int pixel = features[y * feature_factor];
322 // 1 or 2 features use greyscale.
323 int red = ClipToRange<int>(pixel + 128, 0, 255);
324 int green = red, blue = red;
325 if (feature_factor == 3) {
326 // With 3 features assume RGB color.
327 green = ClipToRange<int>(features[y * feature_factor + 1] + 128, 0, 255);
328 blue = ClipToRange<int>(features[y * feature_factor + 2] + 128, 0, 255);
329 } else if (num_features > 3) {
330 // More than 3 features use false yellow/blue color, assuming a signed
331 // input in the range [-1,1].
332 red = abs(pixel) * 2;
333 if (pixel >= 0) {
334 green = red;
335 blue = 0;
336 } else {
337 blue = red;
338 green = red = 0;
339 }
340 }
341 pixSetPixel(pix, im_x, im_y,
342 (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT));
343 }
344 } else {
345 const float *features = f_[t];
346 for (int y = 0; y < num_features; ++y, im_y += im_height) {
347 float pixel = features[y * feature_factor];
348 // 1 or 2 features use greyscale.
349 int red = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);
350 int green = red, blue = red;
351 if (feature_factor == 3) {
352 // With 3 features assume RGB color.
353 pixel = features[y * feature_factor + 1];
354 green = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);
355 pixel = features[y * feature_factor + 2];
356 blue = ClipToRange<int>(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255);
357 } else if (num_features > 3) {
358 // More than 3 features use false yellow/blue color, assuming a signed
359 // input in the range [-1,1].
360 red = ClipToRange<int>(IntCastRounded(std::fabs(pixel) * 255), 0, 255);
361 if (pixel >= 0) {
362 green = red;
363 blue = 0;
364 } else {
365 blue = red;
366 green = red = 0;
367 }
368 }
369 pixSetPixel(pix, im_x, im_y,
370 (red << L_RED_SHIFT) | (green << L_GREEN_SHIFT) | (blue << L_BLUE_SHIFT));
371 }
372 }
373 } while (index.Increment());
374 return pix;
375}
376
377// Prints the first and last num timesteps of the array for each feature.
378void NetworkIO::Print(int num) const {
379 int num_features = NumFeatures();
380 for (int y = 0; y < num_features; ++y) {
381 for (int t = 0; t < Width(); ++t) {
382 if (num == 0 || t < num || t + num >= Width()) {
383 if (int_mode_) {
384 tprintf(" %g", static_cast<float>(i_[t][y]) / INT8_MAX);
385 } else {
386 tprintf(" %g", f_[t][y]);
387 }
388 }
389 }
390 tprintf("\n");
391 }
392}
393
394// Copies a single time step from src.
395void NetworkIO::CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t) {
396 ASSERT_HOST(int_mode_ == src.int_mode_);
397 if (int_mode_) {
398 memcpy(i_[dest_t], src.i_[src_t], i_.dim2() * sizeof(i_[0][0]));
399 } else {
400 memcpy(f_[dest_t], src.f_[src_t], f_.dim2() * sizeof(f_[0][0]));
401 }
402}
403
404// Copies a part of single time step from src.
405void NetworkIO::CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features,
406 const NetworkIO &src, int src_t, int src_offset) {
407 ASSERT_HOST(int_mode_ == src.int_mode_);
408 if (int_mode_) {
409 memcpy(i_[dest_t] + dest_offset, src.i_[src_t] + src_offset, num_features * sizeof(i_[0][0]));
410 } else {
411 memcpy(f_[dest_t] + dest_offset, src.f_[src_t] + src_offset, num_features * sizeof(f_[0][0]));
412 }
413}
414
415// Sets the given range to random values.
416void NetworkIO::Randomize(int t, int offset, int num_features, TRand *randomizer) {
417 if (int_mode_) {
418 int8_t *line = i_[t] + offset;
419 for (int i = 0; i < num_features; ++i) {
420 line[i] = IntCastRounded(randomizer->SignedRand(INT8_MAX));
421 }
422 } else {
423 // float mode.
424 float *line = f_[t] + offset;
425 for (int i = 0; i < num_features; ++i) {
426 line[i] = randomizer->SignedRand(1.0);
427 }
428 }
429}
430
431// Helper returns the label and score of the best choice over a range.
432int NetworkIO::BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating,
433 float *certainty) const {
434 if (t_end <= t_start) {
435 return -1;
436 }
437 int max_char = -1;
438 float min_score = 0.0f;
439 for (int c = 0; c < NumFeatures(); ++c) {
440 if (c == not_this || c == null_ch) {
441 continue;
442 }
443 ScoresOverRange(t_start, t_end, c, null_ch, rating, certainty);
444 if (max_char < 0 || *rating < min_score) {
445 min_score = *rating;
446 max_char = c;
447 }
448 }
449 ScoresOverRange(t_start, t_end, max_char, null_ch, rating, certainty);
450 return max_char;
451}
452
453// Helper returns the rating and certainty of the choice over a range in output.
454void NetworkIO::ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating,
455 float *certainty) const {
456 ASSERT_HOST(!int_mode_);
457 *rating = 0.0f;
458 *certainty = 0.0f;
459 if (t_end <= t_start || t_end <= 0) {
460 return;
461 }
462 float ratings[3] = {0.0f, 0.0f, 0.0f};
463 float certs[3] = {0.0f, 0.0f, 0.0f};
464 for (int t = t_start; t < t_end; ++t) {
465 const float *line = f_[t];
466 float score = ProbToCertainty(line[choice]);
467 float zero = ProbToCertainty(line[null_ch]);
468 if (t == t_start) {
469 ratings[2] = FLT_MAX;
470 ratings[1] = -score;
471 certs[1] = score;
472 } else {
473 for (int i = 2; i >= 1; --i) {
474 if (ratings[i] > ratings[i - 1]) {
475 ratings[i] = ratings[i - 1];
476 certs[i] = certs[i - 1];
477 }
478 }
479 ratings[2] -= zero;
480 if (zero < certs[2]) {
481 certs[2] = zero;
482 }
483 ratings[1] -= score;
484 if (score < certs[1]) {
485 certs[1] = score;
486 }
487 }
488 ratings[0] -= zero;
489 if (zero < certs[0]) {
490 certs[0] = zero;
491 }
492 }
493 int best_i = ratings[2] < ratings[1] ? 2 : 1;
494 *rating = ratings[best_i] + t_end - t_start;
495 *certainty = certs[best_i];
496}
497
498// Returns the index (label) of the best value at the given timestep,
499// excluding not_this and not_that, and if not null, sets the score to the
500// log of the corresponding value.
501int NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const {
502 ASSERT_HOST(!int_mode_);
503 int best_index = -1;
504 float best_score = -FLT_MAX;
505 const float *line = f_[t];
506 for (int i = 0; i < f_.dim2(); ++i) {
507 if (line[i] > best_score && i != not_this && i != not_that) {
508 best_score = line[i];
509 best_index = i;
510 }
511 }
512 if (score != nullptr) {
513 *score = ProbToCertainty(best_score);
514 }
515 return best_index;
516}
517
518// Returns the best start position out of [start, end) (into which all labels
519// must fit) to obtain the highest cumulative score for the given labels.
520int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const {
521 int length = labels.size();
522 int last_start = end - length;
523 int best_start = -1;
524 TFloat best_score = 0;
525 for (int s = start; s <= last_start; ++s) {
526 TFloat score = ScoreOfLabels(labels, s);
527 if (score > best_score || best_start < 0) {
528 best_score = score;
529 best_start = s;
530 }
531 }
532 return best_start;
533}
534
535// Returns the cumulative score of the given labels starting at start, and
536// using one label per time-step.
537TFloat NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {
538 int length = labels.size();
539 TFloat score = 0;
540 for (int i = 0; i < length; ++i) {
541 score += f_(start + i, labels[i]);
542 }
543 return score;
544}
545
546// Helper function sets all the outputs for a single timestep, such that
547// label has value ok_score, and the other labels share 1 - ok_score.
548void NetworkIO::SetActivations(int t, int label, float ok_score) {
549 ASSERT_HOST(!int_mode_);
550 int num_classes = NumFeatures();
551 float bad_score = (1.0f - ok_score) / (num_classes - 1);
552 float *targets = f_[t];
553 for (int i = 0; i < num_classes; ++i) {
554 targets[i] = bad_score;
555 }
556 targets[label] = ok_score;
557}
558
559// Modifies the values, only if needed, so that the given label is
560// the winner at the given time step t.
561void NetworkIO::EnsureBestLabel(int t, int label) {
562 ASSERT_HOST(!int_mode_);
563 if (BestLabel(t, nullptr) != label) {
564 // Output value needs enhancing. Third all the other elements and add the
565 // remainder to best_label.
566 int num_classes = NumFeatures();
567 float *targets = f_[t];
568 for (int c = 0; c < num_classes; ++c) {
569 if (c == label) {
570 targets[c] += (1.0 - targets[c]) * (2 / 3.0);
571 } else {
572 targets[c] /= 3.0;
573 }
574 }
575 }
576}
577
578// Helper function converts prob to certainty taking the minimum into account.
579/* static */
580float NetworkIO::ProbToCertainty(float prob) {
581 return prob > kMinProb ? std::log(prob) : kMinCertainty;
582}
583
584// Returns true if there is any bad value that is suspiciously like a GT
585// error. Assuming that *this is the difference(gradient) between target
586// and forward output, returns true if there is a large negative value
587// (correcting a very confident output) for which there is no corresponding
588// positive value in an adjacent timestep for the same feature index. This
589// allows the box-truthed samples to make fine adjustments to position while
590// stopping other disagreements of confident output with ground truth.
591bool NetworkIO::AnySuspiciousTruth(float confidence_thr) const {
592 int num_features = NumFeatures();
593 for (int t = 0; t < Width(); ++t) {
594 const float *features = f_[t];
595 for (int y = 0; y < num_features; ++y) {
596 float grad = features[y];
597 if (grad < -confidence_thr) {
598 // Correcting strong output. Check for movement.
599 if ((t == 0 || f_[t - 1][y] < confidence_thr / 2) &&
600 (t + 1 == Width() || f_[t + 1][y] < confidence_thr / 2)) {
601 return true; // No strong positive on either side.
602 }
603 }
604 }
605 }
606 return false;
607}
608
609// Reads a single timestep to floats in the range [-1, 1].
611 if (int_mode_) {
612 const int8_t *line = i_[t];
613 for (int i = 0; i < i_.dim2(); ++i) {
614 output[i] = static_cast<TFloat>(line[i]) / INT8_MAX;
615 }
616 } else {
617 const float *line = f_[t];
618 for (int i = 0; i < f_.dim2(); ++i) {
619 output[i] = static_cast<TFloat>(line[i]);
620 }
621 }
622}
623
624// Adds a single timestep to floats.
625void NetworkIO::AddTimeStep(int t, TFloat *inout) const {
626 int num_features = NumFeatures();
627 if (int_mode_) {
628 const int8_t *line = i_[t];
629 for (int i = 0; i < num_features; ++i) {
630 inout[i] += static_cast<TFloat>(line[i]) / INT8_MAX;
631 }
632 } else {
633 const float *line = f_[t];
634 for (int i = 0; i < num_features; ++i) {
635 inout[i] += line[i];
636 }
637 }
638}
639
640// Adds part of a single timestep to floats.
641void NetworkIO::AddTimeStepPart(int t, int offset, int num_features, float *inout) const {
642 if (int_mode_) {
643 const int8_t *line = i_[t] + offset;
644 for (int i = 0; i < num_features; ++i) {
645 inout[i] += static_cast<float>(line[i]) / INT8_MAX;
646 }
647 } else {
648 const float *line = f_[t] + offset;
649 for (int i = 0; i < num_features; ++i) {
650 inout[i] += line[i];
651 }
652 }
653}
654
655// Writes a single timestep from floats in the range [-1, 1].
656void NetworkIO::WriteTimeStep(int t, const TFloat *input) {
657 WriteTimeStepPart(t, 0, NumFeatures(), input);
658}
659
660// Writes a single timestep from floats in the range [-1, 1] writing only
661// num_features elements of input to (*this)[t], starting at offset.
662void NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input) {
663 if (int_mode_) {
664 int8_t *line = i_[t] + offset;
665 for (int i = 0; i < num_features; ++i) {
666 line[i] = ClipToRange<int>(IntCastRounded(input[i] * INT8_MAX), -INT8_MAX, INT8_MAX);
667 }
668 } else {
669 float *line = f_[t] + offset;
670 for (int i = 0; i < num_features; ++i) {
671 line[i] = static_cast<float>(input[i]);
672 }
673 }
674}
675
676// Maxpools a single time step from src.
677void NetworkIO::MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line) {
678 ASSERT_HOST(int_mode_ == src.int_mode_);
679 if (int_mode_) {
680 int dim = i_.dim2();
681 int8_t *dest_line = i_[dest_t];
682 const int8_t *src_line = src.i_[src_t];
683 for (int i = 0; i < dim; ++i) {
684 if (dest_line[i] < src_line[i]) {
685 dest_line[i] = src_line[i];
686 max_line[i] = src_t;
687 }
688 }
689 } else {
690 int dim = f_.dim2();
691 float *dest_line = f_[dest_t];
692 const float *src_line = src.f_[src_t];
693 for (int i = 0; i < dim; ++i) {
694 if (dest_line[i] < src_line[i]) {
695 dest_line[i] = src_line[i];
696 max_line[i] = src_t;
697 }
698 }
699 }
700}
701
702// Runs maxpool backward, using maxes to index timesteps in *this.
704 ASSERT_HOST(!int_mode_);
705 Zero();
706 StrideMap::Index index(fwd.stride_map_);
707 do {
708 int t = index.t();
709 const int *max_line = maxes[t];
710 const float *fwd_line = fwd.f_[t];
711 int num_features = fwd.f_.dim2();
712 for (int i = 0; i < num_features; ++i) {
713 f_[max_line[i]][i] = fwd_line[i];
714 }
715 } while (index.Increment());
716}
717
718// Returns the min over time of the maxes over features of the outputs.
720 float min_max = 0.0f;
721 int width = Width();
722 int num_features = NumFeatures();
723 for (int t = 0; t < width; ++t) {
724 float max_value = -FLT_MAX;
725 if (int_mode_) {
726 const int8_t *column = i_[t];
727 for (int i = 0; i < num_features; ++i) {
728 if (column[i] > max_value) {
729 max_value = column[i];
730 }
731 }
732 } else {
733 const float *column = f_[t];
734 for (int i = 0; i < num_features; ++i) {
735 if (column[i] > max_value) {
736 max_value = column[i];
737 }
738 }
739 }
740 if (t == 0 || max_value < min_max) {
741 min_max = max_value;
742 }
743 }
744 return min_max;
745}
746
747// Computes combined results for a combiner that chooses between an existing
748// input and itself, with an additional output to indicate the choice.
749void NetworkIO::CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output) {
750 int no = base_output.NumFeatures();
751 ASSERT_HOST(combiner_output.NumFeatures() == no + 1);
752 Resize(base_output, no);
753 int width = Width();
754 if (int_mode_) {
755 // Number of outputs from base and final result.
756 for (int t = 0; t < width; ++t) {
757 int8_t *out_line = i_[t];
758 const int8_t *base_line = base_output.i_[t];
759 const int8_t *comb_line = combiner_output.i_[t];
760 float base_weight = static_cast<float>(comb_line[no]) / INT8_MAX;
761 float boost_weight = 1.0f - base_weight;
762 for (int i = 0; i < no; ++i) {
763 out_line[i] = IntCastRounded(base_line[i] * base_weight + comb_line[i] * boost_weight);
764 }
765 }
766 } else {
767 for (int t = 0; t < width; ++t) {
768 float *out_line = f_[t];
769 const float *base_line = base_output.f_[t];
770 const float *comb_line = combiner_output.f_[t];
771 float base_weight = comb_line[no];
772 float boost_weight = 1.0f - base_weight;
773 for (int i = 0; i < no; ++i) {
774 out_line[i] = base_line[i] * base_weight + comb_line[i] * boost_weight;
775 }
776 }
777 }
778}
779
780// Computes deltas for a combiner that chooses between 2 sets of inputs.
781void NetworkIO::ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output) {
782 ASSERT_HOST(!int_mode_);
783 // Compute the deltas for the combiner.
784 int width = Width();
785 int no = NumFeatures() - 1;
786 ASSERT_HOST(fwd_deltas.NumFeatures() == no);
787 ASSERT_HOST(base_output.NumFeatures() == no);
788 // Number of outputs from base and final result.
789 for (int t = 0; t < width; ++t) {
790 const float *delta_line = fwd_deltas.f_[t];
791 const float *base_line = base_output.f_[t];
792 float *comb_line = f_[t];
793 float base_weight = comb_line[no];
794 float boost_weight = 1.0f - base_weight;
795 float max_base_delta = 0.0;
796 for (int i = 0; i < no; ++i) {
797 // What did the combiner actually produce?
798 float output = base_line[i] * base_weight + comb_line[i] * boost_weight;
799 // Reconstruct the target from the delta.
800 float comb_target = delta_line[i] + output;
801 comb_line[i] = comb_target - comb_line[i];
802 float base_delta = std::fabs(comb_target - base_line[i]);
803 if (base_delta > max_base_delta) {
804 max_base_delta = base_delta;
805 }
806 }
807 if (max_base_delta >= 0.5) {
808 // The base network got it wrong. The combiner should output the right
809 // answer and 0 for the base network.
810 comb_line[no] = 0.0 - base_weight;
811 } else {
812 // The base network was right. The combiner should flag that.
813 for (int i = 0; i < no; ++i) {
814 // All other targets are 0.
815 if (comb_line[i] > 0.0) {
816 comb_line[i] -= 1.0;
817 }
818 }
819 comb_line[no] = 1.0 - base_weight;
820 }
821 }
822}
823
824// Copies the array checking that the types match.
826 ASSERT_HOST(src.int_mode_ == int_mode_);
827 f_ = src.f_;
828}
829
830// Checks that both are floats and adds the src array to *this.
832 ASSERT_HOST(!int_mode_);
833 ASSERT_HOST(!src.int_mode_);
834 f_ += src.f_;
835}
836
837// Subtracts the array from a float array. src must also be float.
839 ASSERT_HOST(!int_mode_);
840 ASSERT_HOST(!src.int_mode_);
841 f_ -= src.f_;
842}
843
844// Copies src to *this, with maxabs normalization to match scale.
846 ASSERT_HOST(!int_mode_);
847 ASSERT_HOST(!src.int_mode_);
848 ASSERT_HOST(!scale.int_mode_);
849 float src_max = src.f_.MaxAbs();
850 ASSERT_HOST(std::isfinite(src_max));
851 float scale_max = scale.f_.MaxAbs();
852 ASSERT_HOST(std::isfinite(scale_max));
853 if (src_max > 0.0f) {
854 float factor = scale_max / src_max;
855 for (int t = 0; t < src.Width(); ++t) {
856 const float *src_ptr = src.f_[t];
857 float *dest_ptr = f_[t];
858 for (int i = 0; i < src.f_.dim2(); ++i) {
859 dest_ptr[i] = src_ptr[i] * factor;
860 }
861 }
862 } else {
863 f_.Clear();
864 }
865}
866
867// Copies src to *this with independent reversal of the y dimension.
869 int num_features = src.NumFeatures();
870 Resize(src, num_features);
871 StrideMap::Index b_index(src.stride_map_);
872 do {
873 int width = b_index.MaxIndexOfDim(FD_WIDTH) + 1;
874 StrideMap::Index fwd_index(b_index);
875 StrideMap::Index rev_index(b_index);
876 rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_HEIGHT), FD_HEIGHT);
877 do {
878 int fwd_t = fwd_index.t();
879 int rev_t = rev_index.t();
880 for (int x = 0; x < width; ++x) {
881 CopyTimeStepFrom(rev_t++, src, fwd_t++);
882 }
883 } while (fwd_index.AddOffset(1, FD_HEIGHT) && rev_index.AddOffset(-1, FD_HEIGHT));
884 } while (b_index.AddOffset(1, FD_BATCH));
885}
886
887// Copies src to *this with independent reversal of the x dimension.
889 int num_features = src.NumFeatures();
890 Resize(src, num_features);
891 StrideMap::Index b_index(src.stride_map_);
892 do {
893 StrideMap::Index y_index(b_index);
894 do {
895 StrideMap::Index fwd_index(y_index);
896 StrideMap::Index rev_index(y_index);
897 rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_WIDTH), FD_WIDTH);
898 do {
899 CopyTimeStepFrom(rev_index.t(), src, fwd_index.t());
900 } while (fwd_index.AddOffset(1, FD_WIDTH) && rev_index.AddOffset(-1, FD_WIDTH));
901 } while (y_index.AddOffset(1, FD_HEIGHT));
902 } while (b_index.AddOffset(1, FD_BATCH));
903}
904
905// Copies src to *this with independent transpose of the x and y dimensions.
907 int num_features = src.NumFeatures();
908 stride_map_ = src.stride_map_;
909 stride_map_.TransposeXY();
910 ResizeToMap(src.int_mode(), stride_map_, num_features);
911 StrideMap::Index src_b_index(src.stride_map_);
912 StrideMap::Index dest_b_index(stride_map_);
913 do {
914 StrideMap::Index src_y_index(src_b_index);
915 StrideMap::Index dest_x_index(dest_b_index);
916 do {
917 StrideMap::Index src_x_index(src_y_index);
918 StrideMap::Index dest_y_index(dest_x_index);
919 do {
920 CopyTimeStepFrom(dest_y_index.t(), src, src_x_index.t());
921 } while (src_x_index.AddOffset(1, FD_WIDTH) && dest_y_index.AddOffset(1, FD_HEIGHT));
922 } while (src_y_index.AddOffset(1, FD_HEIGHT) && dest_x_index.AddOffset(1, FD_WIDTH));
923 } while (src_b_index.AddOffset(1, FD_BATCH) && dest_b_index.AddOffset(1, FD_BATCH));
924}
925
926// Copies src to *this, at the given feature_offset, returning the total
927// feature offset after the copy. Multiple calls will stack outputs from
928// multiple sources in feature space.
929int NetworkIO::CopyPacking(const NetworkIO &src, int feature_offset) {
930 ASSERT_HOST(int_mode_ == src.int_mode_);
931 int width = src.Width();
932 ASSERT_HOST(width <= Width());
933 int num_features = src.NumFeatures();
934 ASSERT_HOST(num_features + feature_offset <= NumFeatures());
935 if (int_mode_) {
936 for (int t = 0; t < width; ++t) {
937 memcpy(i_[t] + feature_offset, src.i_[t], num_features * sizeof(i_[t][0]));
938 }
939 for (int t = width; t < i_.dim1(); ++t) {
940 memset(i_[t], 0, num_features * sizeof(i_[t][0]));
941 }
942 } else {
943 for (int t = 0; t < width; ++t) {
944 memcpy(f_[t] + feature_offset, src.f_[t], num_features * sizeof(f_[t][0]));
945 }
946 for (int t = width; t < f_.dim1(); ++t) {
947 memset(f_[t], 0, num_features * sizeof(f_[t][0]));
948 }
949 }
950 return num_features + feature_offset;
951}
952
953// Opposite of CopyPacking, fills *this with a part of src, starting at
954// feature_offset, and picking num_features.
955void NetworkIO::CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features) {
956 Resize(src, num_features);
957 int width = src.Width();
958 ASSERT_HOST(num_features + feature_offset <= src.NumFeatures());
959 if (int_mode_) {
960 for (int t = 0; t < width; ++t) {
961 memcpy(i_[t], src.i_[t] + feature_offset, num_features * sizeof(i_[t][0]));
962 }
963 } else {
964 for (int t = 0; t < width; ++t) {
965 memcpy(f_[t], src.f_[t] + feature_offset, num_features * sizeof(f_[t][0]));
966 }
967 }
968}
969
970// Transposes the float part of *this into dest.
972 int width = Width();
973 dest->ResizeNoInit(NumFeatures(), width);
974 for (int t = 0; t < width; ++t) {
975 dest->WriteStrided(t, f_[t]);
976 }
977}
978
979// Clips the content of a single time-step to +/-range.
980void NetworkIO::ClipVector(int t, float range) {
981 ASSERT_HOST(!int_mode_);
982 float *v = f_[t];
983 int dim = f_.dim2();
984 for (int i = 0; i < dim; ++i) {
985 v[i] = ClipToRange<float>(v[i], -range, range);
986 }
987}
988
989// Returns the padding required for the given number of features in order
990// for the SIMD operations to be safe.
991/* static */
992int NetworkIO::GetPadding(int num_features) {
993 int padding = 0;
995 padding = IntSimdMatrix::intSimdMatrix->RoundInputs(num_features) - num_features;
996 }
997 return padding;
998}
999
1000} // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:54
const double y
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:170
const float kMinCertainty
Definition: networkio.cpp:30
double TFloat
Definition: tesstypes.h:39
@ FD_WIDTH
Definition: stridemap.h:35
@ FD_BATCH
Definition: stridemap.h:33
@ FD_HEIGHT
Definition: stridemap.h:34
const float kMinProb
Definition: networkio.cpp:32
void ZeroVector(unsigned n, T *vec)
Definition: functions.h:245
def next(obj)
Definition: ast.py:56
dest
Definition: upload.py:409
void ResizeNoInit(int size1, int size2, int pad=0)
Definition: matrix.h:94
int RoundInputs(int size) const
Definition: intsimdmatrix.h:70
static const IntSimdMatrix * intSimdMatrix
double SignedRand(double range)
Definition: helpers.h:78
void FromPix(const StaticShape &shape, const Image pix, TRand *randomizer)
Definition: networkio.cpp:163
void Resize(const NetworkIO &src, int num_features)
Definition: networkio.h:44
void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input)
Definition: networkio.cpp:662
void ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output)
Definition: networkio.cpp:781
void ResizeXTo1(const NetworkIO &src, int num_features)
Definition: networkio.cpp:68
void ZeroInvalidElements()
Definition: networkio.cpp:86
float MinOfMaxes() const
Definition: networkio.cpp:719
void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line)
Definition: networkio.cpp:677
bool int_mode() const
Definition: networkio.h:122
void FromPixes(const StaticShape &shape, const std::vector< Image > &pixes, TRand *randomizer)
Definition: networkio.cpp:171
Image ToPix() const
Definition: networkio.cpp:300
void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src, int src_t, int src_offset)
Definition: networkio.cpp:405
void ClipVector(int t, float range)
Definition: networkio.cpp:980
void AddTimeStepPart(int t, int offset, int num_features, float *inout) const
Definition: networkio.cpp:641
void WriteTimeStep(int t, const TFloat *input)
Definition: networkio.cpp:656
void CopyWithXReversal(const NetworkIO &src)
Definition: networkio.cpp:888
void Print(int num) const
Definition: networkio.cpp:378
void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating, float *certainty) const
Definition: networkio.cpp:454
static float ProbToCertainty(float prob)
Definition: networkio.cpp:580
int CopyPacking(const NetworkIO &src, int feature_offset)
Definition: networkio.cpp:929
void CopyWithXYTranspose(const NetworkIO &src)
Definition: networkio.cpp:906
void ReadTimeStep(int t, TFloat *output) const
Definition: networkio.cpp:610
float * f(int t)
Definition: networkio.h:110
int Width() const
Definition: networkio.h:102
void Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer)
Definition: networkio.cpp:257
void Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer)
Definition: networkio.cpp:216
void Resize2d(bool int_mode, int width, int num_features)
Definition: networkio.cpp:35
void MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY< int > &maxes)
Definition: networkio.cpp:703
void AddTimeStep(int t, TFloat *inout) const
Definition: networkio.cpp:625
void ZeroTimeStep(int t)
Definition: networkio.h:147
void SetActivations(int t, int label, float ok_score)
Definition: networkio.cpp:548
void CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output)
Definition: networkio.cpp:749
void Transpose(TransposedArray *dest) const
Definition: networkio.cpp:971
void ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features)
Definition: networkio.cpp:61
bool AnySuspiciousTruth(float confidence_thr) const
Definition: networkio.cpp:591
void CopyWithYReversal(const NetworkIO &src)
Definition: networkio.cpp:868
void Randomize(int t, int offset, int num_features, TRand *randomizer)
Definition: networkio.cpp:416
void CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features)
Definition: networkio.cpp:955
void EnsureBestLabel(int t, int label)
Definition: networkio.cpp:561
void AddAllToFloat(const NetworkIO &src)
Definition: networkio.cpp:831
const StrideMap & stride_map() const
Definition: networkio.h:128
void SubtractAllFromFloat(const NetworkIO &src)
Definition: networkio.cpp:838
TFloat ScoreOfLabels(const std::vector< int > &labels, int start) const
Definition: networkio.cpp:537
void ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features)
Definition: networkio.cpp:46
const int8_t * i(int t) const
Definition: networkio.h:118
void CopyAll(const NetworkIO &src)
Definition: networkio.cpp:825
void CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t)
Definition: networkio.cpp:395
int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating, float *certainty) const
Definition: networkio.cpp:432
void CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale)
Definition: networkio.cpp:845
int NumFeatures() const
Definition: networkio.h:106
void SetPixel(int t, int f, int pixel, float black, float contrast)
Definition: networkio.cpp:290
int PositionOfBestMatch(const std::vector< int > &labels, int start, int end) const
Definition: networkio.cpp:520
int BestLabel(int t, float *score) const
Definition: networkio.h:165
int Width() const
Definition: stridemap.h:123
int Size(FlexDimensions dimension) const
Definition: stridemap.h:119
void ScaleXY(int x_factor, int y_factor)
Definition: stridemap.cpp:153
void SetStride(const std::vector< std::pair< int, int > > &h_w_pairs)
Definition: stridemap.cpp:131
int index(FlexDimensions dimension) const
Definition: stridemap.h:59
bool AddOffset(int offset, FlexDimensions dimension)
Definition: stridemap.cpp:67
int MaxIndexOfDim(FlexDimensions dim) const
Definition: stridemap.cpp:46