tesseract v5.3.3.20231005
networkio.h
Go to the documentation of this file.
1
2// File: networkio.h
3// Description: Network input/output data, allowing float/int implementations.
4// Author: Ray Smith
5//
6// (C) Copyright 2014, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#ifndef TESSERACT_LSTM_NETWORKIO_H_
19#define TESSERACT_LSTM_NETWORKIO_H_
20
21#include "helpers.h"
22#include "image.h"
23#include "static_shape.h"
24#include "stridemap.h"
25#include "weightmatrix.h"
26
27#include <cmath>
28#include <cstdio>
29#include <vector>
30
31struct Pix;
32
33namespace tesseract {
34
35// Class to contain all the input/output of a network, allowing for fixed or
36// variable-strided 2d to 1d mapping, and float or int8_t values. Provides
37// enough calculating functions to hide the detail of the implementation.
39public:
40 NetworkIO() : int_mode_(false) {}
41 // Resizes the array (and stride), avoiding realloc if possible, to the given
42 // size from various size specs:
43 // Same stride size, but given number of features.
44 void Resize(const NetworkIO &src, int num_features) {
45 ResizeToMap(src.int_mode(), src.stride_map(), num_features);
46 }
47 // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.
48 void Resize2d(bool int_mode, int width, int num_features);
49 // Resizes forcing a float representation with the stridemap of src and the
50 // given number of features.
51 void ResizeFloat(const NetworkIO &src, int num_features) {
52 ResizeToMap(false, src.stride_map(), num_features);
53 }
54 // Resizes to a specific stride_map.
55 void ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features);
56 // Shrinks image size by x_scale,y_scale, and use given number of features.
57 void ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features);
58 // Resizes to just 1 x-coord, whatever the input.
59 void ResizeXTo1(const NetworkIO &src, int num_features);
60 // Initialize all the array to zero.
61 void Zero();
62 // Initializes to zero all elements of the array that do not correspond to
63 // valid image positions. (If a batch of different-sized images are packed
64 // together, then there will be padding pixels.)
65 void ZeroInvalidElements();
66 // Sets up the array from the given image, using the currently set int_mode_.
67 // If the image width doesn't match the shape, the image is truncated or
68 // padded with noise to match.
69 void FromPix(const StaticShape &shape, const Image pix, TRand *randomizer);
70 // Sets up the array from the given set of images, using the currently set
71 // int_mode_. If the image width doesn't match the shape, the images are
72 // truncated or padded with noise to match.
73 void FromPixes(const StaticShape &shape, const std::vector<Image> &pixes,
74 TRand *randomizer);
75 // Copies the given pix to *this at the given batch index, stretching and
76 // clipping the pixel values so that [black, black + 2*contrast] maps to the
77 // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.
78 // This is a 2-d operation in the sense that the output depth is the number
79 // of input channels, the height is the height of the image, and the width
80 // is the width of the image, or truncated/padded with noise if the width
81 // is a fixed size.
82 void Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer);
83 // Copies the given pix to *this at the given batch index, as Copy2DImage
84 // above, except that the output depth is the height of the input image, the
85 // output height is 1, and the output width as for Copy2DImage.
86 // The image is thus treated as a 1-d set of vertical pixel strips.
87 void Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer);
88 // Helper stores the pixel value in i_ or f_ according to int_mode_.
89 // t: is the index from the StrideMap corresponding to the current
90 // [batch,y,x] position
91 // f: is the index into the depth/channel
92 // pixel: the value of the pixel from the image (in one channel)
93 // black: the pixel value to map to the lowest of the range of *this
94 // contrast: the range of pixel values to stretch to half the range of *this.
95 void SetPixel(int t, int f, int pixel, float black, float contrast);
96 // Converts the array to a Pix. Must be pixDestroyed after use.
97 Image ToPix() const;
98 // Prints the first and last num timesteps of the array for each feature.
99 void Print(int num) const;
100
101 // Returns the timestep width.
102 int Width() const {
103 return int_mode_ ? i_.dim1() : f_.dim1();
104 }
105 // Returns the number of features.
106 int NumFeatures() const {
107 return int_mode_ ? i_.dim2() : f_.dim2();
108 }
109 // Accessor to a timestep of the float matrix.
110 float *f(int t) {
111 ASSERT_HOST(!int_mode_);
112 return f_[t];
113 }
114 const float *f(int t) const {
115 ASSERT_HOST(!int_mode_);
116 return f_[t];
117 }
118 const int8_t *i(int t) const {
119 ASSERT_HOST(int_mode_);
120 return i_[t];
121 }
122 bool int_mode() const {
123 return int_mode_;
124 }
125 void set_int_mode(bool is_quantized) {
126 int_mode_ = is_quantized;
127 }
128 const StrideMap &stride_map() const {
129 return stride_map_;
130 }
131 void set_stride_map(const StrideMap &map) {
132 stride_map_ = map;
133 }
135 return f_;
136 }
138 return &f_;
139 }
140
141 // Copies a single time step from src.
142 void CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t);
143 // Copies a part of single time step from src.
144 void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src,
145 int src_t, int src_offset);
146 // Zeroes a single time step.
147 void ZeroTimeStep(int t) {
148 if (int_mode_) {
149 memset(i_[t], 0, sizeof(*i_[t]) * NumFeatures());
150 } else {
151 memset(f_[t], 0, sizeof(*f_[t]) * NumFeatures());
152 }
153 }
154 // Sets the given range to random values.
155 void Randomize(int t, int offset, int num_features, TRand *randomizer);
156
157 // Helper returns the label and score of the best choice over a range.
158 int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating,
159 float *certainty) const;
160 // Helper returns the rating and certainty of the choice over a range in t.
161 void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating,
162 float *certainty) const;
163 // Returns the index (label) of the best value at the given timestep,
164 // and if not null, sets the score to the log of the corresponding value.
165 int BestLabel(int t, float *score) const {
166 return BestLabel(t, -1, -1, score);
167 }
168 // Returns the index (label) of the best value at the given timestep,
169 // excluding not_this and not_that, and if not null, sets the score to the
170 // log of the corresponding value.
171 int BestLabel(int t, int not_this, int not_that, float *score) const;
172 // Returns the best start position out of range (into which both start and end
173 // must fit) to obtain the highest cumulative score for the given labels.
174 int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const;
175 // Returns the cumulative score of the given labels starting at start, and
176 // using one label per time-step.
177 TFloat ScoreOfLabels(const std::vector<int> &labels, int start) const;
178 // Helper function sets all the outputs for a single timestep, such that
179 // label has value ok_score, and the other labels share 1 - ok_score.
180 // Assumes float mode.
181 void SetActivations(int t, int label, float ok_score);
182 // Modifies the values, only if needed, so that the given label is
183 // the winner at the given time step t.
184 // Assumes float mode.
185 void EnsureBestLabel(int t, int label);
186 // Helper function converts prob to certainty taking the minimum into account.
187 static float ProbToCertainty(float prob);
188 // Returns true if there is any bad value that is suspiciously like a GT
189 // error. Assuming that *this is the difference(gradient) between target
190 // and forward output, returns true if there is a large negative value
191 // (correcting a very confident output) for which there is no corresponding
192 // positive value in an adjacent timestep for the same feature index. This
193 // allows the box-truthed samples to make fine adjustments to position while
194 // stopping other disagreements of confident output with ground truth.
195 bool AnySuspiciousTruth(float confidence_thr) const;
196
197 // Reads a single timestep to floats in the range [-1, 1].
198 void ReadTimeStep(int t, TFloat *output) const;
199 // Adds a single timestep to floats.
200 void AddTimeStep(int t, TFloat *inout) const;
201 // Adds part of a single timestep to floats.
202 void AddTimeStepPart(int t, int offset, int num_features, float *inout) const;
203 // Writes a single timestep from floats in the range [-1, 1].
204 void WriteTimeStep(int t, const TFloat *input);
205 // Writes a single timestep from floats in the range [-1, 1] writing only
206 // num_features elements of input to (*this)[t], starting at offset.
207 void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input);
208 // Maxpools a single time step from src.
209 void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line);
210 // Runs maxpool backward, using maxes to index timesteps in *this.
211 void MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes);
212 // Returns the min over time of the maxes over features of the outputs.
213 float MinOfMaxes() const;
214 // Returns the min over time.
215 float Max() const {
216 return int_mode_ ? i_.Max() : f_.Max();
217 }
218 // Computes combined results for a combiner that chooses between an existing
219 // input and itself, with an additional output to indicate the choice.
220 void CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output);
221 // Computes deltas for a combiner that chooses between 2 sets of inputs.
222 void ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output);
223
224 // Copies the array checking that the types match.
225 void CopyAll(const NetworkIO &src);
226 // Adds the array to a float array, with scaling to [-1, 1] if the src is int.
227 void AddAllToFloat(const NetworkIO &src);
228 // Subtracts the array from a float array. src must also be float.
229 void SubtractAllFromFloat(const NetworkIO &src);
230
231 // Copies src to *this, with maxabs normalization to match scale.
232 void CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale);
233 // Multiplies the float data by the given factor.
234 void ScaleFloatBy(float factor) {
235 f_ *= factor;
236 }
237 // Copies src to *this with independent reversal of the y dimension.
238 void CopyWithYReversal(const NetworkIO &src);
239 // Copies src to *this with independent reversal of the x dimension.
240 void CopyWithXReversal(const NetworkIO &src);
241 // Copies src to *this with independent transpose of the x and y dimensions.
242 void CopyWithXYTranspose(const NetworkIO &src);
243 // Copies src to *this, at the given feature_offset, returning the total
244 // feature offset after the copy. Multiple calls will stack outputs from
245 // multiple sources in feature space.
246 int CopyPacking(const NetworkIO &src, int feature_offset);
247 // Opposite of CopyPacking, fills *this with a part of src, starting at
248 // feature_offset, and picking num_features. Resizes *this to match.
249 void CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features);
250 // Transposes the float part of *this into dest.
251 void Transpose(TransposedArray *dest) const;
252
253 // Clips the content of a single time-step to +/-range.
254 void ClipVector(int t, float range);
255
256 // Applies Func to timestep t of *this (u) and multiplies the result by v
257 // component-wise, putting the product in *product.
258 // *this and v may be int or float, but must match. The outputs are TFloat.
259 template <class Func>
260 void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product) {
261 Func f;
262 ASSERT_HOST(!int_mode_);
263 ASSERT_HOST(!v_io.int_mode_);
264 int dim = f_.dim2();
265 if (int_mode_) {
266 const int8_t *u = i_[t];
267 const int8_t *v = v_io.i_[t];
268 for (int i = 0; i < dim; ++i) {
269 product[i] = f(u[i] / static_cast<TFloat>(INT8_MAX)) * v[i] / INT8_MAX;
270 }
271 } else {
272 const float *u = f_[t];
273 const float *v = v_io.f_[t];
274 for (int i = 0; i < dim; ++i) {
275 product[i] = f(u[i]) * v[i];
276 }
277 }
278 }
279 // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,
280 // component-wise, putting the product in *product.
281 // All NetworkIOs are assumed to be float.
282 template <class Func>
283 void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w,
284 TFloat *product) const {
285 ASSERT_HOST(!int_mode_);
286 ASSERT_HOST(!v_io.int_mode_);
287 Func f;
288 const float *u = f_[u_t];
289 const float *v = v_io.f_[v_t];
290 int dim = f_.dim2();
291 for (int i = 0; i < dim; ++i) {
292 product[i] = f(u[i]) * v[i] * w[i];
293 }
294 }
295 // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,
296 // component-wise, adding the product to *product.
297 // All NetworkIOs are assumed to be float.
298 template <class Func>
299 void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const {
300 ASSERT_HOST(!int_mode_);
301 ASSERT_HOST(!v_io.int_mode_);
302 Func f;
303 const float *u = f_[t];
304 const float *v = v_io.f_[t];
305 int dim = f_.dim2();
306 for (int i = 0; i < dim; ++i) {
307 product[i] += f(u[i]) * v[i] * w[i];
308 }
309 }
310 // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w,
311 // component-wise, putting the product in product, all at timestep t, except
312 // w, which is a simple array. All NetworkIOs are assumed to be float.
313 template <class Func1, class Func2>
314 void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const {
315 ASSERT_HOST(!int_mode_);
316 ASSERT_HOST(!v_io.int_mode_);
317 Func1 f;
318 Func2 g;
319 const float *u = f_[t];
320 const float *v = v_io.f_[t];
321 int dim = f_.dim2();
322 for (int i = 0; i < dim; ++i) {
323 product[i] = f(u[i]) * g(v[i]) * w[i];
324 }
325 }
326
327private:
328 // Returns the padding required for the given number of features in order
329 // for the SIMD operations to be safe.
330 static int GetPadding(int num_features);
331
332 // Choice of float vs 8 bit int for data.
335 // Which of f_ and i_ are we actually using.
336 bool int_mode_;
337 // Stride for 2d input data.
338 StrideMap stride_map_;
339};
340
341} // namespace tesseract.
342
343#endif // TESSERACT_LSTM_NETWORKIO_H_
#define ASSERT_HOST(x)
Definition: errcode.h:54
double TFloat
Definition: tesstypes.h:39
void ClipVector(int n, T lower, T upper, T *vec)
Definition: functions.h:251
dest
Definition: upload.py:409
std::string Print(const T &value)
void Resize(const NetworkIO &src, int num_features)
Definition: networkio.h:44
void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product)
Definition: networkio.h:260
bool int_mode() const
Definition: networkio.h:122
void ResizeFloat(const NetworkIO &src, int num_features)
Definition: networkio.h:51
float Max() const
Definition: networkio.h:215
void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const
Definition: networkio.h:299
void set_stride_map(const StrideMap &map)
Definition: networkio.h:131
void ScaleFloatBy(float factor)
Definition: networkio.h:234
const float * f(int t) const
Definition: networkio.h:114
float * f(int t)
Definition: networkio.h:110
int Width() const
Definition: networkio.h:102
void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w, TFloat *product) const
Definition: networkio.h:283
GENERIC_2D_ARRAY< float > * mutable_float_array()
Definition: networkio.h:137
void ZeroTimeStep(int t)
Definition: networkio.h:147
void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const
Definition: networkio.h:314
void set_int_mode(bool is_quantized)
Definition: networkio.h:125
const StrideMap & stride_map() const
Definition: networkio.h:128
const int8_t * i(int t) const
Definition: networkio.h:118
const GENERIC_2D_ARRAY< float > & float_array() const
Definition: networkio.h:134
int NumFeatures() const
Definition: networkio.h:106
int BestLabel(int t, float *score) const
Definition: networkio.h:165
#define TESS_API
Definition: export.h:32