tesseract v5.3.3.20231005
network.cpp
Go to the documentation of this file.
1
2// File: network.cpp
3// Description: Base class for neural network implementations.
4// Author: Ray Smith
5//
6// (C) Copyright 2013, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18// Include automatically generated configuration file if running autoconf.
19#ifdef HAVE_CONFIG_H
20# include "config_auto.h"
21#endif
22
23#include "network.h"
24
25#include <cstdlib>
26
27// This base class needs to know about all its sub-classes because of the
28// factory deserializing method: CreateFromFile.
29#include <allheaders.h>
30#include "convolve.h"
31#include "fullyconnected.h"
32#include "input.h"
33#include "lstm.h"
34#include "maxpool.h"
35#include "parallel.h"
36#include "reconfig.h"
37#include "reversed.h"
38#include "scrollview.h"
39#include "series.h"
40#include "statistc.h"
41#ifdef INCLUDE_TENSORFLOW
42# include "tfnetwork.h"
43#endif
44#include "tprintf.h"
45
46namespace tesseract {
47
48#ifndef GRAPHICS_DISABLED
49
50// Min and max window sizes.
51const int kMinWinSize = 500;
52const int kMaxWinSize = 2000;
53// Window frame sizes need adding on to make the content fit.
54const int kXWinFrameSize = 30;
55const int kYWinFrameSize = 80;
56
57#endif // !GRAPHICS_DISABLED
58
59// String names corresponding to the NetworkType enum.
60// Keep in sync with NetworkType.
61// Names used in Serialization to allow re-ordering/addition/deletion of
62// layer types in NetworkType without invalidating existing network files.
63static char const *const kTypeNames[NT_COUNT] = {
64 "Invalid", "Input",
65 "Convolve", "Maxpool",
66 "Parallel", "Replicated",
67 "ParBidiLSTM", "DepParUDLSTM",
68 "Par2dLSTM", "Series",
69 "Reconfig", "RTLReversed",
70 "TTBReversed", "XYTranspose",
71 "LSTM", "SummLSTM",
72 "Logistic", "LinLogistic",
73 "LinTanh", "Tanh",
74 "Relu", "Linear",
75 "Softmax", "SoftmaxNoCTC",
76 "LSTMSoftmax", "LSTMBinarySoftmax",
77 "TensorFlow",
78};
79
81 : type_(NT_NONE)
82 , training_(TS_ENABLED)
83 , needs_to_backprop_(true)
84 , network_flags_(0)
85 , ni_(0)
86 , no_(0)
87 , num_weights_(0)
88 , forward_win_(nullptr)
89 , backward_win_(nullptr)
90 , randomizer_(nullptr) {}
91Network::Network(NetworkType type, const std::string &name, int ni, int no)
92 : type_(type)
93 , training_(TS_ENABLED)
94 , needs_to_backprop_(true)
95 , network_flags_(0)
96 , ni_(ni)
97 , no_(no)
98 , num_weights_(0)
99 , name_(name)
100 , forward_win_(nullptr)
101 , backward_win_(nullptr)
102 , randomizer_(nullptr) {}
103
104// Suspends/Enables/Permanently disables training by setting the training_
105// flag. Serialize and DeSerialize only operate on the run-time data if state
106// is TS_DISABLED or TS_TEMP_DISABLE. Specifying TS_TEMP_DISABLE will
107// temporarily disable layers in state TS_ENABLED, allowing a trainer to
108// serialize as if it were a recognizer.
109// TS_RE_ENABLE will re-enable layers that were previously in any disabled
110// state. If in TS_TEMP_DISABLE then the flag is just changed, but if in
111// TS_DISABLED, the deltas in the weight matrices are reinitialized so that a
112// recognizer can be converted back to a trainer.
114 if (state == TS_RE_ENABLE) {
115 // Enable only from temp disabled.
116 if (training_ == TS_TEMP_DISABLE) {
118 }
119 } else if (state == TS_TEMP_DISABLE) {
120 // Temp disable only from enabled.
121 if (training_ == TS_ENABLED) {
122 training_ = state;
123 }
124 } else {
125 training_ = state;
126 }
127}
128
129// Sets flags that control the action of the network. See NetworkFlags enum
130// for bit values.
131void Network::SetNetworkFlags(uint32_t flags) {
132 network_flags_ = flags;
133}
134
135// Sets up the network for training. Initializes weights using weights of
136// scale `range` picked according to the random number generator `randomizer`.
137int Network::InitWeights([[maybe_unused]] float range, TRand *randomizer) {
138 randomizer_ = randomizer;
139 return 0;
140}
141
142// Provides a pointer to a TRand for any networks that care to use it.
143// Note that randomizer is a borrowed pointer that should outlive the network
144// and should not be deleted by any of the networks.
145void Network::SetRandomizer(TRand *randomizer) {
146 randomizer_ = randomizer;
147}
148
149// Sets needs_to_backprop_ to needs_backprop and returns true if
150// needs_backprop || any weights in this network so the next layer forward
151// can be told to produce backprop for this layer if needed.
152bool Network::SetupNeedsBackprop(bool needs_backprop) {
153 needs_to_backprop_ = needs_backprop;
154 return needs_backprop || num_weights_ > 0;
155}
156
157// Writes to the given file. Returns false in case of error.
158bool Network::Serialize(TFile *fp) const {
159 int8_t data = NT_NONE;
160 if (!fp->Serialize(&data)) {
161 return false;
162 }
163 std::string type_name = kTypeNames[type_];
164 if (!fp->Serialize(type_name)) {
165 return false;
166 }
167 data = training_;
168 if (!fp->Serialize(&data)) {
169 return false;
170 }
171 data = needs_to_backprop_;
172 if (!fp->Serialize(&data)) {
173 return false;
174 }
175 if (!fp->Serialize(&network_flags_)) {
176 return false;
177 }
178 if (!fp->Serialize(&ni_)) {
179 return false;
180 }
181 if (!fp->Serialize(&no_)) {
182 return false;
183 }
184 if (!fp->Serialize(&num_weights_)) {
185 return false;
186 }
187 uint32_t length = name_.length();
188 if (!fp->Serialize(&length)) {
189 return false;
190 }
191 return fp->Serialize(name_.c_str(), length);
192}
193
194static NetworkType getNetworkType(TFile *fp) {
195 int8_t data;
196 if (!fp->DeSerialize(&data)) {
197 return NT_NONE;
198 }
199 if (data == NT_NONE) {
200 std::string type_name;
201 if (!fp->DeSerialize(type_name)) {
202 return NT_NONE;
203 }
204 for (data = 0; data < NT_COUNT && type_name != kTypeNames[data]; ++data) {
205 }
206 if (data == NT_COUNT) {
207 tprintf("Invalid network layer type:%s\n", type_name.c_str());
208 return NT_NONE;
209 }
210 }
211 return static_cast<NetworkType>(data);
212}
213
214// Reads from the given file. Returns nullptr in case of error.
215// Determines the type of the serialized class and calls its DeSerialize
216// on a new object of the appropriate type, which is returned.
218 NetworkType type; // Type of the derived network class.
219 TrainingState training; // Are we currently training?
220 bool needs_to_backprop; // This network needs to output back_deltas.
221 int32_t network_flags; // Behavior control flags in NetworkFlags.
222 int32_t ni; // Number of input values.
223 int32_t no; // Number of output values.
224 int32_t num_weights; // Number of weights in this and sub-network.
225 std::string name; // A unique name for this layer.
226 int8_t data;
227 Network *network = nullptr;
228 type = getNetworkType(fp);
229 if (!fp->DeSerialize(&data)) {
230 return nullptr;
231 }
232 training = data == TS_ENABLED ? TS_ENABLED : TS_DISABLED;
233 if (!fp->DeSerialize(&data)) {
234 return nullptr;
235 }
236 needs_to_backprop = data != 0;
237 if (!fp->DeSerialize(&network_flags)) {
238 return nullptr;
239 }
240 if (!fp->DeSerialize(&ni)) {
241 return nullptr;
242 }
243 if (!fp->DeSerialize(&no)) {
244 return nullptr;
245 }
246 if (!fp->DeSerialize(&num_weights)) {
247 return nullptr;
248 }
249 if (!fp->DeSerialize(name)) {
250 return nullptr;
251 }
252
253 switch (type) {
254 case NT_CONVOLVE:
255 network = new Convolve(name.c_str(), ni, 0, 0);
256 break;
257 case NT_INPUT:
258 network = new Input(name.c_str(), ni, no);
259 break;
260 case NT_LSTM:
261 case NT_LSTM_SOFTMAX:
263 case NT_LSTM_SUMMARY:
264 network = new LSTM(name.c_str(), ni, no, no, false, type);
265 break;
266 case NT_MAXPOOL:
267 network = new Maxpool(name.c_str(), ni, 0, 0);
268 break;
269 // All variants of Parallel.
270 case NT_PARALLEL:
271 case NT_REPLICATED:
272 case NT_PAR_RL_LSTM:
273 case NT_PAR_UD_LSTM:
274 case NT_PAR_2D_LSTM:
275 network = new Parallel(name.c_str(), type);
276 break;
277 case NT_RECONFIG:
278 network = new Reconfig(name.c_str(), ni, 0, 0);
279 break;
280 // All variants of reversed.
281 case NT_XREVERSED:
282 case NT_YREVERSED:
283 case NT_XYTRANSPOSE:
284 network = new Reversed(name.c_str(), type);
285 break;
286 case NT_SERIES:
287 network = new Series(name.c_str());
288 break;
289 case NT_TENSORFLOW:
290#ifdef INCLUDE_TENSORFLOW
291 network = new TFNetwork(name.c_str());
292#else
293 tprintf("TensorFlow not compiled in! -DINCLUDE_TENSORFLOW\n");
294#endif
295 break;
296 // All variants of FullyConnected.
297 case NT_SOFTMAX:
299 case NT_RELU:
300 case NT_TANH:
301 case NT_LINEAR:
302 case NT_LOGISTIC:
303 case NT_POSCLIP:
304 case NT_SYMCLIP:
305 network = new FullyConnected(name.c_str(), ni, no, type);
306 break;
307 default:
308 break;
309 }
310 if (network) {
311 network->training_ = training;
313 network->network_flags_ = network_flags;
314 network->num_weights_ = num_weights;
315 if (!network->DeSerialize(fp)) {
316 delete network;
317 network = nullptr;
318 }
319 }
320 return network;
321}
322
323// Returns a random number in [-range, range].
325 ASSERT_HOST(randomizer_ != nullptr);
326 return randomizer_->SignedRand(range);
327}
328
329#ifndef GRAPHICS_DISABLED
330
331// === Debug image display methods. ===
332// Displays the image of the matrix to the forward window.
334 Image image = matrix.ToPix();
335 ClearWindow(false, name_.c_str(), pixGetWidth(image), pixGetHeight(image), &forward_win_);
338}
339
340// Displays the image of the matrix to the backward window.
342 Image image = matrix.ToPix();
343 std::string window_name = name_ + "-back";
344 ClearWindow(false, window_name.c_str(), pixGetWidth(image), pixGetHeight(image), &backward_win_);
347}
348
349// Creates the window if needed, otherwise clears it.
350void Network::ClearWindow(bool tess_coords, const char *window_name, int width, int height,
351 ScrollView **window) {
352 if (*window == nullptr) {
353 int min_size = std::min(width, height);
354 if (min_size < kMinWinSize) {
355 if (min_size < 1) {
356 min_size = 1;
357 }
358 width = width * kMinWinSize / min_size;
359 height = height * kMinWinSize / min_size;
360 }
361 width += kXWinFrameSize;
362 height += kYWinFrameSize;
363 if (width > kMaxWinSize) {
364 width = kMaxWinSize;
365 }
366 if (height > kMaxWinSize) {
367 height = kMaxWinSize;
368 }
369 *window = new ScrollView(window_name, 80, 100, width, height, width, height, tess_coords);
370 tprintf("Created window %s of size %d, %d\n", window_name, width, height);
371 } else {
372 (*window)->Clear();
373 }
374}
375
376// Displays the pix in the given window. and returns the height of the pix.
377// The pix is pixDestroyed.
379 int height = pixGetHeight(pix);
380 window->Draw(pix, 0, 0);
381 pix.destroy();
382 return height;
383}
384#endif // !GRAPHICS_DISABLED
385
386} // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kXWinFrameSize
Definition: network.cpp:54
const int kYWinFrameSize
Definition: network.cpp:55
const int kMinWinSize
Definition: network.cpp:51
TrainingState
Definition: network.h:90
@ TS_TEMP_DISABLE
Definition: network.h:95
@ TS_ENABLED
Definition: network.h:93
@ TS_DISABLED
Definition: network.h:92
@ TS_RE_ENABLE
Definition: network.h:97
NetworkType
Definition: network.h:41
@ NT_LINEAR
Definition: network.h:65
@ NT_MAXPOOL
Definition: network.h:46
@ NT_RELU
Definition: network.h:64
@ NT_XREVERSED
Definition: network.h:54
@ NT_LSTM
Definition: network.h:58
@ NT_CONVOLVE
Definition: network.h:45
@ NT_SOFTMAX
Definition: network.h:66
@ NT_NONE
Definition: network.h:42
@ NT_LOGISTIC
Definition: network.h:60
@ NT_PAR_UD_LSTM
Definition: network.h:50
@ NT_LSTM_SOFTMAX_ENCODED
Definition: network.h:74
@ NT_PARALLEL
Definition: network.h:47
@ NT_SYMCLIP
Definition: network.h:62
@ NT_PAR_2D_LSTM
Definition: network.h:51
@ NT_LSTM_SUMMARY
Definition: network.h:59
@ NT_YREVERSED
Definition: network.h:55
@ NT_RECONFIG
Definition: network.h:53
@ NT_INPUT
Definition: network.h:43
@ NT_TENSORFLOW
Definition: network.h:76
@ NT_POSCLIP
Definition: network.h:61
@ NT_LSTM_SOFTMAX
Definition: network.h:73
@ NT_XYTRANSPOSE
Definition: network.h:56
@ NT_SERIES
Definition: network.h:52
@ NT_SOFTMAX_NO_CTC
Definition: network.h:67
@ NT_TANH
Definition: network.h:63
@ NT_PAR_RL_LSTM
Definition: network.h:49
@ NT_COUNT
Definition: network.h:78
@ NT_REPLICATED
Definition: network.h:48
double TFloat
Definition: tesstypes.h:39
const int kMaxWinSize
Definition: network.cpp:52
type
Definition: upload.py:458
void destroy()
Definition: image.cpp:32
double SignedRand(double range)
Definition: helpers.h:78
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
int32_t network_flags_
Definition: network.h:303
NetworkType type_
Definition: network.h:300
const std::string & name() const
Definition: network.h:140
bool needs_to_backprop_
Definition: network.h:302
int num_weights() const
Definition: network.h:119
virtual bool SetupNeedsBackprop(bool needs_backprop)
Definition: network.cpp:152
static void ClearWindow(bool tess_coords, const char *window_name, int width, int height, ScrollView **window)
Definition: network.cpp:350
void DisplayForward(const NetworkIO &matrix)
Definition: network.cpp:333
std::string name_
Definition: network.h:307
virtual bool DeSerialize(TFile *fp)=0
void DisplayBackward(const NetworkIO &matrix)
Definition: network.cpp:341
virtual void SetEnableTraining(TrainingState state)
Definition: network.cpp:113
bool needs_to_backprop() const
Definition: network.h:116
ScrollView * forward_win_
Definition: network.h:310
static Network * CreateFromFile(TFile *fp)
Definition: network.cpp:217
virtual bool Serialize(TFile *fp) const
Definition: network.cpp:158
ScrollView * backward_win_
Definition: network.h:311
static int DisplayImage(Image pix, ScrollView *window)
Definition: network.cpp:378
TFloat Random(TFloat range)
Definition: network.cpp:324
int32_t num_weights_
Definition: network.h:306
virtual int InitWeights(float range, TRand *randomizer)
Definition: network.cpp:137
TrainingState training_
Definition: network.h:301
virtual void SetNetworkFlags(uint32_t flags)
Definition: network.cpp:131
NetworkType type() const
Definition: network.h:110
TRand * randomizer_
Definition: network.h:312
virtual void SetRandomizer(TRand *randomizer)
Definition: network.cpp:145
Image ToPix() const
Definition: networkio.cpp:300
void Draw(Image image, int x_pos, int y_pos)
Definition: scrollview.cpp:750
static void Update()
Definition: scrollview.cpp:700