tesseract v5.3.3.20231005
networkbuilder.h
Go to the documentation of this file.
1
2// File: networkbuilder.h
3// Description: Class to parse the network description language and
4// build a corresponding network.
5// Author: Ray Smith
6//
7// (C) Copyright 2014, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
18
19#ifndef TESSERACT_LSTM_NETWORKBUILDER_H_
20#define TESSERACT_LSTM_NETWORKBUILDER_H_
21
22#include "export.h"
23#include "static_shape.h"
24#include "stridemap.h"
25
26class UNICHARSET;
27
28namespace tesseract {
29
30class Input;
31class Network;
32class Parallel;
33class TRand;
34
35class TESS_COMMON_TRAINING_API NetworkBuilder {
36public:
37 explicit NetworkBuilder(int num_softmax_outputs) : num_softmax_outputs_(num_softmax_outputs) {}
38
39 // Builds a network with a network_spec in the network description
40 // language, to recognize a character set of num_outputs size.
41 // If append_index is non-negative, then *network must be non-null and the
42 // given network_spec will be appended to *network AFTER append_index, with
43 // the top of the input *network discarded.
44 // Note that network_spec is call by value to allow a non-const char* pointer
45 // into the string for BuildFromString.
46 // net_flags control network behavior according to the NetworkFlags enum.
47 // The resulting network is returned via **network.
48 // Returns false if something failed.
49 static bool InitNetwork(int num_outputs, const char *network_spec, int append_index,
50 int net_flags, float weight_range, TRand *randomizer, Network **network);
51
52 // Parses the given string and returns a network according to the following
53 // language:
54 // ============ Syntax of description below: ============
55 // <d> represents a number.
56 // <net> represents any single network element, including (recursively) a
57 // [...] series or (...) parallel construct.
58 // (s|t|r|l|m) (regex notation) represents a single required letter.
59 // NOTE THAT THROUGHOUT, x and y are REVERSED from conventional mathematics,
60 // to use the same convention as Tensor Flow. The reason TF adopts this
61 // convention is to eliminate the need to transpose images on input, since
62 // adjacent memory locations in images increase x and then y, while adjacent
63 // memory locations in tensors in TF, and NetworkIO in tesseract increase the
64 // rightmost index first, then the next-left and so-on, like C arrays.
65 // ============ INPUTS ============
66 // <b>,<h>,<w>,<d> A batch of b images with height h, width w, and depth d.
67 // b, h and/or w may be zero, to indicate variable size. Some network layer
68 // (summarizing LSTM) must be used to make a variable h known.
69 // d may be 1 for greyscale, 3 for color.
70 // NOTE that throughout the constructed network, the inputs/outputs are all of
71 // the same [batch,height,width,depth] dimensions, even if a different size.
72 // ============ PLUMBING ============
73 // [...] Execute ... networks in series (layers).
74 // (...) Execute ... networks in parallel, with their output depths added.
75 // R<d><net> Execute d replicas of net in parallel, with their output depths
76 // added.
77 // Rx<net> Execute <net> with x-dimension reversal.
78 // Ry<net> Execute <net> with y-dimension reversal.
79 // S<y>,<x> Rescale 2-D input by shrink factor x,y, rearranging the data by
80 // increasing the depth of the input by factor xy.
81 // Mp<y>,<x> Maxpool the input, reducing the size by an (x,y) rectangle.
82 // ============ FUNCTIONAL UNITS ============
83 // C(s|t|r|l|m)<y>,<x>,<d> Convolves using a (x,y) window, with no shrinkage,
84 // random infill, producing d outputs, then applies a non-linearity:
85 // s: Sigmoid, t: Tanh, r: Relu, l: Linear, m: Softmax.
86 // F(s|t|r|l|m)<d> Truly fully-connected with s|t|r|l|m non-linearity and d
87 // outputs. Connects to every x,y,depth position of the input, reducing
88 // height, width to 1, producing a single <d> vector as the output.
89 // Input height and width must be constant.
90 // For a sliding-window linear or non-linear map that connects just to the
91 // input depth, and leaves the input image size as-is, use a 1x1 convolution
92 // eg. Cr1,1,64 instead of Fr64.
93 // L(f|r|b)(x|y)[s]<n> LSTM cell with n states/outputs.
94 // The LSTM must have one of:
95 // f runs the LSTM forward only.
96 // r runs the LSTM reversed only.
97 // b runs the LSTM bidirectionally.
98 // It will operate on either the x- or y-dimension, treating the other
99 // dimension independently (as if part of the batch).
100 // s (optional) summarizes the output in the requested dimension,
101 // outputting only the final step, collapsing the dimension to a
102 // single element.
103 // LS<n> Forward-only LSTM cell in the x-direction, with built-in Softmax.
104 // LE<n> Forward-only LSTM cell in the x-direction, with built-in softmax,
105 // with binary Encoding.
106 // L2xy<n> Full 2-d LSTM operating in quad-directions (bidi in x and y) and
107 // all the output depths added.
108 // ============ OUTPUTS ============
109 // The network description must finish with an output specification:
110 // O(2|1|0)(l|s|c)<n> output layer with n classes
111 // 2 (heatmap) Output is a 2-d vector map of the input (possibly at
112 // different scale).
113 // 1 (sequence) Output is a 1-d sequence of vector values.
114 // 0 (category) Output is a 0-d single vector value.
115 // l uses a logistic non-linearity on the output, allowing multiple
116 // hot elements in any output vector value.
117 // s uses a softmax non-linearity, with one-hot output in each value.
118 // c uses a softmax with CTC. Can only be used with s (sequence).
119 // NOTE1: Only O1s and O1c are currently supported.
120 // NOTE2: n is totally ignored, and for compatibility purposes only. The
121 // output number of classes is obtained automatically from the
122 // unicharset.
123 Network *BuildFromString(const StaticShape &input_shape, const char **str);
124
125private:
126 // Parses an input specification and returns the result, which may include a
127 // series.
128 Network *ParseInput(const char **str);
129 // Parses a sequential series of networks, defined by [<net><net>...].
130 Network *ParseSeries(const StaticShape &input_shape, Input *input_layer, const char **str);
131 // Parses a parallel set of networks, defined by (<net><net>...).
132 Network *ParseParallel(const StaticShape &input_shape, const char **str);
133 // Parses a network that begins with 'R'.
134 Network *ParseR(const StaticShape &input_shape, const char **str);
135 // Parses a network that begins with 'S'.
136 Network *ParseS(const StaticShape &input_shape, const char **str);
137 // Parses a network that begins with 'C'.
138 Network *ParseC(const StaticShape &input_shape, const char **str);
139 // Parses a network that begins with 'M'.
140 Network *ParseM(const StaticShape &input_shape, const char **str);
141 // Parses an LSTM network, either individual, bi- or quad-directional.
142 Network *ParseLSTM(const StaticShape &input_shape, const char **str);
143 // Builds a set of 4 lstms with t and y reversal, running in true parallel.
144 static Network *BuildLSTMXYQuad(int num_inputs, int num_states);
145 // Parses a Fully connected network.
146 Network *ParseFullyConnected(const StaticShape &input_shape, const char **str);
147 // Parses an Output spec.
148 Network *ParseOutput(const StaticShape &input_shape, const char **str);
149
150private:
151 int num_softmax_outputs_;
152};
153
154} // namespace tesseract.
155
156#endif // TESSERACT_LSTM_NETWORKBUILDER_H_
NetworkBuilder(int num_softmax_outputs)