tesseract
v5.3.3.20231005
networkbuilder.h
Go to the documentation of this file.
1
2
// File: networkbuilder.h
3
// Description: Class to parse the network description language and
4
// build a corresponding network.
5
// Author: Ray Smith
6
//
7
// (C) Copyright 2014, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
18
19
#ifndef TESSERACT_LSTM_NETWORKBUILDER_H_
20
#define TESSERACT_LSTM_NETWORKBUILDER_H_
21
22
#include "
export.h
"
23
#include "
static_shape.h
"
24
#include "
stridemap.h
"
25
26
class
UNICHARSET;
27
28
namespace
tesseract
{
29
30
class
Input;
31
class
Network;
32
class
Parallel;
33
class
TRand;
34
35
class
TESS_COMMON_TRAINING_API
NetworkBuilder
{
36
public
:
37
explicit
NetworkBuilder
(
int
num_softmax_outputs) : num_softmax_outputs_(num_softmax_outputs) {}
38
39
// Builds a network with a network_spec in the network description
40
// language, to recognize a character set of num_outputs size.
41
// If append_index is non-negative, then *network must be non-null and the
42
// given network_spec will be appended to *network AFTER append_index, with
43
// the top of the input *network discarded.
44
// Note that network_spec is call by value to allow a non-const char* pointer
45
// into the string for BuildFromString.
46
// net_flags control network behavior according to the NetworkFlags enum.
47
// The resulting network is returned via **network.
48
// Returns false if something failed.
49
static
bool
InitNetwork(
int
num_outputs,
const
char
*network_spec,
int
append_index,
50
int
net_flags,
float
weight_range,
TRand
*randomizer,
Network
**network);
51
52
// Parses the given string and returns a network according to the following
53
// language:
54
// ============ Syntax of description below: ============
55
// <d> represents a number.
56
// <net> represents any single network element, including (recursively) a
57
// [...] series or (...) parallel construct.
58
// (s|t|r|l|m) (regex notation) represents a single required letter.
59
// NOTE THAT THROUGHOUT, x and y are REVERSED from conventional mathematics,
60
// to use the same convention as Tensor Flow. The reason TF adopts this
61
// convention is to eliminate the need to transpose images on input, since
62
// adjacent memory locations in images increase x and then y, while adjacent
63
// memory locations in tensors in TF, and NetworkIO in tesseract increase the
64
// rightmost index first, then the next-left and so-on, like C arrays.
65
// ============ INPUTS ============
66
// <b>,<h>,<w>,<d> A batch of b images with height h, width w, and depth d.
67
// b, h and/or w may be zero, to indicate variable size. Some network layer
68
// (summarizing LSTM) must be used to make a variable h known.
69
// d may be 1 for greyscale, 3 for color.
70
// NOTE that throughout the constructed network, the inputs/outputs are all of
71
// the same [batch,height,width,depth] dimensions, even if a different size.
72
// ============ PLUMBING ============
73
// [...] Execute ... networks in series (layers).
74
// (...) Execute ... networks in parallel, with their output depths added.
75
// R<d><net> Execute d replicas of net in parallel, with their output depths
76
// added.
77
// Rx<net> Execute <net> with x-dimension reversal.
78
// Ry<net> Execute <net> with y-dimension reversal.
79
// S<y>,<x> Rescale 2-D input by shrink factor x,y, rearranging the data by
80
// increasing the depth of the input by factor xy.
81
// Mp<y>,<x> Maxpool the input, reducing the size by an (x,y) rectangle.
82
// ============ FUNCTIONAL UNITS ============
83
// C(s|t|r|l|m)<y>,<x>,<d> Convolves using a (x,y) window, with no shrinkage,
84
// random infill, producing d outputs, then applies a non-linearity:
85
// s: Sigmoid, t: Tanh, r: Relu, l: Linear, m: Softmax.
86
// F(s|t|r|l|m)<d> Truly fully-connected with s|t|r|l|m non-linearity and d
87
// outputs. Connects to every x,y,depth position of the input, reducing
88
// height, width to 1, producing a single <d> vector as the output.
89
// Input height and width must be constant.
90
// For a sliding-window linear or non-linear map that connects just to the
91
// input depth, and leaves the input image size as-is, use a 1x1 convolution
92
// eg. Cr1,1,64 instead of Fr64.
93
// L(f|r|b)(x|y)[s]<n> LSTM cell with n states/outputs.
94
// The LSTM must have one of:
95
// f runs the LSTM forward only.
96
// r runs the LSTM reversed only.
97
// b runs the LSTM bidirectionally.
98
// It will operate on either the x- or y-dimension, treating the other
99
// dimension independently (as if part of the batch).
100
// s (optional) summarizes the output in the requested dimension,
101
// outputting only the final step, collapsing the dimension to a
102
// single element.
103
// LS<n> Forward-only LSTM cell in the x-direction, with built-in Softmax.
104
// LE<n> Forward-only LSTM cell in the x-direction, with built-in softmax,
105
// with binary Encoding.
106
// L2xy<n> Full 2-d LSTM operating in quad-directions (bidi in x and y) and
107
// all the output depths added.
108
// ============ OUTPUTS ============
109
// The network description must finish with an output specification:
110
// O(2|1|0)(l|s|c)<n> output layer with n classes
111
// 2 (heatmap) Output is a 2-d vector map of the input (possibly at
112
// different scale).
113
// 1 (sequence) Output is a 1-d sequence of vector values.
114
// 0 (category) Output is a 0-d single vector value.
115
// l uses a logistic non-linearity on the output, allowing multiple
116
// hot elements in any output vector value.
117
// s uses a softmax non-linearity, with one-hot output in each value.
118
// c uses a softmax with CTC. Can only be used with s (sequence).
119
// NOTE1: Only O1s and O1c are currently supported.
120
// NOTE2: n is totally ignored, and for compatibility purposes only. The
121
// output number of classes is obtained automatically from the
122
// unicharset.
123
Network
*BuildFromString(
const
StaticShape
&input_shape,
const
char
**str);
124
125
private
:
126
// Parses an input specification and returns the result, which may include a
127
// series.
128
Network
*ParseInput(
const
char
**str);
129
// Parses a sequential series of networks, defined by [<net><net>...].
130
Network
*ParseSeries(
const
StaticShape
&input_shape,
Input
*input_layer,
const
char
**str);
131
// Parses a parallel set of networks, defined by (<net><net>...).
132
Network
*ParseParallel(
const
StaticShape
&input_shape,
const
char
**str);
133
// Parses a network that begins with 'R'.
134
Network
*ParseR(
const
StaticShape
&input_shape,
const
char
**str);
135
// Parses a network that begins with 'S'.
136
Network
*ParseS(
const
StaticShape
&input_shape,
const
char
**str);
137
// Parses a network that begins with 'C'.
138
Network
*ParseC(
const
StaticShape
&input_shape,
const
char
**str);
139
// Parses a network that begins with 'M'.
140
Network
*ParseM(
const
StaticShape
&input_shape,
const
char
**str);
141
// Parses an LSTM network, either individual, bi- or quad-directional.
142
Network
*ParseLSTM(
const
StaticShape
&input_shape,
const
char
**str);
143
// Builds a set of 4 lstms with t and y reversal, running in true parallel.
144
static
Network
*BuildLSTMXYQuad(
int
num_inputs,
int
num_states);
145
// Parses a Fully connected network.
146
Network
*ParseFullyConnected(
const
StaticShape
&input_shape,
const
char
**str);
147
// Parses an Output spec.
148
Network
*ParseOutput(
const
StaticShape
&input_shape,
const
char
**str);
149
150
private
:
151
int
num_softmax_outputs_;
152
};
153
154
}
// namespace tesseract.
155
156
#endif
// TESSERACT_LSTM_NETWORKBUILDER_H_
static_shape.h
stridemap.h
tesseract
Definition:
baseapi.h:39
tesseract::TRand
Definition:
helpers.h:61
tesseract::Input
Definition:
input.h:27
tesseract::Network
Definition:
network.h:103
tesseract::StaticShape
Definition:
static_shape.h:38
tesseract::NetworkBuilder
Definition:
networkbuilder.h:35
tesseract::NetworkBuilder::NetworkBuilder
NetworkBuilder(int num_softmax_outputs)
Definition:
networkbuilder.h:37
export.h
src
training
common
networkbuilder.h
Generated on Thu Oct 5 2023 22:10:27 for tesseract by
1.9.4