tesseract v5.3.3.20231005
sampleiterator.h
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
15
16#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
17#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
18
19namespace tesseract {
20
21class IndexMapBiDi;
22class IntFeatureMap;
23class ShapeTable;
24class TrainingSample;
25class TrainingSampleSet;
26struct UnicharAndFonts;
27
28// Iterator class to encapsulate the complex iteration involved in getting
29// all samples of all shapes needed for a classification problem.
30//
31// =====INPUTS TO Init FUNCTION=====
32// The charset_map defines a subset of the sample_set classes (with a nullptr
33// shape_table, or the shape_table classes if not nullptr.)
34//
35// The shape_table (if not nullptr) defines the mapping from shapes to
36// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
37//
38// The sample_set holds the samples and provides indexed access to samples
39// of font_id/class_id pairs.
40//
41// If randomize is true, the samples are perturbed slightly, but the
42// perturbation is guaranteed to be the same for multiple identical
43// iterations.
44//
45// =====DIFFERENT COMBINATIONS OF INPUTS=====
46// nullptr shape_table:
47// Without a shape_table, everything works in UNICHAR_IDs.
48//
49// nullptr shape_table, nullptr charset_map:
50// Iterations simply run over the samples in the order the samples occur in the
51// input files.
52// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
53//
54// nullptr shape_table, non-nullptr charset_map:
55// When shape_table is nullptr, the charset_map indexes unichar_ids directly,
56// and an iteration returns all samples of all chars in the charset_map, which
57// is a subset of the full unicharset.
58// The iteration will be in groups of the same unichar_id, in the order
59// defined by the charset_map.
60// GetCompactClassID returns the charset_map index of a sample, and
61// GetSparseClassID returns the sample UNICHAR_ID.
62//
63// Non-nullptr shape_table:
64// With a shape_table, samples are grouped according to the shape_table, so
65// multiple UNICHAR_IDs and fonts may be grouped together, and everything
66// works in shape_ids.
67//
68// Non-nullptr shape_table, nullptr charset_map.
69// Iterations simply run over the samples in the order of shape_id.
70// GetCompactClassID and GetSparseClassID both return the shape_id.
71// (If you want the unichar_id or font_id, the sample still has them.)
72//
73// Non-nullptr shape_table, non-nullptr charset_map.
74// When shape_table is not nullptr, the charset_map indexes and subsets shapes
75// in the shape_table, and iterations will be in shape_table order, not
76// charset_map order.
77// GetCompactClassID returns the charset_map index of a shape, and
78// GetSparseClassID returns the shape_id.
79//
80// =====What is SampleIterator good for?=====
81// Inside a classifier training module, the SampleIterator has abstracted away
82// all the different modes above.
83// Use the following iteration to train your classifier:
84// for (it.Begin(); !it.AtEnd(); it.Next()) {
85// const TrainingSample& sample = it.GetSample();
86// int class_id = it.GetCompactClassID();
87// Your classifier may or may not be dealing with a shape_table, and may be
88// dealing with some subset of the character/shape set. It doesn't need to
89// know and shouldn't care. It is just learning shapes with compact class ids
90// in the range [0, it.CompactCharsetSize()).
92public:
95
96 void Clear();
97
98 // See class comment for arguments.
99 void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize,
101
102 // Iterator functions designed for use with a simple for loop:
103 // for (it.Begin(); !it.AtEnd(); it.Next()) {
104 // const TrainingSample& sample = it.GetSample();
105 // int class_id = it.GetCompactClassID();
106 // ...
107 // }
108 void Begin();
109 bool AtEnd() const;
110 const TrainingSample &GetSample() const;
112 // Returns the total index (from the original set of samples) of the current
113 // sample.
114 int GlobalSampleIndex() const;
115 // Returns the index of the current sample in compact charset space, so
116 // in a 2-class problem between x and y, the returned indices will all be
117 // 0 or 1, and have nothing to do with the unichar_ids.
118 // If the charset_map_ is nullptr, then this is equal to GetSparseClassID().
119 int GetCompactClassID() const;
120 // Returns the index of the current sample in sparse charset space, so
121 // in a 2-class problem between x and y, the returned indices will all be
122 // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
123 // with a shape_table_.
124 int GetSparseClassID() const;
125 // Moves on to the next indexable sample. If the end is reached, leaves
126 // the state such that AtEnd() is true.
127 void Next();
128
129 // Returns the size of the compact charset space.
130 int CompactCharsetSize() const;
131 // Returns the size of the sparse charset space.
132 int SparseCharsetSize() const;
133
134 const IndexMapBiDi &charset_map() const {
135 return *charset_map_;
136 }
137 const ShapeTable *shape_table() const {
138 return shape_table_;
139 }
140 // Sample set operations.
142 return sample_set_;
143 }
144
145 // A set of functions that do something to all the samples accessed by the
146 // iterator, as it is currently setup.
147
148 // Apply the supplied feature_space/feature_map transform to all samples
149 // accessed by this iterator.
150 void MapSampleFeatures(const IntFeatureMap &feature_map);
151
152 // Adjust the weights of all the samples to be uniform in the given charset.
153 // Returns the number of samples in the iterator.
154 int UniformSamples();
155
156 // Normalize the weights of all the samples defined by the iterator so they
157 // sum to 1. Returns the minimum assigned sample weight.
158 double NormalizeSamples();
159
160private:
161 // Helper returns the current UnicharAndFont shape_entry.
162 const UnicharAndFonts *GetShapeEntry() const;
163
164 // Map to subset the actual charset space.
165 const IndexMapBiDi *charset_map_;
166 // Shape table to recombine character classes into shapes
167 const ShapeTable *shape_table_;
168 // The samples to iterate over.
169 TrainingSampleSet *sample_set_;
170 // Flag to control randomizing the sample features.
171 bool randomize_;
172 // Shape table owned by this used to iterate character classes.
173 ShapeTable *owned_shape_table_;
174
175 // Top-level iteration. Shape index in sparse charset_map space.
176 int shape_index_;
177 int num_shapes_;
178 // Index to the character class within a shape.
179 int shape_char_index_;
180 int num_shape_chars_;
181 // Index to the font within a shape/class pair.
182 int shape_font_index_;
183 int num_shape_fonts_;
184 // The lowest level iteration. sample_index_/num_samples_ counts samples
185 // in the current shape/class/font combination.
186 int sample_index_;
187 int num_samples_;
188};
189
190} // namespace tesseract.
191
192#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
const TrainingSample & GetSample() const
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
const TrainingSampleSet * sample_set() const
void MapSampleFeatures(const IntFeatureMap &feature_map)
const IndexMapBiDi & charset_map() const
TrainingSample * MutableSample() const
const ShapeTable * shape_table() const