tesseract v5.3.3.20231005
validator.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: validator.h
3 * Description: Base class for various text validators. Intended mainly for
4 * scripts that use a virama character.
5 * Author: Ray Smith
6 *
7 * (C) Copyright 2017, Google Inc.
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 **********************************************************************/
19
20#ifndef TESSERACT_TRAINING_VALIDATOR_H_
21#define TESSERACT_TRAINING_VALIDATOR_H_
22
23#include "export.h"
24
25#include <tesseract/unichar.h>
26
27#include <memory>
28#include <vector>
29
30namespace tesseract {
31
32// Different kinds of grapheme normalization - not just for Indic!
33// A grapheme is a syllable unit in Indic and can be several unicodes.
34// In other scripts, a grapheme is a base character and accent/diacritic
35// combination, as not all accented characters have a single composed form.
36enum class GraphemeNormMode {
37 // Validation result is a single string, even if input is multi-word.
39 // Standard unicode graphemes are validated and output as grapheme units.
41 // Graphemes are validated and sub-divided. For virama-using scripts, units
42 // that correspond to repeatable glyphs are generated. (Mostly single unicodes
43 // but viramas and joiners are paired with the most sensible neighbor.)
44 // For non-virama scripts, this means that base/accent pairs are separated,
45 // ie the output is individual unicodes.
47 // The output is always single unicodes, regardless of the script.
49};
50
51// An enum representing the scripts that use a virama character. It is
52// guaranteed that the value of any element, (except kNonVirama) can be cast
53// to a unicode (char32) value that represents the start of the unicode range
54// of the corresponding script.
55enum class ViramaScript : char32 {
56 kNonVirama = 0,
57 kDevanagari = 0x900,
58 kBengali = 0x980,
59 kGurmukhi = 0xa00,
60 kGujarati = 0xa80,
61 kOriya = 0xb00,
62 kTamil = 0xb80,
63 kTelugu = 0xc00,
64 kKannada = 0xc80,
65 kMalayalam = 0xd00,
66 kSinhala = 0xd80,
67 kMyanmar = 0x1000,
68 kKhmer = 0x1780,
69 kJavanese = 0xa980,
70};
71
72// Base class offers a validation API and protected methods to allow subclasses
73// to easily build the validated/segmented output.
74class TESS_UNICHARSET_TRAINING_API Validator {
75public:
76 // Validates and cleans the src vector of unicodes to the *dest, according to
77 // g_mode. In the case of kSingleString, a single vector containing the whole
78 // result is added to *dest. With kCombined, multiple vectors are added to
79 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
80 // added to *dest with a smaller unit representing a glyph in each.
81 // In case of validation error, returns false and as much as possible of the
82 // input, without discarding invalid text.
83 static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors,
84 const std::vector<char32> &src,
85 std::vector<std::vector<char32>> *dest);
86
87 // Returns true if the unicode ch is a non-printing zero-width mark of no
88 // significance to OCR training or evaluation.
89 static bool IsZeroWidthMark(char32 ch) {
90 return ch == kZeroWidthSpace || ch == kLeftToRightMark || ch == kRightToLeftMark ||
91 ch == kInvalid;
92 }
93 virtual ~Validator();
94
95 // Some specific but universally useful unicodes.
96 static const char32 kZeroWidthSpace;
101 static const char32 kInvalid;
102
103protected:
104 // These are more or less the character class identifiers in the ISCII
105 // standard, section 8. They have been augmented with the Unicode meta
106 // characters Zero Width Joiner and Zero Width Non Joiner, and the
107 // Unicode Vedic Marks.
108 // The best sources of information on Unicode and Indic scripts are:
109 // http://varamozhi.sourceforge.net/iscii91.pdf
110 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
111 // http://unicode.org/faq/indic.html
112 // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
113 enum class CharClass {
114 // NOTE: The values of the enum members are meaningless and arbitrary, ie
115 // they are not used for sorting, or any other risky application.
116 // The reason they are what they are is they are a single character
117 // abbreviation that can be used in a regexp/BNF definition of a grammar,
118 // IN A COMMENT, and still not relied upon in the code.
119 kConsonant = 'C',
120 kVowel = 'V',
121 kVirama = 'H', // (aka Halant)
122 kMatra = 'M', // (aka Dependent Vowel)
123 kMatraPiece = 'P', // unicode provides pieces of Matras.
124 kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
125 kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
126 kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
127 kVedicMark = 'v', // Modifiers can come modify any indic syllable.
128 kNukta = 'N', // Occurs only immediately after consonants.
129 kRobat = 'R', // Khmer only.
130 kOther = 'O', // (digits, measures, non-Indic, etc)
131 // Additional classes used only by ValidateGrapheme.
132 kWhitespace = ' ',
133 kCombiner = 'c', // Combiners other than virama.
134 };
135 using IndicPair = std::pair<CharClass, char32>;
136
137 Validator(ViramaScript script, bool report_errors)
138 : script_(script), codes_used_(0), output_used_(0), report_errors_(report_errors) {}
139
140 // Factory method that understands how to map script to the right subclass.
141 static std::unique_ptr<Validator> ScriptValidator(ViramaScript script, bool report_errors);
142
143 // Internal version of the public static ValidateCleanAndSegment.
144 // Validates and cleans the src vector of unicodes to the *dest, according to
145 // its type and the given g_mode.
146 // In case of validation error, returns false and returns as much as possible
147 // of the input, without discarding invalid text.
148 bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector<char32> &src,
149 std::vector<std::vector<char32>> *dest);
150 // Moves the results from parts_ or output_ to dest according to g_mode.
151 void MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest);
152
153 // Computes and returns the ViramaScript corresponding to the most frequent
154 // virama-using script in the input, or kNonVirama if none are present.
155 static ViramaScript MostFrequentViramaScript(const std::vector<char32> &utf32);
156 // Returns true if the given UTF-32 unicode is a "virama" character.
157 static bool IsVirama(char32 unicode);
158 // Returns true if the given UTF-32 unicode is a vedic accent.
159 static bool IsVedicAccent(char32 unicode);
160 // Returns true if the script is one that uses subscripts for conjuncts.
161 bool IsSubscriptScript() const;
162
163 // Helper function appends the next element of codes_ only to output_,
164 // without touching parts_
165 // Returns true at the end of codes_.
167 output_.push_back(codes_[codes_used_].second);
168 return ++codes_used_ == codes_.size();
169 }
170
171 // Helper function adds a length-element vector to parts_ from the last length
172 // elements of output_. If there are more than length unused elements in
173 // output_, adds unicodes as single-element vectors to parts_ to catch
174 // output_used_ up to output->size() - length before adding the length-element
175 // vector.
176 void MultiCodePart(unsigned length) {
177 while (output_used_ + length < output_.size()) {
178 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_++]});
179 }
180 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
181 while (++output_used_ < output_.size()) {
182 parts_.back().push_back(output_[output_used_]);
183 }
184 }
185
186 // Helper function appends the next element of codes_ to output_, and then
187 // calls MultiCodePart to add the appropriate components to parts_.
188 // Returns true at the end of codes_.
189 bool UseMultiCode(unsigned length) {
190 output_.push_back(codes_[codes_used_].second);
191 MultiCodePart(length);
192 return ++codes_used_ == codes_.size();
193 }
194
195 // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
196 // parts_ and output_. Returns true if a valid Grapheme was consumed,
197 // otherwise does not increment codes_used_.
198 virtual bool ConsumeGraphemeIfValid() = 0;
199 // Sets codes_ to the class codes for the given unicode text.
200 void ComputeClassCodes(const std::vector<char32> &text);
201 // Returns the CharClass corresponding to the given Unicode ch.
203 // Resets to the initial state.
204 void Clear();
205
206 // Number of unicodes in each Indic codepage.
207 static const int kIndicCodePageSize = 128;
208 // Lowest unicode value of any Indic script. (Devanagari).
209 static const char32 kMinIndicUnicode = 0x900;
210 // Highest unicode value of any consistent (ISCII-based) Indic script.
211 static const char32 kMaxSinhalaUnicode = 0xdff;
212 // Highest unicode value of any virama-using script. (Khmer).
213 static const char32 kMaxViramaScriptUnicode = 0x17ff;
214 // Some special unicodes.
215 static const char32 kSinhalaVirama = 0xdca;
216 static const char32 kMyanmarVirama = 0x1039;
217 static const char32 kKhmerVirama = 0x17d2;
218 // Javanese Script - aksarajawa
219 static const char32 kJavaneseVirama = 0xa9c0;
220 static const char32 kMaxJavaneseUnicode = 0xa9df;
221
222 // Script we are operating on.
224 // Input unicodes with assigned CharClass is the data to be validated.
225 std::vector<IndicPair> codes_;
226 // Glyph-like components of the input.
227 std::vector<std::vector<char32>> parts_;
228 // Copied validated unicodes from codes_ that are OK to output.
229 std::vector<char32> output_;
230 // The number of elements of codes_ that have been processed so far.
231 unsigned codes_used_;
232 // The number of elements of output_ that have already been added to parts_.
233 unsigned output_used_;
234 // Log error messages for reasons why text is invalid.
236};
237
238} // namespace tesseract
239
240#endif // TESSERACT_TRAINING_VALIDATOR_H_
GraphemeNormMode
Definition: validator.h:36
signed int char32
Definition: unichar.h:49
dest
Definition: upload.py:409
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
virtual CharClass UnicodeToCharClass(char32 ch) const =0
std::vector< char32 > output_
Definition: validator.h:229
static const char32 kInvalid
Definition: validator.h:101
static const char32 kRightToLeftMark
Definition: validator.h:100
unsigned output_used_
Definition: validator.h:233
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
void MultiCodePart(unsigned length)
Definition: validator.h:176
static const char32 kLeftToRightMark
Definition: validator.h:99
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:135
static const char32 kZeroWidthSpace
Definition: validator.h:96
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:89
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:137
std::vector< std::vector< char32 > > parts_
Definition: validator.h:227
virtual bool ConsumeGraphemeIfValid()=0