tesseract v5.3.3.20231005
tesseract::Validator Class Referenceabstract

#include <validator.h>

Inheritance diagram for tesseract::Validator:
tesseract::TestableValidator tesseract::ValidateGrapheme tesseract::ValidateIndic tesseract::ValidateJavanese tesseract::ValidateKhmer tesseract::ValidateMyanmar

Public Member Functions

virtual ~Validator ()
 

Static Public Member Functions

static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
static bool IsZeroWidthMark (char32 ch)
 

Static Public Attributes

static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 

Protected Types

enum class  CharClass {
  kConsonant = 'C' , kVowel = 'V' , kVirama = 'H' , kMatra = 'M' ,
  kMatraPiece = 'P' , kVowelModifier = 'D' , kZeroWidthNonJoiner = 'z' , kZeroWidthJoiner = 'Z' ,
  kVedicMark = 'v' , kNukta = 'N' , kRobat = 'R' , kOther = 'O' ,
  kWhitespace = ' ' , kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 

Protected Member Functions

 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
virtual bool ConsumeGraphemeIfValid ()=0
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
virtual CharClass UnicodeToCharClass (char32 ch) const =0
 
void Clear ()
 

Static Protected Member Functions

static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 

Protected Attributes

ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 

Static Protected Attributes

static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 74 of file validator.h.

Member Typedef Documentation

◆ IndicPair

using tesseract::Validator::IndicPair = std::pair<CharClass, char32>
protected

Definition at line 135 of file validator.h.

Member Enumeration Documentation

◆ CharClass

enum class tesseract::Validator::CharClass
strongprotected
Enumerator
kConsonant 
kVowel 
kVirama 
kMatra 
kMatraPiece 
kVowelModifier 
kZeroWidthNonJoiner 
kZeroWidthJoiner 
kVedicMark 
kNukta 
kRobat 
kOther 
kWhitespace 
kCombiner 

Definition at line 113 of file validator.h.

113 {
114 // NOTE: The values of the enum members are meaningless and arbitrary, ie
115 // they are not used for sorting, or any other risky application.
116 // The reason they are what they are is they are a single character
117 // abbreviation that can be used in a regexp/BNF definition of a grammar,
118 // IN A COMMENT, and still not relied upon in the code.
119 kConsonant = 'C',
120 kVowel = 'V',
121 kVirama = 'H', // (aka Halant)
122 kMatra = 'M', // (aka Dependent Vowel)
123 kMatraPiece = 'P', // unicode provides pieces of Matras.
124 kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
125 kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
126 kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
127 kVedicMark = 'v', // Modifiers can come modify any indic syllable.
128 kNukta = 'N', // Occurs only immediately after consonants.
129 kRobat = 'R', // Khmer only.
130 kOther = 'O', // (digits, measures, non-Indic, etc)
131 // Additional classes used only by ValidateGrapheme.
132 kWhitespace = ' ',
133 kCombiner = 'c', // Combiners other than virama.
134 };
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
static const char32 kZeroWidthJoiner
Definition: validator.h:98

Constructor & Destructor Documentation

◆ ~Validator()

tesseract::Validator::~Validator ( )
virtualdefault

◆ Validator()

tesseract::Validator::Validator ( ViramaScript  script,
bool  report_errors 
)
inlineprotected

Definition at line 137 of file validator.h.

138 : script_(script), codes_used_(0), output_used_(0), report_errors_(report_errors) {}
ViramaScript script_
Definition: validator.h:223
unsigned output_used_
Definition: validator.h:233
unsigned codes_used_
Definition: validator.h:231

Member Function Documentation

◆ Clear()

void tesseract::Validator::Clear ( )
protected

Definition at line 198 of file validator.cpp.

198 {
199 codes_.clear();
200 parts_.clear();
201 output_.clear();
202 codes_used_ = 0;
203 output_used_ = 0;
204}
std::vector< char32 > output_
Definition: validator.h:229
std::vector< IndicPair > codes_
Definition: validator.h:225
std::vector< std::vector< char32 > > parts_
Definition: validator.h:227

◆ CodeOnlyToOutput()

bool tesseract::Validator::CodeOnlyToOutput ( )
inlineprotected

Definition at line 166 of file validator.h.

166 {
167 output_.push_back(codes_[codes_used_].second);
168 return ++codes_used_ == codes_.size();
169 }

◆ ComputeClassCodes()

void tesseract::Validator::ComputeClassCodes ( const std::vector< char32 > &  text)
protected

Definition at line 190 of file validator.cpp.

190 {
191 codes_.reserve(text.size());
192 for (char32 c : text) {
193 codes_.emplace_back(UnicodeToCharClass(c), c);
194 }
195}
signed int char32
virtual CharClass UnicodeToCharClass(char32 ch) const =0

◆ ConsumeGraphemeIfValid()

virtual bool tesseract::Validator::ConsumeGraphemeIfValid ( )
protectedpure virtual

◆ IsSubscriptScript()

bool tesseract::Validator::IsSubscriptScript ( ) const
protected

◆ IsVedicAccent()

bool tesseract::Validator::IsVedicAccent ( char32  unicode)
staticprotected

Definition at line 178 of file validator.cpp.

178 {
179 return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
180 (0x951 <= unicode && unicode <= 0x954);
181}

◆ IsVirama()

bool tesseract::Validator::IsVirama ( char32  unicode)
staticprotected

Definition at line 169 of file validator.cpp.

169 {
170 return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
171 (unicode & 0x7f) == 0x4d) ||
172 unicode == kSinhalaVirama || unicode == kJavaneseVirama || unicode == kMyanmarVirama ||
173 unicode == kKhmerVirama;
174}
static const char32 kSinhalaVirama
Definition: validator.h:215
static const char32 kKhmerVirama
Definition: validator.h:217
static const char32 kJavaneseVirama
Definition: validator.h:219
static const char32 kMaxSinhalaUnicode
Definition: validator.h:211
static const char32 kMyanmarVirama
Definition: validator.h:216
static const char32 kMinIndicUnicode
Definition: validator.h:209

◆ IsZeroWidthMark()

static bool tesseract::Validator::IsZeroWidthMark ( char32  ch)
inlinestatic

Definition at line 89 of file validator.h.

89 {
91 ch == kInvalid;
92 }
static const char32 kInvalid
Definition: validator.h:101
static const char32 kRightToLeftMark
Definition: validator.h:100
static const char32 kLeftToRightMark
Definition: validator.h:99
static const char32 kZeroWidthSpace
Definition: validator.h:96

◆ MostFrequentViramaScript()

ViramaScript tesseract::Validator::MostFrequentViramaScript ( const std::vector< char32 > &  utf32)
staticprotected

Definition at line 135 of file validator.cpp.

135 {
136 std::unordered_map<int, int> histogram;
137 for (char32 ch : utf32) {
138 // Determine the codepage base. For the Indic scripts, Khmer and Javanese,
139 // it is sufficient to divide by kIndicCodePageSize but Myanmar is all over
140 // the unicode code space, so use its script id.
141 int base = ch / kIndicCodePageSize;
142 IcuErrorCode err;
143 UScriptCode script_code = uscript_getScript(ch, err);
144 if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) ||
145 script_code == USCRIPT_MYANMAR) {
146 if (script_code == USCRIPT_MYANMAR) {
147 base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
148 }
149 ++histogram[base];
150 }
151 }
152 if (!histogram.empty()) {
153 int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;
154 auto codebase = static_cast<char32>(base * kIndicCodePageSize);
155 // Check for validity.
156 if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
157 codebase == static_cast<char32>(ViramaScript::kJavanese) ||
158 codebase == static_cast<char32>(ViramaScript::kKhmer) ||
159 (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
160 codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
161 return static_cast<ViramaScript>(codebase);
162 }
163 }
165}
static const int kIndicCodePageSize
Definition: validator.h:207
static const char32 kMaxJavaneseUnicode
Definition: validator.h:220

◆ MoveResultsToDest()

void tesseract::Validator::MoveResultsToDest ( GraphemeNormMode  g_mode,
std::vector< std::vector< char32 > > *  dest 
)
protected

Definition at line 106 of file validator.cpp.

106 {
108 // Append each element of the combined output_ that we made as a new vector
109 // in dest.
110 dest->reserve(dest->size() + output_.size());
111 for (char32 ch : output_) {
112 dest->push_back({ch});
113 }
114 } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
115 // Append all the parts_ that we made onto dest.
116 std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
117 } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
118 // Append the combined output_ that we made onto dest as one new vector.
119 dest->push_back(std::vector<char32>());
120 output_.swap(dest->back());
121 } else { // kNone.
122 // Append the combined output_ that we made onto the last existing element
123 // of dest.
124 dest->back().insert(dest->back().end(), output_.begin(), output_.end());
125 }
126}
dest
Definition: upload.py:409

◆ MultiCodePart()

void tesseract::Validator::MultiCodePart ( unsigned  length)
inlineprotected

Definition at line 176 of file validator.h.

176 {
177 while (output_used_ + length < output_.size()) {
178 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_++]});
179 }
180 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
181 while (++output_used_ < output_.size()) {
182 parts_.back().push_back(output_[output_used_]);
183 }
184 }

◆ ScriptValidator()

std::unique_ptr< Validator > tesseract::Validator::ScriptValidator ( ViramaScript  script,
bool  report_errors 
)
staticprotected

Definition at line 71 of file validator.cpp.

71 {
72 switch (script) {
73#define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)
74 CASE(kNonVirama, ValidateGrapheme);
75 CASE(kJavanese, ValidateJavanese);
76 CASE(kMyanmar, ValidateMyanmar);
77 CASE(kKhmer, ValidateKhmer);
78#undef CASE
79 default:
80 return std::make_unique<ValidateIndic>(script, report_errors);
81 }
82}
#define CASE(e, T)

◆ UnicodeToCharClass()

virtual CharClass tesseract::Validator::UnicodeToCharClass ( char32  ch) const
protectedpure virtual

◆ UseMultiCode()

bool tesseract::Validator::UseMultiCode ( unsigned  length)
inlineprotected

Definition at line 189 of file validator.h.

189 {
190 output_.push_back(codes_[codes_used_].second);
191 MultiCodePart(length);
192 return ++codes_used_ == codes_.size();
193 }
void MultiCodePart(unsigned length)
Definition: validator.h:176

◆ ValidateCleanAndSegment()

bool tesseract::Validator::ValidateCleanAndSegment ( GraphemeNormMode  g_mode,
bool  report_errors,
const std::vector< char32 > &  src,
std::vector< std::vector< char32 > > *  dest 
)
static

Definition at line 40 of file validator.cpp.

42 {
43 ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
44 std::vector<std::vector<char32>> graphemes;
46 bool success = true;
47 if (script == ViramaScript::kNonVirama) {
48 // The grapheme segmenter's maximum segmentation is the grapheme unit, so
49 // up the mode by 1 to get the desired effect.
50 if (g_mode == GraphemeNormMode::kCombined) {
52 } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
54 }
55 // Just do grapheme segmentation.
56 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
57 } else {
58 success =
59 g_validator.ValidateCleanAndSegmentInternal(GraphemeNormMode::kGlyphSplit, src, &graphemes);
60 std::unique_ptr<Validator> validator(ScriptValidator(script, report_errors));
61 for (const auto &grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
63 success = false;
64 }
65 }
66 }
67 return success;
68}
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:135
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:71

◆ ValidateCleanAndSegmentInternal()

bool tesseract::Validator::ValidateCleanAndSegmentInternal ( GraphemeNormMode  g_mode,
const std::vector< char32 > &  src,
std::vector< std::vector< char32 > > *  dest 
)
protected

Definition at line 89 of file validator.cpp.

91 {
92 Clear();
94 bool success = true;
95 for (codes_used_ = 0; codes_used_ < codes_.size();) {
97 success = false;
99 }
100 }
101 MoveResultsToDest(g_mode, dest);
102 return success;
103}
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:106
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:190
virtual bool ConsumeGraphemeIfValid()=0

Member Data Documentation

◆ codes_

std::vector<IndicPair> tesseract::Validator::codes_
protected

Definition at line 225 of file validator.h.

◆ codes_used_

unsigned tesseract::Validator::codes_used_
protected

Definition at line 231 of file validator.h.

◆ kIndicCodePageSize

const int tesseract::Validator::kIndicCodePageSize = 128
staticprotected

Definition at line 207 of file validator.h.

◆ kInvalid

const char32 tesseract::Validator::kInvalid = 0xfffd
static

Definition at line 101 of file validator.h.

◆ kJavaneseVirama

const char32 tesseract::Validator::kJavaneseVirama = 0xa9c0
staticprotected

Definition at line 219 of file validator.h.

◆ kKhmerVirama

const char32 tesseract::Validator::kKhmerVirama = 0x17d2
staticprotected

Definition at line 217 of file validator.h.

◆ kLeftToRightMark

const char32 tesseract::Validator::kLeftToRightMark = 0x200E
static

Definition at line 99 of file validator.h.

◆ kMaxJavaneseUnicode

const char32 tesseract::Validator::kMaxJavaneseUnicode = 0xa9df
staticprotected

Definition at line 220 of file validator.h.

◆ kMaxSinhalaUnicode

const char32 tesseract::Validator::kMaxSinhalaUnicode = 0xdff
staticprotected

Definition at line 211 of file validator.h.

◆ kMaxViramaScriptUnicode

const char32 tesseract::Validator::kMaxViramaScriptUnicode = 0x17ff
staticprotected

Definition at line 213 of file validator.h.

◆ kMinIndicUnicode

const char32 tesseract::Validator::kMinIndicUnicode = 0x900
staticprotected

Definition at line 209 of file validator.h.

◆ kMyanmarVirama

const char32 tesseract::Validator::kMyanmarVirama = 0x1039
staticprotected

Definition at line 216 of file validator.h.

◆ kRightToLeftMark

const char32 tesseract::Validator::kRightToLeftMark = 0x200F
static

Definition at line 100 of file validator.h.

◆ kSinhalaVirama

const char32 tesseract::Validator::kSinhalaVirama = 0xdca
staticprotected

Definition at line 215 of file validator.h.

◆ kZeroWidthJoiner

const char32 tesseract::Validator::kZeroWidthJoiner = 0x200D
static

Definition at line 98 of file validator.h.

◆ kZeroWidthNonJoiner

const char32 tesseract::Validator::kZeroWidthNonJoiner = 0x200C
static

Definition at line 97 of file validator.h.

◆ kZeroWidthSpace

const char32 tesseract::Validator::kZeroWidthSpace = 0x200B
static

Definition at line 96 of file validator.h.

◆ output_

std::vector<char32> tesseract::Validator::output_
protected

Definition at line 229 of file validator.h.

◆ output_used_

unsigned tesseract::Validator::output_used_
protected

Definition at line 233 of file validator.h.

◆ parts_

std::vector<std::vector<char32> > tesseract::Validator::parts_
protected

Definition at line 227 of file validator.h.

◆ report_errors_

bool tesseract::Validator::report_errors_
protected

Definition at line 235 of file validator.h.

◆ script_

ViramaScript tesseract::Validator::script_
protected

Definition at line 223 of file validator.h.


The documentation for this class was generated from the following files: