20#ifndef TESSERACT_TRAINING_VALIDATOR_H_
21#define TESSERACT_TRAINING_VALIDATOR_H_
83 static bool ValidateCleanAndSegment(
GraphemeNormMode g_mode,
bool report_errors,
84 const std::vector<char32> &src,
85 std::vector<std::vector<char32>> *
dest);
90 return ch == kZeroWidthSpace ||
ch == kLeftToRightMark ||
ch == kRightToLeftMark ||
124 kVowelModifier =
'D',
125 kZeroWidthNonJoiner =
'z',
126 kZeroWidthJoiner =
'Z',
138 : script_(script), codes_used_(0), output_used_(0), report_errors_(report_errors) {}
141 static std::unique_ptr<Validator> ScriptValidator(
ViramaScript script,
bool report_errors);
148 bool ValidateCleanAndSegmentInternal(
GraphemeNormMode g_mode,
const std::vector<char32> &src,
149 std::vector<std::vector<char32>> *
dest);
155 static ViramaScript MostFrequentViramaScript(
const std::vector<char32> &utf32);
157 static bool IsVirama(
char32 unicode);
159 static bool IsVedicAccent(
char32 unicode);
161 bool IsSubscriptScript()
const;
167 output_.push_back(codes_[codes_used_].second);
168 return ++codes_used_ == codes_.size();
177 while (output_used_ + length < output_.size()) {
178 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_++]});
180 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
181 while (++output_used_ < output_.size()) {
182 parts_.back().push_back(output_[output_used_]);
190 output_.push_back(codes_[codes_used_].second);
191 MultiCodePart(length);
192 return ++codes_used_ == codes_.size();
200 void ComputeClassCodes(
const std::vector<char32> &text);
207 static const int kIndicCodePageSize = 128;
209 static const char32 kMinIndicUnicode = 0x900;
211 static const char32 kMaxSinhalaUnicode = 0xdff;
213 static const char32 kMaxViramaScriptUnicode = 0x17ff;
215 static const char32 kSinhalaVirama = 0xdca;
216 static const char32 kMyanmarVirama = 0x1039;
217 static const char32 kKhmerVirama = 0x17d2;
219 static const char32 kJavaneseVirama = 0xa9c0;
220 static const char32 kMaxJavaneseUnicode = 0xa9df;
static const char32 kZeroWidthNonJoiner
virtual CharClass UnicodeToCharClass(char32 ch) const =0
std::vector< char32 > output_
static const char32 kInvalid
static const char32 kRightToLeftMark
bool UseMultiCode(unsigned length)
void MultiCodePart(unsigned length)
static const char32 kLeftToRightMark
std::pair< CharClass, char32 > IndicPair
static const char32 kZeroWidthSpace
static bool IsZeroWidthMark(char32 ch)
std::vector< IndicPair > codes_
static const char32 kZeroWidthJoiner
Validator(ViramaScript script, bool report_errors)
std::vector< std::vector< char32 > > parts_
virtual bool ConsumeGraphemeIfValid()=0