21 #ifndef TESSERACT_TRAINING_VALIDATOR_H_ 22 #define TESSERACT_TRAINING_VALIDATOR_H_ 82 const std::vector<char32>& src,
83 std::vector<std::vector<char32>>* dest);
88 return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
89 ch == kRightToLeftMark || ch == kInvalid;
122 kVowelModifier =
'D',
123 kZeroWidthNonJoiner =
'z',
124 kZeroWidthJoiner =
'Z',
139 report_errors_(report_errors) {}
142 static std::unique_ptr<Validator> ScriptValidator(
ViramaScript script,
151 const std::vector<char32>& src,
152 std::vector<std::vector<char32>>* dest);
155 std::vector<std::vector<char32>>* dest);
160 const std::vector<char32>& utf32);
162 static bool IsVirama(
char32 unicode);
164 static bool IsVedicAccent(
char32 unicode);
166 bool IsSubscriptScript()
const;
172 output_.push_back(codes_[codes_used_].second);
173 return ++codes_used_ == codes_.size();
182 while (output_used_ + length < output_.size()) {
184 std::initializer_list<char32>{output_[output_used_++]});
186 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
187 while (++output_used_ < output_.size()) {
188 parts_.back().push_back(output_[output_used_]);
196 output_.push_back(codes_[codes_used_].second);
197 MultiCodePart(length);
198 return ++codes_used_ == codes_.size();
204 virtual bool ConsumeGraphemeIfValid() = 0;
206 void ComputeClassCodes(
const std::vector<char32>& text);
213 static const int kIndicCodePageSize = 128;
215 static const char32 kMinIndicUnicode = 0x900;
217 static const char32 kMaxSinhalaUnicode = 0xdff;
219 static const char32 kMaxViramaScriptUnicode = 0x17ff;
221 static const char32 kSinhalaVirama = 0xdca;
222 static const char32 kMyanmarVirama = 0x1039;
223 static const char32 kKhmerVirama = 0x17d2;
243 #endif // TESSERACT_TRAINING_VALIDATOR_H_ static const char32 kInvalid
static const char32 kLeftToRightMark
std::pair< CharClass, char32 > IndicPair
static const char32 kZeroWidthSpace
std::vector< IndicPair > codes_
Validator(ViramaScript script, bool report_errors)
static bool IsZeroWidthMark(char32 ch)
void MultiCodePart(int length)
std::vector< std::vector< char32 > > parts_
static const char32 kRightToLeftMark
std::vector< char32 > output_
bool UseMultiCode(int length)
static const char32 kZeroWidthJoiner
static const char32 kZeroWidthNonJoiner