tesseract v5.3.3.20231005
tesseract::ValidateKhmer Class Reference

#include <validate_khmer.h>

Inheritance diagram for tesseract::ValidateKhmer:
tesseract::Validator

Public Member Functions

 ValidateKhmer (ViramaScript script, bool report_errors)
 
 ~ValidateKhmer () override=default
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
virtual bool ConsumeGraphemeIfValid ()=0
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
virtual CharClass UnicodeToCharClass (char32 ch) const =0
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum class  CharClass {
  kConsonant = 'C' , kVowel = 'V' , kVirama = 'H' , kMatra = 'M' ,
  kMatraPiece = 'P' , kVowelModifier = 'D' , kZeroWidthNonJoiner = 'z' , kZeroWidthJoiner = 'Z' ,
  kVedicMark = 'v' , kNukta = 'N' , kRobat = 'R' , kOther = 'O' ,
  kWhitespace = ' ' , kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 9 of file validate_khmer.h.

Constructor & Destructor Documentation

◆ ValidateKhmer()

tesseract::ValidateKhmer::ValidateKhmer ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 11 of file validate_khmer.h.

11: Validator(script, report_errors) {}
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:137

◆ ~ValidateKhmer()

tesseract::ValidateKhmer::~ValidateKhmer ( )
overridedefault

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateKhmer::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 20 of file validate_khmer.cpp.

20 {
21 const unsigned num_codes = codes_.size();
22 if (codes_used_ == num_codes) {
23 return false;
24 }
25 if (codes_[codes_used_].first == CharClass::kOther) {
26 UseMultiCode(1);
27 return true;
28 }
30 if (report_errors_) {
31 tprintf("Invalid start of Khmer syllable:0x%x\n", codes_[codes_used_].second);
32 }
33 return false;
34 }
35 if (UseMultiCode(1)) {
36 return true;
37 }
38 if (codes_[codes_used_].first == CharClass::kRobat ||
40 if (UseMultiCode(1)) {
41 return true;
42 }
43 }
44 while (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
47 if (UseMultiCode(2)) {
48 return true;
49 }
50 if (codes_[codes_used_].first == CharClass::kRobat) {
51 if (UseMultiCode(1)) {
52 return true;
53 }
54 }
55 }
56 unsigned num_matra_parts = 0;
57 if (codes_[codes_used_].second == kZeroWidthJoiner ||
59 if (CodeOnlyToOutput()) {
60 if (report_errors_) {
61 tprintf("Unterminated joiner: 0x%x\n", output_.back());
62 }
63 return false;
64 }
65 ++num_matra_parts;
66 }
67 // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
68 // own or as an addition to other matras.
69 if (codes_[codes_used_].first == CharClass::kMatra ||
71 ++num_matra_parts;
72 if (UseMultiCode(num_matra_parts)) {
73 return true;
74 }
75 } else if (num_matra_parts) {
76 if (report_errors_) {
77 tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", output_.back(),
78 codes_[codes_used_].second);
79 }
80 return false;
81 }
84 if (UseMultiCode(1)) {
85 return true;
86 }
87 }
89 if (UseMultiCode(1)) {
90 return true;
91 }
92 }
93 if (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
96 if (UseMultiCode(2)) {
97 return true;
98 }
99 }
100 return true;
101}
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
std::vector< char32 > output_
Definition: validator.h:229
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateKhmer::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 103 of file validate_khmer.cpp.

103 {
104 if (IsVedicAccent(ch)) {
106 }
107 if (ch == kZeroWidthNonJoiner) {
109 }
110 if (ch == kZeroWidthJoiner) {
112 }
113 // Offset from the start of the relevant unicode code block aka code page.
114 int off = ch - static_cast<char32>(script_);
115 // Anything in another code block is other.
116 if (off < 0 || off >= kIndicCodePageSize) {
117 return CharClass::kOther;
118 }
119 if (off <= 0x33) {
121 }
122 if (off <= 0x45) {
123 return CharClass::kMatra;
124 }
125 if (off == 0x46) {
127 }
128 if (off == 0x4c) {
129 return CharClass::kRobat;
130 }
131 if (off == 0x49 || off == 0x4a) {
132 return CharClass::kNukta;
133 }
134 if (off <= 0x51) {
136 }
137 if (off == 0x52) {
138 return CharClass::kVirama;
139 }
140 return CharClass::kOther;
141}
signed int char32
ViramaScript script_
Definition: validator.h:223
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:178
static const int kIndicCodePageSize
Definition: validator.h:207

The documentation for this class was generated from the following files: