tesseract v5.3.3.20231005
validate_khmer.cpp
Go to the documentation of this file.
1#include "validate_khmer.h"
2#include "errcode.h"
3#include "tprintf.h"
4
5namespace tesseract {
6
7// Returns whether codes matches the pattern for a Khmer Grapheme.
8// Taken from unicode standard:
9// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
10// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
11// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
12// Translated to the codes used by the CharClass enum:
13// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
14// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
15// Also the Consonant class here includes independent vowels, as they are
16// treated the same anyway.
17// In the split grapheme mode, the only characters that get grouped are the
18// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
19// the BNF syntax, so who knows what they do.
21 const unsigned num_codes = codes_.size();
22 if (codes_used_ == num_codes) {
23 return false;
24 }
25 if (codes_[codes_used_].first == CharClass::kOther) {
26 UseMultiCode(1);
27 return true;
28 }
30 if (report_errors_) {
31 tprintf("Invalid start of Khmer syllable:0x%x\n", codes_[codes_used_].second);
32 }
33 return false;
34 }
35 if (UseMultiCode(1)) {
36 return true;
37 }
38 if (codes_[codes_used_].first == CharClass::kRobat ||
40 if (UseMultiCode(1)) {
41 return true;
42 }
43 }
44 while (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
47 if (UseMultiCode(2)) {
48 return true;
49 }
50 if (codes_[codes_used_].first == CharClass::kRobat) {
51 if (UseMultiCode(1)) {
52 return true;
53 }
54 }
55 }
56 unsigned num_matra_parts = 0;
57 if (codes_[codes_used_].second == kZeroWidthJoiner ||
59 if (CodeOnlyToOutput()) {
60 if (report_errors_) {
61 tprintf("Unterminated joiner: 0x%x\n", output_.back());
62 }
63 return false;
64 }
65 ++num_matra_parts;
66 }
67 // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
68 // own or as an addition to other matras.
69 if (codes_[codes_used_].first == CharClass::kMatra ||
71 ++num_matra_parts;
72 if (UseMultiCode(num_matra_parts)) {
73 return true;
74 }
75 } else if (num_matra_parts) {
76 if (report_errors_) {
77 tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", output_.back(),
78 codes_[codes_used_].second);
79 }
80 return false;
81 }
84 if (UseMultiCode(1)) {
85 return true;
86 }
87 }
89 if (UseMultiCode(1)) {
90 return true;
91 }
92 }
93 if (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
96 if (UseMultiCode(2)) {
97 return true;
98 }
99 }
100 return true;
101}
102
104 if (IsVedicAccent(ch)) {
106 }
107 if (ch == kZeroWidthNonJoiner) {
109 }
110 if (ch == kZeroWidthJoiner) {
112 }
113 // Offset from the start of the relevant unicode code block aka code page.
114 int off = ch - static_cast<char32>(script_);
115 // Anything in another code block is other.
116 if (off < 0 || off >= kIndicCodePageSize) {
117 return CharClass::kOther;
118 }
119 if (off <= 0x33) {
121 }
122 if (off <= 0x45) {
123 return CharClass::kMatra;
124 }
125 if (off == 0x46) {
127 }
128 if (off == 0x4c) {
129 return CharClass::kRobat;
130 }
131 if (off == 0x49 || off == 0x4a) {
132 return CharClass::kNukta;
133 }
134 if (off <= 0x51) {
136 }
137 if (off == 0x52) {
138 return CharClass::kVirama;
139 }
140 return CharClass::kOther;
141}
142
143} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:49
bool ConsumeGraphemeIfValid() override
CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
std::vector< char32 > output_
Definition: validator.h:229
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:178
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
static const int kIndicCodePageSize
Definition: validator.h:207
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98