tesseract v5.3.3.20231005
validate_javanese.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: validate_javanese.cpp
3 * Description: Text validator for Javanese Script - aksara jawa.
4 * Author: Shree Devi Kumar
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 **********************************************************************/
17
18#include "validate_javanese.h"
19#include "errcode.h"
20#include "tprintf.h"
21
22namespace tesseract {
23
24// Returns whether codes matches the pattern for a Javanese Grapheme.
25// Taken from unicode standard:
26// http://www.unicode.org/charts/PDF/UA980.pdf
27// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
28// The Consonant class here includes independent vowels.
29// The order of components in an orthographic syllable as expressed in BNF is:
30// {C F} C {{R}Y} {V{A}} {Z}
31// Translated to the codes used by the CharClass enum:
32// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
33// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
34// Validation rules copied from validate_indic.cpp and modified for Javanese.
35// Indic - for reference
36// + vowel Grapheme: V[D](v)*
37// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
38
40 switch (codes_[codes_used_].first) {
42 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
45 return ConsumeVowelIfValid();
48 // Apart from within an aksara, joiners are silently dropped.
49 if (report_errors_) {
50 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
51 }
53 return true;
55 UseMultiCode(1);
56 return true;
57 default:
58 if (report_errors_) {
59 tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
60 static_cast<int>(codes_[codes_used_].first),
61 codes_[codes_used_].second);
62 }
63 return false;
64 }
65}
66
67// Helper consumes/copies a virama and any associated post-virama joiners.
68// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
69// no joiner at all) must be followed by a consonant.
70// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
71// consonant, space, or character from a different script. We clean up the
72// representation to make it consistent by adding a ZWNJ if missing from a
73// non-linking virama. Returns false with an invalid sequence.
74bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
75 const unsigned num_codes = codes_.size();
76 if (joiner.first == CharClass::kOther) {
78 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {
79 // Post-matra viramas must be explicit, so no joiners allowed here.
80 if (post_matra) {
81 if (report_errors_) {
82 tprintf("ZWJ after a post-matra virama!!\n");
83 }
84 return false;
85 }
86 if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kCakra &&
87 (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
88 codes_[codes_used_ + 1].second == kPengkal ||
89 codes_[codes_used_ + 1].second == kCakra)) {
90 // This combination will be picked up later.
92 } else {
93 // Half-form with optional Nukta.
94 unsigned len = output_.size() + 1 - output_used_;
95 if (UseMultiCode(len)) {
96 return true;
97 }
98 }
99 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {
100 if (output_used_ == output_.size() || output_[output_used_] != kCakra) {
101 if (report_errors_) {
102 tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_));
103 }
104 return false;
105 }
106 // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
107 if (UseMultiCode(4)) {
108 return true;
109 }
110 }
111 } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||
112 post_matra) {
113 if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {
114 // It is valid to have an unterminated virama at the end of a word, but
115 // for consistency, we will always add ZWNJ if not present.
117 } else {
119 }
120 // Explicit virama [H z]
121 MultiCodePart(2);
122 }
123 } else {
124 // Pre-virama joiner [{Z|z} H] requests specific conjunct.
125 if (UseMultiCode(2)) {
126 if (report_errors_) {
127 tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
128 }
129 return false;
130 }
131 if (codes_[codes_used_].second == kZeroWidthJoiner ||
133 if (report_errors_) {
134 tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
135 codes_[codes_used_].second);
136 }
137 return false;
138 }
139 }
140 // It is good so far as it goes.
141 return true;
142}
143
144// Helper consumes/copies a series of consonants separated by viramas while
145// valid, but not any vowel or other modifiers.
146bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
147 const unsigned num_codes = codes_.size();
148 // Consonant aksara
149 do {
151 // Special Sinhala case of [H Z Yayana/Rayana].
152 int index = output_.size() - 3;
153 if (output_used_ + 3 <= output_.size() &&
154 (output_.back() == kPengkal || output_.back() == kCakra) && IsVirama(output_[index]) &&
155 output_[index + 1] == kZeroWidthJoiner) {
156 MultiCodePart(3);
157 }
158 bool have_nukta = false;
159 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {
160 have_nukta = true;
162 }
163 // Test for subscript conjunct.
164 index = output_.size() - 2 - have_nukta;
165 if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
166 IsVirama(output_[index])) {
167 // Output previous virama, consonant + optional nukta.
168 MultiCodePart(2 + have_nukta);
169 }
170 IndicPair joiner(CharClass::kOther, 0);
171 if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||
174 joiner = codes_[codes_used_];
175 if (++codes_used_ == num_codes) {
176 if (report_errors_) {
177 tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second);
178 }
179 return true;
180 }
181 if (codes_[codes_used_].first == CharClass::kVirama) {
182 output_.push_back(joiner.second);
183 } else {
184 if (report_errors_) {
185 tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second,
186 codes_[codes_used_].second);
187 }
188 joiner = std::make_pair(CharClass::kOther, 0);
189 }
190 }
191 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {
192 if (!ConsumeViramaIfValid(joiner, false)) {
193 return false;
194 }
195 } else {
196 break; // No virama, so the run of consonants is over.
197 }
198 } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);
199 if (output_used_ < output_.size()) {
200 MultiCodePart(1);
201 }
202 return true;
203}
204
205// Helper consumes/copies a tail part of a consonant, comprising optional
206// matra/piece, vowel modifier, vedic mark, terminating virama.
207bool ValidateJavanese::ConsumeConsonantTailIfValid() {
208 if (codes_used_ == codes_.size()) {
209 return true;
210 }
211 // No virama: Finish the grapheme.
212 // Are multiple matras allowed?
213 if (codes_[codes_used_].first == CharClass::kMatra) {
214 if (UseMultiCode(1)) {
215 return true;
216 }
218 if (UseMultiCode(1)) {
219 return true;
220 }
221 }
222 }
223 // Tarung also used for long versions of u and o vowels and vocalic r
224 // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
225 while (codes_[codes_used_].first == CharClass::kMatraPiece) {
226 if (UseMultiCode(1)) {
227 return true;
228 }
229 }
231 if (UseMultiCode(1)) {
232 return true;
233 }
234 // Only Malayalam allows only repeated 0xd02.
235 if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {
236 break;
237 }
238 }
239 while (codes_[codes_used_].first == CharClass::kVedicMark) {
240 if (UseMultiCode(1)) {
241 return true;
242 }
243 }
244 if (codes_[codes_used_].first == CharClass::kVirama) {
245 if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
246 return false;
247 }
248 }
249 // What we have consumed so far is a valid consonant cluster.
250 if (output_used_ < output_.size()) {
251 MultiCodePart(1);
252 }
253
254 return true;
255}
256
257// Helper consumes/copies a vowel and optional modifiers.
258bool ValidateJavanese::ConsumeVowelIfValid() {
259 if (UseMultiCode(1)) {
260 return true;
261 }
263 if (UseMultiCode(1)) {
264 return true;
265 }
266 // Only Malayalam allows repeated modifiers?
268 break;
269 }
270 }
271 while (codes_[codes_used_].first == CharClass::kVedicMark) {
272 if (UseMultiCode(1)) {
273 return true;
274 }
275 }
276 // What we have consumed so far is a valid vowel cluster.
277 return true;
278}
279
281 if (ch == kZeroWidthNonJoiner) {
283 }
284 if (ch == kZeroWidthJoiner) {
286 }
287 // Offset from the start of the relevant unicode code block aka code page.
288 int off = ch - static_cast<char32>(script_);
289 // Anything in another code block is other.
290 if (off < 0 || off >= kIndicCodePageSize) {
291 return CharClass::kOther;
292 }
293 if (off < 0x4) {
295 }
296 if (off <= 0x32) {
297 return CharClass::kConsonant; // includes independent vowels
298 }
299 if (off == 0x33) {
300 return CharClass::kNukta; // A9B3 CECAK TELU
301 }
302 if (off == 0x34) {
303 return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
304 }
305 if (off <= 0x39) {
306 return CharClass::kMatra;
307 }
308 if (off <= 0x3a) {
309 return CharClass::kConsonant; // A9BA TALING - pre base vowel
310 }
311 if (off <= 0x3d) {
312 return CharClass::kMatra;
313 }
314 if (off <= 0x3f) {
315 return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
316 }
317 if (off == 0x40) {
318 return CharClass::kVirama; // A9C0 PANGKON
319 }
320 return CharClass::kOther;
321}
322
323} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:49
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
std::vector< char32 > output_
Definition: validator.h:229
unsigned output_used_
Definition: validator.h:233
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
void MultiCodePart(unsigned length)
Definition: validator.h:176
static bool IsVirama(char32 unicode)
Definition: validator.cpp:169
static const int kIndicCodePageSize
Definition: validator.h:207
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:135
bool IsSubscriptScript() const
Definition: validator.cpp:184
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98