tesseract  4.00.00dev
validate_indic.cpp
Go to the documentation of this file.
1 #include "validate_indic.h"
2 #include "errcode.h"
3 #include "tprintf.h"
4 
5 namespace tesseract {
6 
7 // Returns whether codes matches the pattern for an Indic Grapheme.
8 // The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
9 // has a BNF for valid syllables (Graphemes) which is modified slightly
10 // for Unicode. Notably U+200C and U+200D are used before/after the
11 // virama/virama to express explicit or soft viramas.
12 // Also the unicode v.9 Malayalam entry states that CZHC can be used in several
13 // Indic languages to request traditional ligatures, and CzHC is Malayalam-
14 // specific for requesting open conjuncts.
15 //
16 // + vowel Grapheme: V[D](v)*
17 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
19  switch (codes_[codes_used_].first) {
21  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
22  case CharClass::kVowel:
23  return ConsumeVowelIfValid();
26  // Apart from within an aksara, joiners are silently dropped.
27  if (report_errors_)
28  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
29  ++codes_used_;
30  return true;
31  case CharClass::kOther:
32  UseMultiCode(1);
33  return true;
34  default:
35  if (report_errors_) {
36  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
37  codes_[codes_used_].first, codes_[codes_used_].second);
38  }
39  return false;
40  }
41 }
42 
44  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
47  // Offset from the start of the relevant unicode code block aka code page.
48  int base = static_cast<char32>(script_);
49  int off = ch - base;
50  // Anything in another code block is other.
51  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
52  // Exception for Tamil. The aytham character is considered a letter.
53  if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
54  if (off < 0x4) return CharClass::kVowelModifier;
56  // Sinhala is an exception.
57  if (off <= 0x19) return CharClass::kVowel;
58  if (off <= 0x49) return CharClass::kConsonant;
59  if (off == 0x4a) return CharClass::kVirama;
60  if (off <= 0x5f) return CharClass::kMatra;
61  } else {
62  if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
63  if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
64  return CharClass::kConsonant;
65  // Sinhala doesn't have Nukta or Avagraha.
66  if (off == 0x3c) return CharClass::kNukta;
67  if (off == 0x3d) return CharClass::kVowel;
68  if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
69  if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
70  if (off == 0x4d) return CharClass::kVirama;
71  }
72  if (off == 0x60 || off == 0x61) return CharClass::kVowel;
73  if (off == 0x62 || off == 0x63) return CharClass::kMatra;
74  // Danda and digits up to 6f are OK as other.
75  // 70-7f are script-specific.
76  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
77  return CharClass::kConsonant;
78  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
79  return CharClass::kConsonant;
80  if (script_ == ViramaScript::kSinhala && off == 0x70)
81  return CharClass::kConsonant;
82  if (script_ == ViramaScript::kDevanagari && off == 0x70)
83  return CharClass::kOther;
84  if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
85  // Non Indic, Digits, Measures, danda, etc.
86  return CharClass::kOther;
87 }
88 
89 // Helper consumes/copies a virama and any associated post-virama joiners.
90 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
91 // no joiner at all) must be followed by a consonant.
92 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
93 // consonant, space, or character from a different script. We clean up the
94 // representation to make it consistent by adding a ZWNJ if missing from a
95 // non-linking virama. Returns false with an invalid sequence.
96 bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
97  int num_codes = codes_.size();
98  if (joiner.first == CharClass::kOther) {
100  if (codes_used_ < num_codes &&
101  codes_[codes_used_].second == kZeroWidthJoiner) {
102  // Post-matra viramas must be explicit, so no joiners allowed here.
103  if (post_matra) {
104  if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
105  return false;
106  }
107  if (codes_used_ + 1 < num_codes &&
108  codes_[codes_used_ - 2].second != kRayana &&
109  (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
110  codes_[codes_used_ + 1].second == kYayana ||
111  codes_[codes_used_ + 1].second == kRayana)) {
112  // This combination will be picked up later.
114  } else {
115  // Half-form with optional Nukta.
116  int len = output_.size() + 1 - output_used_;
117  if (UseMultiCode(len)) return true;
118  }
119  if (codes_used_ < num_codes &&
121  if (output_used_ == output_.size() ||
122  output_[output_used_] != kRayana) {
123  if (report_errors_) {
124  tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
125  static_cast<int>(script_));
126  }
127  return false;
128  }
129  // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
130  if (UseMultiCode(4)) return true;
131  }
132  } else if (codes_used_ == num_codes ||
134  post_matra) {
135  if (codes_used_ == num_codes ||
137  // It is valid to have an unterminated virama at the end of a word, but
138  // for consistency, we will always add ZWNJ if not present.
139  output_.push_back(kZeroWidthNonJoiner);
140  } else {
142  }
143  // Explicit virama [H z]
144  MultiCodePart(2);
145  }
146  } else {
147  // Pre-virama joiner [{Z|z} H] requests specific conjunct.
148  if (UseMultiCode(2)) {
149  if (report_errors_)
150  tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
151  return false;
152  }
153  if (codes_[codes_used_].second == kZeroWidthJoiner ||
155  if (report_errors_) {
156  tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
157  codes_[codes_used_].second);
158  }
159  return false;
160  }
161  }
162  // It is good so far as it goes.
163  return true;
164 }
165 
166 // Helper consumes/copies a series of consonants separated by viramas while
167 // valid, but not any vowel or other modifiers.
168 bool ValidateIndic::ConsumeConsonantHeadIfValid() {
169  const int num_codes = codes_.size();
170  // Consonant aksara
171  do {
173  // Special Sinhala case of [H Z Yayana/Rayana].
174  int index = output_.size() - 3;
175  if (output_used_ <= index &&
176  (output_.back() == kYayana || output_.back() == kRayana) &&
177  IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
178  MultiCodePart(3);
179  }
180  bool have_nukta = false;
181  if (codes_used_ < num_codes &&
183  have_nukta = true;
185  }
186  // Test for subscript conjunct.
187  index = output_.size() - 2 - have_nukta;
188  if (output_used_ <= index && IsSubscriptScript() &&
189  IsVirama(output_[index])) {
190  // Output previous virama, consonant + optional nukta.
191  MultiCodePart(2 + have_nukta);
192  }
193  IndicPair joiner(CharClass::kOther, 0);
194  if (codes_used_ < num_codes &&
195  (codes_[codes_used_].second == kZeroWidthJoiner ||
196  (codes_[codes_used_].second == kZeroWidthNonJoiner &&
198  joiner = codes_[codes_used_];
199  if (++codes_used_ == num_codes) {
200  if (report_errors_) {
201  tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
202  joiner.second);
203  }
204  return true;
205  }
206  if (codes_[codes_used_].first == CharClass::kVirama) {
207  output_.push_back(joiner.second);
208  } else {
209  if (report_errors_) {
210  tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
211  output_.back(), joiner.second, codes_[codes_used_].second);
212  }
213  joiner = std::make_pair(CharClass::kOther, 0);
214  }
215  }
216  if (codes_used_ < num_codes &&
218  if (!ConsumeViramaIfValid(joiner, false)) return false;
219  } else {
220  break; // No virama, so the run of consonants is over.
221  }
222  } while (codes_used_ < num_codes &&
224  if (output_used_ < output_.size()) MultiCodePart(1);
225  return true;
226 }
227 
228 // Helper consumes/copies a tail part of a consonant, comprising optional
229 // matra/piece, vowel modifier, vedic mark, terminating virama.
230 bool ValidateIndic::ConsumeConsonantTailIfValid() {
231  if (codes_used_ == codes_.size()) return true;
232  // No virama: Finish the grapheme.
233  // Are multiple matras allowed?
234  if (codes_[codes_used_].first == CharClass::kMatra) {
235  if (UseMultiCode(1)) return true;
236  if (codes_[codes_used_].first == CharClass::kMatraPiece) {
237  if (UseMultiCode(1)) return true;
238  }
239  }
240  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
241  if (UseMultiCode(1)) return true;
242  // Only Malayalam allows only repeated 0xd02.
243  if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
244  }
245  while (codes_[codes_used_].first == CharClass::kVedicMark) {
246  if (UseMultiCode(1)) return true;
247  }
248  if (codes_[codes_used_].first == CharClass::kVirama) {
249  if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
250  return false;
251  }
252  }
253  // What we have consumed so far is a valid consonant cluster.
254  if (output_used_ < output_.size()) MultiCodePart(1);
255 
256  return true;
257 }
258 
259 // Helper consumes/copies a vowel and optional modifiers.
260 bool ValidateIndic::ConsumeVowelIfValid() {
261  if (UseMultiCode(1)) return true;
262  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
263  if (UseMultiCode(1)) return true;
264  // Only Malayalam allows repeated modifiers?
265  if (script_ != ViramaScript::kMalayalam) break;
266  }
267  while (codes_[codes_used_].first == CharClass::kVedicMark) {
268  if (UseMultiCode(1)) return true;
269  }
270  // What we have consumed so far is a valid vowel cluster.
271  return true;
272 }
273 
274 } // namespace tesseract
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:133
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
#define tprintf(...)
Definition: tprintf.h:31
ViramaScript script_
Definition: validator.h:226
signed int char32
Definition: unichar.h:52
std::vector< IndicPair > codes_
Definition: validator.h:228
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:179
static const int kIndicCodePageSize
Definition: validator.h:213
bool IsSubscriptScript() const
Definition: validator.cpp:184
#define ASSERT_HOST(x)
Definition: errcode.h:84
void MultiCodePart(int length)
Definition: validator.h:181
static bool IsVirama(char32 unicode)
Definition: validator.cpp:170
std::vector< char32 > output_
Definition: validator.h:232
bool UseMultiCode(int length)
Definition: validator.h:195
static const char32 kZeroWidthJoiner
Definition: validator.h:96
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95