tesseract  4.00.00dev
validator.cpp
Go to the documentation of this file.
1 #include "validator.h"
2 
3 #include <algorithm>
4 #include <unordered_map>
5 #include <vector>
6 #include <iterator>
7 
8 #include "icuerrorcode.h"
9 #include "unicode/uchar.h" // From libicu
10 #include "unicode/uscript.h" // From libicu
11 #include "validate_grapheme.h"
12 #include "validate_indic.h"
13 #include "validate_khmer.h"
14 #include "validate_myanmar.h"
15 
16 namespace tesseract {
17 
18 // Some specific but universally useful unicodes.
19 const char32 Validator::kZeroWidthSpace = 0x200B;
21 const char32 Validator::kZeroWidthJoiner = 0x200D;
22 const char32 Validator::kLeftToRightMark = 0x200E;
23 const char32 Validator::kRightToLeftMark = 0x200F;
24 const char32 Validator::kInvalid = 0xfffd;
25 
26 // Validates and cleans the src vector of unicodes to the *dest, according to
27 // g_mode. In the case of kSingleString, a single vector containing the whole
28 // result is added to *dest. With kCombined, multiple vectors are added to
29 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
30 // added to *dest with a smaller unit representing a glyph in each.
31 // In case of validation error, returns false and as much as possible of the
32 // input, without discarding invalid text.
33 /* static */
35  GraphemeNormMode g_mode, bool report_errors, const std::vector<char32>& src,
36  std::vector<std::vector<char32>>* dest) {
37  ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
38  std::vector<std::vector<char32>> graphemes;
40  bool success = true;
41  if (script == ViramaScript::kNonVirama) {
42  // The grapheme segmenter's maximum segmentation is the grapheme unit, so
43  // up the mode by 1 to get the desired effect.
44  if (g_mode == GraphemeNormMode::kCombined)
46  else if (g_mode == GraphemeNormMode::kGlyphSplit)
48  // Just do grapheme segmentation.
49  success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
50  } else {
51  success = g_validator.ValidateCleanAndSegmentInternal(
52  GraphemeNormMode::kGlyphSplit, src, &graphemes);
53  std::unique_ptr<Validator> validator(
54  ScriptValidator(script, report_errors));
55  for (const auto& grapheme : graphemes) {
56  if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
57  success = false;
58  }
59  }
60  }
61  return success;
62 }
63 
64 // Factory method that understands how to map script to the right subclass.
65 std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
66  bool report_errors) {
67  switch (script) {
69  return std::unique_ptr<Validator>(
70  new ValidateGrapheme(script, report_errors));
72  return std::unique_ptr<Validator>(
73  new ValidateMyanmar(script, report_errors));
75  return std::unique_ptr<Validator>(
76  new ValidateKhmer(script, report_errors));
77  default:
78  return std::unique_ptr<Validator>(
79  new ValidateIndic(script, report_errors));
80  }
81 }
82 
83 // Internal version of the public static ValidateCleanAndSegment.
84 // Validates and cleans the src vector of unicodes to the *dest, according to
85 // its type and the given g_mode.
86 // In case of validation error, returns false and returns as much as possible
87 // of the input, without discarding invalid text.
89  GraphemeNormMode g_mode, const std::vector<char32>& src,
90  std::vector<std::vector<char32>>* dest) {
91  Clear();
92  ComputeClassCodes(src);
93  bool success = true;
94  for (codes_used_ = 0; codes_used_ < codes_.size();) {
95  if (!ConsumeGraphemeIfValid()) {
96  success = false;
97  ++codes_used_;
98  }
99  }
100  MoveResultsToDest(g_mode, dest);
101  return success;
102 }
103 
104 // Moves the results from parts_ or output_ to dest according to g_mode.
106  std::vector<std::vector<char32>>* dest) {
108  // Append each element of the combined output_ that we made as a new vector
109  // in dest.
110  dest->reserve(dest->size() + output_.size());
111  for (char32 ch : output_) dest->push_back({ch});
112  } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
113  // Append all the parts_ that we made onto dest.
114  std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
115  } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
116  // Append the combined output_ that we made onto dest as one new vector.
117  dest->push_back(std::vector<char32>());
118  output_.swap(dest->back());
119  } else { // kNone.
120  // Append the combined output_ that we made onto the last existing element
121  // of dest.
122  dest->back().insert(dest->back().end(), output_.begin(), output_.end());
123  }
124 }
125 
126 bool CmpPairSecond(const std::pair<int, int>& p1,
127  const std::pair<int, int>& p2) {
128  return p1.second < p2.second;
129 }
130 
131 // Computes and returns the ViramaScript corresponding to the most frequent
132 // virama-using script in the input, or kNonVirama if none are present.
133 /* static */
135  const std::vector<char32>& utf32) {
136  std::unordered_map<int, int> histogram;
137  for (char32 ch : utf32) {
138  // Determine the codepage base. For the Indic scripts, and Khmer, it is
139  // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
140  // unicode code space, so use its script id.
141  int base = ch / kIndicCodePageSize;
142  IcuErrorCode err;
143  UScriptCode script_code = uscript_getScript(ch, err);
144  if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
145  script_code != USCRIPT_COMMON) ||
146  script_code == USCRIPT_MYANMAR) {
147  if (script_code == USCRIPT_MYANMAR)
148  base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
149  ++histogram[base];
150  }
151  }
152  if (!histogram.empty()) {
153  int base =
154  std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
155  ->first;
156  char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
157  // Check for validity.
158  if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
159  codebase == static_cast<char32>(ViramaScript::kKhmer) ||
160  (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
161  codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
162  return static_cast<ViramaScript>(codebase);
163  }
164  }
166 }
167 
168 // Returns true if the given UTF-32 unicode is a "virama" character.
169 /* static */
171  return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
172  (unicode & 0x7f) == 0x4d) ||
173  unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
174  unicode == kKhmerVirama;
175 }
176 
177 // Returns true if the given UTF-32 unicode is a vedic accent.
178 /* static */
180  return 0x1cd0 <= unicode && unicode < 0x1d00;
181 }
182 
183 // Returns true if the script is one that uses subscripts for conjuncts.
185  return script_ == ViramaScript::kTelugu ||
188 }
189 
190 void Validator::ComputeClassCodes(const std::vector<char32>& text) {
191  codes_.reserve(text.size());
192  for (char32 c : text) {
193  codes_.push_back(std::make_pair(UnicodeToCharClass(c), c));
194  }
195 }
196 
197 // Resets to the initial state.
199  codes_.clear();
200  parts_.clear();
201  output_.clear();
202  codes_used_ = 0;
203  output_used_ = 0;
204 }
205 
206 } // namespace tesseract
static const char32 kInvalid
Definition: validator.h:99
virtual bool ConsumeGraphemeIfValid()=0
static const char32 kKhmerVirama
Definition: validator.h:223
static const char32 kLeftToRightMark
Definition: validator.h:97
bool CmpPairSecond(const std::pair< int, int > &p1, const std::pair< int, int > &p2)
Definition: validator.cpp:126
static const char32 kZeroWidthSpace
Definition: validator.h:94
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:134
static const char32 kMaxViramaScriptUnicode
Definition: validator.h:219
virtual CharClass UnicodeToCharClass(char32 ch) const =0
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:190
signed int char32
ViramaScript script_
Definition: validator.h:226
GraphemeNormMode
Definition: validator.h:34
signed int char32
Definition: unichar.h:52
std::vector< IndicPair > codes_
Definition: validator.h:228
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:179
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:105
static const char32 kMyanmarVirama
Definition: validator.h:222
static const char32 kMaxSinhalaUnicode
Definition: validator.h:217
static const char32 kSinhalaVirama
Definition: validator.h:221
static const int kIndicCodePageSize
Definition: validator.h:213
bool IsSubscriptScript() const
Definition: validator.cpp:184
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:34
std::vector< std::vector< char32 > > parts_
Definition: validator.h:230
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:88
static bool IsVirama(char32 unicode)
Definition: validator.cpp:170
static const char32 kRightToLeftMark
Definition: validator.h:98
std::vector< char32 > output_
Definition: validator.h:232
static const char32 kZeroWidthJoiner
Definition: validator.h:96
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:65
static const char32 kMinIndicUnicode
Definition: validator.h:215
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95