tesseract  4.00.00dev
tesseract::Validator Class Referenceabstract

#include <validator.h>

Inheritance diagram for tesseract::Validator:
tesseract::ValidateGrapheme tesseract::ValidateIndic tesseract::ValidateKhmer tesseract::ValidateMyanmar

Public Member Functions

virtual ~Validator ()
 

Static Public Member Functions

static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
static bool IsZeroWidthMark (char32 ch)
 

Static Public Attributes

static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 

Protected Types

enum  CharClass {
  CharClass::kConsonant = 'C', CharClass::kVowel = 'V', CharClass::kVirama = 'H', CharClass::kMatra = 'M',
  CharClass::kMatraPiece = 'P', CharClass::kVowelModifier = 'D', CharClass::kZeroWidthNonJoiner = 'z', CharClass::kZeroWidthJoiner = 'Z',
  CharClass::kVedicMark = 'v', CharClass::kNukta = 'N', CharClass::kRobat = 'R', CharClass::kOther = 'O',
  CharClass::kWhitespace = ' ', CharClass::kCombiner = 'c'
}
 
typedef std::pair< CharClass, char32IndicPair
 

Protected Member Functions

 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (int length)
 
bool UseMultiCode (int length)
 
virtual bool ConsumeGraphemeIfValid ()=0
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
virtual CharClass UnicodeToCharClass (char32 ch) const =0
 
void Clear ()
 

Static Protected Member Functions

static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 

Protected Attributes

ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
int codes_used_
 
int output_used_
 
bool report_errors_
 

Static Protected Attributes

static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 

Detailed Description

Definition at line 71 of file validator.h.

Member Typedef Documentation

◆ IndicPair

typedef std::pair<CharClass, char32> tesseract::Validator::IndicPair
protected

Definition at line 133 of file validator.h.

Member Enumeration Documentation

◆ CharClass

enum tesseract::Validator::CharClass
strongprotected
Enumerator
kConsonant 
kVowel 
kVirama 
kMatra 
kMatraPiece 
kVowelModifier 
kZeroWidthNonJoiner 
kZeroWidthJoiner 
kVedicMark 
kNukta 
kRobat 
kOther 
kWhitespace 
kCombiner 

Definition at line 111 of file validator.h.

111  {
112  // NOTE: The values of the enum members are meaningless and arbitrary, ie
113  // they are not used for sorting, or any other risky application.
114  // The reason they are what they are is they are a single character
115  // abbreviation that can be used in a regexp/BNF definition of a grammar,
116  // IN A COMMENT, and still not relied upon in the code.
117  kConsonant = 'C',
118  kVowel = 'V',
119  kVirama = 'H', // (aka Halant)
120  kMatra = 'M', // (aka Dependent Vowel)
121  kMatraPiece = 'P', // unicode provides pieces of Matras.
122  kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
123  kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
124  kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
125  kVedicMark = 'v', // Modifiers can come modify any indic syllable.
126  kNukta = 'N', // Occurs only immediately after consonants.
127  kRobat = 'R', // Khmer only.
128  kOther = 'O', // (digits, measures, non-Indic, etc)
129  // Additional classes used only by ValidateGrapheme.
130  kWhitespace = ' ',
131  kCombiner = 'c', // Combiners other than virama.
132  };
static const char32 kZeroWidthJoiner
Definition: validator.h:96
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95

Constructor & Destructor Documentation

◆ ~Validator()

virtual tesseract::Validator::~Validator ( )
inlinevirtual

Definition at line 91 of file validator.h.

91 {}

◆ Validator()

tesseract::Validator::Validator ( ViramaScript  script,
bool  report_errors 
)
inlineprotected

Definition at line 135 of file validator.h.

136  : script_(script),
137  codes_used_(0),
138  output_used_(0),
139  report_errors_(report_errors) {}
ViramaScript script_
Definition: validator.h:226

Member Function Documentation

◆ Clear()

void tesseract::Validator::Clear ( )
protected

Definition at line 198 of file validator.cpp.

198  {
199  codes_.clear();
200  parts_.clear();
201  output_.clear();
202  codes_used_ = 0;
203  output_used_ = 0;
204 }
std::vector< IndicPair > codes_
Definition: validator.h:228
std::vector< std::vector< char32 > > parts_
Definition: validator.h:230
std::vector< char32 > output_
Definition: validator.h:232

◆ CodeOnlyToOutput()

bool tesseract::Validator::CodeOnlyToOutput ( )
inlineprotected

Definition at line 171 of file validator.h.

171  {
172  output_.push_back(codes_[codes_used_].second);
173  return ++codes_used_ == codes_.size();
174  }
std::vector< IndicPair > codes_
Definition: validator.h:228
std::vector< char32 > output_
Definition: validator.h:232

◆ ComputeClassCodes()

void tesseract::Validator::ComputeClassCodes ( const std::vector< char32 > &  text)
protected

Definition at line 190 of file validator.cpp.

190  {
191  codes_.reserve(text.size());
192  for (char32 c : text) {
193  codes_.push_back(std::make_pair(UnicodeToCharClass(c), c));
194  }
195 }
virtual CharClass UnicodeToCharClass(char32 ch) const =0
signed int char32
std::vector< IndicPair > codes_
Definition: validator.h:228

◆ ConsumeGraphemeIfValid()

virtual bool tesseract::Validator::ConsumeGraphemeIfValid ( )
protectedpure virtual

◆ IsSubscriptScript()

bool tesseract::Validator::IsSubscriptScript ( ) const
protected

◆ IsVedicAccent()

bool tesseract::Validator::IsVedicAccent ( char32  unicode)
staticprotected

Definition at line 179 of file validator.cpp.

179  {
180  return 0x1cd0 <= unicode && unicode < 0x1d00;
181 }

◆ IsVirama()

bool tesseract::Validator::IsVirama ( char32  unicode)
staticprotected

Definition at line 170 of file validator.cpp.

170  {
171  return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
172  (unicode & 0x7f) == 0x4d) ||
173  unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
174  unicode == kKhmerVirama;
175 }
static const char32 kKhmerVirama
Definition: validator.h:223
static const char32 kMyanmarVirama
Definition: validator.h:222
static const char32 kMaxSinhalaUnicode
Definition: validator.h:217
static const char32 kSinhalaVirama
Definition: validator.h:221
static const char32 kMinIndicUnicode
Definition: validator.h:215

◆ IsZeroWidthMark()

static bool tesseract::Validator::IsZeroWidthMark ( char32  ch)
inlinestatic

Definition at line 87 of file validator.h.

87  {
88  return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
89  ch == kRightToLeftMark || ch == kInvalid;
90  }
static const char32 kInvalid
Definition: validator.h:99
static const char32 kLeftToRightMark
Definition: validator.h:97
static const char32 kZeroWidthSpace
Definition: validator.h:94
static const char32 kRightToLeftMark
Definition: validator.h:98

◆ MostFrequentViramaScript()

ViramaScript tesseract::Validator::MostFrequentViramaScript ( const std::vector< char32 > &  utf32)
staticprotected

Definition at line 134 of file validator.cpp.

135  {
136  std::unordered_map<int, int> histogram;
137  for (char32 ch : utf32) {
138  // Determine the codepage base. For the Indic scripts, and Khmer, it is
139  // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
140  // unicode code space, so use its script id.
141  int base = ch / kIndicCodePageSize;
142  IcuErrorCode err;
143  UScriptCode script_code = uscript_getScript(ch, err);
144  if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
145  script_code != USCRIPT_COMMON) ||
146  script_code == USCRIPT_MYANMAR) {
147  if (script_code == USCRIPT_MYANMAR)
148  base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
149  ++histogram[base];
150  }
151  }
152  if (!histogram.empty()) {
153  int base =
154  std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
155  ->first;
156  char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
157  // Check for validity.
158  if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
159  codebase == static_cast<char32>(ViramaScript::kKhmer) ||
160  (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
161  codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
162  return static_cast<ViramaScript>(codebase);
163  }
164  }
166 }
bool CmpPairSecond(const std::pair< int, int > &p1, const std::pair< int, int > &p2)
Definition: validator.cpp:126
static const char32 kMaxViramaScriptUnicode
Definition: validator.h:219
signed int char32
static const int kIndicCodePageSize
Definition: validator.h:213
static const char32 kMinIndicUnicode
Definition: validator.h:215

◆ MoveResultsToDest()

void tesseract::Validator::MoveResultsToDest ( GraphemeNormMode  g_mode,
std::vector< std::vector< char32 >> *  dest 
)
protected

Definition at line 105 of file validator.cpp.

106  {
108  // Append each element of the combined output_ that we made as a new vector
109  // in dest.
110  dest->reserve(dest->size() + output_.size());
111  for (char32 ch : output_) dest->push_back({ch});
112  } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
113  // Append all the parts_ that we made onto dest.
114  std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
115  } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
116  // Append the combined output_ that we made onto dest as one new vector.
117  dest->push_back(std::vector<char32>());
118  output_.swap(dest->back());
119  } else { // kNone.
120  // Append the combined output_ that we made onto the last existing element
121  // of dest.
122  dest->back().insert(dest->back().end(), output_.begin(), output_.end());
123  }
124 }
signed int char32
std::vector< std::vector< char32 > > parts_
Definition: validator.h:230
std::vector< char32 > output_
Definition: validator.h:232

◆ MultiCodePart()

void tesseract::Validator::MultiCodePart ( int  length)
inlineprotected

Definition at line 181 of file validator.h.

181  {
182  while (output_used_ + length < output_.size()) {
183  parts_.emplace_back(
184  std::initializer_list<char32>{output_[output_used_++]});
185  }
186  parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
187  while (++output_used_ < output_.size()) {
188  parts_.back().push_back(output_[output_used_]);
189  }
190  }
std::vector< std::vector< char32 > > parts_
Definition: validator.h:230
std::vector< char32 > output_
Definition: validator.h:232

◆ ScriptValidator()

std::unique_ptr< Validator > tesseract::Validator::ScriptValidator ( ViramaScript  script,
bool  report_errors 
)
staticprotected

Definition at line 65 of file validator.cpp.

66  {
67  switch (script) {
69  return std::unique_ptr<Validator>(
70  new ValidateGrapheme(script, report_errors));
72  return std::unique_ptr<Validator>(
73  new ValidateMyanmar(script, report_errors));
75  return std::unique_ptr<Validator>(
76  new ValidateKhmer(script, report_errors));
77  default:
78  return std::unique_ptr<Validator>(
79  new ValidateIndic(script, report_errors));
80  }
81 }

◆ UnicodeToCharClass()

virtual CharClass tesseract::Validator::UnicodeToCharClass ( char32  ch) const
protectedpure virtual

◆ UseMultiCode()

bool tesseract::Validator::UseMultiCode ( int  length)
inlineprotected

Definition at line 195 of file validator.h.

195  {
196  output_.push_back(codes_[codes_used_].second);
197  MultiCodePart(length);
198  return ++codes_used_ == codes_.size();
199  }
std::vector< IndicPair > codes_
Definition: validator.h:228
void MultiCodePart(int length)
Definition: validator.h:181
std::vector< char32 > output_
Definition: validator.h:232

◆ ValidateCleanAndSegment()

bool tesseract::Validator::ValidateCleanAndSegment ( GraphemeNormMode  g_mode,
bool  report_errors,
const std::vector< char32 > &  src,
std::vector< std::vector< char32 >> *  dest 
)
static

Definition at line 34 of file validator.cpp.

36  {
37  ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
38  std::vector<std::vector<char32>> graphemes;
40  bool success = true;
41  if (script == ViramaScript::kNonVirama) {
42  // The grapheme segmenter's maximum segmentation is the grapheme unit, so
43  // up the mode by 1 to get the desired effect.
44  if (g_mode == GraphemeNormMode::kCombined)
46  else if (g_mode == GraphemeNormMode::kGlyphSplit)
48  // Just do grapheme segmentation.
49  success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
50  } else {
51  success = g_validator.ValidateCleanAndSegmentInternal(
52  GraphemeNormMode::kGlyphSplit, src, &graphemes);
53  std::unique_ptr<Validator> validator(
54  ScriptValidator(script, report_errors));
55  for (const auto& grapheme : graphemes) {
56  if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
57  success = false;
58  }
59  }
60  }
61  return success;
62 }
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:134
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:65

◆ ValidateCleanAndSegmentInternal()

bool tesseract::Validator::ValidateCleanAndSegmentInternal ( GraphemeNormMode  g_mode,
const std::vector< char32 > &  src,
std::vector< std::vector< char32 >> *  dest 
)
protected

Definition at line 88 of file validator.cpp.

90  {
91  Clear();
92  ComputeClassCodes(src);
93  bool success = true;
94  for (codes_used_ = 0; codes_used_ < codes_.size();) {
95  if (!ConsumeGraphemeIfValid()) {
96  success = false;
97  ++codes_used_;
98  }
99  }
100  MoveResultsToDest(g_mode, dest);
101  return success;
102 }
virtual bool ConsumeGraphemeIfValid()=0
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:190
std::vector< IndicPair > codes_
Definition: validator.h:228
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:105

Member Data Documentation

◆ codes_

std::vector<IndicPair> tesseract::Validator::codes_
protected

Definition at line 228 of file validator.h.

◆ codes_used_

int tesseract::Validator::codes_used_
protected

Definition at line 234 of file validator.h.

◆ kIndicCodePageSize

const int tesseract::Validator::kIndicCodePageSize = 128
staticprotected

Definition at line 213 of file validator.h.

◆ kInvalid

const char32 tesseract::Validator::kInvalid = 0xfffd
static

Definition at line 99 of file validator.h.

◆ kKhmerVirama

const char32 tesseract::Validator::kKhmerVirama = 0x17d2
staticprotected

Definition at line 223 of file validator.h.

◆ kLeftToRightMark

const char32 tesseract::Validator::kLeftToRightMark = 0x200E
static

Definition at line 97 of file validator.h.

◆ kMaxSinhalaUnicode

const char32 tesseract::Validator::kMaxSinhalaUnicode = 0xdff
staticprotected

Definition at line 217 of file validator.h.

◆ kMaxViramaScriptUnicode

const char32 tesseract::Validator::kMaxViramaScriptUnicode = 0x17ff
staticprotected

Definition at line 219 of file validator.h.

◆ kMinIndicUnicode

const char32 tesseract::Validator::kMinIndicUnicode = 0x900
staticprotected

Definition at line 215 of file validator.h.

◆ kMyanmarVirama

const char32 tesseract::Validator::kMyanmarVirama = 0x1039
staticprotected

Definition at line 222 of file validator.h.

◆ kRightToLeftMark

const char32 tesseract::Validator::kRightToLeftMark = 0x200F
static

Definition at line 98 of file validator.h.

◆ kSinhalaVirama

const char32 tesseract::Validator::kSinhalaVirama = 0xdca
staticprotected

Definition at line 221 of file validator.h.

◆ kZeroWidthJoiner

const char32 tesseract::Validator::kZeroWidthJoiner = 0x200D
static

Definition at line 96 of file validator.h.

◆ kZeroWidthNonJoiner

const char32 tesseract::Validator::kZeroWidthNonJoiner = 0x200C
static

Definition at line 95 of file validator.h.

◆ kZeroWidthSpace

const char32 tesseract::Validator::kZeroWidthSpace = 0x200B
static

Definition at line 94 of file validator.h.

◆ output_

std::vector<char32> tesseract::Validator::output_
protected

Definition at line 232 of file validator.h.

◆ output_used_

int tesseract::Validator::output_used_
protected

Definition at line 236 of file validator.h.

◆ parts_

std::vector<std::vector<char32> > tesseract::Validator::parts_
protected

Definition at line 230 of file validator.h.

◆ report_errors_

bool tesseract::Validator::report_errors_
protected

Definition at line 238 of file validator.h.

◆ script_

ViramaScript tesseract::Validator::script_
protected

Definition at line 226 of file validator.h.


The documentation for this class was generated from the following files: