tesseract v5.3.3.20231005
tesseract::ValidateIndic Class Reference

#include <validate_indic.h>

Inheritance diagram for tesseract::ValidateIndic:
tesseract::Validator

Public Member Functions

 ValidateIndic (ViramaScript script, bool report_errors)
 
 ~ValidateIndic () override=default
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
Validator::CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
virtual bool ConsumeGraphemeIfValid ()=0
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
virtual CharClass UnicodeToCharClass (char32 ch) const =0
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum class  CharClass {
  kConsonant = 'C' , kVowel = 'V' , kVirama = 'H' , kMatra = 'M' ,
  kMatraPiece = 'P' , kVowelModifier = 'D' , kZeroWidthNonJoiner = 'z' , kZeroWidthJoiner = 'Z' ,
  kVedicMark = 'v' , kNukta = 'N' , kRobat = 'R' , kOther = 'O' ,
  kWhitespace = ' ' , kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 10 of file validate_indic.h.

Constructor & Destructor Documentation

◆ ValidateIndic()

tesseract::ValidateIndic::ValidateIndic ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 12 of file validate_indic.h.

12: Validator(script, report_errors) {}
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:137

◆ ~ValidateIndic()

tesseract::ValidateIndic::~ValidateIndic ( )
overridedefault

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateIndic::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 18 of file validate_indic.cpp.

18 {
19 switch (codes_[codes_used_].first) {
21 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
24 return ConsumeVowelIfValid();
27 // Apart from within an aksara, joiners are silently dropped.
28 if (report_errors_) {
29 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
30 }
32 return true;
34 UseMultiCode(1);
35 return true;
36 default:
37 if (report_errors_) {
38 tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
39 static_cast<int>(codes_[codes_used_].first),
40 codes_[codes_used_].second);
41 }
42 return false;
43 }
44}
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
std::vector< IndicPair > codes_
Definition: validator.h:225

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateIndic::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 46 of file validate_indic.cpp.

46 {
47 if (IsVedicAccent(ch)) {
49 }
50 if (ch == kZeroWidthNonJoiner) {
52 }
53 if (ch == kZeroWidthJoiner) {
55 }
56 // Offset from the start of the relevant unicode code block aka code page.
57 int base = static_cast<char32>(script_);
58 int off = ch - base;
59 // Anything in another code block is other.
60 if (off < 0 || off >= kIndicCodePageSize) {
61 return CharClass::kOther;
62 }
63 // Exception for Tamil. The aytham character is considered a letter.
64 if (script_ == ViramaScript::kTamil && off == 0x03) {
65 return CharClass::kVowel;
66 }
67 if (off < 0x4) {
69 }
71 // Sinhala is an exception.
72 if (off <= 0x19) {
73 return CharClass::kVowel;
74 }
75 if (off <= 0x49) {
77 }
78 if (off == 0x4a) {
79 return CharClass::kVirama;
80 }
81 if (off <= 0x5f) {
82 return CharClass::kMatra;
83 }
84 } else {
85 if (off <= 0x14 || off == 0x50) {
86 return CharClass::kVowel;
87 }
88 if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) {
90 }
91 // Sinhala doesn't have Nukta or Avagraha.
92 if (off == 0x3c) {
93 return CharClass::kNukta;
94 }
95 if (off == 0x3d) {
96 return CharClass::kVowel; // avagraha
97 }
98 if (off <= 0x4c || (0x51 <= off && off <= 0x54)) {
99 return CharClass::kMatra;
100 }
101 if (0x55 <= off && off <= 0x57) {
103 }
104 if (off == 0x4d) {
105 return CharClass::kVirama;
106 }
107 }
108 if (off == 0x60 || off == 0x61) {
109 return CharClass::kVowel;
110 }
111 if (off == 0x62 || off == 0x63) {
112 return CharClass::kMatra;
113 }
114 // Danda and digits up to 6f are OK as other.
115 // 70-7f are script-specific.
116 // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
117 if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72)) {
118 return CharClass::kOther;
119 }
120 // 0BF3-0BFA are other Tamil symbols.
121 if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A)) {
122 return CharClass::kOther;
123 }
124 if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) {
126 }
127 if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) {
129 }
130 if (script_ == ViramaScript::kSinhala && off == 0x70) {
132 }
133 if (script_ == ViramaScript::kDevanagari && off == 0x70) {
134 return CharClass::kOther;
135 }
136 if (0x70 <= off && off <= 0x73) {
138 }
139 // Non Indic, Digits, Measures, danda, etc.
140 return CharClass::kOther;
141}
signed int char32
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:178
static const int kIndicCodePageSize
Definition: validator.h:207
static const char32 kZeroWidthJoiner
Definition: validator.h:98

The documentation for this class was generated from the following files: