#include <validator.h>
|
enum class | CharClass {
kConsonant = 'C'
, kVowel = 'V'
, kVirama = 'H'
, kMatra = 'M'
,
kMatraPiece = 'P'
, kVowelModifier = 'D'
, kZeroWidthNonJoiner = 'z'
, kZeroWidthJoiner = 'Z'
,
kVedicMark = 'v'
, kNukta = 'N'
, kRobat = 'R'
, kOther = 'O'
,
kWhitespace = ' '
, kCombiner = 'c'
} |
|
using | IndicPair = std::pair< CharClass, char32 > |
|
Definition at line 74 of file validator.h.
◆ IndicPair
◆ CharClass
Enumerator |
---|
kConsonant | |
kVowel | |
kVirama | |
kMatra | |
kMatraPiece | |
kVowelModifier | |
kZeroWidthNonJoiner | |
kZeroWidthJoiner | |
kVedicMark | |
kNukta | |
kRobat | |
kOther | |
kWhitespace | |
kCombiner | |
Definition at line 113 of file validator.h.
113 {
114
115
116
117
118
119 kConsonant = 'C',
120 kVowel = 'V',
121 kVirama = 'H',
122 kMatra = 'M',
123 kMatraPiece = 'P',
124 kVowelModifier = 'D',
127 kVedicMark = 'v',
128 kNukta = 'N',
129 kRobat = 'R',
131
132 kWhitespace = ' ',
133 kCombiner = 'c',
134 };
static const char32 kZeroWidthNonJoiner
static const char32 kZeroWidthJoiner
◆ ~Validator()
tesseract::Validator::~Validator |
( |
| ) |
|
|
virtualdefault |
◆ Validator()
tesseract::Validator::Validator |
( |
ViramaScript |
script, |
|
|
bool |
report_errors |
|
) |
| |
|
inlineprotected |
◆ Clear()
void tesseract::Validator::Clear |
( |
| ) |
|
|
protected |
Definition at line 198 of file validator.cpp.
198 {
204}
std::vector< char32 > output_
std::vector< IndicPair > codes_
std::vector< std::vector< char32 > > parts_
◆ CodeOnlyToOutput()
bool tesseract::Validator::CodeOnlyToOutput |
( |
| ) |
|
|
inlineprotected |
◆ ComputeClassCodes()
void tesseract::Validator::ComputeClassCodes |
( |
const std::vector< char32 > & |
text | ) |
|
|
protected |
Definition at line 190 of file validator.cpp.
190 {
191 codes_.reserve(text.size());
194 }
195}
virtual CharClass UnicodeToCharClass(char32 ch) const =0
◆ ConsumeGraphemeIfValid()
virtual bool tesseract::Validator::ConsumeGraphemeIfValid |
( |
| ) |
|
|
protectedpure virtual |
◆ IsSubscriptScript()
bool tesseract::Validator::IsSubscriptScript |
( |
| ) |
const |
|
protected |
◆ IsVedicAccent()
bool tesseract::Validator::IsVedicAccent |
( |
char32 |
unicode | ) |
|
|
staticprotected |
Definition at line 178 of file validator.cpp.
178 {
179 return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
180 (0x951 <= unicode && unicode <= 0x954);
181}
◆ IsVirama()
bool tesseract::Validator::IsVirama |
( |
char32 |
unicode | ) |
|
|
staticprotected |
Definition at line 169 of file validator.cpp.
169 {
171 (unicode & 0x7f) == 0x4d) ||
174}
static const char32 kSinhalaVirama
static const char32 kKhmerVirama
static const char32 kJavaneseVirama
static const char32 kMaxSinhalaUnicode
static const char32 kMyanmarVirama
static const char32 kMinIndicUnicode
◆ IsZeroWidthMark()
static bool tesseract::Validator::IsZeroWidthMark |
( |
char32 |
ch | ) |
|
|
inlinestatic |
Definition at line 89 of file validator.h.
89 {
92 }
static const char32 kInvalid
static const char32 kRightToLeftMark
static const char32 kLeftToRightMark
static const char32 kZeroWidthSpace
◆ MostFrequentViramaScript()
ViramaScript tesseract::Validator::MostFrequentViramaScript |
( |
const std::vector< char32 > & |
utf32 | ) |
|
|
staticprotected |
Definition at line 135 of file validator.cpp.
135 {
136 std::unordered_map<int, int> histogram;
138
139
140
142 IcuErrorCode err;
143 UScriptCode script_code = uscript_getScript(
ch, err);
145 script_code == USCRIPT_MYANMAR) {
146 if (script_code == USCRIPT_MYANMAR) {
148 }
149 ++histogram[base];
150 }
151 }
152 if (!histogram.empty()) {
153 int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;
155
162 }
163 }
165}
static const int kIndicCodePageSize
static const char32 kMaxJavaneseUnicode
◆ MoveResultsToDest()
void tesseract::Validator::MoveResultsToDest |
( |
GraphemeNormMode |
g_mode, |
|
|
std::vector< std::vector< char32 > > * |
dest |
|
) |
| |
|
protected |
Definition at line 106 of file validator.cpp.
106 {
108
109
113 }
115
118
119 dest->push_back(std::vector<char32>());
121 } else {
122
123
125 }
126}
◆ MultiCodePart()
void tesseract::Validator::MultiCodePart |
( |
unsigned |
length | ) |
|
|
inlineprotected |
◆ ScriptValidator()
std::unique_ptr< Validator > tesseract::Validator::ScriptValidator |
( |
ViramaScript |
script, |
|
|
bool |
report_errors |
|
) |
| |
|
staticprotected |
Definition at line 71 of file validator.cpp.
71 {
72 switch (script) {
73#define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)
74 CASE(kNonVirama, ValidateGrapheme);
75 CASE(kJavanese, ValidateJavanese);
76 CASE(kMyanmar, ValidateMyanmar);
77 CASE(kKhmer, ValidateKhmer);
78#undef CASE
79 default:
80 return std::make_unique<ValidateIndic>(script, report_errors);
81 }
82}
◆ UnicodeToCharClass()
virtual CharClass tesseract::Validator::UnicodeToCharClass |
( |
char32 |
ch | ) |
const |
|
protectedpure virtual |
◆ UseMultiCode()
bool tesseract::Validator::UseMultiCode |
( |
unsigned |
length | ) |
|
|
inlineprotected |
Definition at line 189 of file validator.h.
189 {
193 }
void MultiCodePart(unsigned length)
◆ ValidateCleanAndSegment()
bool tesseract::Validator::ValidateCleanAndSegment |
( |
GraphemeNormMode |
g_mode, |
|
|
bool |
report_errors, |
|
|
const std::vector< char32 > & |
src, |
|
|
std::vector< std::vector< char32 > > * |
dest |
|
) |
| |
|
static |
Definition at line 40 of file validator.cpp.
42 {
44 std::vector<std::vector<char32>> graphemes;
46 bool success = true;
48
49
54 }
55
56 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src,
dest);
57 } else {
58 success =
60 std::unique_ptr<Validator> validator(
ScriptValidator(script, report_errors));
61 for (const auto &grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme,
dest)) {
63 success = false;
64 }
65 }
66 }
67 return success;
68}
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
◆ ValidateCleanAndSegmentInternal()
bool tesseract::Validator::ValidateCleanAndSegmentInternal |
( |
GraphemeNormMode |
g_mode, |
|
|
const std::vector< char32 > & |
src, |
|
|
std::vector< std::vector< char32 > > * |
dest |
|
) |
| |
|
protected |
Definition at line 89 of file validator.cpp.
91 {
94 bool success = true;
97 success = false;
99 }
100 }
102 return success;
103}
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
void ComputeClassCodes(const std::vector< char32 > &text)
virtual bool ConsumeGraphemeIfValid()=0
◆ codes_
std::vector<IndicPair> tesseract::Validator::codes_ |
|
protected |
◆ codes_used_
unsigned tesseract::Validator::codes_used_ |
|
protected |
◆ kIndicCodePageSize
const int tesseract::Validator::kIndicCodePageSize = 128 |
|
staticprotected |
◆ kInvalid
const char32 tesseract::Validator::kInvalid = 0xfffd |
|
static |
◆ kJavaneseVirama
const char32 tesseract::Validator::kJavaneseVirama = 0xa9c0 |
|
staticprotected |
◆ kKhmerVirama
const char32 tesseract::Validator::kKhmerVirama = 0x17d2 |
|
staticprotected |
◆ kLeftToRightMark
const char32 tesseract::Validator::kLeftToRightMark = 0x200E |
|
static |
◆ kMaxJavaneseUnicode
const char32 tesseract::Validator::kMaxJavaneseUnicode = 0xa9df |
|
staticprotected |
◆ kMaxSinhalaUnicode
const char32 tesseract::Validator::kMaxSinhalaUnicode = 0xdff |
|
staticprotected |
◆ kMaxViramaScriptUnicode
const char32 tesseract::Validator::kMaxViramaScriptUnicode = 0x17ff |
|
staticprotected |
◆ kMinIndicUnicode
const char32 tesseract::Validator::kMinIndicUnicode = 0x900 |
|
staticprotected |
◆ kMyanmarVirama
const char32 tesseract::Validator::kMyanmarVirama = 0x1039 |
|
staticprotected |
◆ kRightToLeftMark
const char32 tesseract::Validator::kRightToLeftMark = 0x200F |
|
static |
◆ kSinhalaVirama
const char32 tesseract::Validator::kSinhalaVirama = 0xdca |
|
staticprotected |
◆ kZeroWidthJoiner
const char32 tesseract::Validator::kZeroWidthJoiner = 0x200D |
|
static |
◆ kZeroWidthNonJoiner
const char32 tesseract::Validator::kZeroWidthNonJoiner = 0x200C |
|
static |
◆ kZeroWidthSpace
const char32 tesseract::Validator::kZeroWidthSpace = 0x200B |
|
static |
◆ output_
std::vector<char32> tesseract::Validator::output_ |
|
protected |
◆ output_used_
unsigned tesseract::Validator::output_used_ |
|
protected |
◆ parts_
std::vector<std::vector<char32> > tesseract::Validator::parts_ |
|
protected |
◆ report_errors_
bool tesseract::Validator::report_errors_ |
|
protected |
◆ script_
The documentation for this class was generated from the following files:
- /media/home/debian/src/github/tesseract-ocr/tesseract/src/training/unicharset/validator.h
- /media/home/debian/src/github/tesseract-ocr/tesseract/src/training/unicharset/validator.cpp