tesseract v5.3.3.20231005
tesseract::UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0 , U_RIGHT_TO_LEFT = 1 , U_EUROPEAN_NUMBER = 2 , U_EUROPEAN_NUMBER_SEPARATOR = 3 ,
  U_EUROPEAN_NUMBER_TERMINATOR = 4 , U_ARABIC_NUMBER = 5 , U_COMMON_NUMBER_SEPARATOR = 6 , U_BLOCK_SEPARATOR = 7 ,
  U_SEGMENT_SEPARATOR = 8 , U_WHITE_SPACE_NEUTRAL = 9 , U_OTHER_NEUTRAL = 10 , U_LEFT_TO_RIGHT_EMBEDDING = 11 ,
  U_LEFT_TO_RIGHT_OVERRIDE = 12 , U_RIGHT_TO_LEFT_ARABIC = 13 , U_RIGHT_TO_LEFT_EMBEDDING = 14 , U_RIGHT_TO_LEFT_OVERRIDE = 15 ,
  U_POP_DIRECTIONAL_FORMAT = 16 , U_DIR_NON_SPACING_MARK = 17 , U_BOUNDARY_NEUTRAL = 18 , U_FIRST_STRONG_ISOLATE = 19 ,
  U_LEFT_TO_RIGHT_ISOLATE = 20 , U_RIGHT_TO_LEFT_ISOLATE = 21 , U_POP_DIRECTIONAL_ISOLATE = 22 , U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, unsigned *first_bad_position) const
 
bool encode_string (const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
 
const char * id_to_unichar (UNICHAR_ID id) const
 
const char * id_to_unichar_ext (UNICHAR_ID id) const
 
std::string debug_str (UNICHAR_ID id) const
 
std::string debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style)
 
void unichar_insert (const char *const unichar_repr)
 
void unichar_insert_backwards_compatible (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
size_t size () const
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool save_to_file (tesseract::TFile *file) const
 
bool save_to_string (std::string &str) const
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
void set_normed_ids (UNICHAR_ID unichar_id)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void CopyFrom (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
 
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
 
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
 
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
 
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
 
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
 
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
 
bool IsSpaceDelimited (UNICHAR_ID unichar_id) const
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
bool has_special_codes () const
 
bool AnyRepeatedUnicodes () const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
const std::vector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int thai_sid () const
 
int hangul_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static std::string debug_utf8_str (const char *str)
 
static std::string CleanupString (const char *utf8_str)
 
static std::string CleanupString (const char *utf8_str, size_t length)
 

Static Public Attributes

static const char * kCustomLigatures [][2]
 
static const char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]
 

Detailed Description

Definition at line 164 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_FIRST_STRONG_ISOLATE 
U_LEFT_TO_RIGHT_ISOLATE 
U_RIGHT_TO_LEFT_ISOLATE 
U_POP_DIRECTIONAL_ISOLATE 
U_CHAR_DIRECTION_COUNT 

Definition at line 175 of file unicharset.h.

175 {
176 U_LEFT_TO_RIGHT = 0,
177 U_RIGHT_TO_LEFT = 1,
181 U_ARABIC_NUMBER = 5,
186 U_OTHER_NEUTRAL = 10,
199#ifndef U_HIDE_DEPRECATED_API
201#endif // U_HIDE_DEPRECATED_API
202 };

Constructor & Destructor Documentation

◆ UNICHARSET()

tesseract::UNICHARSET::UNICHARSET ( )

Definition at line 170 of file unicharset.cpp.

171 : ids(), script_table(nullptr), script_table_size_used(0) {
172 clear();
173 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
175 if (i == UNICHAR_JOINED) {
176 set_isngram(i, true);
177 }
178 }
179}
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
@ UNICHAR_JOINED
Definition: unicharset.h:37
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:172
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:462

◆ ~UNICHARSET()

tesseract::UNICHARSET::~UNICHARSET ( )

Definition at line 181 of file unicharset.cpp.

181 {
182 clear();
183}

Member Function Documentation

◆ add_script()

int tesseract::UNICHARSET::add_script ( const char *  script)

Definition at line 1063 of file unicharset.cpp.

1063 {
1064 for (int i = 0; i < script_table_size_used; ++i) {
1065 if (strcmp(script, script_table[i]) == 0) {
1066 return i;
1067 }
1068 }
1069 if (script_table_size_reserved == 0) {
1070 script_table_size_reserved = 8;
1071 script_table = new char *[script_table_size_reserved];
1072 } else if (script_table_size_used >= script_table_size_reserved) {
1073 assert(script_table_size_used == script_table_size_reserved);
1074 script_table_size_reserved += script_table_size_reserved;
1075 char **new_script_table = new char *[script_table_size_reserved];
1076 memcpy(new_script_table, script_table,
1077 script_table_size_used * sizeof(char *));
1078 delete[] script_table;
1079 script_table = new_script_table;
1080 }
1081 script_table[script_table_size_used] = new char[strlen(script) + 1];
1082 strcpy(script_table[script_table_size_used], script);
1083 return script_table_size_used++;
1084}

◆ AnyRepeatedUnicodes()

bool tesseract::UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 1046 of file unicharset.cpp.

1046 {
1047 int start_id = 0;
1048 if (has_special_codes()) {
1049 start_id = SPECIAL_UNICHAR_CODES_COUNT;
1050 }
1051 for (unsigned id = start_id; id < unichars.size(); ++id) {
1052 // Convert to unicodes.
1053 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1054 for (size_t u = 1; u < unicodes.size(); ++u) {
1055 if (unicodes[u - 1] == unicodes[u]) {
1056 return true;
1057 }
1058 }
1059 }
1060 return false;
1061}
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
bool has_special_codes() const
Definition: unicharset.h:756
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:859

◆ AppendOtherUnicharset()

void tesseract::UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 454 of file unicharset.cpp.

454 {
455 int initial_used = unichars.size();
456 for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
457 const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
458 const char *utf8 = src.id_to_unichar(ch);
459 int id = unichars.size();
460 if (contains_unichar(utf8)) {
461 id = unichar_to_id(utf8);
462 // Just expand current ranges.
463 unichars[id].properties.ExpandRangesFrom(src_props);
464 } else {
466 unichars[id].properties.SetRangesEmpty();
467 }
468 }
469 // Set properties, including mirror and other_case, WITHOUT reordering
470 // the unicharset.
471 PartialSetPropertiesFromOther(initial_used, src);
472}
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:288
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:395

◆ CleanupString() [1/2]

static std::string tesseract::UNICHARSET::CleanupString ( const char *  utf8_str)
inlinestatic

Definition at line 265 of file unicharset.h.

265 {
266 return CleanupString(utf8_str, strlen(utf8_str));
267 }
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265

◆ CleanupString() [2/2]

std::string tesseract::UNICHARSET::CleanupString ( const char *  utf8_str,
size_t  length 
)
static

Definition at line 1158 of file unicharset.cpp.

1158 {
1159 std::string result;
1160 result.reserve(length);
1161 char ch;
1162 while ((ch = *utf8_str) != '\0' && length-- > 0) {
1163 int key_index = 0;
1164 const char *key;
1165 while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1166 int match = 0;
1167 while (key[match] != '\0' && key[match] == utf8_str[match]) {
1168 ++match;
1169 }
1170 if (key[match] == '\0') {
1171 utf8_str += match;
1172 break;
1173 }
1174 ++key_index;
1175 }
1176 if (key == nullptr) {
1177 result.push_back(ch);
1178 ++utf8_str;
1179 } else {
1180 result.append(kCleanupMaps[key_index][1]);
1181 }
1182 }
1183 return result;
1184}

◆ clear()

void tesseract::UNICHARSET::clear ( )
inline

Definition at line 324 of file unicharset.h.

324 {
325 if (script_table != nullptr) {
326 for (int i = 0; i < script_table_size_used; ++i) {
327 delete[] script_table[i];
328 }
329 delete[] script_table;
330 script_table = nullptr;
331 script_table_size_used = 0;
332 }
333 script_table_size_reserved = 0;
335 unichars.clear();
336 ids.clear();
337 top_bottom_set_ = false;
338 script_has_upper_lower_ = false;
339 script_has_xheight_ = false;
340 old_style_included_ = false;
341 null_sid_ = 0;
342 common_sid_ = 0;
343 latin_sid_ = 0;
344 cyrillic_sid_ = 0;
345 greek_sid_ = 0;
346 han_sid_ = 0;
347 hiragana_sid_ = 0;
348 katakana_sid_ = 0;
349 thai_sid_ = 0;
350 hangul_sid_ = 0;
351 default_sid_ = 0;
352 }
void delete_pointers_in_unichars()
Definition: unicharset.h:316

◆ common_sid()

int tesseract::UNICHARSET::common_sid ( ) const
inline

Definition at line 919 of file unicharset.h.

919 {
920 return common_sid_;
921 }

◆ contains_unichar() [1/2]

bool tesseract::UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 695 of file unicharset.cpp.

695 {
696 std::string cleaned =
697 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
698 return ids.contains(cleaned.data(), cleaned.size());
699}
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:83

◆ contains_unichar() [2/2]

bool tesseract::UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 701 of file unicharset.cpp.

702 {
703 if (length == 0) {
704 return false;
705 }
706 std::string cleaned(unichar_repr, length);
707 if (!old_style_included_) {
708 cleaned = CleanupString(unichar_repr, length);
709 }
710 return ids.contains(cleaned.data(), cleaned.size());
711}

◆ contains_unichar_id()

bool tesseract::UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 303 of file unicharset.h.

303 {
304 return static_cast<size_t>(unichar_id) < unichars.size();
305 }

◆ CopyFrom()

void tesseract::UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 438 of file unicharset.cpp.

438 {
439 clear();
440 for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
441 const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
442 const char *utf8 = src.id_to_unichar(ch);
444 unichars[ch].properties.ExpandRangesFrom(src_props);
445 }
446 // Set properties, including mirror and other_case, WITHOUT reordering
447 // the unicharset.
449}

◆ cyrillic_sid()

int tesseract::UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 925 of file unicharset.h.

925 {
926 return cyrillic_sid_;
927 }

◆ debug_str() [1/2]

std::string tesseract::UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 273 of file unicharset.h.

273 {
274 return debug_str(unichar_to_id(unichar_repr));
275 }
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331

◆ debug_str() [2/2]

std::string tesseract::UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 331 of file unicharset.cpp.

331 {
332 if (id == INVALID_UNICHAR_ID) {
333 return std::string(id_to_unichar(id));
334 }
335 const CHAR_FRAGMENT *fragment = this->get_fragment(id);
336 if (fragment) {
337 return fragment->to_string();
338 }
339 const char *str = id_to_unichar(id);
340 std::string result = debug_utf8_str(str);
341 // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
342 if (get_isalpha(id)) {
343 if (get_islower(id)) {
344 result += "a";
345 } else if (get_isupper(id)) {
346 result += "A";
347 } else {
348 result += "x";
349 }
350 }
351 // Append 0 if a digit.
352 if (get_isdigit(id)) {
353 result += "0";
354 }
355 // Append p is a punctuation symbol.
356 if (get_ispunctuation(id)) {
357 result += "p";
358 }
359 return result;
360}
static std::string debug_utf8_str(const char *str)
Definition: unicharset.cpp:307
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533

◆ debug_utf8_str()

std::string tesseract::UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 307 of file unicharset.cpp.

307 {
308 std::string result = str;
309 result += " [";
310 int step = 1;
311 // Chop into unicodes and code each as hex.
312 for (int i = 0; str[i] != '\0'; i += step) {
313 char hex[sizeof(int) * 2 + 1];
314 step = UNICHAR::utf8_step(str + i);
315 if (step == 0) {
316 step = 1;
317 snprintf(hex, sizeof(hex), "%x", str[i]);
318 } else {
319 UNICHAR ch(str + i, step);
320 snprintf(hex, sizeof(hex), "%x", ch.first_uni());
321 }
322 result += hex;
323 result += " ";
324 }
325 result += "]";
326 return result;
327}
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
int step(const char *str) const
Definition: unicharset.cpp:211

◆ default_sid()

int tesseract::UNICHARSET::default_sid ( ) const
inline

Definition at line 946 of file unicharset.h.

946 {
947 return default_sid_;
948 }

◆ delete_pointers_in_unichars()

void tesseract::UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 316 of file unicharset.h.

316 {
317 for (auto &unichar : unichars) {
318 delete unichar.properties.fragment;
319 unichar.properties.fragment = nullptr;
320 }
321 }

◆ encodable_string()

bool tesseract::UNICHARSET::encodable_string ( const char *  str,
unsigned *  first_bad_position 
) const

Definition at line 224 of file unicharset.cpp.

225 {
226 std::vector<UNICHAR_ID> encoding;
227 return encode_string(str, true, &encoding, nullptr, first_bad_position);
228}
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ encode_string()

bool tesseract::UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
std::vector< UNICHAR_ID > *  encoding,
std::vector< char > *  lengths,
unsigned *  encoded_length 
) const

Definition at line 239 of file unicharset.cpp.

242 {
243 std::vector<UNICHAR_ID> working_encoding;
244 std::vector<char> working_lengths;
245 std::vector<char> best_lengths;
246 encoding->clear(); // Just in case str is empty.
247 auto str_length = strlen(str);
248 unsigned str_pos = 0;
249 bool perfect = true;
250 while (str_pos < str_length) {
251 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
252 &str_pos, encoding, &best_lengths);
253 if (str_pos < str_length) {
254 // This is a non-match. Skip one utf-8 character.
255 perfect = false;
256 if (give_up_on_failure) {
257 break;
258 }
259 int step = UNICHAR::utf8_step(str + str_pos);
260 if (step == 0) {
261 step = 1;
262 }
263 encoding->push_back(INVALID_UNICHAR_ID);
264 best_lengths.push_back(step);
265 str_pos += step;
266 working_encoding = *encoding;
267 working_lengths = best_lengths;
268 }
269 }
270 if (lengths != nullptr) {
271 *lengths = best_lengths;
272 }
273 if (encoded_length != nullptr) {
274 *encoded_length = str_pos;
275 }
276 return perfect;
277}

◆ eq()

bool tesseract::UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 713 of file unicharset.cpp.

714 {
715 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
716}

◆ ExpandRangesFromOther()

void tesseract::UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 425 of file unicharset.cpp.

425 {
426 for (unsigned ch = 0; ch < unichars.size(); ++ch) {
427 const char *utf8 = id_to_unichar(ch);
428 UNICHAR_PROPERTIES properties;
429 if (src.GetStrProperties(utf8, &properties)) {
430 // Expand just the ranges from properties.
431 unichars[ch].properties.ExpandRangesFrom(properties);
432 }
433 }
434}

◆ get_advance_stats()

void tesseract::UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const
inline

Definition at line 646 of file unicharset.h.

647 {
648 if (INVALID_UNICHAR_ID == unichar_id) {
649 *advance = *advance_sd = 0;
650 return;
651 }
652 ASSERT_HOST(contains_unichar_id(unichar_id));
653 *advance = unichars[unichar_id].properties.advance;
654 *advance_sd = unichars[unichar_id].properties.advance_sd;
655 }
#define ASSERT_HOST(x)
Definition: errcode.h:54
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303

◆ get_bearing_stats()

void tesseract::UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const
inline

Definition at line 629 of file unicharset.h.

630 {
631 if (INVALID_UNICHAR_ID == unichar_id) {
632 *bearing = *bearing_sd = 0.0f;
633 return;
634 }
635 ASSERT_HOST(contains_unichar_id(unichar_id));
636 *bearing = unichars[unichar_id].properties.bearing;
637 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
638 }

◆ get_chartype() [1/2]

char tesseract::UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 807 of file unicharset.h.

807 {
808 return get_chartype(unichar_to_id(unichar_repr));
809 }
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:635

◆ get_chartype() [2/2]

char tesseract::UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 635 of file unicharset.cpp.

635 {
636 if (this->get_isupper(id)) {
637 return 'A';
638 }
639 if (this->get_islower(id)) {
640 return 'a';
641 }
642 if (this->get_isalpha(id)) {
643 return 'x';
644 }
645 if (this->get_isdigit(id)) {
646 return '0';
647 }
648 if (this->get_ispunctuation(id)) {
649 return 'p';
650 }
651 return 0;
652}

◆ get_direction()

Direction tesseract::UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 712 of file unicharset.h.

712 {
713 if (INVALID_UNICHAR_ID == unichar_id) {
715 }
716 ASSERT_HOST(contains_unichar_id(unichar_id));
717 return unichars[unichar_id].properties.direction;
718 }

◆ get_enabled()

bool tesseract::UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 911 of file unicharset.h.

911 {
912 ASSERT_HOST(contains_unichar_id(unichar_id));
913 return unichars[unichar_id].properties.enabled;
914 }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT * tesseract::UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 820 of file unicharset.h.

820 {
821 if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
822 !ids.contains(unichar_repr, false)) {
823 return nullptr;
824 }
825 return get_fragment(unichar_to_id(unichar_repr));
826 }

◆ get_fragment() [2/2]

const CHAR_FRAGMENT * tesseract::UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 768 of file unicharset.h.

768 {
769 if (INVALID_UNICHAR_ID == unichar_id) {
770 return nullptr;
771 }
772 ASSERT_HOST(contains_unichar_id(unichar_id));
773 return unichars[unichar_id].properties.fragment;
774 }

◆ get_isalpha() [1/3]

bool tesseract::UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 777 of file unicharset.h.

777 {
778 return get_isalpha(unichar_to_id(unichar_repr));
779 }

◆ get_isalpha() [2/3]

bool tesseract::UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 830 of file unicharset.h.

830 {
831 return get_isalpha(unichar_to_id(unichar_repr, length));
832 }

◆ get_isalpha() [3/3]

bool tesseract::UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 497 of file unicharset.h.

497 {
498 if (INVALID_UNICHAR_ID == unichar_id) {
499 return false;
500 }
501 ASSERT_HOST(contains_unichar_id(unichar_id));
502 return unichars[unichar_id].properties.isalpha;
503 }

◆ get_isdigit() [1/3]

bool tesseract::UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 792 of file unicharset.h.

792 {
793 return get_isdigit(unichar_to_id(unichar_repr));
794 }

◆ get_isdigit() [2/3]

bool tesseract::UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 848 of file unicharset.h.

848 {
849 return get_isdigit(unichar_to_id(unichar_repr, length));
850 }

◆ get_isdigit() [3/3]

bool tesseract::UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 524 of file unicharset.h.

524 {
525 if (INVALID_UNICHAR_ID == unichar_id) {
526 return false;
527 }
528 ASSERT_HOST(contains_unichar_id(unichar_id));
529 return unichars[unichar_id].properties.isdigit;
530 }

◆ get_islower() [1/3]

bool tesseract::UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 782 of file unicharset.h.

782 {
783 return get_islower(unichar_to_id(unichar_repr));
784 }

◆ get_islower() [2/3]

bool tesseract::UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 836 of file unicharset.h.

836 {
837 return get_islower(unichar_to_id(unichar_repr, length));
838 }

◆ get_islower() [3/3]

bool tesseract::UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 506 of file unicharset.h.

506 {
507 if (INVALID_UNICHAR_ID == unichar_id) {
508 return false;
509 }
510 ASSERT_HOST(contains_unichar_id(unichar_id));
511 return unichars[unichar_id].properties.islower;
512 }

◆ get_isngram()

bool tesseract::UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 542 of file unicharset.h.

542 {
543 if (INVALID_UNICHAR_ID == unichar_id) {
544 return false;
545 }
546 ASSERT_HOST(contains_unichar_id(unichar_id));
547 return unichars[unichar_id].properties.isngram;
548 }

◆ get_isprivate()

bool tesseract::UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 379 of file unicharset.cpp.

379 {
380 UNICHAR uc(id_to_unichar(unichar_id), -1);
381 int uni = uc.first_uni();
382 return (uni >= 0xE000 && uni <= 0xF8FF);
383}

◆ get_ispunctuation() [1/3]

bool tesseract::UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 797 of file unicharset.h.

797 {
798 return get_ispunctuation(unichar_to_id(unichar_repr));
799 }

◆ get_ispunctuation() [2/3]

bool tesseract::UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 854 of file unicharset.h.

854 {
855 return get_ispunctuation(unichar_to_id(unichar_repr, length));
856 }

◆ get_ispunctuation() [3/3]

bool tesseract::UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 533 of file unicharset.h.

533 {
534 if (INVALID_UNICHAR_ID == unichar_id) {
535 return false;
536 }
537 ASSERT_HOST(contains_unichar_id(unichar_id));
538 return unichars[unichar_id].properties.ispunctuation;
539 }

◆ get_isupper() [1/3]

bool tesseract::UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 787 of file unicharset.h.

787 {
788 return get_isupper(unichar_to_id(unichar_repr));
789 }

◆ get_isupper() [2/3]

bool tesseract::UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 842 of file unicharset.h.

842 {
843 return get_isupper(unichar_to_id(unichar_repr, length));
844 }

◆ get_isupper() [3/3]

bool tesseract::UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 515 of file unicharset.h.

515 {
516 if (INVALID_UNICHAR_ID == unichar_id) {
517 return false;
518 }
519 ASSERT_HOST(contains_unichar_id(unichar_id));
520 return unichars[unichar_id].properties.isupper;
521 }

◆ get_mirror()

UNICHAR_ID tesseract::UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 721 of file unicharset.h.

721 {
722 if (INVALID_UNICHAR_ID == unichar_id) {
723 return INVALID_UNICHAR_ID;
724 }
725 ASSERT_HOST(contains_unichar_id(unichar_id));
726 return unichars[unichar_id].properties.mirror;
727 }

◆ get_normed_unichar()

const char * tesseract::UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 859 of file unicharset.h.

859 {
860 if (unichar_id == UNICHAR_SPACE) {
861 return " ";
862 }
863 return unichars[unichar_id].properties.normed.c_str();
864 }
@ UNICHAR_SPACE
Definition: unicharset.h:36

◆ get_other_case()

UNICHAR_ID tesseract::UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 703 of file unicharset.h.

703 {
704 if (INVALID_UNICHAR_ID == unichar_id) {
705 return INVALID_UNICHAR_ID;
706 }
707 ASSERT_HOST(contains_unichar_id(unichar_id));
708 return unichars[unichar_id].properties.other_case;
709 }

◆ get_properties() [1/2]

unsigned int tesseract::UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 803 of file unicharset.h.

803 {
804 return get_properties(unichar_to_id(unichar_repr));
805 }
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:615

◆ get_properties() [2/2]

unsigned int tesseract::UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 615 of file unicharset.cpp.

615 {
616 unsigned int properties = 0;
617 if (this->get_isalpha(id)) {
618 properties |= ISALPHA_MASK;
619 }
620 if (this->get_islower(id)) {
621 properties |= ISLOWER_MASK;
622 }
623 if (this->get_isupper(id)) {
624 properties |= ISUPPER_MASK;
625 }
626 if (this->get_isdigit(id)) {
627 properties |= ISDIGIT_MASK;
628 }
629 if (this->get_ispunctuation(id)) {
630 properties |= ISPUNCTUATION_MASK;
631 }
632 return properties;
633}

◆ get_script() [1/3]

int tesseract::UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 814 of file unicharset.h.

814 {
815 return get_script(unichar_to_id(unichar_repr));
816 }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:681

◆ get_script() [2/3]

int tesseract::UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 876 of file unicharset.h.

876 {
877 return get_script(unichar_to_id(unichar_repr, length));
878 }

◆ get_script() [3/3]

int tesseract::UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 681 of file unicharset.h.

681 {
682 if (INVALID_UNICHAR_ID == unichar_id) {
683 return null_sid_;
684 }
685 ASSERT_HOST(contains_unichar_id(unichar_id));
686 return unichars[unichar_id].properties.script_id;
687 }

◆ get_script_from_script_id()

const char * tesseract::UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 886 of file unicharset.h.

886 {
887 if (id >= script_table_size_used || id < 0) {
888 return null_script;
889 }
890 return script_table[id];
891 }

◆ get_script_id_from_name()

int tesseract::UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1146 of file unicharset.cpp.

1146 {
1147 for (int i = 0; i < script_table_size_used; ++i) {
1148 if (strcmp(script_name, script_table[i]) == 0) {
1149 return i;
1150 }
1151 }
1152 return 0; // 0 is always the null_script
1153}

◆ get_script_table_size()

int tesseract::UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 881 of file unicharset.h.

881 {
882 return script_table_size_used;
883 }

◆ get_top_bottom()

void tesseract::UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 586 of file unicharset.h.

587 {
588 if (INVALID_UNICHAR_ID == unichar_id) {
589 *min_bottom = *min_top = 0;
590 *max_bottom = *max_top = 256; // kBlnCellHeight
591 return;
592 }
593 ASSERT_HOST(contains_unichar_id(unichar_id));
594 *min_bottom = unichars[unichar_id].properties.min_bottom;
595 *max_bottom = unichars[unichar_id].properties.max_bottom;
596 *min_top = unichars[unichar_id].properties.min_top;
597 *max_top = unichars[unichar_id].properties.max_top;
598 }

◆ get_width_stats()

void tesseract::UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const
inline

Definition at line 612 of file unicharset.h.

613 {
614 if (INVALID_UNICHAR_ID == unichar_id) {
615 *width = 0.0f;
616 *width_sd = 0.0f;
617 return;
618 }
619 ASSERT_HOST(contains_unichar_id(unichar_id));
620 *width = unichars[unichar_id].properties.width;
621 *width_sd = unichars[unichar_id].properties.width_sd;
622 }

◆ greek_sid()

int tesseract::UNICHARSET::greek_sid ( ) const
inline

Definition at line 928 of file unicharset.h.

928 {
929 return greek_sid_;
930 }

◆ han_sid()

int tesseract::UNICHARSET::han_sid ( ) const
inline

Definition at line 931 of file unicharset.h.

931 {
932 return han_sid_;
933 }

◆ hangul_sid()

int tesseract::UNICHARSET::hangul_sid ( ) const
inline

Definition at line 943 of file unicharset.h.

943 {
944 return hangul_sid_;
945 }

◆ has_special_codes()

bool tesseract::UNICHARSET::has_special_codes ( ) const
inline

Definition at line 756 of file unicharset.h.

756 {
757 return get_fragment(UNICHAR_BROKEN) != nullptr &&
760 }
@ UNICHAR_BROKEN
Definition: unicharset.h:38

◆ hiragana_sid()

int tesseract::UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 934 of file unicharset.h.

934 {
935 return hiragana_sid_;
936 }

◆ id_to_unichar()

const char * tesseract::UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 279 of file unicharset.cpp.

279 {
280 if (id == INVALID_UNICHAR_ID) {
281 return INVALID_UNICHAR;
282 }
283 ASSERT_HOST(static_cast<unsigned>(id) < this->size());
284 return unichars[id].representation;
285}
size_t size() const
Definition: unicharset.h:355

◆ id_to_unichar_ext()

const char * tesseract::UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 287 of file unicharset.cpp.

287 {
288 if (id == INVALID_UNICHAR_ID) {
289 return INVALID_UNICHAR;
290 }
291 ASSERT_HOST(static_cast<unsigned>(id) < this->size());
292 // Resolve from the kCustomLigatures table if this is a private encoding.
293 if (get_isprivate(id)) {
294 const char *ch = id_to_unichar(id);
295 for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
296 if (!strcmp(ch, kCustomLigatures[i][1])) {
297 return kCustomLigatures[i][0];
298 }
299 }
300 }
301 // Otherwise return the stored representation.
302 return unichars[id].representation;
303}
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:379
static const char * kCustomLigatures[][2]
Definition: unicharset.h:169

◆ is_null_script()

bool tesseract::UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 901 of file unicharset.h.

901 {
902 return script == null_script;
903 }

◆ IsSpaceDelimited()

bool tesseract::UNICHARSET::IsSpaceDelimited ( UNICHAR_ID  unichar_id) const
inline

Definition at line 668 of file unicharset.h.

668 {
669 if (INVALID_UNICHAR_ID == unichar_id) {
670 return true;
671 }
672 int script_id = get_script(unichar_id);
673 return script_id != han_sid_ && script_id != thai_sid_ &&
674 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
675 script_id != katakana_sid_;
676 }

◆ katakana_sid()

int tesseract::UNICHARSET::katakana_sid ( ) const
inline

Definition at line 937 of file unicharset.h.

937 {
938 return katakana_sid_;
939 }

◆ latin_sid()

int tesseract::UNICHARSET::latin_sid ( ) const
inline

Definition at line 922 of file unicharset.h.

922 {
923 return latin_sid_;
924 }

◆ load_from_file() [1/5]

bool tesseract::UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 401 of file unicharset.h.

401 {
402 return load_from_file(filename, false);
403 }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391

◆ load_from_file() [2/5]

bool tesseract::UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 391 of file unicharset.h.

391 {
392 FILE *file = fopen(filename, "rb");
393 if (file == nullptr) {
394 return false;
395 }
396 bool result = load_from_file(file, skip_fragments);
397 fclose(file);
398 return result;
399 }

◆ load_from_file() [3/5]

bool tesseract::UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 408 of file unicharset.h.

408 {
409 return load_from_file(file, false);
410 }

◆ load_from_file() [4/5]

bool tesseract::UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 767 of file unicharset.cpp.

767 {
768 LocalFilePointer lfp(file);
769 using namespace std::placeholders; // for _1, _2
770 std::function<char *(char *, int)> fgets_cb =
771 std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
772 bool success = load_via_fgets(fgets_cb, skip_fragments);
773 return success;
774}
char * fgets(char *dst, int size)
Definition: unicharset.cpp:759

◆ load_from_file() [5/5]

bool tesseract::UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 776 of file unicharset.cpp.

776 {
777 using namespace std::placeholders; // for _1, _2
778 std::function<char *(char *, int)> fgets_cb =
779 std::bind(&tesseract::TFile::FGets, file, _1, _2);
780 bool success = load_via_fgets(fgets_cb, skip_fragments);
781 return success;
782}
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195

◆ major_right_to_left()

bool tesseract::UNICHARSET::major_right_to_left ( ) const

Definition at line 983 of file unicharset.cpp.

983 {
984 int ltr_count = 0;
985 int rtl_count = 0;
986 for (unsigned id = 0; id < unichars.size(); ++id) {
987 int dir = get_direction(id);
988 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
989 ltr_count++;
990 }
991 if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
994 rtl_count++;
995 }
996 }
997 return rtl_count > ltr_count;
998}
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:712

◆ normed_ids()

const std::vector< UNICHAR_ID > & tesseract::UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const
inline

Definition at line 868 of file unicharset.h.

868 {
869 return unichars[unichar_id].properties.normed_ids;
870 }

◆ null_sid()

int tesseract::UNICHARSET::null_sid ( ) const
inline

Definition at line 916 of file unicharset.h.

916 {
917 return null_sid_;
918 }

◆ PartialSetPropertiesFromOther()

void tesseract::UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 395 of file unicharset.cpp.

396 {
397 for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
398 const char *utf8 = id_to_unichar(ch);
399 UNICHAR_PROPERTIES properties;
400 if (src.GetStrProperties(utf8, &properties)) {
401 // Setup the script_id, other_case, and mirror properly.
402 const char *script = src.get_script_from_script_id(properties.script_id);
403 properties.script_id = add_script(script);
404 const char *other_case = src.id_to_unichar(properties.other_case);
405 if (contains_unichar(other_case)) {
406 properties.other_case = unichar_to_id(other_case);
407 } else {
408 properties.other_case = ch;
409 }
410 const char *mirror_str = src.id_to_unichar(properties.mirror);
411 if (contains_unichar(mirror_str)) {
412 properties.mirror = unichar_to_id(mirror_str);
413 } else {
414 properties.mirror = ch;
415 }
416 unichars[ch].properties.CopyFrom(properties);
418 }
419 }
420}
int add_script(const char *script)
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:364

◆ post_load_setup()

void tesseract::UNICHARSET::post_load_setup ( )

Definition at line 912 of file unicharset.cpp.

912 {
913 // Number of alpha chars with the case property minus those without,
914 // in order to determine that half the alpha chars have case.
915 int net_case_alphas = 0;
916 int x_height_alphas = 0;
917 int cap_height_alphas = 0;
918 top_bottom_set_ = false;
919 for (unsigned id = 0; id < unichars.size(); ++id) {
920 int min_bottom = 0;
921 int max_bottom = UINT8_MAX;
922 int min_top = 0;
923 int max_top = UINT8_MAX;
924 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
925 if (min_top > 0) {
926 top_bottom_set_ = true;
927 }
928 if (get_isalpha(id)) {
929 if (get_islower(id) || get_isupper(id)) {
930 ++net_case_alphas;
931 } else {
932 --net_case_alphas;
933 }
934 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
935 ++x_height_alphas;
936 } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
937 ++cap_height_alphas;
938 }
939 }
940 set_normed_ids(id);
941 }
942
943 script_has_upper_lower_ = net_case_alphas > 0;
944 script_has_xheight_ =
945 script_has_upper_lower_ ||
946 (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
947 cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
948
949 null_sid_ = get_script_id_from_name(null_script);
950 ASSERT_HOST(null_sid_ == 0);
951 common_sid_ = get_script_id_from_name("Common");
952 latin_sid_ = get_script_id_from_name("Latin");
953 cyrillic_sid_ = get_script_id_from_name("Cyrillic");
954 greek_sid_ = get_script_id_from_name("Greek");
955 han_sid_ = get_script_id_from_name("Han");
956 hiragana_sid_ = get_script_id_from_name("Hiragana");
957 katakana_sid_ = get_script_id_from_name("Katakana");
958 thai_sid_ = get_script_id_from_name("Thai");
959 hangul_sid_ = get_script_id_from_name("Hangul");
960
961 // Compute default script. Use the highest-counting alpha script, that is
962 // not the common script, as that still contains some "alphas".
963 int *script_counts = new int[script_table_size_used];
964 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
965 for (unsigned id = 0; id < unichars.size(); ++id) {
966 if (get_isalpha(id)) {
967 ++script_counts[get_script(id)];
968 }
969 }
970 default_sid_ = 0;
971 for (int s = 1; s < script_table_size_used; ++s) {
972 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
973 default_sid_ = s;
974 }
975 }
976 delete[] script_counts;
977}
const double kMinCapHeightFraction
Definition: unicharset.cpp:58
const double kMinXHeightFraction
Definition: unicharset.cpp:57
int get_script_id_from_name(const char *script_name) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

◆ PropertiesIncomplete()

bool tesseract::UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const
inline

Definition at line 662 of file unicharset.h.

662 {
663 return unichars[unichar_id].properties.AnyRangeEmpty();
664 }

◆ save_to_file() [1/3]

bool tesseract::UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 361 of file unicharset.h.

361 {
362 FILE *file = fopen(filename, "w+b");
363 if (file == nullptr) {
364 return false;
365 }
366 bool result = save_to_file(file);
367 fclose(file);
368 return result;
369 }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361

◆ save_to_file() [2/3]

bool tesseract::UNICHARSET::save_to_file ( FILE *  file) const
inline

Definition at line 373 of file unicharset.h.

373 {
374 std::string str;
375 return save_to_string(str) &&
376 tesseract::Serialize(file, &str[0], str.length());
377 }
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236
bool save_to_string(std::string &str) const
Definition: unicharset.cpp:718

◆ save_to_file() [3/3]

bool tesseract::UNICHARSET::save_to_file ( tesseract::TFile file) const
inline

Definition at line 379 of file unicharset.h.

379 {
380 std::string str;
381 return save_to_string(str) && file->Serialize(&str[0], str.length());
382 }

◆ save_to_string()

bool tesseract::UNICHARSET::save_to_string ( std::string &  str) const

Definition at line 718 of file unicharset.cpp.

718 {
719 const int kFileBufSize = 1024;
720 char buffer[kFileBufSize + 1];
721 snprintf(buffer, kFileBufSize, "%zu\n", this->size());
722 str = buffer;
723 for (unsigned id = 0; id < this->size(); ++id) {
724 int min_bottom, max_bottom, min_top, max_top;
725 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
726 float width, width_sd;
727 get_width_stats(id, &width, &width_sd);
728 float bearing, bearing_sd;
729 get_bearing_stats(id, &bearing, &bearing_sd);
730 float advance, advance_sd;
731 get_advance_stats(id, &advance, &advance_sd);
732 unsigned int properties = this->get_properties(id);
733 if (strcmp(this->id_to_unichar(id), " ") == 0) {
734 snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
735 this->get_script_from_script_id(this->get_script(id)),
736 this->get_other_case(id));
737 str += buffer;
738 } else {
739 std::ostringstream stream;
740 stream.imbue(std::locale::classic());
741 stream << this->id_to_unichar(id) << ' ' << properties << ' '
742 << min_bottom << ',' << max_bottom << ',' << min_top << ','
743 << max_top << ',' << width << ',' << width_sd << ',' << bearing
744 << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
745 << this->get_script_from_script_id(this->get_script(id)) << ' '
746 << this->get_other_case(id) << ' ' << this->get_direction(id)
747 << ' ' << this->get_mirror(id) << ' '
748 << this->get_normed_unichar(id) << "\t# "
749 << this->debug_str(id).c_str() << '\n';
750 str += stream.str().c_str();
751 }
752 }
753 return true;
754}
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:886
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:646
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:629
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:703
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:721
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:612

◆ script_has_upper_lower()

bool tesseract::UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 951 of file unicharset.h.

951 {
952 return script_has_upper_lower_;
953 }

◆ script_has_xheight()

bool tesseract::UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 958 of file unicharset.h.

958 {
959 return script_has_xheight_;
960 }

◆ set_advance_stats()

void tesseract::UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
)
inline

Definition at line 656 of file unicharset.h.

657 {
658 unichars[unichar_id].properties.advance = advance;
659 unichars[unichar_id].properties.advance_sd = advance_sd;
660 }

◆ set_bearing_stats()

void tesseract::UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
)
inline

Definition at line 639 of file unicharset.h.

640 {
641 unichars[unichar_id].properties.bearing = bearing;
642 unichars[unichar_id].properties.bearing_sd = bearing_sd;
643 }

◆ set_black_and_whitelist()

void tesseract::UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 1004 of file unicharset.cpp.

1006 {
1007 bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1008 // Set everything to default
1009 for (auto &uc : unichars) {
1010 uc.properties.enabled = def_enabled;
1011 }
1012 if (!def_enabled) {
1013 // Enable the whitelist.
1014 std::vector<UNICHAR_ID> encoding;
1015 encode_string(whitelist, false, &encoding, nullptr, nullptr);
1016 for (auto it : encoding) {
1017 if (it != INVALID_UNICHAR_ID) {
1018 unichars[it].properties.enabled = true;
1019 }
1020 }
1021 }
1022 if (blacklist != nullptr && blacklist[0] != '\0') {
1023 // Disable the blacklist.
1024 std::vector<UNICHAR_ID> encoding;
1025 encode_string(blacklist, false, &encoding, nullptr, nullptr);
1026 for (auto it : encoding) {
1027 if (it != INVALID_UNICHAR_ID) {
1028 unichars[it].properties.enabled = false;
1029 }
1030 }
1031 }
1032 if (unblacklist != nullptr && unblacklist[0] != '\0') {
1033 // Re-enable the unblacklist.
1034 std::vector<UNICHAR_ID> encoding;
1035 encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1036 for (auto it : encoding) {
1037 if (it != INVALID_UNICHAR_ID) {
1038 unichars[it].properties.enabled = true;
1039 }
1040 }
1041 }
1042}

◆ set_direction()

void tesseract::UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 478 of file unicharset.h.

478 {
479 unichars[unichar_id].properties.direction = value;
480 }
int value

◆ set_isalpha()

void tesseract::UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 437 of file unicharset.h.

437 {
438 unichars[unichar_id].properties.isalpha = value;
439 }

◆ set_isdigit()

void tesseract::UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 452 of file unicharset.h.

452 {
453 unichars[unichar_id].properties.isdigit = value;
454 }

◆ set_islower()

void tesseract::UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 442 of file unicharset.h.

442 {
443 unichars[unichar_id].properties.islower = value;
444 }

◆ set_isngram()

void tesseract::UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 462 of file unicharset.h.

462 {
463 unichars[unichar_id].properties.isngram = value;
464 }

◆ set_ispunctuation()

void tesseract::UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 457 of file unicharset.h.

457 {
458 unichars[unichar_id].properties.ispunctuation = value;
459 }

◆ set_isupper()

void tesseract::UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 447 of file unicharset.h.

447 {
448 unichars[unichar_id].properties.isupper = value;
449 }

◆ set_mirror()

void tesseract::UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 483 of file unicharset.h.

483 {
484 unichars[unichar_id].properties.mirror = mirror;
485 }

◆ set_normed()

void tesseract::UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 488 of file unicharset.h.

488 {
489 unichars[unichar_id].properties.normed = normed;
490 unichars[unichar_id].properties.normed_ids.clear();
491 }

◆ set_normed_ids()

void tesseract::UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 364 of file unicharset.cpp.

364 {
365 unichars[unichar_id].properties.normed_ids.clear();
366 if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
367 unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
368 } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
369 true, &unichars[unichar_id].properties.normed_ids,
370 nullptr, nullptr)) {
371 unichars[unichar_id].properties.normed_ids.clear();
372 unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
373 }
374}

◆ set_other_case()

void tesseract::UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 473 of file unicharset.h.

473 {
474 unichars[unichar_id].properties.other_case = other_case;
475 }

◆ set_ranges_empty()

void tesseract::UNICHARSET::set_ranges_empty ( )

Definition at line 386 of file unicharset.cpp.

386 {
387 for (auto &uc : unichars) {
388 uc.properties.SetRangesEmpty();
389 }
390}

◆ set_script()

void tesseract::UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 468 of file unicharset.h.

468 {
469 unichars[unichar_id].properties.script_id = add_script(value);
470 }

◆ set_top_bottom()

void tesseract::UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 599 of file unicharset.h.

600 {
601 unichars[unichar_id].properties.min_bottom =
602 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603 unichars[unichar_id].properties.max_bottom =
604 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605 unichars[unichar_id].properties.min_top =
606 ClipToRange<int>(min_top, 0, UINT8_MAX);
607 unichars[unichar_id].properties.max_top =
608 ClipToRange<int>(max_top, 0, UINT8_MAX);
609 }

◆ set_width_stats()

void tesseract::UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
)
inline

Definition at line 623 of file unicharset.h.

623 {
624 unichars[unichar_id].properties.width = width;
625 unichars[unichar_id].properties.width_sd = width_sd;
626 }

◆ SetPropertiesFromOther()

void tesseract::UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)
inline

Definition at line 563 of file unicharset.h.

563 {
565 }

◆ size()

size_t tesseract::UNICHARSET::size ( ) const
inline

Definition at line 355 of file unicharset.h.

355 {
356 return unichars.size();
357 }

◆ SizesDistinct()

bool tesseract::UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 476 of file unicharset.cpp.

476 {
477 int overlap = std::min(unichars[id1].properties.max_top,
478 unichars[id2].properties.max_top) -
479 std::max(unichars[id1].properties.min_top,
480 unichars[id2].properties.min_top);
481 return overlap <= 0;
482}

◆ step()

int tesseract::UNICHARSET::step ( const char *  str) const

Definition at line 211 of file unicharset.cpp.

211 {
212 std::vector<UNICHAR_ID> encoding;
213 std::vector<char> lengths;
214 encode_string(str, true, &encoding, &lengths, nullptr);
215 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
216 return 0;
217 }
218 return lengths[0];
219}

◆ thai_sid()

int tesseract::UNICHARSET::thai_sid ( ) const
inline

Definition at line 940 of file unicharset.h.

940 {
941 return thai_sid_;
942 }

◆ to_lower()

UNICHAR_ID tesseract::UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 730 of file unicharset.h.

730 {
731 if (INVALID_UNICHAR_ID == unichar_id) {
732 return INVALID_UNICHAR_ID;
733 }
734 ASSERT_HOST(contains_unichar_id(unichar_id));
735 if (unichars[unichar_id].properties.islower) {
736 return unichar_id;
737 }
738 return unichars[unichar_id].properties.other_case;
739 }

◆ to_upper()

UNICHAR_ID tesseract::UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 742 of file unicharset.h.

742 {
743 if (INVALID_UNICHAR_ID == unichar_id) {
744 return INVALID_UNICHAR_ID;
745 }
746 ASSERT_HOST(contains_unichar_id(unichar_id));
747 if (unichars[unichar_id].properties.isupper) {
748 return unichar_id;
749 }
750 return unichars[unichar_id].properties.other_case;
751 }

◆ top_bottom_useful()

bool tesseract::UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 555 of file unicharset.h.

555 {
556 return top_bottom_set_;
557 }

◆ unichar_insert() [1/2]

void tesseract::UNICHARSET::unichar_insert ( const char *const  unichar_repr)
inline

Definition at line 283 of file unicharset.h.

◆ unichar_insert() [2/2]

void tesseract::UNICHARSET::unichar_insert ( const char *const  unichar_repr,
OldUncleanUnichars  old_style 
)

Definition at line 654 of file unicharset.cpp.

655 {
656 if (old_style == OldUncleanUnichars::kTrue) {
657 old_style_included_ = true;
658 }
659 std::string cleaned =
660 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
661 if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
662 const char *str = cleaned.c_str();
663 std::vector<int> encoding;
664 if (!old_style_included_ &&
665 encode_string(str, true, &encoding, nullptr, nullptr)) {
666 return;
667 }
668 unichars.emplace_back();
669 auto &u = unichars.back();
670 int index = 0;
671 do {
672 if (index >= UNICHAR_LEN) {
673 fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
674 unichar_repr);
675 return;
676 }
677 u.representation[index++] = *str++;
678 } while (*str != '\0');
679 u.representation[index] = '\0';
680 this->set_script(unichars.size() - 1, null_script);
681 // If the given unichar_repr represents a fragmented character, set
682 // fragment property to a pointer to CHAR_FRAGMENT class instance with
683 // information parsed from the unichar representation. Use the script
684 // of the base unichar for the fragmented character if possible.
685 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
686 u.properties.fragment = frag;
687 if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
688 u.properties.script_id = this->get_script(frag->get_unichar());
689 }
690 u.properties.enabled = true;
691 ids.insert(u.representation, unichars.size() - 1);
692 }
693}
#define UNICHAR_LEN
Definition: unichar.h:31
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468

◆ unichar_insert_backwards_compatible()

void tesseract::UNICHARSET::unichar_insert_backwards_compatible ( const char *const  unichar_repr)
inline

Definition at line 288 of file unicharset.h.

288 {
289 std::string cleaned = CleanupString(unichar_repr);
290 if (cleaned != unichar_repr) {
292 } else {
293 auto old_size = size();
295 if (size() == old_size) {
297 }
298 }
299 }

◆ unichar_to_id() [1/2]

UNICHAR_ID tesseract::UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 186 of file unicharset.cpp.

186 {
187 std::string cleaned =
188 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
189 return ids.contains(cleaned.data(), cleaned.size())
190 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
191 : INVALID_UNICHAR_ID;
192}
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:36

◆ unichar_to_id() [2/2]

UNICHAR_ID tesseract::UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 194 of file unicharset.cpp.

195 {
196 assert(length > 0 && length <= UNICHAR_LEN);
197 std::string cleaned(unichar_repr, length);
198 if (!old_style_included_) {
199 cleaned = CleanupString(unichar_repr, length);
200 }
201 return ids.contains(cleaned.data(), cleaned.size())
202 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
203 : INVALID_UNICHAR_ID;
204}

Member Data Documentation

◆ kCustomLigatures

const char * tesseract::UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{nullptr, nullptr}}

Definition at line 169 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * tesseract::UNICHARSET::kSpecialUnicharCodes
static
Initial value:
= {
" ", "Joined", "|Broken|0|1"}

Definition at line 172 of file unicharset.h.


The documentation for this class was generated from the following files: