tesseract v5.3.3.20231005
|
#include <unicharset.h>
Public Types | |
enum | Direction { U_LEFT_TO_RIGHT = 0 , U_RIGHT_TO_LEFT = 1 , U_EUROPEAN_NUMBER = 2 , U_EUROPEAN_NUMBER_SEPARATOR = 3 , U_EUROPEAN_NUMBER_TERMINATOR = 4 , U_ARABIC_NUMBER = 5 , U_COMMON_NUMBER_SEPARATOR = 6 , U_BLOCK_SEPARATOR = 7 , U_SEGMENT_SEPARATOR = 8 , U_WHITE_SPACE_NEUTRAL = 9 , U_OTHER_NEUTRAL = 10 , U_LEFT_TO_RIGHT_EMBEDDING = 11 , U_LEFT_TO_RIGHT_OVERRIDE = 12 , U_RIGHT_TO_LEFT_ARABIC = 13 , U_RIGHT_TO_LEFT_EMBEDDING = 14 , U_RIGHT_TO_LEFT_OVERRIDE = 15 , U_POP_DIRECTIONAL_FORMAT = 16 , U_DIR_NON_SPACING_MARK = 17 , U_BOUNDARY_NEUTRAL = 18 , U_FIRST_STRONG_ISOLATE = 19 , U_LEFT_TO_RIGHT_ISOLATE = 20 , U_RIGHT_TO_LEFT_ISOLATE = 21 , U_POP_DIRECTIONAL_ISOLATE = 22 , U_CHAR_DIRECTION_COUNT } |
Public Member Functions | |
UNICHARSET () | |
~UNICHARSET () | |
UNICHAR_ID | unichar_to_id (const char *const unichar_repr) const |
UNICHAR_ID | unichar_to_id (const char *const unichar_repr, int length) const |
int | step (const char *str) const |
bool | encodable_string (const char *str, unsigned *first_bad_position) const |
bool | encode_string (const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const |
const char * | id_to_unichar (UNICHAR_ID id) const |
const char * | id_to_unichar_ext (UNICHAR_ID id) const |
std::string | debug_str (UNICHAR_ID id) const |
std::string | debug_str (const char *unichar_repr) const |
void | unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style) |
void | unichar_insert (const char *const unichar_repr) |
void | unichar_insert_backwards_compatible (const char *const unichar_repr) |
bool | contains_unichar_id (UNICHAR_ID unichar_id) const |
bool | contains_unichar (const char *const unichar_repr) const |
bool | contains_unichar (const char *const unichar_repr, int length) const |
bool | eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const |
void | delete_pointers_in_unichars () |
void | clear () |
size_t | size () const |
bool | save_to_file (const char *const filename) const |
bool | save_to_file (FILE *file) const |
bool | save_to_file (tesseract::TFile *file) const |
bool | save_to_string (std::string &str) const |
bool | load_from_file (const char *const filename, bool skip_fragments) |
bool | load_from_file (const char *const filename) |
bool | load_from_file (FILE *file, bool skip_fragments) |
bool | load_from_file (FILE *file) |
bool | load_from_file (tesseract::TFile *file, bool skip_fragments) |
void | post_load_setup () |
bool | major_right_to_left () const |
void | set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist) |
void | set_isalpha (UNICHAR_ID unichar_id, bool value) |
void | set_islower (UNICHAR_ID unichar_id, bool value) |
void | set_isupper (UNICHAR_ID unichar_id, bool value) |
void | set_isdigit (UNICHAR_ID unichar_id, bool value) |
void | set_ispunctuation (UNICHAR_ID unichar_id, bool value) |
void | set_isngram (UNICHAR_ID unichar_id, bool value) |
void | set_script (UNICHAR_ID unichar_id, const char *value) |
void | set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case) |
void | set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value) |
void | set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror) |
void | set_normed (UNICHAR_ID unichar_id, const char *normed) |
void | set_normed_ids (UNICHAR_ID unichar_id) |
bool | get_isalpha (UNICHAR_ID unichar_id) const |
bool | get_islower (UNICHAR_ID unichar_id) const |
bool | get_isupper (UNICHAR_ID unichar_id) const |
bool | get_isdigit (UNICHAR_ID unichar_id) const |
bool | get_ispunctuation (UNICHAR_ID unichar_id) const |
bool | get_isngram (UNICHAR_ID unichar_id) const |
bool | get_isprivate (UNICHAR_ID unichar_id) const |
bool | top_bottom_useful () const |
void | set_ranges_empty () |
void | SetPropertiesFromOther (const UNICHARSET &src) |
void | PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src) |
void | ExpandRangesFromOther (const UNICHARSET &src) |
void | CopyFrom (const UNICHARSET &src) |
void | AppendOtherUnicharset (const UNICHARSET &src) |
bool | SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const |
void | get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const |
void | set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top) |
void | get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const |
void | set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd) |
void | get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const |
void | set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd) |
void | get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const |
void | set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd) |
bool | PropertiesIncomplete (UNICHAR_ID unichar_id) const |
bool | IsSpaceDelimited (UNICHAR_ID unichar_id) const |
int | get_script (UNICHAR_ID unichar_id) const |
unsigned int | get_properties (UNICHAR_ID unichar_id) const |
char | get_chartype (UNICHAR_ID unichar_id) const |
UNICHAR_ID | get_other_case (UNICHAR_ID unichar_id) const |
Direction | get_direction (UNICHAR_ID unichar_id) const |
UNICHAR_ID | get_mirror (UNICHAR_ID unichar_id) const |
UNICHAR_ID | to_lower (UNICHAR_ID unichar_id) const |
UNICHAR_ID | to_upper (UNICHAR_ID unichar_id) const |
bool | has_special_codes () const |
bool | AnyRepeatedUnicodes () const |
const CHAR_FRAGMENT * | get_fragment (UNICHAR_ID unichar_id) const |
bool | get_isalpha (const char *const unichar_repr) const |
bool | get_islower (const char *const unichar_repr) const |
bool | get_isupper (const char *const unichar_repr) const |
bool | get_isdigit (const char *const unichar_repr) const |
bool | get_ispunctuation (const char *const unichar_repr) const |
unsigned int | get_properties (const char *const unichar_repr) const |
char | get_chartype (const char *const unichar_repr) const |
int | get_script (const char *const unichar_repr) const |
const CHAR_FRAGMENT * | get_fragment (const char *const unichar_repr) const |
bool | get_isalpha (const char *const unichar_repr, int length) const |
bool | get_islower (const char *const unichar_repr, int length) const |
bool | get_isupper (const char *const unichar_repr, int length) const |
bool | get_isdigit (const char *const unichar_repr, int length) const |
bool | get_ispunctuation (const char *const unichar_repr, int length) const |
const char * | get_normed_unichar (UNICHAR_ID unichar_id) const |
const std::vector< UNICHAR_ID > & | normed_ids (UNICHAR_ID unichar_id) const |
int | get_script (const char *const unichar_repr, int length) const |
int | get_script_table_size () const |
const char * | get_script_from_script_id (int id) const |
int | get_script_id_from_name (const char *script_name) const |
bool | is_null_script (const char *script) const |
int | add_script (const char *script) |
bool | get_enabled (UNICHAR_ID unichar_id) const |
int | null_sid () const |
int | common_sid () const |
int | latin_sid () const |
int | cyrillic_sid () const |
int | greek_sid () const |
int | han_sid () const |
int | hiragana_sid () const |
int | katakana_sid () const |
int | thai_sid () const |
int | hangul_sid () const |
int | default_sid () const |
bool | script_has_upper_lower () const |
bool | script_has_xheight () const |
Static Public Member Functions | |
static std::string | debug_utf8_str (const char *str) |
static std::string | CleanupString (const char *utf8_str) |
static std::string | CleanupString (const char *utf8_str, size_t length) |
Static Public Attributes | |
static const char * | kCustomLigatures [][2] |
static const char * | kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT] |
Definition at line 164 of file unicharset.h.
Definition at line 175 of file unicharset.h.
tesseract::UNICHARSET::UNICHARSET | ( | ) |
Definition at line 170 of file unicharset.cpp.
tesseract::UNICHARSET::~UNICHARSET | ( | ) |
Definition at line 181 of file unicharset.cpp.
int tesseract::UNICHARSET::add_script | ( | const char * | script | ) |
Definition at line 1063 of file unicharset.cpp.
bool tesseract::UNICHARSET::AnyRepeatedUnicodes | ( | ) | const |
Definition at line 1046 of file unicharset.cpp.
void tesseract::UNICHARSET::AppendOtherUnicharset | ( | const UNICHARSET & | src | ) |
Definition at line 454 of file unicharset.cpp.
|
inlinestatic |
Definition at line 265 of file unicharset.h.
|
static |
Definition at line 1158 of file unicharset.cpp.
|
inline |
Definition at line 324 of file unicharset.h.
|
inline |
Definition at line 919 of file unicharset.h.
bool tesseract::UNICHARSET::contains_unichar | ( | const char *const | unichar_repr | ) | const |
Definition at line 695 of file unicharset.cpp.
bool tesseract::UNICHARSET::contains_unichar | ( | const char *const | unichar_repr, |
int | length | ||
) | const |
Definition at line 701 of file unicharset.cpp.
|
inline |
Definition at line 303 of file unicharset.h.
void tesseract::UNICHARSET::CopyFrom | ( | const UNICHARSET & | src | ) |
Definition at line 438 of file unicharset.cpp.
|
inline |
Definition at line 925 of file unicharset.h.
|
inline |
Definition at line 273 of file unicharset.h.
std::string tesseract::UNICHARSET::debug_str | ( | UNICHAR_ID | id | ) | const |
Definition at line 331 of file unicharset.cpp.
|
static |
Definition at line 307 of file unicharset.cpp.
|
inline |
Definition at line 946 of file unicharset.h.
|
inline |
Definition at line 316 of file unicharset.h.
bool tesseract::UNICHARSET::encodable_string | ( | const char * | str, |
unsigned * | first_bad_position | ||
) | const |
Definition at line 224 of file unicharset.cpp.
bool tesseract::UNICHARSET::encode_string | ( | const char * | str, |
bool | give_up_on_failure, | ||
std::vector< UNICHAR_ID > * | encoding, | ||
std::vector< char > * | lengths, | ||
unsigned * | encoded_length | ||
) | const |
Definition at line 239 of file unicharset.cpp.
bool tesseract::UNICHARSET::eq | ( | UNICHAR_ID | unichar_id, |
const char *const | unichar_repr | ||
) | const |
Definition at line 713 of file unicharset.cpp.
void tesseract::UNICHARSET::ExpandRangesFromOther | ( | const UNICHARSET & | src | ) |
Definition at line 425 of file unicharset.cpp.
|
inline |
Definition at line 646 of file unicharset.h.
|
inline |
Definition at line 629 of file unicharset.h.
|
inline |
Definition at line 807 of file unicharset.h.
char tesseract::UNICHARSET::get_chartype | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 635 of file unicharset.cpp.
|
inline |
Definition at line 712 of file unicharset.h.
|
inline |
Definition at line 911 of file unicharset.h.
|
inline |
Definition at line 820 of file unicharset.h.
|
inline |
Definition at line 768 of file unicharset.h.
|
inline |
Definition at line 777 of file unicharset.h.
|
inline |
Definition at line 830 of file unicharset.h.
|
inline |
Definition at line 497 of file unicharset.h.
|
inline |
Definition at line 792 of file unicharset.h.
|
inline |
Definition at line 848 of file unicharset.h.
|
inline |
Definition at line 524 of file unicharset.h.
|
inline |
Definition at line 782 of file unicharset.h.
|
inline |
Definition at line 836 of file unicharset.h.
|
inline |
Definition at line 506 of file unicharset.h.
|
inline |
Definition at line 542 of file unicharset.h.
bool tesseract::UNICHARSET::get_isprivate | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 379 of file unicharset.cpp.
|
inline |
Definition at line 797 of file unicharset.h.
|
inline |
Definition at line 854 of file unicharset.h.
|
inline |
Definition at line 533 of file unicharset.h.
|
inline |
Definition at line 787 of file unicharset.h.
|
inline |
Definition at line 842 of file unicharset.h.
|
inline |
Definition at line 515 of file unicharset.h.
|
inline |
Definition at line 721 of file unicharset.h.
|
inline |
Definition at line 859 of file unicharset.h.
|
inline |
Definition at line 703 of file unicharset.h.
|
inline |
Definition at line 803 of file unicharset.h.
unsigned int tesseract::UNICHARSET::get_properties | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 615 of file unicharset.cpp.
|
inline |
Definition at line 814 of file unicharset.h.
|
inline |
Definition at line 876 of file unicharset.h.
|
inline |
Definition at line 681 of file unicharset.h.
|
inline |
Definition at line 886 of file unicharset.h.
int tesseract::UNICHARSET::get_script_id_from_name | ( | const char * | script_name | ) | const |
|
inline |
Definition at line 881 of file unicharset.h.
|
inline |
Definition at line 586 of file unicharset.h.
|
inline |
Definition at line 612 of file unicharset.h.
|
inline |
Definition at line 928 of file unicharset.h.
|
inline |
Definition at line 931 of file unicharset.h.
|
inline |
Definition at line 943 of file unicharset.h.
|
inline |
Definition at line 756 of file unicharset.h.
|
inline |
Definition at line 934 of file unicharset.h.
const char * tesseract::UNICHARSET::id_to_unichar | ( | UNICHAR_ID | id | ) | const |
Definition at line 279 of file unicharset.cpp.
const char * tesseract::UNICHARSET::id_to_unichar_ext | ( | UNICHAR_ID | id | ) | const |
Definition at line 287 of file unicharset.cpp.
|
inline |
Definition at line 901 of file unicharset.h.
|
inline |
Definition at line 668 of file unicharset.h.
|
inline |
Definition at line 937 of file unicharset.h.
|
inline |
Definition at line 922 of file unicharset.h.
|
inline |
Definition at line 401 of file unicharset.h.
|
inline |
Definition at line 391 of file unicharset.h.
|
inline |
Definition at line 408 of file unicharset.h.
bool tesseract::UNICHARSET::load_from_file | ( | FILE * | file, |
bool | skip_fragments | ||
) |
Definition at line 767 of file unicharset.cpp.
bool tesseract::UNICHARSET::load_from_file | ( | tesseract::TFile * | file, |
bool | skip_fragments | ||
) |
Definition at line 776 of file unicharset.cpp.
bool tesseract::UNICHARSET::major_right_to_left | ( | ) | const |
Definition at line 983 of file unicharset.cpp.
|
inline |
Definition at line 868 of file unicharset.h.
|
inline |
Definition at line 916 of file unicharset.h.
void tesseract::UNICHARSET::PartialSetPropertiesFromOther | ( | int | start_index, |
const UNICHARSET & | src | ||
) |
Definition at line 395 of file unicharset.cpp.
void tesseract::UNICHARSET::post_load_setup | ( | ) |
Definition at line 912 of file unicharset.cpp.
|
inline |
Definition at line 662 of file unicharset.h.
|
inline |
Definition at line 361 of file unicharset.h.
|
inline |
Definition at line 373 of file unicharset.h.
|
inline |
Definition at line 379 of file unicharset.h.
bool tesseract::UNICHARSET::save_to_string | ( | std::string & | str | ) | const |
Definition at line 718 of file unicharset.cpp.
|
inline |
Definition at line 951 of file unicharset.h.
|
inline |
Definition at line 958 of file unicharset.h.
|
inline |
Definition at line 656 of file unicharset.h.
|
inline |
Definition at line 639 of file unicharset.h.
void tesseract::UNICHARSET::set_black_and_whitelist | ( | const char * | blacklist, |
const char * | whitelist, | ||
const char * | unblacklist | ||
) |
Definition at line 1004 of file unicharset.cpp.
|
inline |
Definition at line 478 of file unicharset.h.
|
inline |
Definition at line 437 of file unicharset.h.
|
inline |
Definition at line 452 of file unicharset.h.
|
inline |
Definition at line 442 of file unicharset.h.
|
inline |
Definition at line 462 of file unicharset.h.
|
inline |
Definition at line 457 of file unicharset.h.
|
inline |
Definition at line 447 of file unicharset.h.
|
inline |
Definition at line 483 of file unicharset.h.
|
inline |
Definition at line 488 of file unicharset.h.
void tesseract::UNICHARSET::set_normed_ids | ( | UNICHAR_ID | unichar_id | ) |
Definition at line 364 of file unicharset.cpp.
|
inline |
Definition at line 473 of file unicharset.h.
void tesseract::UNICHARSET::set_ranges_empty | ( | ) |
Definition at line 386 of file unicharset.cpp.
|
inline |
Definition at line 468 of file unicharset.h.
|
inline |
Definition at line 599 of file unicharset.h.
|
inline |
Definition at line 623 of file unicharset.h.
|
inline |
Definition at line 563 of file unicharset.h.
|
inline |
Definition at line 355 of file unicharset.h.
bool tesseract::UNICHARSET::SizesDistinct | ( | UNICHAR_ID | id1, |
UNICHAR_ID | id2 | ||
) | const |
Definition at line 476 of file unicharset.cpp.
int tesseract::UNICHARSET::step | ( | const char * | str | ) | const |
Definition at line 211 of file unicharset.cpp.
|
inline |
Definition at line 940 of file unicharset.h.
|
inline |
Definition at line 730 of file unicharset.h.
|
inline |
Definition at line 742 of file unicharset.h.
|
inline |
Definition at line 555 of file unicharset.h.
|
inline |
Definition at line 283 of file unicharset.h.
void tesseract::UNICHARSET::unichar_insert | ( | const char *const | unichar_repr, |
OldUncleanUnichars | old_style | ||
) |
Definition at line 654 of file unicharset.cpp.
|
inline |
Definition at line 288 of file unicharset.h.
UNICHAR_ID tesseract::UNICHARSET::unichar_to_id | ( | const char *const | unichar_repr | ) | const |
Definition at line 186 of file unicharset.cpp.
UNICHAR_ID tesseract::UNICHARSET::unichar_to_id | ( | const char *const | unichar_repr, |
int | length | ||
) | const |
Definition at line 194 of file unicharset.cpp.
|
static |
Definition at line 169 of file unicharset.h.
|
static |
Definition at line 172 of file unicharset.h.