19#ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20#define TESSERACT_CCUTIL_UNICHARSET_H_
53 static const int kMinLen = 6;
57 static const int kMaxChunks = 5;
60 inline void set_all(
const char *unichar,
int pos,
int total,
bool natural) {
67 strncpy(this->unichar, uch,
sizeof(this->unichar));
88 static std::string to_string(
const char *unichar,
int pos,
int total,
92 return to_string(unichar, pos, total, natural);
97 inline bool equals(
const char *other_unichar,
int other_pos,
98 int other_total)
const {
99 return (strcmp(this->unichar, other_unichar) == 0 &&
100 this->pos == other_pos && this->total == other_total);
110 return (strcmp(this->unichar, fragment->
get_unichar()) == 0 &&
112 this->pos == fragment->
get_pos() + 1);
117 return this->pos == 0;
122 return this->pos == this->total - 1;
169 static const char *kCustomLigatures[][2];
178 U_EUROPEAN_NUMBER = 2,
179 U_EUROPEAN_NUMBER_SEPARATOR = 3,
180 U_EUROPEAN_NUMBER_TERMINATOR = 4,
182 U_COMMON_NUMBER_SEPARATOR = 6,
183 U_BLOCK_SEPARATOR = 7,
184 U_SEGMENT_SEPARATOR = 8,
185 U_WHITE_SPACE_NEUTRAL = 9,
186 U_OTHER_NEUTRAL = 10,
187 U_LEFT_TO_RIGHT_EMBEDDING = 11,
188 U_LEFT_TO_RIGHT_OVERRIDE = 12,
189 U_RIGHT_TO_LEFT_ARABIC = 13,
190 U_RIGHT_TO_LEFT_EMBEDDING = 14,
191 U_RIGHT_TO_LEFT_OVERRIDE = 15,
192 U_POP_DIRECTIONAL_FORMAT = 16,
193 U_DIR_NON_SPACING_MARK = 17,
194 U_BOUNDARY_NEUTRAL = 18,
195 U_FIRST_STRONG_ISOLATE = 19,
196 U_LEFT_TO_RIGHT_ISOLATE = 20,
197 U_RIGHT_TO_LEFT_ISOLATE = 21,
198 U_POP_DIRECTIONAL_ISOLATE = 22,
199#ifndef U_HIDE_DEPRECATED_API
200 U_CHAR_DIRECTION_COUNT
211 UNICHAR_ID unichar_to_id(
const char *
const unichar_repr)
const;
215 UNICHAR_ID unichar_to_id(
const char *
const unichar_repr,
int length)
const;
222 int step(
const char *str)
const;
227 bool encodable_string(
const char *str,
unsigned *first_bad_position)
const;
244 bool encode_string(
const char *str,
bool give_up_on_failure,
245 std::vector<UNICHAR_ID> *encoding,
246 std::vector<char> *lengths,
247 unsigned *encoded_length)
const;
251 const char *id_to_unichar(
UNICHAR_ID id)
const;
257 const char *id_to_unichar_ext(
UNICHAR_ID id)
const;
261 static std::string debug_utf8_str(
const char *str);
266 return CleanupString(utf8_str, strlen(utf8_str));
268 static std::string CleanupString(
const char *utf8_str,
size_t length);
274 return debug_str(unichar_to_id(unichar_repr));
281 void unichar_insert(
const char *
const unichar_repr,
289 std::string cleaned = CleanupString(unichar_repr);
290 if (cleaned != unichar_repr) {
293 auto old_size = size();
295 if (size() == old_size) {
304 return static_cast<size_t>(unichar_id) < unichars.size();
308 bool contains_unichar(
const char *
const unichar_repr)
const;
309 bool contains_unichar(
const char *
const unichar_repr,
int length)
const;
313 bool eq(
UNICHAR_ID unichar_id,
const char *
const unichar_repr)
const;
317 for (
auto &unichar : unichars) {
318 delete unichar.properties.fragment;
319 unichar.properties.fragment =
nullptr;
325 if (script_table !=
nullptr) {
326 for (
int i = 0;
i < script_table_size_used; ++
i) {
327 delete[] script_table[
i];
329 delete[] script_table;
330 script_table =
nullptr;
331 script_table_size_used = 0;
333 script_table_size_reserved = 0;
334 delete_pointers_in_unichars();
337 top_bottom_set_ =
false;
338 script_has_upper_lower_ =
false;
339 script_has_xheight_ =
false;
340 old_style_included_ =
false;
356 return unichars.size();
362 FILE *
file = fopen(filename,
"w+b");
363 if (
file ==
nullptr) {
366 bool result = save_to_file(
file);
375 return save_to_string(str) &&
381 return save_to_string(str) &&
file->Serialize(&str[0], str.length());
386 bool save_to_string(std::string &str)
const;
392 FILE *
file = fopen(filename,
"rb");
393 if (
file ==
nullptr) {
396 bool result = load_from_file(
file, skip_fragments);
402 return load_from_file(filename,
false);
407 bool load_from_file(FILE *
file,
bool skip_fragments);
409 return load_from_file(
file,
false);
416 void post_load_setup();
422 bool major_right_to_left()
const;
433 void set_black_and_whitelist(
const char *blacklist,
const char *whitelist,
434 const char *unblacklist);
438 unichars[unichar_id].properties.isalpha =
value;
443 unichars[unichar_id].properties.islower =
value;
448 unichars[unichar_id].properties.isupper =
value;
453 unichars[unichar_id].properties.isdigit =
value;
458 unichars[unichar_id].properties.ispunctuation =
value;
463 unichars[unichar_id].properties.isngram =
value;
469 unichars[unichar_id].properties.script_id = add_script(
value);
474 unichars[unichar_id].properties.other_case = other_case;
479 unichars[unichar_id].properties.direction =
value;
484 unichars[unichar_id].properties.mirror = mirror;
489 unichars[unichar_id].properties.normed = normed;
490 unichars[unichar_id].properties.normed_ids.clear();
498 if (INVALID_UNICHAR_ID == unichar_id) {
502 return unichars[unichar_id].properties.isalpha;
507 if (INVALID_UNICHAR_ID == unichar_id) {
511 return unichars[unichar_id].properties.islower;
516 if (INVALID_UNICHAR_ID == unichar_id) {
520 return unichars[unichar_id].properties.isupper;
525 if (INVALID_UNICHAR_ID == unichar_id) {
529 return unichars[unichar_id].properties.isdigit;
534 if (INVALID_UNICHAR_ID == unichar_id) {
538 return unichars[unichar_id].properties.ispunctuation;
543 if (INVALID_UNICHAR_ID == unichar_id) {
547 return unichars[unichar_id].properties.isngram;
552 bool get_isprivate(
UNICHAR_ID unichar_id)
const;
556 return top_bottom_set_;
559 void set_ranges_empty();
564 PartialSetPropertiesFromOther(0, src);
567 void PartialSetPropertiesFromOther(
int start_index,
const UNICHARSET &src);
571 void ExpandRangesFromOther(
const UNICHARSET &src);
578 void AppendOtherUnicharset(
const UNICHARSET &src);
587 int *min_top,
int *max_top)
const {
588 if (INVALID_UNICHAR_ID == unichar_id) {
589 *min_bottom = *min_top = 0;
590 *max_bottom = *max_top = 256;
594 *min_bottom = unichars[unichar_id].properties.min_bottom;
595 *max_bottom = unichars[unichar_id].properties.max_bottom;
596 *min_top = unichars[unichar_id].properties.min_top;
597 *max_top = unichars[unichar_id].properties.max_top;
600 int min_top,
int max_top) {
601 unichars[unichar_id].properties.min_bottom =
602 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603 unichars[unichar_id].properties.max_bottom =
604 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605 unichars[unichar_id].properties.min_top =
606 ClipToRange<int>(min_top, 0, UINT8_MAX);
607 unichars[unichar_id].properties.max_top =
608 ClipToRange<int>(max_top, 0, UINT8_MAX);
613 float *width_sd)
const {
614 if (INVALID_UNICHAR_ID == unichar_id) {
620 *width = unichars[unichar_id].properties.width;
621 *width_sd = unichars[unichar_id].properties.width_sd;
624 unichars[unichar_id].properties.width = width;
625 unichars[unichar_id].properties.width_sd = width_sd;
630 float *bearing_sd)
const {
631 if (INVALID_UNICHAR_ID == unichar_id) {
632 *bearing = *bearing_sd = 0.0f;
636 *bearing = unichars[unichar_id].properties.bearing;
637 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
641 unichars[unichar_id].properties.bearing = bearing;
642 unichars[unichar_id].properties.bearing_sd = bearing_sd;
647 float *advance_sd)
const {
648 if (INVALID_UNICHAR_ID == unichar_id) {
649 *advance = *advance_sd = 0;
653 *advance = unichars[unichar_id].properties.advance;
654 *advance_sd = unichars[unichar_id].properties.advance_sd;
658 unichars[unichar_id].properties.advance = advance;
659 unichars[unichar_id].properties.advance_sd = advance_sd;
663 return unichars[unichar_id].properties.AnyRangeEmpty();
669 if (INVALID_UNICHAR_ID == unichar_id) {
672 int script_id = get_script(unichar_id);
673 return script_id != han_sid_ && script_id != thai_sid_ &&
674 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
675 script_id != katakana_sid_;
682 if (INVALID_UNICHAR_ID == unichar_id) {
686 return unichars[unichar_id].properties.script_id;
691 unsigned int get_properties(
UNICHAR_ID unichar_id)
const;
700 char get_chartype(
UNICHAR_ID unichar_id)
const;
704 if (INVALID_UNICHAR_ID == unichar_id) {
705 return INVALID_UNICHAR_ID;
708 return unichars[unichar_id].properties.other_case;
713 if (INVALID_UNICHAR_ID == unichar_id) {
717 return unichars[unichar_id].properties.direction;
722 if (INVALID_UNICHAR_ID == unichar_id) {
723 return INVALID_UNICHAR_ID;
726 return unichars[unichar_id].properties.mirror;
731 if (INVALID_UNICHAR_ID == unichar_id) {
732 return INVALID_UNICHAR_ID;
735 if (unichars[unichar_id].properties.islower) {
738 return unichars[unichar_id].properties.other_case;
743 if (INVALID_UNICHAR_ID == unichar_id) {
744 return INVALID_UNICHAR_ID;
747 if (unichars[unichar_id].properties.isupper) {
750 return unichars[unichar_id].properties.other_case;
764 bool AnyRepeatedUnicodes()
const;
769 if (INVALID_UNICHAR_ID == unichar_id) {
773 return unichars[unichar_id].properties.fragment;
778 return get_isalpha(unichar_to_id(unichar_repr));
783 return get_islower(unichar_to_id(unichar_repr));
788 return get_isupper(unichar_to_id(unichar_repr));
793 return get_isdigit(unichar_to_id(unichar_repr));
798 return get_ispunctuation(unichar_to_id(unichar_repr));
804 return get_properties(unichar_to_id(unichar_repr));
808 return get_chartype(unichar_to_id(unichar_repr));
815 return get_script(unichar_to_id(unichar_repr));
821 if (unichar_repr ==
nullptr || unichar_repr[0] ==
'\0' ||
822 !ids.contains(unichar_repr,
false)) {
825 return get_fragment(unichar_to_id(unichar_repr));
830 bool get_isalpha(
const char *
const unichar_repr,
int length)
const {
831 return get_isalpha(unichar_to_id(unichar_repr, length));
836 bool get_islower(
const char *
const unichar_repr,
int length)
const {
837 return get_islower(unichar_to_id(unichar_repr, length));
842 bool get_isupper(
const char *
const unichar_repr,
int length)
const {
843 return get_isupper(unichar_to_id(unichar_repr, length));
848 bool get_isdigit(
const char *
const unichar_repr,
int length)
const {
849 return get_isdigit(unichar_to_id(unichar_repr, length));
855 return get_ispunctuation(unichar_to_id(unichar_repr, length));
863 return unichars[unichar_id].properties.normed.c_str();
869 return unichars[unichar_id].properties.normed_ids;
876 int get_script(
const char *
const unichar_repr,
int length)
const {
877 return get_script(unichar_to_id(unichar_repr, length));
882 return script_table_size_used;
887 if (
id >= script_table_size_used ||
id < 0) {
890 return script_table[id];
898 int get_script_id_from_name(
const char *script_name)
const;
902 return script == null_script;
908 int add_script(
const char *script);
913 return unichars[unichar_id].properties.enabled;
926 return cyrillic_sid_;
935 return hiragana_sid_;
938 return katakana_sid_;
952 return script_has_upper_lower_;
959 return script_has_xheight_;
963 struct TESS_API UNICHAR_PROPERTIES {
964 UNICHAR_PROPERTIES();
969 void SetRangesOpen();
971 void SetRangesEmpty();
974 bool AnyRangeEmpty()
const;
976 void ExpandRangesFrom(
const UNICHAR_PROPERTIES &src);
978 void CopyFrom(
const UNICHAR_PROPERTIES &src);
1005 Direction direction;
1016 std::vector<UNICHAR_ID> normed_ids;
1025 struct UNICHAR_SLOT {
1027 UNICHAR_PROPERTIES properties;
1041 void encode_string(
const char *str,
int str_index,
int str_length,
1042 std::vector<UNICHAR_ID> *encoding,
1043 std::vector<char> *lengths,
unsigned *best_total_length,
1044 std::vector<UNICHAR_ID> *best_encoding,
1045 std::vector<char> *best_lengths)
const;
1052 bool GetStrProperties(
const char *utf8_str, UNICHAR_PROPERTIES *props)
const;
1057 bool load_via_fgets(
const std::function<
char *(
char *,
int)> &fgets_cb,
1058 bool skip_fragments);
1063 static const char *kCleanupMaps[][2];
1064 static const char *null_script;
1066 std::vector<UNICHAR_SLOT> unichars;
1068 char **script_table;
1069 int script_table_size_used;
1070 int script_table_size_reserved;
1072 bool top_bottom_set_;
1074 bool script_has_upper_lower_;
1077 bool script_has_xheight_;
1079 bool old_style_included_;
bool Serialize(FILE *fp, const std::vector< T > &data)
@ SPECIAL_UNICHAR_CODES_COUNT
bool equals(const char *other_unichar, int other_pos, int other_total) const
void set_unichar(const char *uch)
void set_all(const char *unichar, int pos, int total, bool natural)
void set_natural(bool value)
std::string to_string() const
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
const char * get_unichar() const
bool is_beginning() const
bool equals(const CHAR_FRAGMENT *other) const
bool get_islower(const char *const unichar_repr) const
bool get_isalpha(const char *const unichar_repr) const
void unichar_insert(const char *const unichar_repr)
std::string debug_str(const char *unichar_repr) const
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
void set_script(UNICHAR_ID unichar_id, const char *value)
void delete_pointers_in_unichars()
const char * get_script_from_script_id(int id) const
int get_script(UNICHAR_ID unichar_id) const
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
bool script_has_xheight() const
int get_script(const char *const unichar_repr, int length) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
int get_script_table_size() const
bool get_isupper(const char *const unichar_repr, int length) const
bool get_isalpha(UNICHAR_ID unichar_id) const
bool has_special_codes() const
Direction get_direction(UNICHAR_ID unichar_id) const
void set_isupper(UNICHAR_ID unichar_id, bool value)
bool script_has_upper_lower() const
void set_normed(UNICHAR_ID unichar_id, const char *normed)
bool is_null_script(const char *script) const
int get_script(const char *const unichar_repr) const
bool get_ispunctuation(const char *const unichar_repr) const
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
bool get_isdigit(const char *const unichar_repr) const
bool get_islower(UNICHAR_ID unichar_id) const
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
bool get_isdigit(const char *const unichar_repr, int length) const
bool load_from_file(FILE *file)
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
bool load_from_file(const char *const filename, bool skip_fragments)
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
bool get_ispunctuation(const char *const unichar_repr, int length) const
unsigned int get_properties(const char *const unichar_repr) const
bool get_isupper(const char *const unichar_repr) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
bool contains_unichar_id(UNICHAR_ID unichar_id) const
bool get_isngram(UNICHAR_ID unichar_id) const
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
bool top_bottom_useful() const
bool save_to_file(const char *const filename) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isalpha(const char *const unichar_repr, int length) const
void unichar_insert_backwards_compatible(const char *const unichar_repr)
bool save_to_file(FILE *file) const
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
bool get_isdigit(UNICHAR_ID unichar_id) const
bool load_from_file(const char *const filename)
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
bool get_islower(const char *const unichar_repr, int length) const
bool get_enabled(UNICHAR_ID unichar_id) const
void set_islower(UNICHAR_ID unichar_id, bool value)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
char get_chartype(const char *const unichar_repr) const
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
void SetPropertiesFromOther(const UNICHARSET &src)
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
void set_isdigit(UNICHAR_ID unichar_id, bool value)
static std::string CleanupString(const char *utf8_str)
bool save_to_file(tesseract::TFile *file) const
void set_isngram(UNICHAR_ID unichar_id, bool value)
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const