20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_ 21 #define TESSERACT_CCUTIL_UNICHARSET_H_ 52 static const int kMinLen = 6;
56 static const int kMaxChunks = 5;
59 inline void set_all(
const char *unichar,
int pos,
int total,
bool natural) {
69 inline void set_pos(
int p) { this->pos = p; }
71 inline const char*
get_unichar()
const {
return this->unichar; }
72 inline int get_pos()
const {
return this->pos; }
73 inline int get_total()
const {
return this->total; }
77 static STRING to_string(
const char *unichar,
int pos,
int total,
81 return to_string(unichar, pos, total, natural);
86 inline bool equals(
const char *other_unichar,
87 int other_pos,
int other_total)
const {
88 return (strcmp(this->unichar, other_unichar) == 0 &&
89 this->pos == other_pos && this->total == other_total);
100 return (strcmp(this->unichar, fragment->
get_unichar()) == 0 &&
102 this->pos == fragment->
get_pos() + 1);
109 inline bool is_ending()
const {
return this->pos == this->total-1; }
160 U_EUROPEAN_NUMBER = 2,
161 U_EUROPEAN_NUMBER_SEPARATOR = 3,
162 U_EUROPEAN_NUMBER_TERMINATOR = 4,
164 U_COMMON_NUMBER_SEPARATOR = 6,
165 U_BLOCK_SEPARATOR = 7,
166 U_SEGMENT_SEPARATOR = 8,
167 U_WHITE_SPACE_NEUTRAL = 9,
168 U_OTHER_NEUTRAL = 10,
169 U_LEFT_TO_RIGHT_EMBEDDING = 11,
170 U_LEFT_TO_RIGHT_OVERRIDE = 12,
171 U_RIGHT_TO_LEFT_ARABIC = 13,
172 U_RIGHT_TO_LEFT_EMBEDDING = 14,
173 U_RIGHT_TO_LEFT_OVERRIDE = 15,
174 U_POP_DIRECTIONAL_FORMAT = 16,
175 U_DIR_NON_SPACING_MARK = 17,
176 U_BOUNDARY_NEUTRAL = 18,
177 U_CHAR_DIRECTION_COUNT
187 UNICHAR_ID unichar_to_id(
const char*
const unichar_repr)
const;
191 UNICHAR_ID unichar_to_id(
const char*
const unichar_repr,
int length)
const;
198 int step(
const char* str)
const;
203 bool encodable_string(
const char *str,
int *first_bad_position)
const;
220 bool encode_string(
const char* str,
bool give_up_on_failure,
223 int* encoded_length)
const;
227 const char* id_to_unichar(
UNICHAR_ID id)
const;
233 const char* id_to_unichar_ext(
UNICHAR_ID id)
const;
237 static STRING debug_utf8_str(
const char* str);
242 return CleanupString(utf8_str, strlen(utf8_str));
244 static string CleanupString(
const char* utf8_str,
int length);
250 return debug_str(unichar_to_id(unichar_repr));
257 void unichar_insert(
const char*
const unichar_repr,
265 string cleaned = CleanupString(unichar_repr);
266 if (cleaned != unichar_repr) {
269 int old_size = size();
271 if (size() == old_size) {
280 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
285 bool contains_unichar(
const char*
const unichar_repr)
const;
286 bool contains_unichar(
const char*
const unichar_repr,
int length)
const;
290 bool eq(
UNICHAR_ID unichar_id,
const char*
const unichar_repr)
const;
294 for (
int i = 0; i < size_used; ++i) {
295 if (unichars[i].properties.fragment != NULL) {
296 delete unichars[i].properties.fragment;
297 unichars[i].properties.fragment = NULL;
304 if (script_table != NULL) {
305 for (
int i = 0; i < script_table_size_used; ++i)
306 delete[] script_table[i];
307 delete[] script_table;
309 script_table_size_used = 0;
311 if (unichars != NULL) {
312 delete_pointers_in_unichars();
316 script_table_size_reserved = 0;
320 top_bottom_set_ =
false;
321 script_has_upper_lower_ =
false;
322 script_has_xheight_ =
false;
323 old_style_included_ =
false;
343 void reserve(
int unichars_number);
348 FILE* file = fopen(filename,
"w+b");
349 if (file == NULL)
return false;
350 bool result = save_to_file(file);
359 if (!save_to_string(&str))
return false;
360 if (fwrite(&str[0], str.
length(), 1, file) != 1)
return false;
365 if (!save_to_string(&str))
return false;
366 if (file->
FWrite(&str[0], str.
length(), 1) != 1)
return false;
372 bool save_to_string(
STRING *str)
const;
377 bool load_from_inmemory_file(
const char*
const memory,
int mem_size,
378 bool skip_fragments);
381 return load_from_inmemory_file(memory, mem_size,
false);
388 FILE* file = fopen(filename,
"rb");
389 if (file == NULL)
return false;
390 bool result = load_from_file(file, skip_fragments);
396 return load_from_file(filename,
false);
401 bool load_from_file(FILE *file,
bool skip_fragments);
409 void post_load_setup();
415 bool major_right_to_left()
const;
426 void set_black_and_whitelist(
const char* blacklist,
const char* whitelist,
427 const char* unblacklist);
431 unichars[unichar_id].properties.isalpha = value;
436 unichars[unichar_id].properties.islower = value;
441 unichars[unichar_id].properties.isupper = value;
446 unichars[unichar_id].properties.isdigit = value;
451 unichars[unichar_id].properties.ispunctuation = value;
456 unichars[unichar_id].properties.isngram = value;
462 unichars[unichar_id].properties.script_id = add_script(value);
467 unichars[unichar_id].properties.other_case = other_case;
472 unichars[unichar_id].properties.direction = value;
477 unichars[unichar_id].properties.mirror = mirror;
482 unichars[unichar_id].properties.normed = normed;
483 unichars[unichar_id].properties.normed_ids.truncate(0);
491 if (INVALID_UNICHAR_ID == unichar_id)
return false;
493 return unichars[unichar_id].properties.isalpha;
498 if (INVALID_UNICHAR_ID == unichar_id)
return false;
500 return unichars[unichar_id].properties.islower;
505 if (INVALID_UNICHAR_ID == unichar_id)
return false;
507 return unichars[unichar_id].properties.isupper;
512 if (INVALID_UNICHAR_ID == unichar_id)
return false;
514 return unichars[unichar_id].properties.isdigit;
519 if (INVALID_UNICHAR_ID == unichar_id)
return false;
521 return unichars[unichar_id].properties.ispunctuation;
526 if (INVALID_UNICHAR_ID == unichar_id)
return false;
528 return unichars[unichar_id].properties.isngram;
533 bool get_isprivate(
UNICHAR_ID unichar_id)
const;
537 return top_bottom_set_;
540 void set_ranges_empty();
545 PartialSetPropertiesFromOther(0, src);
548 void PartialSetPropertiesFromOther(
int start_index,
const UNICHARSET& src);
552 void ExpandRangesFromOther(
const UNICHARSET& src);
559 void AppendOtherUnicharset(
const UNICHARSET& src);
568 int* min_bottom,
int* max_bottom,
569 int* min_top,
int* max_top)
const {
570 if (INVALID_UNICHAR_ID == unichar_id) {
571 *min_bottom = *min_top = 0;
572 *max_bottom = *max_top = 256;
576 *min_bottom = unichars[unichar_id].properties.min_bottom;
577 *max_bottom = unichars[unichar_id].properties.max_bottom;
578 *min_top = unichars[unichar_id].properties.min_top;
579 *max_top = unichars[unichar_id].properties.max_top;
582 int min_bottom,
int max_bottom,
583 int min_top,
int max_top) {
584 unichars[unichar_id].properties.min_bottom =
586 unichars[unichar_id].properties.max_bottom =
588 unichars[unichar_id].properties.min_top =
590 unichars[unichar_id].properties.max_top =
596 float* width,
float* width_sd)
const {
597 if (INVALID_UNICHAR_ID == unichar_id) {
603 *width = unichars[unichar_id].properties.width;
604 *width_sd = unichars[unichar_id].properties.width_sd;
607 unichars[unichar_id].properties.width = width;
608 unichars[unichar_id].properties.width_sd = width_sd;
613 float* bearing,
float* bearing_sd)
const {
614 if (INVALID_UNICHAR_ID == unichar_id) {
615 *bearing = *bearing_sd = 0.0f;
619 *bearing = unichars[unichar_id].properties.bearing;
620 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
623 float bearing,
float bearing_sd) {
624 unichars[unichar_id].properties.bearing = bearing;
625 unichars[unichar_id].properties.bearing_sd = bearing_sd;
630 float* advance,
float* advance_sd)
const {
631 if (INVALID_UNICHAR_ID == unichar_id) {
632 *advance = *advance_sd = 0;
636 *advance = unichars[unichar_id].properties.advance;
637 *advance_sd = unichars[unichar_id].properties.advance_sd;
640 float advance,
float advance_sd) {
641 unichars[unichar_id].properties.advance = advance;
642 unichars[unichar_id].properties.advance_sd = advance_sd;
646 return unichars[unichar_id].properties.AnyRangeEmpty();
652 if (INVALID_UNICHAR_ID == unichar_id)
return true;
653 int script_id = get_script(unichar_id);
654 return script_id != han_sid_ && script_id != thai_sid_ &&
655 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
656 script_id != katakana_sid_;
663 if (INVALID_UNICHAR_ID == unichar_id)
return null_sid_;
665 return unichars[unichar_id].properties.script_id;
670 unsigned int get_properties(
UNICHAR_ID unichar_id)
const;
679 char get_chartype(
UNICHAR_ID unichar_id)
const;
683 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
685 return unichars[unichar_id].properties.other_case;
692 return unichars[unichar_id].properties.direction;
697 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
699 return unichars[unichar_id].properties.mirror;
704 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
706 if (unichars[unichar_id].properties.islower)
return unichar_id;
707 return unichars[unichar_id].properties.other_case;
712 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
714 if (unichars[unichar_id].properties.isupper)
return unichar_id;
715 return unichars[unichar_id].properties.other_case;
729 bool AnyRepeatedUnicodes()
const;
734 if (INVALID_UNICHAR_ID == unichar_id)
return NULL;
736 return unichars[unichar_id].properties.fragment;
741 return get_isalpha(unichar_to_id(unichar_repr));
746 return get_islower(unichar_to_id(unichar_repr));
751 return get_isupper(unichar_to_id(unichar_repr));
756 return get_isdigit(unichar_to_id(unichar_repr));
761 return get_ispunctuation(unichar_to_id(unichar_repr));
767 return get_properties(unichar_to_id(unichar_repr));
771 return get_chartype(unichar_to_id(unichar_repr));
778 return get_script(unichar_to_id(unichar_repr));
784 if (unichar_repr == NULL || unichar_repr[0] ==
'\0' ||
785 !ids.contains(unichar_repr,
false)) {
788 return get_fragment(unichar_to_id(unichar_repr));
795 return get_isalpha(unichar_to_id(unichar_repr, length));
802 return get_islower(unichar_to_id(unichar_repr, length));
809 return get_isupper(unichar_to_id(unichar_repr, length));
816 return get_isdigit(unichar_to_id(unichar_repr, length));
823 return get_ispunctuation(unichar_to_id(unichar_repr, length));
829 return unichars[unichar_id].properties.normed.string();
835 return unichars[unichar_id].properties.normed_ids;
844 return get_script(unichar_to_id(unichar_repr, length));
849 return script_table_size_used;
854 if (
id >= script_table_size_used ||
id < 0)
856 return script_table[id];
864 int get_script_id_from_name(
const char* script_name)
const;
868 return script == null_script;
874 int add_script(
const char* script);
878 return unichars[unichar_id].properties.enabled;
896 return script_has_upper_lower_;
903 return script_has_xheight_;
908 struct UNICHAR_PROPERTIES {
909 UNICHAR_PROPERTIES();
914 void SetRangesOpen();
916 void SetRangesEmpty();
919 bool AnyRangeEmpty()
const;
921 void ExpandRangesFrom(
const UNICHAR_PROPERTIES& src);
923 void CopyFrom(
const UNICHAR_PROPERTIES& src);
970 struct UNICHAR_SLOT {
972 UNICHAR_PROPERTIES properties;
986 void encode_string(
const char* str,
int str_index,
int str_length,
989 int* best_total_length,
998 bool GetStrProperties(
const char* utf8_str,
999 UNICHAR_PROPERTIES* props)
const;
1005 bool skip_fragments);
1010 static const char* kCleanupMaps[][2];
1012 UNICHAR_SLOT* unichars;
1016 char** script_table;
1017 int script_table_size_used;
1018 int script_table_size_reserved;
1019 const char* null_script;
1021 bool top_bottom_set_;
1023 bool script_has_upper_lower_;
1026 bool script_has_xheight_;
1028 bool old_style_included_;
1047 #endif // TESSERACT_CCUTIL_UNICHARSET_H_
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
unsigned int get_properties(const char *const unichar_repr) const
int FWrite(const void *buffer, int size, int count)
void delete_pointers_in_unichars()
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
void set_islower(UNICHAR_ID unichar_id, bool value)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
void set_normed(UNICHAR_ID unichar_id, const char *normed)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
void SetPropertiesFromOther(const UNICHARSET &src)
void set_isdigit(UNICHAR_ID unichar_id, bool value)
STRING debug_str(const char *unichar_repr) const
bool script_has_upper_lower() const
bool get_isdigit(UNICHAR_ID unichar_id) const
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
bool get_isalpha(UNICHAR_ID unichar_id) const
bool load_from_inmemory_file(const char *const memory, int mem_size)
bool get_isalpha(const char *const unichar_repr) const
const char * get_script_from_script_id(int id) const
char get_chartype(const char *const unichar_repr) const
int get_script(const char *const unichar_repr, int length) const
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
bool is_null_script(const char *script) const
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
bool get_islower(const char *const unichar_repr, int length) const
bool save_to_file(tesseract::TFile *file) const
bool get_isalpha(const char *const unichar_repr, int length) const
static string CleanupString(const char *utf8_str)
void set_isngram(UNICHAR_ID unichar_id, bool value)
bool script_has_xheight() const
int direction(EDGEPT *point)
bool get_enabled(UNICHAR_ID unichar_id) const
bool get_ispunctuation(const char *const unichar_repr, int length) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
bool save_to_file(const char *const filename) const
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
int get_script(const char *const unichar_repr) const
bool get_isupper(const char *const unichar_repr) const
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
bool has_special_codes() const
void set_isupper(UNICHAR_ID unichar_id, bool value)
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Direction get_direction(UNICHAR_ID unichar_id) const
bool get_islower(UNICHAR_ID unichar_id) const
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
void set_all(const char *unichar, int pos, int total, bool natural)
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
void unichar_insert_backwards_compatible(const char *const unichar_repr)
bool get_ispunctuation(const char *const unichar_repr) const
bool get_isdigit(const char *const unichar_repr, int length) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isngram(UNICHAR_ID unichar_id) const
bool save_to_file(FILE *file) const
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
bool top_bottom_useful() const
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
const char * get_unichar() const
bool load_from_file(const char *const filename, bool skip_fragments)
void set_script(UNICHAR_ID unichar_id, const char *value)
bool get_islower(const char *const unichar_repr) const
bool equals(const CHAR_FRAGMENT *other) const
void set_natural(bool value)
bool get_isdigit(const char *const unichar_repr) const
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
bool load_from_file(FILE *file)
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
int get_script_table_size() const
bool is_beginning() const
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
void unichar_insert(const char *const unichar_repr)
bool load_from_file(const char *const filename)
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
bool get_isupper(const char *const unichar_repr, int length) const
int get_script(UNICHAR_ID unichar_id) const
void set_unichar(const char *uch)
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
bool equals(const char *other_unichar, int other_pos, int other_total) const