tesseract  4.00.00dev
UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, int *first_bad_position) const
 
bool encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
 
const char * id_to_unichar (UNICHAR_ID id) const
 
const char * id_to_unichar_ext (UNICHAR_ID id) const
 
STRING debug_str (UNICHAR_ID id) const
 
STRING debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style)
 
void unichar_insert (const char *const unichar_repr)
 
void unichar_insert_backwards_compatible (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
int size () const
 
void reserve (int unichars_number)
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool save_to_file (tesseract::TFile *file) const
 
bool save_to_string (STRING *str) const
 
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
 
bool load_from_inmemory_file (const char *const memory, int mem_size)
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
void set_normed_ids (UNICHAR_ID unichar_id)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void CopyFrom (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
 
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
 
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
 
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
 
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
 
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
 
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
 
bool IsSpaceDelimited (UNICHAR_ID unichar_id) const
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
bool has_special_codes () const
 
bool AnyRepeatedUnicodes () const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
const GenericVector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int thai_sid () const
 
int hangul_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static STRING debug_utf8_str (const char *str)
 
static string CleanupString (const char *utf8_str)
 
static string CleanupString (const char *utf8_str, int length)
 

Static Public Attributes

static TESS_API const char * kCustomLigatures [][2]
 
static TESS_API const char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]
 

Detailed Description

Definition at line 146 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_CHAR_DIRECTION_COUNT 

Definition at line 157 of file unicharset.h.

157  {
158  U_LEFT_TO_RIGHT = 0,
159  U_RIGHT_TO_LEFT = 1,
160  U_EUROPEAN_NUMBER = 2,
163  U_ARABIC_NUMBER = 5,
165  U_BLOCK_SEPARATOR = 7,
168  U_OTHER_NEUTRAL = 10,
176  U_BOUNDARY_NEUTRAL = 18,
178  };

Constructor & Destructor Documentation

◆ UNICHARSET()

UNICHARSET::UNICHARSET ( )

Definition at line 172 of file unicharset.cpp.

172  :
173  unichars(NULL),
174  ids(),
175  size_used(0),
176  size_reserved(0),
177  script_table(NULL),
178  script_table_size_used(0),
179  null_script("NULL") {
180  clear();
181  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
183  if (i == UNICHAR_JOINED)
184  set_isngram(i, true);
185  }
186 }
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:154
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:455
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623
void clear()
Definition: unicharset.h:303

◆ ~UNICHARSET()

UNICHARSET::~UNICHARSET ( )

Definition at line 188 of file unicharset.cpp.

188  {
189  clear();
190 }
void clear()
Definition: unicharset.h:303

Member Function Documentation

◆ add_script()

int UNICHARSET::add_script ( const char *  script)

Definition at line 1028 of file unicharset.cpp.

1028  {
1029  for (int i = 0; i < script_table_size_used; ++i) {
1030  if (strcmp(script, script_table[i]) == 0)
1031  return i;
1032  }
1033  if (script_table_size_reserved == 0) {
1034  script_table_size_reserved = 8;
1035  script_table = new char*[script_table_size_reserved];
1036  } else if (script_table_size_used >= script_table_size_reserved) {
1037  assert(script_table_size_used == script_table_size_reserved);
1038  script_table_size_reserved += script_table_size_reserved;
1039  char** new_script_table = new char*[script_table_size_reserved];
1040  memcpy(new_script_table, script_table,
1041  script_table_size_used * sizeof(char*));
1042  delete[] script_table;
1043  script_table = new_script_table;
1044  }
1045  script_table[script_table_size_used] = new char[strlen(script) + 1];
1046  strcpy(script_table[script_table_size_used], script);
1047  return script_table_size_used++;
1048 }

◆ AnyRepeatedUnicodes()

bool UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 1015 of file unicharset.cpp.

1015  {
1016  int start_id = 0;
1018  for (int id = start_id; id < size_used; ++id) {
1019  // Convert to unicodes.
1020  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1021  for (int u = 1; u < unicodes.size(); ++u) {
1022  if (unicodes[u - 1] == unicodes[u]) return true;
1023  }
1024  }
1025  return false;
1026 }
bool has_special_codes() const
Definition: unicharset.h:721
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:827

◆ AppendOtherUnicharset()

void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 461 of file unicharset.cpp.

461  {
462  int initial_used = size_used;
463  for (int ch = 0; ch < src.size_used; ++ch) {
464  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
465  const char* utf8 = src.id_to_unichar(ch);
466  int id = size_used;
467  if (contains_unichar(utf8)) {
468  id = unichar_to_id(utf8);
469  // Just expand current ranges.
470  unichars[id].properties.ExpandRangesFrom(src_props);
471  } else {
473  unichars[id].properties.SetRangesEmpty();
474  }
475  }
476  // Set properties, including mirror and other_case, WITHOUT reordering
477  // the unicharset.
478  PartialSetPropertiesFromOther(initial_used, src);
479 }
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:264
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:402
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ CleanupString() [1/2]

static string UNICHARSET::CleanupString ( const char *  utf8_str)
inlinestatic

Definition at line 241 of file unicharset.h.

241  {
242  return CleanupString(utf8_str, strlen(utf8_str));
243  }
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241

◆ CleanupString() [2/2]

string UNICHARSET::CleanupString ( const char *  utf8_str,
int  length 
)
static

Definition at line 1118 of file unicharset.cpp.

1118  {
1119  string result;
1120  result.reserve(length);
1121  char ch;
1122  while ((ch = *utf8_str) != '\0' && --length >= 0) {
1123  int key_index = 0;
1124  const char* key;
1125  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1126  int match = 0;
1127  while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1128  if (key[match] == '\0') {
1129  utf8_str += match;
1130  break;
1131  }
1132  ++key_index;
1133  }
1134  if (key == nullptr) {
1135  result.push_back(ch);
1136  ++utf8_str;
1137  } else {
1138  result.append(kCleanupMaps[key_index][1]);
1139  }
1140  }
1141  return result;
1142 }

◆ clear()

void UNICHARSET::clear ( )
inline

Definition at line 303 of file unicharset.h.

303  {
304  if (script_table != NULL) {
305  for (int i = 0; i < script_table_size_used; ++i)
306  delete[] script_table[i];
307  delete[] script_table;
308  script_table = NULL;
309  script_table_size_used = 0;
310  }
311  if (unichars != NULL) {
313  delete[] unichars;
314  unichars = NULL;
315  }
316  script_table_size_reserved = 0;
317  size_reserved = 0;
318  size_used = 0;
319  ids.clear();
320  top_bottom_set_ = false;
321  script_has_upper_lower_ = false;
322  script_has_xheight_ = false;
323  old_style_included_ = false;
324  null_sid_ = 0;
325  common_sid_ = 0;
326  latin_sid_ = 0;
327  cyrillic_sid_ = 0;
328  greek_sid_ = 0;
329  han_sid_ = 0;
330  hiragana_sid_ = 0;
331  katakana_sid_ = 0;
332  thai_sid_ = 0;
333  hangul_sid_ = 0;
334  default_sid_ = 0;
335  }
void delete_pointers_in_unichars()
Definition: unicharset.h:293
void clear()
Definition: unicharmap.cpp:118

◆ common_sid()

int UNICHARSET::common_sid ( ) const
inline

Definition at line 883 of file unicharset.h.

883 { return common_sid_; }

◆ contains_unichar() [1/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 668 of file unicharset.cpp.

668  {
669  string cleaned =
670  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
671  return ids.contains(cleaned.data(), cleaned.size());
672 }
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82

◆ contains_unichar() [2/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 674 of file unicharset.cpp.

675  {
676  if (length == 0) {
677  return false;
678  }
679  string cleaned(unichar_repr, length);
680  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
681  return ids.contains(cleaned.data(), cleaned.size());
682 }
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82

◆ contains_unichar_id()

bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 279 of file unicharset.h.

279  {
280  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
281  unichar_id >= 0;
282  }

◆ CopyFrom()

void UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 445 of file unicharset.cpp.

445  {
446  clear();
447  for (int ch = 0; ch < src.size_used; ++ch) {
448  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
449  const char* utf8 = src.id_to_unichar(ch);
451  unichars[ch].properties.ExpandRangesFrom(src_props);
452  }
453  // Set properties, including mirror and other_case, WITHOUT reordering
454  // the unicharset.
456 }
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:264
void clear()
Definition: unicharset.h:303
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:402
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ cyrillic_sid()

int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 885 of file unicharset.h.

885 { return cyrillic_sid_; }

◆ debug_str() [1/2]

STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 340 of file unicharset.cpp.

340  {
341  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
342  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
343  if (fragment) {
344  return fragment->to_string();
345  }
346  const char* str = id_to_unichar(id);
347  STRING result = debug_utf8_str(str);
348  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
349  if (get_isalpha(id)) {
350  if (get_islower(id))
351  result += "a";
352  else if (get_isupper(id))
353  result += "A";
354  else
355  result += "x";
356  }
357  // Append 0 if a digit.
358  if (get_isdigit(id)) {
359  result += "0";
360  }
361  // Append p is a punctuation symbol.
362  if (get_ispunctuation(id)) {
363  result += "p";
364  }
365  return result;
366 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
static STRING to_string(const char *unichar, int pos, int total, bool natural)
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
Definition: strngs.h:45
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:316
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ debug_str() [2/2]

STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 249 of file unicharset.h.

249  {
250  return debug_str(unichar_to_id(unichar_repr));
251  }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:340

◆ debug_utf8_str()

STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 316 of file unicharset.cpp.

316  {
317  STRING result = str;
318  result += " [";
319  int step = 1;
320  // Chop into unicodes and code each as hex.
321  for (int i = 0; str[i] != '\0'; i += step) {
322  char hex[sizeof(int) * 2 + 1];
323  step = UNICHAR::utf8_step(str + i);
324  if (step == 0) {
325  step = 1;
326  sprintf(hex, "%x", str[i]);
327  } else {
328  UNICHAR ch(str + i, step);
329  sprintf(hex, "%x", ch.first_uni());
330  }
331  result += hex;
332  result += " ";
333  }
334  result += "]";
335  return result;
336 }
Definition: strngs.h:45
int step(const char *str) const
Definition: unicharset.cpp:230

◆ default_sid()

int UNICHARSET::default_sid ( ) const
inline

Definition at line 892 of file unicharset.h.

892 { return default_sid_; }

◆ delete_pointers_in_unichars()

void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 293 of file unicharset.h.

293  {
294  for (int i = 0; i < size_used; ++i) {
295  if (unichars[i].properties.fragment != NULL) {
296  delete unichars[i].properties.fragment;
297  unichars[i].properties.fragment = NULL;
298  }
299  }
300  }

◆ encodable_string()

bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 241 of file unicharset.cpp.

242  {
243  GenericVector<UNICHAR_ID> encoding;
244  return encode_string(str, true, &encoding, NULL, first_bad_position);
245 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256

◆ encode_string()

bool UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
GenericVector< UNICHAR_ID > *  encoding,
GenericVector< char > *  lengths,
int *  encoded_length 
) const

Definition at line 256 of file unicharset.cpp.

259  {
260  GenericVector<UNICHAR_ID> working_encoding;
261  GenericVector<char> working_lengths;
262  GenericVector<char> best_lengths;
263  encoding->truncate(0); // Just in case str is empty.
264  int str_length = strlen(str);
265  int str_pos = 0;
266  bool perfect = true;
267  while (str_pos < str_length) {
268  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
269  &str_pos, encoding, &best_lengths);
270  if (str_pos < str_length) {
271  // This is a non-match. Skip one utf-8 character.
272  perfect = false;
273  if (give_up_on_failure) break;
274  int step = UNICHAR::utf8_step(str + str_pos);
275  if (step == 0) step = 1;
276  encoding->push_back(INVALID_UNICHAR_ID);
277  best_lengths.push_back(step);
278  str_pos += step;
279  working_encoding = *encoding;
280  working_lengths = best_lengths;
281  }
282  }
283  if (lengths != NULL) *lengths = best_lengths;
284  if (encoded_length != NULL) *encoded_length = str_pos;
285  return perfect;
286 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
void truncate(int size)
int push_back(T object)
int step(const char *str) const
Definition: unicharset.cpp:230

◆ eq()

bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 684 of file unicharset.cpp.

685  {
686  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
687 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ ExpandRangesFromOther()

void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 432 of file unicharset.cpp.

432  {
433  for (int ch = 0; ch < size_used; ++ch) {
434  const char* utf8 = id_to_unichar(ch);
435  UNICHAR_PROPERTIES properties;
436  if (src.GetStrProperties(utf8, &properties)) {
437  // Expand just the ranges from properties.
438  unichars[ch].properties.ExpandRangesFrom(properties);
439  }
440  }
441 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ get_advance_stats()

void UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const
inline

Definition at line 629 of file unicharset.h.

630  {
631  if (INVALID_UNICHAR_ID == unichar_id) {
632  *advance = *advance_sd = 0;
633  return;
634  }
635  ASSERT_HOST(contains_unichar_id(unichar_id));
636  *advance = unichars[unichar_id].properties.advance;
637  *advance_sd = unichars[unichar_id].properties.advance_sd;
638  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_bearing_stats()

void UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const
inline

Definition at line 612 of file unicharset.h.

613  {
614  if (INVALID_UNICHAR_ID == unichar_id) {
615  *bearing = *bearing_sd = 0.0f;
616  return;
617  }
618  ASSERT_HOST(contains_unichar_id(unichar_id));
619  *bearing = unichars[unichar_id].properties.bearing;
620  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
621  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_chartype() [1/2]

char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 614 of file unicharset.cpp.

614  {
615  if (this->get_isupper(id)) return 'A';
616  if (this->get_islower(id)) return 'a';
617  if (this->get_isalpha(id)) return 'x';
618  if (this->get_isdigit(id)) return '0';
619  if (this->get_ispunctuation(id)) return 'p';
620  return 0;
621 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504

◆ get_chartype() [2/2]

char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 770 of file unicharset.h.

770  {
771  return get_chartype(unichar_to_id(unichar_repr));
772  }
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:614
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_direction()

Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 689 of file unicharset.h.

689  {
690  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
691  ASSERT_HOST(contains_unichar_id(unichar_id));
692  return unichars[unichar_id].properties.direction;
693  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_enabled()

bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 877 of file unicharset.h.

877  {
878  return unichars[unichar_id].properties.enabled;
879  }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 733 of file unicharset.h.

733  {
734  if (INVALID_UNICHAR_ID == unichar_id) return NULL;
735  ASSERT_HOST(contains_unichar_id(unichar_id));
736  return unichars[unichar_id].properties.fragment;
737  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_fragment() [2/2]

const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 783 of file unicharset.h.

783  {
784  if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
785  !ids.contains(unichar_repr, false)) {
786  return NULL;
787  }
788  return get_fragment(unichar_to_id(unichar_repr));
789  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82

◆ get_isalpha() [1/3]

bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 490 of file unicharset.h.

490  {
491  if (INVALID_UNICHAR_ID == unichar_id) return false;
492  ASSERT_HOST(contains_unichar_id(unichar_id));
493  return unichars[unichar_id].properties.isalpha;
494  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_isalpha() [2/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 740 of file unicharset.h.

740  {
741  return get_isalpha(unichar_to_id(unichar_repr));
742  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_isalpha() [3/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 793 of file unicharset.h.

794  {
795  return get_isalpha(unichar_to_id(unichar_repr, length));
796  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_isdigit() [1/3]

bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 511 of file unicharset.h.

511  {
512  if (INVALID_UNICHAR_ID == unichar_id) return false;
513  ASSERT_HOST(contains_unichar_id(unichar_id));
514  return unichars[unichar_id].properties.isdigit;
515  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_isdigit() [2/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 755 of file unicharset.h.

755  {
756  return get_isdigit(unichar_to_id(unichar_repr));
757  }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_isdigit() [3/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 814 of file unicharset.h.

815  {
816  return get_isdigit(unichar_to_id(unichar_repr, length));
817  }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_islower() [1/3]

bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 497 of file unicharset.h.

497  {
498  if (INVALID_UNICHAR_ID == unichar_id) return false;
499  ASSERT_HOST(contains_unichar_id(unichar_id));
500  return unichars[unichar_id].properties.islower;
501  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_islower() [2/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 745 of file unicharset.h.

745  {
746  return get_islower(unichar_to_id(unichar_repr));
747  }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_islower() [3/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 800 of file unicharset.h.

801  {
802  return get_islower(unichar_to_id(unichar_repr, length));
803  }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_isngram()

bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 525 of file unicharset.h.

525  {
526  if (INVALID_UNICHAR_ID == unichar_id) return false;
527  ASSERT_HOST(contains_unichar_id(unichar_id));
528  return unichars[unichar_id].properties.isngram;
529  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_isprivate()

bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 385 of file unicharset.cpp.

385  {
386  UNICHAR uc(id_to_unichar(unichar_id), -1);
387  int uni = uc.first_uni();
388  return (uni >= 0xE000 && uni <= 0xF8FF);
389 }
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ get_ispunctuation() [1/3]

bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 518 of file unicharset.h.

518  {
519  if (INVALID_UNICHAR_ID == unichar_id) return false;
520  ASSERT_HOST(contains_unichar_id(unichar_id));
521  return unichars[unichar_id].properties.ispunctuation;
522  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_ispunctuation() [2/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 760 of file unicharset.h.

760  {
761  return get_ispunctuation(unichar_to_id(unichar_repr));
762  }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_ispunctuation() [3/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 821 of file unicharset.h.

822  {
823  return get_ispunctuation(unichar_to_id(unichar_repr, length));
824  }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_isupper() [1/3]

bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 504 of file unicharset.h.

504  {
505  if (INVALID_UNICHAR_ID == unichar_id) return false;
506  ASSERT_HOST(contains_unichar_id(unichar_id));
507  return unichars[unichar_id].properties.isupper;
508  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_isupper() [2/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 750 of file unicharset.h.

750  {
751  return get_isupper(unichar_to_id(unichar_repr));
752  }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504

◆ get_isupper() [3/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 807 of file unicharset.h.

808  {
809  return get_isupper(unichar_to_id(unichar_repr, length));
810  }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504

◆ get_mirror()

UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 696 of file unicharset.h.

696  {
697  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
698  ASSERT_HOST(contains_unichar_id(unichar_id));
699  return unichars[unichar_id].properties.mirror;
700  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_normed_unichar()

const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 827 of file unicharset.h.

827  {
828  if (unichar_id == UNICHAR_SPACE) return " ";
829  return unichars[unichar_id].properties.normed.string();
830  }

◆ get_other_case()

UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 682 of file unicharset.h.

682  {
683  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
684  ASSERT_HOST(contains_unichar_id(unichar_id));
685  return unichars[unichar_id].properties.other_case;
686  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_properties() [1/2]

unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 599 of file unicharset.cpp.

599  {
600  unsigned int properties = 0;
601  if (this->get_isalpha(id))
602  properties |= ISALPHA_MASK;
603  if (this->get_islower(id))
604  properties |= ISLOWER_MASK;
605  if (this->get_isupper(id))
606  properties |= ISUPPER_MASK;
607  if (this->get_isdigit(id))
608  properties |= ISDIGIT_MASK;
609  if (this->get_ispunctuation(id))
610  properties |= ISPUNCTUATION_MASK;
611  return properties;
612 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504

◆ get_properties() [2/2]

unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 766 of file unicharset.h.

766  {
767  return get_properties(unichar_to_id(unichar_repr));
768  }
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:599
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207

◆ get_script() [1/3]

int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 662 of file unicharset.h.

662  {
663  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
664  ASSERT_HOST(contains_unichar_id(unichar_id));
665  return unichars[unichar_id].properties.script_id;
666  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_script() [2/3]

int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 777 of file unicharset.h.

777  {
778  return get_script(unichar_to_id(unichar_repr));
779  }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662

◆ get_script() [3/3]

int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 842 of file unicharset.h.

843  {
844  return get_script(unichar_to_id(unichar_repr, length));
845  }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662

◆ get_script_from_script_id()

const char* UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 853 of file unicharset.h.

853  {
854  if (id >= script_table_size_used || id < 0)
855  return null_script;
856  return script_table[id];
857  }

◆ get_script_id_from_name()

int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1107 of file unicharset.cpp.

1107  {
1108  for (int i = 0; i < script_table_size_used; ++i) {
1109  if (strcmp(script_name, script_table[i]) == 0)
1110  return i;
1111  }
1112  return 0; // 0 is always the null_script
1113 }

◆ get_script_table_size()

int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 848 of file unicharset.h.

848  {
849  return script_table_size_used;
850  }

◆ get_top_bottom()

void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 567 of file unicharset.h.

569  {
570  if (INVALID_UNICHAR_ID == unichar_id) {
571  *min_bottom = *min_top = 0;
572  *max_bottom = *max_top = 256; // kBlnCellHeight
573  return;
574  }
575  ASSERT_HOST(contains_unichar_id(unichar_id));
576  *min_bottom = unichars[unichar_id].properties.min_bottom;
577  *max_bottom = unichars[unichar_id].properties.max_bottom;
578  *min_top = unichars[unichar_id].properties.min_top;
579  *max_top = unichars[unichar_id].properties.max_top;
580  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ get_width_stats()

void UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const
inline

Definition at line 595 of file unicharset.h.

596  {
597  if (INVALID_UNICHAR_ID == unichar_id) {
598  *width = 0.0f;
599  *width_sd = 0.0f;;
600  return;
601  }
602  ASSERT_HOST(contains_unichar_id(unichar_id));
603  *width = unichars[unichar_id].properties.width;
604  *width_sd = unichars[unichar_id].properties.width_sd;
605  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ greek_sid()

int UNICHARSET::greek_sid ( ) const
inline

Definition at line 886 of file unicharset.h.

886 { return greek_sid_; }

◆ han_sid()

int UNICHARSET::han_sid ( ) const
inline

Definition at line 887 of file unicharset.h.

887 { return han_sid_; }

◆ hangul_sid()

int UNICHARSET::hangul_sid ( ) const
inline

Definition at line 891 of file unicharset.h.

891 { return hangul_sid_; }

◆ has_special_codes()

bool UNICHARSET::has_special_codes ( ) const
inline

Definition at line 721 of file unicharset.h.

721  {
722  return get_fragment(UNICHAR_BROKEN) != NULL &&
725  }
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:154
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ hiragana_sid()

int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 888 of file unicharset.h.

888 { return hiragana_sid_; }

◆ id_to_unichar()

const char * UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 288 of file unicharset.cpp.

288  {
289  if (id == INVALID_UNICHAR_ID) {
290  return INVALID_UNICHAR;
291  }
292  ASSERT_HOST(id < this->size());
293  return unichars[id].representation;
294 }
int size() const
Definition: unicharset.h:338
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ id_to_unichar_ext()

const char * UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 296 of file unicharset.cpp.

296  {
297  if (id == INVALID_UNICHAR_ID) {
298  return INVALID_UNICHAR;
299  }
300  ASSERT_HOST(id < this->size());
301  // Resolve from the kCustomLigatures table if this is a private encoding.
302  if (get_isprivate(id)) {
303  const char* ch = id_to_unichar(id);
304  for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
305  if (!strcmp(ch, kCustomLigatures[i][1])) {
306  return kCustomLigatures[i][0];
307  }
308  }
309  }
310  // Otherwise return the stored representation.
311  return unichars[id].representation;
312 }
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:385
int size() const
Definition: unicharset.h:338
#define ASSERT_HOST(x)
Definition: errcode.h:84
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:151
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ is_null_script()

bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 867 of file unicharset.h.

867  {
868  return script == null_script;
869  }

◆ IsSpaceDelimited()

bool UNICHARSET::IsSpaceDelimited ( UNICHAR_ID  unichar_id) const
inline

Definition at line 651 of file unicharset.h.

651  {
652  if (INVALID_UNICHAR_ID == unichar_id) return true;
653  int script_id = get_script(unichar_id);
654  return script_id != han_sid_ && script_id != thai_sid_ &&
655  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
656  script_id != katakana_sid_;
657  }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662

◆ katakana_sid()

int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 889 of file unicharset.h.

889 { return katakana_sid_; }

◆ latin_sid()

int UNICHARSET::latin_sid ( ) const
inline

Definition at line 884 of file unicharset.h.

884 { return latin_sid_; }

◆ load_from_file() [1/5]

bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 387 of file unicharset.h.

387  {
388  FILE* file = fopen(filename, "rb");
389  if (file == NULL) return false;
390  bool result = load_from_file(file, skip_fragments);
391  fclose(file);
392  return result;
393  }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:387

◆ load_from_file() [2/5]

bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 395 of file unicharset.h.

395  {
396  return load_from_file(filename, false);
397  }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:387

◆ load_from_file() [3/5]

bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 772 of file unicharset.cpp.

772  {
773  LocalFilePointer lfp(file);
776  bool success = load_via_fgets(fgets_cb, skip_fragments);
777  delete fgets_cb;
778  return success;
779 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
char * fgets(char *dst, int size)
Definition: unicharset.cpp:765

◆ load_from_file() [4/5]

bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 402 of file unicharset.h.

402 { return load_from_file(file, false); }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:387

◆ load_from_file() [5/5]

bool UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 781 of file unicharset.cpp.

781  {
784  bool success = load_via_fgets(fgets_cb, skip_fragments);
785  delete fgets_cb;
786  return success;
787 }
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:86
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116

◆ load_from_inmemory_file() [1/2]

bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 752 of file unicharset.cpp.

753  {
754  InMemoryFilePointer mem_fp(memory, mem_size);
757  bool success = load_via_fgets(fgets_cb, skip_fragments);
758  delete fgets_cb;
759  return success;
760 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:730

◆ load_from_inmemory_file() [2/2]

bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
)
inline

Definition at line 380 of file unicharset.h.

380  {
381  return load_from_inmemory_file(memory, mem_size, false);
382  }
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:752

◆ major_right_to_left()

bool UNICHARSET::major_right_to_left ( ) const

Definition at line 960 of file unicharset.cpp.

960  {
961  int ltr_count = 0;
962  int rtl_count = 0;
963  for (int id = 0; id < size_used; ++id) {
964  int dir = get_direction(id);
965  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
966  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
968  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
969  }
970  return rtl_count > ltr_count;
971 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:689

◆ normed_ids()

const GenericVector<UNICHAR_ID>& UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const
inline

Definition at line 834 of file unicharset.h.

834  {
835  return unichars[unichar_id].properties.normed_ids;
836  }

◆ null_sid()

int UNICHARSET::null_sid ( ) const
inline

Definition at line 882 of file unicharset.h.

882 { return null_sid_; }

◆ PartialSetPropertiesFromOther()

void UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 402 of file unicharset.cpp.

403  {
404  for (int ch = start_index; ch < size_used; ++ch) {
405  const char* utf8 = id_to_unichar(ch);
406  UNICHAR_PROPERTIES properties;
407  if (src.GetStrProperties(utf8, &properties)) {
408  // Setup the script_id, other_case, and mirror properly.
409  const char* script = src.get_script_from_script_id(properties.script_id);
410  properties.script_id = add_script(script);
411  const char* other_case = src.id_to_unichar(properties.other_case);
412  if (contains_unichar(other_case)) {
413  properties.other_case = unichar_to_id(other_case);
414  } else {
415  properties.other_case = ch;
416  }
417  const char* mirror_str = src.id_to_unichar(properties.mirror);
418  if (contains_unichar(mirror_str)) {
419  properties.mirror = unichar_to_id(mirror_str);
420  } else {
421  properties.mirror = ch;
422  }
423  unichars[ch].properties.CopyFrom(properties);
424  set_normed_ids(ch);
425  }
426  }
427 }
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:853
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
int add_script(const char *script)
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:370
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ post_load_setup()

void UNICHARSET::post_load_setup ( )

Definition at line 894 of file unicharset.cpp.

894  {
895  // Number of alpha chars with the case property minus those without,
896  // in order to determine that half the alpha chars have case.
897  int net_case_alphas = 0;
898  int x_height_alphas = 0;
899  int cap_height_alphas = 0;
900  top_bottom_set_ = false;
901  for (UNICHAR_ID id = 0; id < size_used; ++id) {
902  int min_bottom = 0;
903  int max_bottom = MAX_UINT8;
904  int min_top = 0;
905  int max_top = MAX_UINT8;
906  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
907  if (min_top > 0)
908  top_bottom_set_ = true;
909  if (get_isalpha(id)) {
910  if (get_islower(id) || get_isupper(id))
911  ++net_case_alphas;
912  else
913  --net_case_alphas;
914  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
915  ++x_height_alphas;
916  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
917  ++cap_height_alphas;
918  }
919  set_normed_ids(id);
920  }
921 
922  script_has_upper_lower_ = net_case_alphas > 0;
923  script_has_xheight_ = script_has_upper_lower_ ||
924  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
925  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
926 
927  null_sid_ = get_script_id_from_name(null_script);
928  ASSERT_HOST(null_sid_ == 0);
929  common_sid_ = get_script_id_from_name("Common");
930  latin_sid_ = get_script_id_from_name("Latin");
931  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
932  greek_sid_ = get_script_id_from_name("Greek");
933  han_sid_ = get_script_id_from_name("Han");
934  hiragana_sid_ = get_script_id_from_name("Hiragana");
935  katakana_sid_ = get_script_id_from_name("Katakana");
936  thai_sid_ = get_script_id_from_name("Thai");
937  hangul_sid_ = get_script_id_from_name("Hangul");
938 
939  // Compute default script. Use the highest-counting alpha script, that is
940  // not the common script, as that still contains some "alphas".
941  int* script_counts = new int[script_table_size_used];
942  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
943  for (int id = 0; id < size_used; ++id) {
944  if (get_isalpha(id)) {
945  ++script_counts[get_script(id)];
946  }
947  }
948  default_sid_ = 0;
949  for (int s = 1; s < script_table_size_used; ++s) {
950  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
951  default_sid_ = s;
952  }
953  delete [] script_counts;
954 }
int get_script_id_from_name(const char *script_name) const
const double kMinCapHeightFraction
Definition: unicharset.cpp:58
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:567
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
const double kMinXHeightFraction
Definition: unicharset.cpp:57
#define MAX_UINT8
Definition: host.h:63
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:370
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
#define ASSERT_HOST(x)
Definition: errcode.h:84
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
int UNICHAR_ID
Definition: unichar.h:35

◆ PropertiesIncomplete()

bool UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const
inline

Definition at line 645 of file unicharset.h.

645  {
646  return unichars[unichar_id].properties.AnyRangeEmpty();
647  }

◆ reserve()

void UNICHARSET::reserve ( int  unichars_number)

Definition at line 192 of file unicharset.cpp.

192  {
193  if (unichars_number > size_reserved) {
194  UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
195  for (int i = 0; i < size_used; ++i)
196  unichars_new[i] = unichars[i];
197  for (int j = size_used; j < unichars_number; ++j) {
198  unichars_new[j].properties.script_id = add_script(null_script);
199  }
200  delete[] unichars;
201  unichars = unichars_new;
202  size_reserved = unichars_number;
203  }
204 }
int add_script(const char *script)

◆ save_to_file() [1/3]

bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 347 of file unicharset.h.

347  {
348  FILE* file = fopen(filename, "w+b");
349  if (file == NULL) return false;
350  bool result = save_to_file(file);
351  fclose(file);
352  return result;
353  }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:347

◆ save_to_file() [2/3]

bool UNICHARSET::save_to_file ( FILE *  file) const
inline

Definition at line 357 of file unicharset.h.

357  {
358  STRING str;
359  if (!save_to_string(&str)) return false;
360  if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
361  return true;
362  }
Definition: strngs.h:45
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:689
inT32 length() const
Definition: strngs.cpp:193

◆ save_to_file() [3/3]

bool UNICHARSET::save_to_file ( tesseract::TFile file) const
inline

Definition at line 363 of file unicharset.h.

363  {
364  STRING str;
365  if (!save_to_string(&str)) return false;
366  if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
367  return true;
368  }
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
Definition: strngs.h:45
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:689
inT32 length() const
Definition: strngs.cpp:193

◆ save_to_string()

bool UNICHARSET::save_to_string ( STRING str) const

Definition at line 689 of file unicharset.cpp.

689  {
690  const int kFileBufSize = 1024;
691  char buffer[kFileBufSize + 1];
692  snprintf(buffer, kFileBufSize, "%d\n", this->size());
693  *str = buffer;
694  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
695  int min_bottom, max_bottom, min_top, max_top;
696  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
697  float width, width_sd;
698  get_width_stats(id, &width, &width_sd);
699  float bearing, bearing_sd;
700  get_bearing_stats(id, &bearing, &bearing_sd);
701  float advance, advance_sd;
702  get_advance_stats(id, &advance, &advance_sd);
703  unsigned int properties = this->get_properties(id);
704  if (strcmp(this->id_to_unichar(id), " ") == 0) {
705  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
706  this->get_script_from_script_id(this->get_script(id)),
707  this->get_other_case(id));
708  } else {
709  snprintf(buffer, kFileBufSize,
710  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
711  this->id_to_unichar(id), properties,
712  min_bottom, max_bottom, min_top, max_top, width, width_sd,
713  bearing, bearing_sd, advance, advance_sd,
714  this->get_script_from_script_id(this->get_script(id)),
715  this->get_other_case(id), this->get_direction(id),
716  this->get_mirror(id), this->get_normed_unichar(id),
717  this->debug_str(id).string());
718  }
719  *str += buffer;
720  }
721  return true;
722 }
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:567
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:599
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:853
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:629
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int size() const
Definition: unicharset.h:338
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:827
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:689
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:595
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:340
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:696
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:612
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35

◆ script_has_upper_lower()

bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 895 of file unicharset.h.

895  {
896  return script_has_upper_lower_;
897  }

◆ script_has_xheight()

bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 902 of file unicharset.h.

902  {
903  return script_has_xheight_;
904  }

◆ set_advance_stats()

void UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
)
inline

Definition at line 639 of file unicharset.h.

640  {
641  unichars[unichar_id].properties.advance = advance;
642  unichars[unichar_id].properties.advance_sd = advance_sd;
643  }

◆ set_bearing_stats()

void UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
)
inline

Definition at line 622 of file unicharset.h.

623  {
624  unichars[unichar_id].properties.bearing = bearing;
625  unichars[unichar_id].properties.bearing_sd = bearing_sd;
626  }

◆ set_black_and_whitelist()

void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 977 of file unicharset.cpp.

979  {
980  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
981  // Set everything to default
982  for (int ch = 0; ch < size_used; ++ch)
983  unichars[ch].properties.enabled = def_enabled;
984  if (!def_enabled) {
985  // Enable the whitelist.
986  GenericVector<UNICHAR_ID> encoding;
987  encode_string(whitelist, false, &encoding, NULL, NULL);
988  for (int i = 0; i < encoding.size(); ++i) {
989  if (encoding[i] != INVALID_UNICHAR_ID)
990  unichars[encoding[i]].properties.enabled = true;
991  }
992  }
993  if (blacklist != NULL && blacklist[0] != '\0') {
994  // Disable the blacklist.
995  GenericVector<UNICHAR_ID> encoding;
996  encode_string(blacklist, false, &encoding, NULL, NULL);
997  for (int i = 0; i < encoding.size(); ++i) {
998  if (encoding[i] != INVALID_UNICHAR_ID)
999  unichars[encoding[i]].properties.enabled = false;
1000  }
1001  }
1002  if (unblacklist != NULL && unblacklist[0] != '\0') {
1003  // Re-enable the unblacklist.
1004  GenericVector<UNICHAR_ID> encoding;
1005  encode_string(unblacklist, false, &encoding, NULL, NULL);
1006  for (int i = 0; i < encoding.size(); ++i) {
1007  if (encoding[i] != INVALID_UNICHAR_ID)
1008  unichars[encoding[i]].properties.enabled = true;
1009  }
1010  }
1011 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
int size() const
Definition: genericvector.h:72

◆ set_direction()

void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 471 of file unicharset.h.

471  {
472  unichars[unichar_id].properties.direction = value;
473  }

◆ set_isalpha()

void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 430 of file unicharset.h.

430  {
431  unichars[unichar_id].properties.isalpha = value;
432  }

◆ set_isdigit()

void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 445 of file unicharset.h.

445  {
446  unichars[unichar_id].properties.isdigit = value;
447  }

◆ set_islower()

void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 435 of file unicharset.h.

435  {
436  unichars[unichar_id].properties.islower = value;
437  }

◆ set_isngram()

void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 455 of file unicharset.h.

455  {
456  unichars[unichar_id].properties.isngram = value;
457  }

◆ set_ispunctuation()

void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 450 of file unicharset.h.

450  {
451  unichars[unichar_id].properties.ispunctuation = value;
452  }

◆ set_isupper()

void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 440 of file unicharset.h.

440  {
441  unichars[unichar_id].properties.isupper = value;
442  }

◆ set_mirror()

void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 476 of file unicharset.h.

476  {
477  unichars[unichar_id].properties.mirror = mirror;
478  }

◆ set_normed()

void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 481 of file unicharset.h.

481  {
482  unichars[unichar_id].properties.normed = normed;
483  unichars[unichar_id].properties.normed_ids.truncate(0);
484  }

◆ set_normed_ids()

void UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 370 of file unicharset.cpp.

370  {
371  unichars[unichar_id].properties.normed_ids.truncate(0);
372  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
373  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
374  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
375  true, &unichars[unichar_id].properties.normed_ids,
376  NULL, NULL)) {
377  unichars[unichar_id].properties.normed_ids.truncate(0);
378  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
379  }
380 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ set_other_case()

void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 466 of file unicharset.h.

466  {
467  unichars[unichar_id].properties.other_case = other_case;
468  }

◆ set_ranges_empty()

void UNICHARSET::set_ranges_empty ( )

Definition at line 393 of file unicharset.cpp.

393  {
394  for (int id = 0; id < size_used; ++id) {
395  unichars[id].properties.SetRangesEmpty();
396  }
397 }

◆ set_script()

void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 461 of file unicharset.h.

461  {
462  unichars[unichar_id].properties.script_id = add_script(value);
463  }
int add_script(const char *script)

◆ set_top_bottom()

void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 581 of file unicharset.h.

583  {
584  unichars[unichar_id].properties.min_bottom =
585  static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
586  unichars[unichar_id].properties.max_bottom =
587  static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
588  unichars[unichar_id].properties.min_top =
589  static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
590  unichars[unichar_id].properties.max_top =
591  static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
592  }
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
#define MAX_UINT8
Definition: host.h:63
uint8_t uinT8
Definition: host.h:35

◆ set_width_stats()

void UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
)
inline

Definition at line 606 of file unicharset.h.

606  {
607  unichars[unichar_id].properties.width = width;
608  unichars[unichar_id].properties.width_sd = width_sd;
609  }

◆ SetPropertiesFromOther()

void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)
inline

Definition at line 544 of file unicharset.h.

544  {
546  }
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:402

◆ size()

int UNICHARSET::size ( ) const
inline

Definition at line 338 of file unicharset.h.

338  {
339  return size_used;
340  }

◆ SizesDistinct()

bool UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 483 of file unicharset.cpp.

483  {
484  int overlap = MIN(unichars[id1].properties.max_top,
485  unichars[id2].properties.max_top) -
486  MAX(unichars[id1].properties.min_top,
487  unichars[id2].properties.min_top);
488  return overlap <= 0;
489 }
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24

◆ step()

int UNICHARSET::step ( const char *  str) const

Definition at line 230 of file unicharset.cpp.

230  {
231  GenericVector<UNICHAR_ID> encoding;
232  GenericVector<char> lengths;
233  encode_string(str, true, &encoding, &lengths, NULL);
234  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
235  return lengths[0];
236 }
bool empty() const
Definition: genericvector.h:91
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256

◆ thai_sid()

int UNICHARSET::thai_sid ( ) const
inline

Definition at line 890 of file unicharset.h.

890 { return thai_sid_; }

◆ to_lower()

UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 703 of file unicharset.h.

703  {
704  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
705  ASSERT_HOST(contains_unichar_id(unichar_id));
706  if (unichars[unichar_id].properties.islower) return unichar_id;
707  return unichars[unichar_id].properties.other_case;
708  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ to_upper()

UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 711 of file unicharset.h.

711  {
712  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
713  ASSERT_HOST(contains_unichar_id(unichar_id));
714  if (unichars[unichar_id].properties.isupper) return unichar_id;
715  return unichars[unichar_id].properties.other_case;
716  }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279

◆ top_bottom_useful()

bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 536 of file unicharset.h.

536  {
537  return top_bottom_set_;
538  }

◆ unichar_insert() [1/2]

void UNICHARSET::unichar_insert ( const char *const  unichar_repr,
OldUncleanUnichars  old_style 
)

Definition at line 623 of file unicharset.cpp.

624  {
625  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
626  string cleaned =
627  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
628  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
629  const char* str = cleaned.c_str();
630  GenericVector<int> encoding;
631  if (!old_style_included_ &&
632  encode_string(str, true, &encoding, nullptr, nullptr))
633  return;
634  if (size_used == size_reserved) {
635  if (size_used == 0)
636  reserve(8);
637  else
638  reserve(2 * size_used);
639  }
640  int index = 0;
641  do {
642  if (index > UNICHAR_LEN) {
643  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
644  unichar_repr);
645  return;
646  }
647  unichars[size_used].representation[index++] = *str++;
648  } while (*str != '\0');
649  unichars[size_used].representation[index] = '\0';
650  this->set_script(size_used, null_script);
651  // If the given unichar_repr represents a fragmented character, set
652  // fragment property to a pointer to CHAR_FRAGMENT class instance with
653  // information parsed from the unichar representation. Use the script
654  // of the base unichar for the fragmented character if possible.
655  CHAR_FRAGMENT* frag =
656  CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
657  this->unichars[size_used].properties.fragment = frag;
658  if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
659  this->unichars[size_used].properties.script_id =
660  this->get_script(frag->get_unichar());
661  }
662  this->unichars[size_used].properties.enabled = true;
663  ids.insert(unichars[size_used].representation, size_used);
664  ++size_used;
665  }
666 }
static CHAR_FRAGMENT * parse_from_string(const char *str)
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
#define UNICHAR_LEN
Definition: unichar.h:31
const char * get_unichar() const
Definition: unicharset.h:71
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:461
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82
void reserve(int unichars_number)
Definition: unicharset.cpp:192
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662

◆ unichar_insert() [2/2]

void UNICHARSET::unichar_insert ( const char *const  unichar_repr)
inline

Definition at line 259 of file unicharset.h.

259  {
261  }
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623

◆ unichar_insert_backwards_compatible()

void UNICHARSET::unichar_insert_backwards_compatible ( const char *const  unichar_repr)
inline

Definition at line 264 of file unicharset.h.

264  {
265  string cleaned = CleanupString(unichar_repr);
266  if (cleaned != unichar_repr) {
268  } else {
269  int old_size = size();
271  if (size() == old_size) {
273  }
274  }
275  }
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
int size() const
Definition: unicharset.h:338
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623

◆ unichar_to_id() [1/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 207 of file unicharset.cpp.

207  {
208  string cleaned =
209  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
210  return ids.contains(cleaned.data(), cleaned.size())
211  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
212  : INVALID_UNICHAR_ID;
213 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:37
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82

◆ unichar_to_id() [2/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 215 of file unicharset.cpp.

216  {
217  assert(length > 0 && length <= UNICHAR_LEN);
218  string cleaned(unichar_repr, length);
219  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
220  return ids.contains(cleaned.data(), cleaned.size())
221  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
222  : INVALID_UNICHAR_ID;
223 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:37
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
#define UNICHAR_LEN
Definition: unichar.h:31
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82

Member Data Documentation

◆ kCustomLigatures

const char * UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{NULL, NULL}
}

Definition at line 151 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * UNICHARSET::kSpecialUnicharCodes
static
Initial value:
= {
" ",
"Joined",
"|Broken|0|1"
}

Definition at line 154 of file unicharset.h.


The documentation for this class was generated from the following files: