#include <unicharset.h>

Public Types
enum	Direction { U_LEFT_TO_RIGHT = 0 , U_RIGHT_TO_LEFT = 1 , U_EUROPEAN_NUMBER = 2 , U_EUROPEAN_NUMBER_SEPARATOR = 3 , U_EUROPEAN_NUMBER_TERMINATOR = 4 , U_ARABIC_NUMBER = 5 , U_COMMON_NUMBER_SEPARATOR = 6 , U_BLOCK_SEPARATOR = 7 , U_SEGMENT_SEPARATOR = 8 , U_WHITE_SPACE_NEUTRAL = 9 , U_OTHER_NEUTRAL = 10 , U_LEFT_TO_RIGHT_EMBEDDING = 11 , U_LEFT_TO_RIGHT_OVERRIDE = 12 , U_RIGHT_TO_LEFT_ARABIC = 13 , U_RIGHT_TO_LEFT_EMBEDDING = 14 , U_RIGHT_TO_LEFT_OVERRIDE = 15 , U_POP_DIRECTIONAL_FORMAT = 16 , U_DIR_NON_SPACING_MARK = 17 , U_BOUNDARY_NEUTRAL = 18 , U_FIRST_STRONG_ISOLATE = 19 , U_LEFT_TO_RIGHT_ISOLATE = 20 , U_RIGHT_TO_LEFT_ISOLATE = 21 , U_POP_DIRECTIONAL_ISOLATE = 22 , U_CHAR_DIRECTION_COUNT }

Public Member Functions
	UNICHARSET ()

	~UNICHARSET ()

UNICHAR_ID	unichar_to_id (const char *const unichar_repr) const

UNICHAR_ID	unichar_to_id (const char *const unichar_repr, int length) const

int	step (const char *str) const

bool	encodable_string (const char str, unsigned first_bad_position) const

bool	encode_string (const char str, bool give_up_on_failure, std::vector< UNICHAR_ID > encoding, std::vector< char > lengths, unsigned encoded_length) const

const char *	id_to_unichar (UNICHAR_ID id) const

const char *	id_to_unichar_ext (UNICHAR_ID id) const

std::string	debug_str (UNICHAR_ID id) const

std::string	debug_str (const char *unichar_repr) const

void	unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style)

void	unichar_insert (const char *const unichar_repr)

void	unichar_insert_backwards_compatible (const char *const unichar_repr)

bool	contains_unichar_id (UNICHAR_ID unichar_id) const

bool	contains_unichar (const char *const unichar_repr) const

bool	contains_unichar (const char *const unichar_repr, int length) const

bool	eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const

void	delete_pointers_in_unichars ()

void	clear ()

size_t	size () const

bool	save_to_file (const char *const filename) const

bool	save_to_file (FILE *file) const

bool	save_to_file (tesseract::TFile *file) const

bool	save_to_string (std::string &str) const

bool	load_from_file (const char *const filename, bool skip_fragments)

bool	load_from_file (const char *const filename)

bool	load_from_file (FILE *file, bool skip_fragments)

bool	load_from_file (FILE *file)

bool	load_from_file (tesseract::TFile *file, bool skip_fragments)

void	post_load_setup ()

bool	major_right_to_left () const

void	set_black_and_whitelist (const char blacklist, const char whitelist, const char *unblacklist)

void	set_isalpha (UNICHAR_ID unichar_id, bool value)

void	set_islower (UNICHAR_ID unichar_id, bool value)

void	set_isupper (UNICHAR_ID unichar_id, bool value)

void	set_isdigit (UNICHAR_ID unichar_id, bool value)

void	set_ispunctuation (UNICHAR_ID unichar_id, bool value)

void	set_isngram (UNICHAR_ID unichar_id, bool value)

void	set_script (UNICHAR_ID unichar_id, const char *value)

void	set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)

void	set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)

void	set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)

void	set_normed (UNICHAR_ID unichar_id, const char *normed)

void	set_normed_ids (UNICHAR_ID unichar_id)

bool	get_isalpha (UNICHAR_ID unichar_id) const

bool	get_islower (UNICHAR_ID unichar_id) const

bool	get_isupper (UNICHAR_ID unichar_id) const

bool	get_isdigit (UNICHAR_ID unichar_id) const

bool	get_ispunctuation (UNICHAR_ID unichar_id) const

bool	get_isngram (UNICHAR_ID unichar_id) const

bool	get_isprivate (UNICHAR_ID unichar_id) const

bool	top_bottom_useful () const

void	set_ranges_empty ()

void	SetPropertiesFromOther (const UNICHARSET &src)

void	PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)

void	ExpandRangesFromOther (const UNICHARSET &src)

void	CopyFrom (const UNICHARSET &src)

void	AppendOtherUnicharset (const UNICHARSET &src)

bool	SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const

void	get_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top) const

void	set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)

void	get_width_stats (UNICHAR_ID unichar_id, float width, float width_sd) const

void	set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)

void	get_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd) const

void	set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)

void	get_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd) const

void	set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)

bool	PropertiesIncomplete (UNICHAR_ID unichar_id) const

bool	IsSpaceDelimited (UNICHAR_ID unichar_id) const

int	get_script (UNICHAR_ID unichar_id) const

unsigned int	get_properties (UNICHAR_ID unichar_id) const

char	get_chartype (UNICHAR_ID unichar_id) const

UNICHAR_ID	get_other_case (UNICHAR_ID unichar_id) const

Direction	get_direction (UNICHAR_ID unichar_id) const

UNICHAR_ID	get_mirror (UNICHAR_ID unichar_id) const

UNICHAR_ID	to_lower (UNICHAR_ID unichar_id) const

UNICHAR_ID	to_upper (UNICHAR_ID unichar_id) const

bool	has_special_codes () const

bool	AnyRepeatedUnicodes () const

const CHAR_FRAGMENT *	get_fragment (UNICHAR_ID unichar_id) const

bool	get_isalpha (const char *const unichar_repr) const

bool	get_islower (const char *const unichar_repr) const

bool	get_isupper (const char *const unichar_repr) const

bool	get_isdigit (const char *const unichar_repr) const

bool	get_ispunctuation (const char *const unichar_repr) const

unsigned int	get_properties (const char *const unichar_repr) const

char	get_chartype (const char *const unichar_repr) const

int	get_script (const char *const unichar_repr) const

const CHAR_FRAGMENT *	get_fragment (const char *const unichar_repr) const

bool	get_isalpha (const char *const unichar_repr, int length) const

bool	get_islower (const char *const unichar_repr, int length) const

bool	get_isupper (const char *const unichar_repr, int length) const

bool	get_isdigit (const char *const unichar_repr, int length) const

bool	get_ispunctuation (const char *const unichar_repr, int length) const

const char *	get_normed_unichar (UNICHAR_ID unichar_id) const

const std::vector< UNICHAR_ID > &	normed_ids (UNICHAR_ID unichar_id) const

int	get_script (const char *const unichar_repr, int length) const

int	get_script_table_size () const

const char *	get_script_from_script_id (int id) const

int	get_script_id_from_name (const char *script_name) const

bool	is_null_script (const char *script) const

int	add_script (const char *script)

bool	get_enabled (UNICHAR_ID unichar_id) const

int	null_sid () const

int	common_sid () const

int	latin_sid () const

int	cyrillic_sid () const

int	greek_sid () const

int	han_sid () const

int	hiragana_sid () const

int	katakana_sid () const

int	thai_sid () const

int	hangul_sid () const

int	default_sid () const

bool	script_has_upper_lower () const

bool	script_has_xheight () const

Static Public Member Functions
static std::string	debug_utf8_str (const char *str)

static std::string	CleanupString (const char *utf8_str)

static std::string	CleanupString (const char *utf8_str, size_t length)

Static Public Attributes
static const char *	kCustomLigatures [][2]

static const char *	kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]

Detailed Description

Definition at line 164 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

enum tesseract::UNICHARSET::Direction

Enumerator
U_LEFT_TO_RIGHT
U_RIGHT_TO_LEFT
U_EUROPEAN_NUMBER
U_EUROPEAN_NUMBER_SEPARATOR
U_EUROPEAN_NUMBER_TERMINATOR
U_ARABIC_NUMBER
U_COMMON_NUMBER_SEPARATOR
U_BLOCK_SEPARATOR
U_SEGMENT_SEPARATOR
U_WHITE_SPACE_NEUTRAL
U_OTHER_NEUTRAL
U_LEFT_TO_RIGHT_EMBEDDING
U_LEFT_TO_RIGHT_OVERRIDE
U_RIGHT_TO_LEFT_ARABIC
U_RIGHT_TO_LEFT_EMBEDDING
U_RIGHT_TO_LEFT_OVERRIDE
U_POP_DIRECTIONAL_FORMAT
U_DIR_NON_SPACING_MARK
U_BOUNDARY_NEUTRAL
U_FIRST_STRONG_ISOLATE
U_LEFT_TO_RIGHT_ISOLATE
U_RIGHT_TO_LEFT_ISOLATE
U_POP_DIRECTIONAL_ISOLATE
U_CHAR_DIRECTION_COUNT

Definition at line 175 of file unicharset.h.

                 {
    U_LEFT_TO_RIGHT = 0,
    U_RIGHT_TO_LEFT = 1,
    U_EUROPEAN_NUMBER = 2,
    U_EUROPEAN_NUMBER_SEPARATOR = 3,
    U_EUROPEAN_NUMBER_TERMINATOR = 4,
    U_ARABIC_NUMBER = 5,
    U_COMMON_NUMBER_SEPARATOR = 6,
    U_BLOCK_SEPARATOR = 7,
    U_SEGMENT_SEPARATOR = 8,
    U_WHITE_SPACE_NEUTRAL = 9,
    U_OTHER_NEUTRAL = 10,
    U_LEFT_TO_RIGHT_EMBEDDING = 11,
    U_LEFT_TO_RIGHT_OVERRIDE = 12,
    U_RIGHT_TO_LEFT_ARABIC = 13,
    U_RIGHT_TO_LEFT_EMBEDDING = 14,
    U_RIGHT_TO_LEFT_OVERRIDE = 15,
    U_POP_DIRECTIONAL_FORMAT = 16,
    U_DIR_NON_SPACING_MARK = 17,
    U_BOUNDARY_NEUTRAL = 18,
    U_FIRST_STRONG_ISOLATE = 19,
    U_LEFT_TO_RIGHT_ISOLATE = 20,
    U_RIGHT_TO_LEFT_ISOLATE = 21,
    U_POP_DIRECTIONAL_ISOLATE = 22,
#ifndef U_HIDE_DEPRECATED_API
    U_CHAR_DIRECTION_COUNT
#endif // U_HIDE_DEPRECATED_API
  };

Constructor & Destructor Documentation

◆ UNICHARSET()

tesseract::UNICHARSET::UNICHARSET ( )

Definition at line 170 of file unicharset.cpp.

    : ids(), script_table(nullptr), script_table_size_used(0) {
  clear();
  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
    unichar_insert(kSpecialUnicharCodes[i]);
    if (i == UNICHAR_JOINED) {
      set_isngram(i, true);
    }
  }
}

◆ ~UNICHARSET()

tesseract::UNICHARSET::~UNICHARSET ( )

Definition at line 181 of file unicharset.cpp.

                        {
  clear();
}

Member Function Documentation

◆ add_script()

int tesseract::UNICHARSET::add_script ( const char * script )

Definition at line 1063 of file unicharset.cpp.

                                             {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script, script_table[i]) == 0) {
      return i;
    }
  }
  if (script_table_size_reserved == 0) {
    script_table_size_reserved = 8;
    script_table = new char *[script_table_size_reserved];
  } else if (script_table_size_used >= script_table_size_reserved) {
    assert(script_table_size_used == script_table_size_reserved);
    script_table_size_reserved += script_table_size_reserved;
    char **new_script_table = new char *[script_table_size_reserved];
    memcpy(new_script_table, script_table,
           script_table_size_used * sizeof(char *));
    delete[] script_table;
    script_table = new_script_table;
  }
  script_table[script_table_size_used] = new char[strlen(script) + 1];
  strcpy(script_table[script_table_size_used], script);
  return script_table_size_used++;
}

◆ AnyRepeatedUnicodes()

bool tesseract::UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 1046 of file unicharset.cpp.

                                           {
  int start_id = 0;
  if (has_special_codes()) {
    start_id = SPECIAL_UNICHAR_CODES_COUNT;
  }
  for (unsigned id = start_id; id < unichars.size(); ++id) {
    // Convert to unicodes.
    std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
    for (size_t u = 1; u < unicodes.size(); ++u) {
      if (unicodes[u - 1] == unicodes[u]) {
        return true;
      }
    }
  }
  return false;
}

◆ AppendOtherUnicharset()

void tesseract::UNICHARSET::AppendOtherUnicharset ( const UNICHARSET & src )

Definition at line 454 of file unicharset.cpp.

                                                            {
  int initial_used = unichars.size();
  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
    const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
    const char *utf8 = src.id_to_unichar(ch);
    int id = unichars.size();
    if (contains_unichar(utf8)) {
      id = unichar_to_id(utf8);
      // Just expand current ranges.
      unichars[id].properties.ExpandRangesFrom(src_props);
    } else {
      unichar_insert_backwards_compatible(utf8);
      unichars[id].properties.SetRangesEmpty();
    }
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(initial_used, src);
}

◆ CleanupString() [1/2]

static std::string tesseract::UNICHARSET::CleanupString ( const char * utf8_str )

inlinestatic

Definition at line 265 of file unicharset.h.

                                                       {
    return CleanupString(utf8_str, strlen(utf8_str));
  }

◆ CleanupString() [2/2]

std::string tesseract::UNICHARSET::CleanupString	(	const char *	utf8_str,
		size_t	length
	)

static

Definition at line 1158 of file unicharset.cpp.

                                                                       {
  std::string result;
  result.reserve(length);
  char ch;
  while ((ch = *utf8_str) != '\0' && length-- > 0) {
    int key_index = 0;
    const char *key;
    while ((key = kCleanupMaps[key_index][0]) != nullptr) {
      int match = 0;
      while (key[match] != '\0' && key[match] == utf8_str[match]) {
        ++match;
      }
      if (key[match] == '\0') {
        utf8_str += match;
        break;
      }
      ++key_index;
    }
    if (key == nullptr) {
      result.push_back(ch);
      ++utf8_str;
    } else {
      result.append(kCleanupMaps[key_index][1]);
    }
  }
  return result;
}

◆ clear()

void tesseract::UNICHARSET::clear ( )

inline

Definition at line 324 of file unicharset.h.

               {
    if (script_table != nullptr) {
      for (int i = 0; i < script_table_size_used; ++i) {
        delete[] script_table[i];
      }
      delete[] script_table;
      script_table = nullptr;
      script_table_size_used = 0;
    }
    script_table_size_reserved = 0;
    delete_pointers_in_unichars();
    unichars.clear();
    ids.clear();
    top_bottom_set_ = false;
    script_has_upper_lower_ = false;
    script_has_xheight_ = false;
    old_style_included_ = false;
    null_sid_ = 0;
    common_sid_ = 0;
    latin_sid_ = 0;
    cyrillic_sid_ = 0;
    greek_sid_ = 0;
    han_sid_ = 0;
    hiragana_sid_ = 0;
    katakana_sid_ = 0;
    thai_sid_ = 0;
    hangul_sid_ = 0;
    default_sid_ = 0;
  }

◆ common_sid()

int tesseract::UNICHARSET::common_sid ( ) const

inline

Definition at line 919 of file unicharset.h.

                         {
    return common_sid_;
  }

◆ contains_unichar() [1/2]

bool tesseract::UNICHARSET::contains_unichar ( const char *const unichar_repr ) const

Definition at line 695 of file unicharset.cpp.

                                                                      {
  std::string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  return ids.contains(cleaned.data(), cleaned.size());
}

◆ contains_unichar() [2/2]

bool tesseract::UNICHARSET::contains_unichar	(	const char *const	unichar_repr,
		int	length
	)		const

Definition at line 701 of file unicharset.cpp.

                                                    {
  if (length == 0) {
    return false;
  }
  std::string cleaned(unichar_repr, length);
  if (!old_style_included_) {
    cleaned = CleanupString(unichar_repr, length);
  }
  return ids.contains(cleaned.data(), cleaned.size());
}

◆ contains_unichar_id()

bool tesseract::UNICHARSET::contains_unichar_id ( UNICHAR_ID unichar_id ) const

inline

Definition at line 303 of file unicharset.h.

                                                        {
    return static_cast<size_t>(unichar_id) < unichars.size();
  }

◆ CopyFrom()

void tesseract::UNICHARSET::CopyFrom ( const UNICHARSET & src )

Definition at line 438 of file unicharset.cpp.

                                               {
  clear();
  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
    const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
    const char *utf8 = src.id_to_unichar(ch);
    unichar_insert_backwards_compatible(utf8);
    unichars[ch].properties.ExpandRangesFrom(src_props);
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(0, src);
}

◆ cyrillic_sid()

int tesseract::UNICHARSET::cyrillic_sid ( ) const

inline

Definition at line 925 of file unicharset.h.

                           {
    return cyrillic_sid_;
  }

◆ debug_str() [1/2]

std::string tesseract::UNICHARSET::debug_str ( const char * unichar_repr ) const

inline

Definition at line 273 of file unicharset.h.

                                                      {
    return debug_str(unichar_to_id(unichar_repr));
  }

◆ debug_str() [2/2]

std::string tesseract::UNICHARSET::debug_str ( UNICHAR_ID id ) const

Definition at line 331 of file unicharset.cpp.

                                                   {
  if (id == INVALID_UNICHAR_ID) {
    return std::string(id_to_unichar(id));
  }
  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
  if (fragment) {
    return fragment->to_string();
  }
  const char *str = id_to_unichar(id);
  std::string result = debug_utf8_str(str);
  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
  if (get_isalpha(id)) {
    if (get_islower(id)) {
      result += "a";
    } else if (get_isupper(id)) {
      result += "A";
    } else {
      result += "x";
    }
  }
  // Append 0 if a digit.
  if (get_isdigit(id)) {
    result += "0";
  }
  // Append p is a punctuation symbol.
  if (get_ispunctuation(id)) {
    result += "p";
  }
  return result;
}

◆ debug_utf8_str()

std::string tesseract::UNICHARSET::debug_utf8_str ( const char * str )

static

Definition at line 307 of file unicharset.cpp.

                                                    {
  std::string result = str;
  result += " [";
  int step = 1;
  // Chop into unicodes and code each as hex.
  for (int i = 0; str[i] != '\0'; i += step) {
    char hex[sizeof(int) * 2 + 1];
    step = UNICHAR::utf8_step(str + i);
    if (step == 0) {
      step = 1;
      snprintf(hex, sizeof(hex), "%x", str[i]);
    } else {
      UNICHAR ch(str + i, step);
      snprintf(hex, sizeof(hex), "%x", ch.first_uni());
    }
    result += hex;
    result += " ";
  }
  result += "]";
  return result;
}

◆ default_sid()

int tesseract::UNICHARSET::default_sid ( ) const

inline

Definition at line 946 of file unicharset.h.

                          {
    return default_sid_;
  }

◆ delete_pointers_in_unichars()

void tesseract::UNICHARSET::delete_pointers_in_unichars ( )

inline

Definition at line 316 of file unicharset.h.

                                     {
    for (auto &unichar : unichars) {
      delete unichar.properties.fragment;
      unichar.properties.fragment = nullptr;
    }
  }

◆ encodable_string()

bool tesseract::UNICHARSET::encodable_string	(	const char *	str,
		unsigned *	first_bad_position
	)		const

Definition at line 224 of file unicharset.cpp.

                                                                      {
  std::vector<UNICHAR_ID> encoding;
  return encode_string(str, true, &encoding, nullptr, first_bad_position);
}

◆ encode_string()

bool tesseract::UNICHARSET::encode_string	(	const char *	str,
		bool	give_up_on_failure,
		std::vector< UNICHAR_ID > *	encoding,
		std::vector< char > *	lengths,
		unsigned *	encoded_length
	)		const

Definition at line 239 of file unicharset.cpp.

                                                               {
  std::vector<UNICHAR_ID> working_encoding;
  std::vector<char> working_lengths;
  std::vector<char> best_lengths;
  encoding->clear(); // Just in case str is empty.
  auto str_length = strlen(str);
  unsigned str_pos = 0;
  bool perfect = true;
  while (str_pos < str_length) {
    encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
                  &str_pos, encoding, &best_lengths);
    if (str_pos < str_length) {
      // This is a non-match. Skip one utf-8 character.
      perfect = false;
      if (give_up_on_failure) {
        break;
      }
      int step = UNICHAR::utf8_step(str + str_pos);
      if (step == 0) {
        step = 1;
      }
      encoding->push_back(INVALID_UNICHAR_ID);
      best_lengths.push_back(step);
      str_pos += step;
      working_encoding = *encoding;
      working_lengths = best_lengths;
    }
  }
  if (lengths != nullptr) {
    *lengths = best_lengths;
  }
  if (encoded_length != nullptr) {
    *encoded_length = str_pos;
  }
  return perfect;
}

◆ eq()

bool tesseract::UNICHARSET::eq	(	UNICHAR_ID	unichar_id,
		const char *const	unichar_repr
	)		const

Definition at line 713 of file unicharset.cpp.

                                                          {
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}

◆ ExpandRangesFromOther()

void tesseract::UNICHARSET::ExpandRangesFromOther ( const UNICHARSET & src )

Definition at line 425 of file unicharset.cpp.

                                                            {
  for (unsigned ch = 0; ch < unichars.size(); ++ch) {
    const char *utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Expand just the ranges from properties.
      unichars[ch].properties.ExpandRangesFrom(properties);
    }
  }
}

◆ get_advance_stats()

void tesseract::UNICHARSET::get_advance_stats	(	UNICHAR_ID	unichar_id,
		float *	advance,
		float *	advance_sd
	)		const

inline

Definition at line 646 of file unicharset.h.

                                                  {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *advance = *advance_sd = 0;
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *advance = unichars[unichar_id].properties.advance;
    *advance_sd = unichars[unichar_id].properties.advance_sd;
  }

◆ get_bearing_stats()

void tesseract::UNICHARSET::get_bearing_stats	(	UNICHAR_ID	unichar_id,
		float *	bearing,
		float *	bearing_sd
	)		const

inline

Definition at line 629 of file unicharset.h.

                                                  {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *bearing = *bearing_sd = 0.0f;
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *bearing = unichars[unichar_id].properties.bearing;
    *bearing_sd = unichars[unichar_id].properties.bearing_sd;
  }

◆ get_chartype() [1/2]

char tesseract::UNICHARSET::get_chartype ( const char *const unichar_repr ) const

inline

Definition at line 807 of file unicharset.h.

                                                          {
    return get_chartype(unichar_to_id(unichar_repr));
  }

◆ get_chartype() [2/2]

char tesseract::UNICHARSET::get_chartype ( UNICHAR_ID unichar_id ) const

Definition at line 635 of file unicharset.cpp.

                                                 {
  if (this->get_isupper(id)) {
    return 'A';
  }
  if (this->get_islower(id)) {
    return 'a';
  }
  if (this->get_isalpha(id)) {
    return 'x';
  }
  if (this->get_isdigit(id)) {
    return '0';
  }
  if (this->get_ispunctuation(id)) {
    return 'p';
  }
  return 0;
}

◆ get_direction()

Direction tesseract::UNICHARSET::get_direction ( UNICHAR_ID unichar_id ) const

inline

Definition at line 712 of file unicharset.h.

                                                       {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return UNICHARSET::U_OTHER_NEUTRAL;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.direction;
  }

◆ get_enabled()

bool tesseract::UNICHARSET::get_enabled ( UNICHAR_ID unichar_id ) const

inline

Definition at line 911 of file unicharset.h.

                                                {
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.enabled;
  }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT * tesseract::UNICHARSET::get_fragment ( const char *const unichar_repr ) const

inline

Definition at line 820 of file unicharset.h.

                                                                          {
    if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
        !ids.contains(unichar_repr, false)) {
      return nullptr;
    }
    return get_fragment(unichar_to_id(unichar_repr));
  }

◆ get_fragment() [2/2]

const CHAR_FRAGMENT * tesseract::UNICHARSET::get_fragment ( UNICHAR_ID unichar_id ) const

inline

Definition at line 768 of file unicharset.h.

                                                                 {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return nullptr;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.fragment;
  }

◆ get_isalpha() [1/3]

bool tesseract::UNICHARSET::get_isalpha ( const char *const unichar_repr ) const

inline

Definition at line 777 of file unicharset.h.

                                                         {
    return get_isalpha(unichar_to_id(unichar_repr));
  }

◆ get_isalpha() [2/3]

bool tesseract::UNICHARSET::get_isalpha	(	const char *const	unichar_repr,
		int	length
	)		const

inline

Definition at line 830 of file unicharset.h.

                                                                     {
    return get_isalpha(unichar_to_id(unichar_repr, length));
  }

◆ get_isalpha() [3/3]

bool tesseract::UNICHARSET::get_isalpha ( UNICHAR_ID unichar_id ) const

inline

Definition at line 497 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return false;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isalpha;
  }

◆ get_isdigit() [1/3]

bool tesseract::UNICHARSET::get_isdigit ( const char *const unichar_repr ) const

inline

Definition at line 792 of file unicharset.h.

                                                         {
    return get_isdigit(unichar_to_id(unichar_repr));
  }

◆ get_isdigit() [2/3]

bool tesseract::UNICHARSET::get_isdigit	(	const char *const	unichar_repr,
		int	length
	)		const

inline

Definition at line 848 of file unicharset.h.

                                                                     {
    return get_isdigit(unichar_to_id(unichar_repr, length));
  }

◆ get_isdigit() [3/3]

bool tesseract::UNICHARSET::get_isdigit ( UNICHAR_ID unichar_id ) const

inline

Definition at line 524 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return false;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isdigit;
  }

◆ get_islower() [1/3]

bool tesseract::UNICHARSET::get_islower ( const char *const unichar_repr ) const

inline

Definition at line 782 of file unicharset.h.

                                                         {
    return get_islower(unichar_to_id(unichar_repr));
  }

◆ get_islower() [2/3]

bool tesseract::UNICHARSET::get_islower	(	const char *const	unichar_repr,
		int	length
	)		const

inline

Definition at line 836 of file unicharset.h.

                                                                     {
    return get_islower(unichar_to_id(unichar_repr, length));
  }

◆ get_islower() [3/3]

bool tesseract::UNICHARSET::get_islower ( UNICHAR_ID unichar_id ) const

inline

Definition at line 506 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return false;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.islower;
  }

◆ get_isngram()

bool tesseract::UNICHARSET::get_isngram ( UNICHAR_ID unichar_id ) const

inline

Definition at line 542 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return false;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isngram;
  }

◆ get_isprivate()

bool tesseract::UNICHARSET::get_isprivate ( UNICHAR_ID unichar_id ) const

Definition at line 379 of file unicharset.cpp.

                                                          {
  UNICHAR uc(id_to_unichar(unichar_id), -1);
  int uni = uc.first_uni();
  return (uni >= 0xE000 && uni <= 0xF8FF);
}

◆ get_ispunctuation() [1/3]

bool tesseract::UNICHARSET::get_ispunctuation ( const char *const unichar_repr ) const

inline

Definition at line 797 of file unicharset.h.

                                                               {
    return get_ispunctuation(unichar_to_id(unichar_repr));
  }

◆ get_ispunctuation() [2/3]

bool tesseract::UNICHARSET::get_ispunctuation	(	const char *const	unichar_repr,
		int	length
	)		const

inline

Definition at line 854 of file unicharset.h.

                                                                           {
    return get_ispunctuation(unichar_to_id(unichar_repr, length));
  }

◆ get_ispunctuation() [3/3]

bool tesseract::UNICHARSET::get_ispunctuation ( UNICHAR_ID unichar_id ) const

inline

Definition at line 533 of file unicharset.h.

                                                      {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return false;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.ispunctuation;
  }

◆ get_isupper() [1/3]

bool tesseract::UNICHARSET::get_isupper ( const char *const unichar_repr ) const

inline

Definition at line 787 of file unicharset.h.

                                                         {
    return get_isupper(unichar_to_id(unichar_repr));
  }

◆ get_isupper() [2/3]

bool tesseract::UNICHARSET::get_isupper	(	const char *const	unichar_repr,
		int	length
	)		const

inline

Definition at line 842 of file unicharset.h.

                                                                     {
    return get_isupper(unichar_to_id(unichar_repr, length));
  }

◆ get_isupper() [3/3]

bool tesseract::UNICHARSET::get_isupper ( UNICHAR_ID unichar_id ) const

inline

Definition at line 515 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return false;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isupper;
  }

◆ get_mirror()

UNICHAR_ID tesseract::UNICHARSET::get_mirror ( UNICHAR_ID unichar_id ) const

inline

Definition at line 721 of file unicharset.h.

                                                     {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return INVALID_UNICHAR_ID;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.mirror;
  }

◆ get_normed_unichar()

const char * tesseract::UNICHARSET::get_normed_unichar ( UNICHAR_ID unichar_id ) const

inline

Definition at line 859 of file unicharset.h.

                                                              {
    if (unichar_id == UNICHAR_SPACE) {
      return " ";
    }
    return unichars[unichar_id].properties.normed.c_str();
  }

◆ get_other_case()

UNICHAR_ID tesseract::UNICHARSET::get_other_case ( UNICHAR_ID unichar_id ) const

inline

Definition at line 703 of file unicharset.h.

                                                         {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return INVALID_UNICHAR_ID;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.other_case;
  }

◆ get_properties() [1/2]

unsigned int tesseract::UNICHARSET::get_properties ( const char *const unichar_repr ) const

inline

Definition at line 803 of file unicharset.h.

                                                                    {
    return get_properties(unichar_to_id(unichar_repr));
  }

◆ get_properties() [2/2]

unsigned int tesseract::UNICHARSET::get_properties ( UNICHAR_ID unichar_id ) const

Definition at line 615 of file unicharset.cpp.

                                                           {
  unsigned int properties = 0;
  if (this->get_isalpha(id)) {
    properties |= ISALPHA_MASK;
  }
  if (this->get_islower(id)) {
    properties |= ISLOWER_MASK;
  }
  if (this->get_isupper(id)) {
    properties |= ISUPPER_MASK;
  }
  if (this->get_isdigit(id)) {
    properties |= ISDIGIT_MASK;
  }
  if (this->get_ispunctuation(id)) {
    properties |= ISPUNCTUATION_MASK;
  }
  return properties;
}

◆ get_script() [1/3]

int tesseract::UNICHARSET::get_script ( const char *const unichar_repr ) const

inline

Definition at line 814 of file unicharset.h.

                                                       {
    return get_script(unichar_to_id(unichar_repr));
  }

◆ get_script() [2/3]

int tesseract::UNICHARSET::get_script	(	const char *const	unichar_repr,
		int	length
	)		const

inline

Definition at line 876 of file unicharset.h.

                                                                   {
    return get_script(unichar_to_id(unichar_repr, length));
  }

◆ get_script() [3/3]

int tesseract::UNICHARSET::get_script ( UNICHAR_ID unichar_id ) const

inline

Definition at line 681 of file unicharset.h.

                                              {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return null_sid_;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.script_id;
  }

◆ get_script_from_script_id()

const char * tesseract::UNICHARSET::get_script_from_script_id ( int id ) const

inline

Definition at line 886 of file unicharset.h.

                                                      {
    if (id >= script_table_size_used || id < 0) {
      return null_script;
    }
    return script_table[id];
  }

◆ get_script_id_from_name()

int tesseract::UNICHARSET::get_script_id_from_name ( const char * script_name ) const

Definition at line 1146 of file unicharset.cpp.

                                                                     {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script_name, script_table[i]) == 0) {
      return i;
    }
  }
  return 0; // 0 is always the null_script
}

◆ get_script_table_size()

int tesseract::UNICHARSET::get_script_table_size ( ) const

inline

Definition at line 881 of file unicharset.h.

                                    {
    return script_table_size_used;
  }

◆ get_top_bottom()

void tesseract::UNICHARSET::get_top_bottom	(	UNICHAR_ID	unichar_id,
		int *	min_bottom,
		int *	max_bottom,
		int *	min_top,
		int *	max_top
	)		const

inline

Definition at line 586 of file unicharset.h.

                                                        {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *min_bottom = *min_top = 0;
      *max_bottom = *max_top = 256; // kBlnCellHeight
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *min_bottom = unichars[unichar_id].properties.min_bottom;
    *max_bottom = unichars[unichar_id].properties.max_bottom;
    *min_top = unichars[unichar_id].properties.min_top;
    *max_top = unichars[unichar_id].properties.max_top;
  }

◆ get_width_stats()

void tesseract::UNICHARSET::get_width_stats	(	UNICHAR_ID	unichar_id,
		float *	width,
		float *	width_sd
	)		const

inline

Definition at line 612 of file unicharset.h.

                                              {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *width = 0.0f;
      *width_sd = 0.0f;
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *width = unichars[unichar_id].properties.width;
    *width_sd = unichars[unichar_id].properties.width_sd;
  }

◆ greek_sid()

int tesseract::UNICHARSET::greek_sid ( ) const

inline

Definition at line 928 of file unicharset.h.

                        {
    return greek_sid_;
  }

◆ han_sid()

int tesseract::UNICHARSET::han_sid ( ) const

inline

Definition at line 931 of file unicharset.h.

                      {
    return han_sid_;
  }

◆ hangul_sid()

int tesseract::UNICHARSET::hangul_sid ( ) const

inline

Definition at line 943 of file unicharset.h.

                         {
    return hangul_sid_;
  }

◆ has_special_codes()

bool tesseract::UNICHARSET::has_special_codes ( ) const

inline

Definition at line 756 of file unicharset.h.

                                 {
    return get_fragment(UNICHAR_BROKEN) != nullptr &&
           strcmp(id_to_unichar(UNICHAR_BROKEN),
                  kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
  }

◆ hiragana_sid()

int tesseract::UNICHARSET::hiragana_sid ( ) const

inline

Definition at line 934 of file unicharset.h.

                           {
    return hiragana_sid_;
  }

◆ id_to_unichar()

const char * tesseract::UNICHARSET::id_to_unichar ( UNICHAR_ID id ) const

Definition at line 279 of file unicharset.cpp.

                                                         {
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
  return unichars[id].representation;
}

◆ id_to_unichar_ext()

const char * tesseract::UNICHARSET::id_to_unichar_ext ( UNICHAR_ID id ) const

Definition at line 287 of file unicharset.cpp.

                                                             {
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
  // Resolve from the kCustomLigatures table if this is a private encoding.
  if (get_isprivate(id)) {
    const char *ch = id_to_unichar(id);
    for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
      if (!strcmp(ch, kCustomLigatures[i][1])) {
        return kCustomLigatures[i][0];
      }
    }
  }
  // Otherwise return the stored representation.
  return unichars[id].representation;
}

◆ is_null_script()

bool tesseract::UNICHARSET::is_null_script ( const char * script ) const

inline

Definition at line 901 of file unicharset.h.

                                                {
    return script == null_script;
  }

◆ IsSpaceDelimited()

bool tesseract::UNICHARSET::IsSpaceDelimited ( UNICHAR_ID unichar_id ) const

inline

Definition at line 668 of file unicharset.h.

                                                     {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return true;
    }
    int script_id = get_script(unichar_id);
    return script_id != han_sid_ && script_id != thai_sid_ &&
           script_id != hangul_sid_ && script_id != hiragana_sid_ &&
           script_id != katakana_sid_;
  }

◆ katakana_sid()

int tesseract::UNICHARSET::katakana_sid ( ) const

inline

Definition at line 937 of file unicharset.h.

                           {
    return katakana_sid_;
  }

◆ latin_sid()

int tesseract::UNICHARSET::latin_sid ( ) const

inline

Definition at line 922 of file unicharset.h.

                        {
    return latin_sid_;
  }

◆ load_from_file() [1/5]

bool tesseract::UNICHARSET::load_from_file ( const char *const filename )

inline

Definition at line 401 of file unicharset.h.

                                                  {
    return load_from_file(filename, false);
  }

◆ load_from_file() [2/5]

bool tesseract::UNICHARSET::load_from_file	(	const char *const	filename,
		bool	skip_fragments
	)

inline

Definition at line 391 of file unicharset.h.

                                                                       {
    FILE *file = fopen(filename, "rb");
    if (file == nullptr) {
      return false;
    }
    bool result = load_from_file(file, skip_fragments);
    fclose(file);
    return result;
  }

◆ load_from_file() [3/5]

bool tesseract::UNICHARSET::load_from_file ( FILE * file )

inline

Definition at line 408 of file unicharset.h.

                                  {
    return load_from_file(file, false);
  }

◆ load_from_file() [4/5]

bool tesseract::UNICHARSET::load_from_file	(	FILE *	file,
		bool	skip_fragments
	)

Definition at line 767 of file unicharset.cpp.

                                                               {
  LocalFilePointer lfp(file);
  using namespace std::placeholders; // for _1, _2
  std::function<char *(char *, int)> fgets_cb =
      std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  return success;
}

◆ load_from_file() [5/5]

bool tesseract::UNICHARSET::load_from_file	(	tesseract::TFile *	file,
		bool	skip_fragments
	)

Definition at line 776 of file unicharset.cpp.

                                                                         {
  using namespace std::placeholders; // for _1, _2
  std::function<char *(char *, int)> fgets_cb =
      std::bind(&tesseract::TFile::FGets, file, _1, _2);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  return success;
}

◆ major_right_to_left()

bool tesseract::UNICHARSET::major_right_to_left ( ) const

Definition at line 983 of file unicharset.cpp.

                                           {
  int ltr_count = 0;
  int rtl_count = 0;
  for (unsigned id = 0; id < unichars.size(); ++id) {
    int dir = get_direction(id);
    if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
      ltr_count++;
    }
    if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
        dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
        dir == UNICHARSET::U_ARABIC_NUMBER) {
      rtl_count++;
    }
  }
  return rtl_count > ltr_count;
}

◆ normed_ids()

const std::vector< UNICHAR_ID > & tesseract::UNICHARSET::normed_ids ( UNICHAR_ID unichar_id ) const

inline

Definition at line 868 of file unicharset.h.

                                                                       {
    return unichars[unichar_id].properties.normed_ids;
  }

◆ null_sid()

int tesseract::UNICHARSET::null_sid ( ) const

inline

Definition at line 916 of file unicharset.h.

                       {
    return null_sid_;
  }

◆ PartialSetPropertiesFromOther()

void tesseract::UNICHARSET::PartialSetPropertiesFromOther	(	int	start_index,
		const UNICHARSET &	src
	)

Definition at line 395 of file unicharset.cpp.

                                                                      {
  for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
    const char *utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Setup the script_id, other_case, and mirror properly.
      const char *script = src.get_script_from_script_id(properties.script_id);
      properties.script_id = add_script(script);
      const char *other_case = src.id_to_unichar(properties.other_case);
      if (contains_unichar(other_case)) {
        properties.other_case = unichar_to_id(other_case);
      } else {
        properties.other_case = ch;
      }
      const char *mirror_str = src.id_to_unichar(properties.mirror);
      if (contains_unichar(mirror_str)) {
        properties.mirror = unichar_to_id(mirror_str);
      } else {
        properties.mirror = ch;
      }
      unichars[ch].properties.CopyFrom(properties);
      set_normed_ids(ch);
    }
  }
}

◆ post_load_setup()

void tesseract::UNICHARSET::post_load_setup ( )

Definition at line 912 of file unicharset.cpp.

                                 {
  // Number of alpha chars with the case property minus those without,
  // in order to determine that half the alpha chars have case.
  int net_case_alphas = 0;
  int x_height_alphas = 0;
  int cap_height_alphas = 0;
  top_bottom_set_ = false;
  for (unsigned id = 0; id < unichars.size(); ++id) {
    int min_bottom = 0;
    int max_bottom = UINT8_MAX;
    int min_top = 0;
    int max_top = UINT8_MAX;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
    if (min_top > 0) {
      top_bottom_set_ = true;
    }
    if (get_isalpha(id)) {
      if (get_islower(id) || get_isupper(id)) {
        ++net_case_alphas;
      } else {
        --net_case_alphas;
      }
      if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
        ++x_height_alphas;
      } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
        ++cap_height_alphas;
      }
    }
    set_normed_ids(id);
  }
 
  script_has_upper_lower_ = net_case_alphas > 0;
  script_has_xheight_ =
      script_has_upper_lower_ ||
      (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
       cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
 
  null_sid_ = get_script_id_from_name(null_script);
  ASSERT_HOST(null_sid_ == 0);
  common_sid_ = get_script_id_from_name("Common");
  latin_sid_ = get_script_id_from_name("Latin");
  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
  greek_sid_ = get_script_id_from_name("Greek");
  han_sid_ = get_script_id_from_name("Han");
  hiragana_sid_ = get_script_id_from_name("Hiragana");
  katakana_sid_ = get_script_id_from_name("Katakana");
  thai_sid_ = get_script_id_from_name("Thai");
  hangul_sid_ = get_script_id_from_name("Hangul");
 
  // Compute default script. Use the highest-counting alpha script, that is
  // not the common script, as that still contains some "alphas".
  int *script_counts = new int[script_table_size_used];
  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
  for (unsigned id = 0; id < unichars.size(); ++id) {
    if (get_isalpha(id)) {
      ++script_counts[get_script(id)];
    }
  }
  default_sid_ = 0;
  for (int s = 1; s < script_table_size_used; ++s) {
    if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
      default_sid_ = s;
    }
  }
  delete[] script_counts;
}

◆ PropertiesIncomplete()

bool tesseract::UNICHARSET::PropertiesIncomplete ( UNICHAR_ID unichar_id ) const

inline

Definition at line 662 of file unicharset.h.

                                                         {
    return unichars[unichar_id].properties.AnyRangeEmpty();
  }

◆ save_to_file() [1/3]

bool tesseract::UNICHARSET::save_to_file ( const char *const filename ) const

inline

Definition at line 361 of file unicharset.h.

                                                      {
    FILE *file = fopen(filename, "w+b");
    if (file == nullptr) {
      return false;
    }
    bool result = save_to_file(file);
    fclose(file);
    return result;
  }

◆ save_to_file() [2/3]

bool tesseract::UNICHARSET::save_to_file ( FILE * file ) const

inline

Definition at line 373 of file unicharset.h.

                                      {
    std::string str;
    return save_to_string(str) &&
           tesseract::Serialize(file, &str[0], str.length());
  }

◆ save_to_file() [3/3]

bool tesseract::UNICHARSET::save_to_file ( tesseract::TFile * file ) const

inline

Definition at line 379 of file unicharset.h.

                                                {
    std::string str;
    return save_to_string(str) && file->Serialize(&str[0], str.length());
  }

◆ save_to_string()

bool tesseract::UNICHARSET::save_to_string ( std::string & str ) const

Definition at line 718 of file unicharset.cpp.

                                                    {
  const int kFileBufSize = 1024;
  char buffer[kFileBufSize + 1];
  snprintf(buffer, kFileBufSize, "%zu\n", this->size());
  str = buffer;
  for (unsigned id = 0; id < this->size(); ++id) {
    int min_bottom, max_bottom, min_top, max_top;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
    float width, width_sd;
    get_width_stats(id, &width, &width_sd);
    float bearing, bearing_sd;
    get_bearing_stats(id, &bearing, &bearing_sd);
    float advance, advance_sd;
    get_advance_stats(id, &advance, &advance_sd);
    unsigned int properties = this->get_properties(id);
    if (strcmp(this->id_to_unichar(id), " ") == 0) {
      snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
               this->get_script_from_script_id(this->get_script(id)),
               this->get_other_case(id));
      str += buffer;
    } else {
      std::ostringstream stream;
      stream.imbue(std::locale::classic());
      stream << this->id_to_unichar(id) << ' ' << properties << ' '
             << min_bottom << ',' << max_bottom << ',' << min_top << ','
             << max_top << ',' << width << ',' << width_sd << ',' << bearing
             << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
             << this->get_script_from_script_id(this->get_script(id)) << ' '
             << this->get_other_case(id) << ' ' << this->get_direction(id)
             << ' ' << this->get_mirror(id) << ' '
             << this->get_normed_unichar(id) << "\t# "
             << this->debug_str(id).c_str() << '\n';
      str += stream.str().c_str();
    }
  }
  return true;
}

◆ script_has_upper_lower()

bool tesseract::UNICHARSET::script_has_upper_lower ( ) const

inline

Definition at line 951 of file unicharset.h.

                                      {
    return script_has_upper_lower_;
  }

◆ script_has_xheight()

bool tesseract::UNICHARSET::script_has_xheight ( ) const

inline

Definition at line 958 of file unicharset.h.

                                  {
    return script_has_xheight_;
  }

◆ set_advance_stats()

void tesseract::UNICHARSET::set_advance_stats	(	UNICHAR_ID	unichar_id,
		float	advance,
		float	advance_sd
	)

inline

Definition at line 656 of file unicharset.h.

                                           {
    unichars[unichar_id].properties.advance = advance;
    unichars[unichar_id].properties.advance_sd = advance_sd;
  }

◆ set_bearing_stats()

void tesseract::UNICHARSET::set_bearing_stats	(	UNICHAR_ID	unichar_id,
		float	bearing,
		float	bearing_sd
	)

inline

Definition at line 639 of file unicharset.h.

                                           {
    unichars[unichar_id].properties.bearing = bearing;
    unichars[unichar_id].properties.bearing_sd = bearing_sd;
  }

◆ set_black_and_whitelist()

void tesseract::UNICHARSET::set_black_and_whitelist	(	const char *	blacklist,
		const char *	whitelist,
		const char *	unblacklist
	)

Definition at line 1004 of file unicharset.cpp.

                                                                  {
  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
  // Set everything to default
  for (auto &uc : unichars) {
    uc.properties.enabled = def_enabled;
  }
  if (!def_enabled) {
    // Enable the whitelist.
    std::vector<UNICHAR_ID> encoding;
    encode_string(whitelist, false, &encoding, nullptr, nullptr);
    for (auto it : encoding) {
      if (it != INVALID_UNICHAR_ID) {
        unichars[it].properties.enabled = true;
      }
    }
  }
  if (blacklist != nullptr && blacklist[0] != '\0') {
    // Disable the blacklist.
    std::vector<UNICHAR_ID> encoding;
    encode_string(blacklist, false, &encoding, nullptr, nullptr);
    for (auto it : encoding) {
      if (it != INVALID_UNICHAR_ID) {
        unichars[it].properties.enabled = false;
      }
    }
  }
  if (unblacklist != nullptr && unblacklist[0] != '\0') {
    // Re-enable the unblacklist.
    std::vector<UNICHAR_ID> encoding;
    encode_string(unblacklist, false, &encoding, nullptr, nullptr);
    for (auto it : encoding) {
      if (it != INVALID_UNICHAR_ID) {
        unichars[it].properties.enabled = true;
      }
    }
  }
}

◆ set_direction()

void tesseract::UNICHARSET::set_direction	(	UNICHAR_ID	unichar_id,
		UNICHARSET::Direction	value
	)

inline

Definition at line 478 of file unicharset.h.

                                                                       {
    unichars[unichar_id].properties.direction = value;
  }

◆ set_isalpha()

void tesseract::UNICHARSET::set_isalpha	(	UNICHAR_ID	unichar_id,
		bool	value
	)

inline

Definition at line 437 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isalpha = value;
  }

◆ set_isdigit()

void tesseract::UNICHARSET::set_isdigit	(	UNICHAR_ID	unichar_id,
		bool	value
	)

inline

Definition at line 452 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isdigit = value;
  }

◆ set_islower()

void tesseract::UNICHARSET::set_islower	(	UNICHAR_ID	unichar_id,
		bool	value
	)

inline

Definition at line 442 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.islower = value;
  }

◆ set_isngram()

void tesseract::UNICHARSET::set_isngram	(	UNICHAR_ID	unichar_id,
		bool	value
	)

inline

Definition at line 462 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isngram = value;
  }

◆ set_ispunctuation()

void tesseract::UNICHARSET::set_ispunctuation	(	UNICHAR_ID	unichar_id,
		bool	value
	)

inline

Definition at line 457 of file unicharset.h.

                                                            {
    unichars[unichar_id].properties.ispunctuation = value;
  }

◆ set_isupper()

void tesseract::UNICHARSET::set_isupper	(	UNICHAR_ID	unichar_id,
		bool	value
	)

inline

Definition at line 447 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isupper = value;
  }

◆ set_mirror()

void tesseract::UNICHARSET::set_mirror	(	UNICHAR_ID	unichar_id,
		UNICHAR_ID	mirror
	)

inline

Definition at line 483 of file unicharset.h.

                                                            {
    unichars[unichar_id].properties.mirror = mirror;
  }

◆ set_normed()

void tesseract::UNICHARSET::set_normed	(	UNICHAR_ID	unichar_id,
		const char *	normed
	)

inline

Definition at line 488 of file unicharset.h.

                                                             {
    unichars[unichar_id].properties.normed = normed;
    unichars[unichar_id].properties.normed_ids.clear();
  }

◆ set_normed_ids()

void tesseract::UNICHARSET::set_normed_ids ( UNICHAR_ID unichar_id )

Definition at line 364 of file unicharset.cpp.

                                                     {
  unichars[unichar_id].properties.normed_ids.clear();
  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
                            true, &unichars[unichar_id].properties.normed_ids,
                            nullptr, nullptr)) {
    unichars[unichar_id].properties.normed_ids.clear();
    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
  }
}

◆ set_other_case()

void tesseract::UNICHARSET::set_other_case	(	UNICHAR_ID	unichar_id,
		UNICHAR_ID	other_case
	)

inline

Definition at line 473 of file unicharset.h.

                                                                    {
    unichars[unichar_id].properties.other_case = other_case;
  }

◆ set_ranges_empty()

void tesseract::UNICHARSET::set_ranges_empty ( )

Definition at line 386 of file unicharset.cpp.

                                  {
  for (auto &uc : unichars) {
    uc.properties.SetRangesEmpty();
  }
}

◆ set_script()

void tesseract::UNICHARSET::set_script	(	UNICHAR_ID	unichar_id,
		const char *	value
	)

inline

Definition at line 468 of file unicharset.h.

                                                            {
    unichars[unichar_id].properties.script_id = add_script(value);
  }

◆ set_top_bottom()

void tesseract::UNICHARSET::set_top_bottom	(	UNICHAR_ID	unichar_id,
		int	min_bottom,
		int	max_bottom,
		int	min_top,
		int	max_top
	)

inline

Definition at line 599 of file unicharset.h.

                                                {
    unichars[unichar_id].properties.min_bottom =
        ClipToRange<int>(min_bottom, 0, UINT8_MAX);
    unichars[unichar_id].properties.max_bottom =
        ClipToRange<int>(max_bottom, 0, UINT8_MAX);
    unichars[unichar_id].properties.min_top =
        ClipToRange<int>(min_top, 0, UINT8_MAX);
    unichars[unichar_id].properties.max_top =
        ClipToRange<int>(max_top, 0, UINT8_MAX);
  }

◆ set_width_stats()

void tesseract::UNICHARSET::set_width_stats	(	UNICHAR_ID	unichar_id,
		float	width,
		float	width_sd
	)

inline

Definition at line 623 of file unicharset.h.

                                                                           {
    unichars[unichar_id].properties.width = width;
    unichars[unichar_id].properties.width_sd = width_sd;
  }

◆ SetPropertiesFromOther()

void tesseract::UNICHARSET::SetPropertiesFromOther ( const UNICHARSET & src )

inline

Definition at line 563 of file unicharset.h.

                                                     {
    PartialSetPropertiesFromOther(0, src);
  }

◆ size()

size_t tesseract::UNICHARSET::size ( ) const

inline

Definition at line 355 of file unicharset.h.

                      {
    return unichars.size();
  }

◆ SizesDistinct()

bool tesseract::UNICHARSET::SizesDistinct	(	UNICHAR_ID	id1,
		UNICHAR_ID	id2
	)		const

Definition at line 476 of file unicharset.cpp.

                                                                   {
  int overlap = std::min(unichars[id1].properties.max_top,
                         unichars[id2].properties.max_top) -
                std::max(unichars[id1].properties.min_top,
                         unichars[id2].properties.min_top);
  return overlap <= 0;
}

◆ step()

int tesseract::UNICHARSET::step ( const char * str ) const

Definition at line 211 of file unicharset.cpp.

                                          {
  std::vector<UNICHAR_ID> encoding;
  std::vector<char> lengths;
  encode_string(str, true, &encoding, &lengths, nullptr);
  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
    return 0;
  }
  return lengths[0];
}

◆ thai_sid()

int tesseract::UNICHARSET::thai_sid ( ) const

inline

Definition at line 940 of file unicharset.h.

                       {
    return thai_sid_;
  }

◆ to_lower()

UNICHAR_ID tesseract::UNICHARSET::to_lower ( UNICHAR_ID unichar_id ) const

inline

Definition at line 730 of file unicharset.h.

                                                   {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return INVALID_UNICHAR_ID;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    if (unichars[unichar_id].properties.islower) {
      return unichar_id;
    }
    return unichars[unichar_id].properties.other_case;
  }

◆ to_upper()

UNICHAR_ID tesseract::UNICHARSET::to_upper ( UNICHAR_ID unichar_id ) const

inline

Definition at line 742 of file unicharset.h.

                                                   {
    if (INVALID_UNICHAR_ID == unichar_id) {
      return INVALID_UNICHAR_ID;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    if (unichars[unichar_id].properties.isupper) {
      return unichar_id;
    }
    return unichars[unichar_id].properties.other_case;
  }

◆ top_bottom_useful()

bool tesseract::UNICHARSET::top_bottom_useful ( ) const

inline

Definition at line 555 of file unicharset.h.

                                 {
    return top_bottom_set_;
  }

◆ unichar_insert() [1/2]

void tesseract::UNICHARSET::unichar_insert ( const char *const unichar_repr )

inline

Definition at line 283 of file unicharset.h.

                                                      {
    unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
  }

◆ unichar_insert() [2/2]

void tesseract::UNICHARSET::unichar_insert	(	const char *const	unichar_repr,
		OldUncleanUnichars	old_style
	)

Definition at line 654 of file unicharset.cpp.

                                                              {
  if (old_style == OldUncleanUnichars::kTrue) {
    old_style_included_ = true;
  }
  std::string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
    const char *str = cleaned.c_str();
    std::vector<int> encoding;
    if (!old_style_included_ &&
        encode_string(str, true, &encoding, nullptr, nullptr)) {
      return;
    }
    unichars.emplace_back();
    auto &u = unichars.back();
    int index = 0;
    do {
      if (index >= UNICHAR_LEN) {
        fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
                unichar_repr);
        return;
      }
      u.representation[index++] = *str++;
    } while (*str != '\0');
    u.representation[index] = '\0';
    this->set_script(unichars.size() - 1, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
    u.properties.fragment = frag;
    if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
      u.properties.script_id = this->get_script(frag->get_unichar());
    }
    u.properties.enabled = true;
    ids.insert(u.representation, unichars.size() - 1);
  }
}

◆ unichar_insert_backwards_compatible()

void tesseract::UNICHARSET::unichar_insert_backwards_compatible ( const char *const unichar_repr )

inline

Definition at line 288 of file unicharset.h.

                                                                           {
    std::string cleaned = CleanupString(unichar_repr);
    if (cleaned != unichar_repr) {
      unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
    } else {
      auto old_size = size();
      unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
      if (size() == old_size) {
        unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
      }
    }
  }

◆ unichar_to_id() [1/2]

UNICHAR_ID tesseract::UNICHARSET::unichar_to_id ( const char *const unichar_repr ) const

Definition at line 186 of file unicharset.cpp.

                                                              {
  std::string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  return ids.contains(cleaned.data(), cleaned.size())
             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
             : INVALID_UNICHAR_ID;
}

◆ unichar_to_id() [2/2]

UNICHAR_ID tesseract::UNICHARSET::unichar_to_id	(	const char *const	unichar_repr,
		int	length
	)		const

Definition at line 194 of file unicharset.cpp.

                                                       {
  assert(length > 0 && length <= UNICHAR_LEN);
  std::string cleaned(unichar_repr, length);
  if (!old_style_included_) {
    cleaned = CleanupString(unichar_repr, length);
  }
  return ids.contains(cleaned.data(), cleaned.size())
             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
             : INVALID_UNICHAR_ID;
}

Member Data Documentation

◆ kCustomLigatures

const char * tesseract::UNICHARSET::kCustomLigatures

static

Initial value:

= {
    {"ct", "\uE003"}, 
    {"ſh", "\uE006"}, 
    {"ſi", "\uE007"}, 
    {"ſl", "\uE008"}, 
    {"ſſ", "\uE009"}, 
    {nullptr, nullptr}}

Definition at line 169 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * tesseract::UNICHARSET::kSpecialUnicharCodes

static

Initial value:

= {

" ", "Joined", "|Broken|0|1"}

Definition at line 172 of file unicharset.h.

The documentation for this class was generated from the following files:

/media/home/debian/src/github/tesseract-ocr/tesseract/src/ccutil/unicharset.h
/media/home/debian/src/github/tesseract-ocr/tesseract/src/ccutil/unicharset.cpp

Public Types

Public Member Functions

Static Public Member Functions

Static Public Attributes

Detailed Description

Member Enumeration Documentation

◆ Direction

Constructor & Destructor Documentation

◆ UNICHARSET()

◆ ~UNICHARSET()

Member Function Documentation

◆ add_script()

◆ AnyRepeatedUnicodes()

◆ AppendOtherUnicharset()

◆ CleanupString() [1/2]

◆ CleanupString() [2/2]

◆ clear()

◆ common_sid()

◆ contains_unichar() [1/2]

◆ contains_unichar() [2/2]

◆ contains_unichar_id()

◆ CopyFrom()

◆ cyrillic_sid()

◆ debug_str() [1/2]

◆ debug_str() [2/2]

◆ debug_utf8_str()

◆ default_sid()

◆ delete_pointers_in_unichars()

◆ encodable_string()

◆ encode_string()

◆ eq()

◆ ExpandRangesFromOther()

◆ get_advance_stats()

◆ get_bearing_stats()

◆ get_chartype() [1/2]

◆ get_chartype() [2/2]

◆ get_direction()

◆ get_enabled()

◆ get_fragment() [1/2]

◆ get_fragment() [2/2]

◆ get_isalpha() [1/3]

◆ get_isalpha() [2/3]

◆ get_isalpha() [3/3]

◆ get_isdigit() [1/3]

◆ get_isdigit() [2/3]

◆ get_isdigit() [3/3]

◆ get_islower() [1/3]

◆ get_islower() [2/3]

◆ get_islower() [3/3]

◆ get_isngram()

◆ get_isprivate()

◆ get_ispunctuation() [1/3]

◆ get_ispunctuation() [2/3]

◆ get_ispunctuation() [3/3]

◆ get_isupper() [1/3]

◆ get_isupper() [2/3]

◆ get_isupper() [3/3]

◆ get_mirror()

◆ get_normed_unichar()

◆ get_other_case()

◆ get_properties() [1/2]

◆ get_properties() [2/2]

◆ get_script() [1/3]

◆ get_script() [2/3]

◆ get_script() [3/3]

◆ get_script_from_script_id()

◆ get_script_id_from_name()

◆ get_script_table_size()

◆ get_top_bottom()

◆ get_width_stats()

◆ greek_sid()

◆ han_sid()

◆ hangul_sid()

◆ has_special_codes()

◆ hiragana_sid()

◆ id_to_unichar()

◆ id_to_unichar_ext()

◆ is_null_script()

◆ IsSpaceDelimited()

◆ katakana_sid()