tesseract-ocr.github.io/5.3.3/a00185_source.html

// File:        unicharset.h

// Description: Unicode character/ligature set class.

// Author:      Thomas Kielbus

//

// (C) Copyright 2006, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//


#ifndef TESSERACT_CCUTIL_UNICHARSET_H_

#define TESSERACT_CCUTIL_UNICHARSET_H_


#include "errcode.h"

#include "unicharmap.h"


#include <tesseract/unichar.h>

#include "helpers.h"

#include "serialis.h"


#include <functional> // for std::function


namespace tesseract {


// Enum holding special values of unichar_id. Every unicharset has these.

// Warning! Keep in sync with kSpecialUnicharCodes.

enum SpecialUnicharCodes {

  UNICHAR_SPACE,

  UNICHAR_JOINED,

  UNICHAR_BROKEN,


  SPECIAL_UNICHAR_CODES_COUNT

};


// Boolean flag for unichar_insert. It's a bit of a double negative to allow

// the default value to be false.

enum class OldUncleanUnichars {

  kFalse,

  kTrue,

};


class TESS_API CHAR_FRAGMENT {

public:

  // Minimum number of characters used for fragment representation.

  static const int kMinLen = 6;

  // Maximum number of characters used for fragment representation.

  static const int kMaxLen = 3 + UNICHAR_LEN + 2;

  // Maximum number of fragments per character.

  static const int kMaxChunks = 5;


  // Setters and Getters.

  inline void set_all(const char *unichar, int pos, int total, bool natural) {

    set_unichar(unichar);

    set_pos(pos);

    set_total(total);

    set_natural(natural);

  }

  inline void set_unichar(const char *uch) {

    strncpy(this->unichar, uch, sizeof(this->unichar));

    this->unichar[UNICHAR_LEN] = '\0';

  }

  inline void set_pos(int p) {

    this->pos = p;

  }

  inline void set_total(int t) {

    this->total = t;

  }

  inline const char *get_unichar() const {

    return this->unichar;

  }

  inline int get_pos() const {

    return this->pos;

  }

  inline int get_total() const {

    return this->total;

  }


  // Returns the string that represents a fragment

  // with the given unichar, pos and total.

  static std::string to_string(const char *unichar, int pos, int total,

                               bool natural);

  // Returns the string that represents this fragment.

  std::string to_string() const {

    return to_string(unichar, pos, total, natural);

  }


  // Checks whether a fragment has the same unichar,

  // position and total as the given inputs.

  inline bool equals(const char *other_unichar, int other_pos,

                     int other_total) const {

    return (strcmp(this->unichar, other_unichar) == 0 &&

            this->pos == other_pos && this->total == other_total);

  }

  inline bool equals(const CHAR_FRAGMENT *other) const {

    return this->equals(other->get_unichar(), other->get_pos(),

                        other->get_total());

  }


  // Checks whether a given fragment is a continuation of this fragment.

  // Assumes that the given fragment pointer is not nullptr.

  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {

    return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&

            this->total == fragment->get_total() &&

            this->pos == fragment->get_pos() + 1);

  }


  // Returns true if this fragment is a beginning fragment.

  inline bool is_beginning() const {

    return this->pos == 0;

  }


  // Returns true if this fragment is an ending fragment.

  inline bool is_ending() const {

    return this->pos == this->total - 1;

  }


  // Returns true if the fragment was a separate component to begin with,

  // ie did not need chopping to be isolated, but may have been separated

  // out from a multi-outline blob.

  inline bool is_natural() const {

    return natural;

  }

  void set_natural(bool value) {

    natural = value;

  }


  // Parses the string to see whether it represents a character fragment

  // (rather than a regular character). If so, allocates memory for a new

  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment

  // information. Fragments are of the form:

  // |m|1|2, meaning chunk 1 of 2 of character m, or

  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed

  // to divide the parts, as they were already separate connected components.

  //

  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT

  // instance, otherwise (if the string does not represent a fragment or it

  // looks like it does, but parsing it as a fragment fails) returns nullptr.

  //

  // Note: The caller is responsible for deallocating memory

  // associated with the returned pointer.

  static CHAR_FRAGMENT *parse_from_string(const char *str);


private:

  char unichar[UNICHAR_LEN + 1];

  // True if the fragment was a separate component to begin with,

  // ie did not need chopping to be isolated, but may have been separated

  // out from a multi-outline blob.

  bool natural;

  int16_t pos;   // fragment position in the character

  int16_t total; // total number of fragments in the character

};


// The UNICHARSET class is an utility class for Tesseract that holds the

// set of characters that are used by the engine. Each character is identified

// by a unique number, from 0 to (size - 1).

class TESS_API UNICHARSET {

public:

  // Custom list of characters and their ligature forms (UTF8)

  // These map to unicode values in the private use area (PUC) and are supported

  // by only few font families (eg. Wyld, Adobe Caslon Pro).

  static const char *kCustomLigatures[][2];


  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.

  static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];


  // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)

  enum Direction {

    U_LEFT_TO_RIGHT = 0,

    U_RIGHT_TO_LEFT = 1,

    U_EUROPEAN_NUMBER = 2,

    U_EUROPEAN_NUMBER_SEPARATOR = 3,

    U_EUROPEAN_NUMBER_TERMINATOR = 4,

    U_ARABIC_NUMBER = 5,

    U_COMMON_NUMBER_SEPARATOR = 6,

    U_BLOCK_SEPARATOR = 7,

    U_SEGMENT_SEPARATOR = 8,

    U_WHITE_SPACE_NEUTRAL = 9,

    U_OTHER_NEUTRAL = 10,

    U_LEFT_TO_RIGHT_EMBEDDING = 11,

    U_LEFT_TO_RIGHT_OVERRIDE = 12,

    U_RIGHT_TO_LEFT_ARABIC = 13,

    U_RIGHT_TO_LEFT_EMBEDDING = 14,

    U_RIGHT_TO_LEFT_OVERRIDE = 15,

    U_POP_DIRECTIONAL_FORMAT = 16,

    U_DIR_NON_SPACING_MARK = 17,

    U_BOUNDARY_NEUTRAL = 18,

    U_FIRST_STRONG_ISOLATE = 19,

    U_LEFT_TO_RIGHT_ISOLATE = 20,

    U_RIGHT_TO_LEFT_ISOLATE = 21,

    U_POP_DIRECTIONAL_ISOLATE = 22,

#ifndef U_HIDE_DEPRECATED_API

    U_CHAR_DIRECTION_COUNT

#endif // U_HIDE_DEPRECATED_API

  };


  // Create an empty UNICHARSET

  UNICHARSET();


  ~UNICHARSET();


  // Return the UNICHAR_ID of a given unichar representation within the

  // UNICHARSET.

  UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;


  // Return the UNICHAR_ID of a given unichar representation within the

  // UNICHARSET. Only the first length characters from unichar_repr are used.

  UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;


  // Return the minimum number of bytes that matches a legal UNICHAR_ID,

  // while leaving the rest of the string encodable. Returns 0 if the

  // beginning of the string is not encodable.

  // WARNING: this function now encodes the whole string for precision.

  // Use encode_string in preference to repeatedly calling step.

  int step(const char *str) const;


  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.

  // If not encodable, write the first byte offset which cannot be converted

  // into the second (return) argument.

  bool encodable_string(const char *str, unsigned *first_bad_position) const;


  // Encodes the given UTF-8 string with this UNICHARSET.

  // Any part of the string that cannot be encoded (because the utf8 can't

  // be broken up into pieces that are in the unicharset) then:

  // if give_up_on_failure, stops and returns a partial encoding,

  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.

  // Returns true if the encoding succeeds completely, false if there is at

  // least one failure.

  // If lengths is not nullptr, then it is filled with the corresponding

  // byte length of each encoded UNICHAR_ID.

  // If encoded_length is not nullptr then on return it contains the length of

  // str that was encoded. (if give_up_on_failure the location of the first

  // failure, otherwise strlen(str).)

  // WARNING: Caller must guarantee that str has already been cleaned of codes

  // that do not belong in the unicharset, or encoding may fail.

  // Use CleanupString to perform the cleaning.

  bool encode_string(const char *str, bool give_up_on_failure,

                     std::vector<UNICHAR_ID> *encoding,

                     std::vector<char> *lengths,

                     unsigned *encoded_length) const;


  // Return the unichar representation corresponding to the given UNICHAR_ID

  // within the UNICHARSET.

  const char *id_to_unichar(UNICHAR_ID id) const;


  // Return the UTF8 representation corresponding to the given UNICHAR_ID after

  // resolving any private encodings internal to Tesseract. This method is

  // preferable to id_to_unichar for outputting text that will be visible to

  // external applications.

  const char *id_to_unichar_ext(UNICHAR_ID id) const;


  // Return a string that reformats the utf8 str into the str followed

  // by its hex unicodes.

  static std::string debug_utf8_str(const char *str);


  // Removes/replaces content that belongs in rendered text, but not in the

  // unicharset.

  static std::string CleanupString(const char *utf8_str) {

    return CleanupString(utf8_str, strlen(utf8_str));

  }

  static std::string CleanupString(const char *utf8_str, size_t length);


  // Return a string containing debug information on the unichar, including

  // the id_to_unichar, its hex unicodes and the properties.

  std::string debug_str(UNICHAR_ID id) const;

  std::string debug_str(const char *unichar_repr) const {

    return debug_str(unichar_to_id(unichar_repr));

  }


  // Adds a unichar representation to the set. If old_style is true, then

  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL

  // characters are ignored/skipped as if they don't exist and n-grams that

  // can already be encoded are not added.

  void unichar_insert(const char *const unichar_repr,

                      OldUncleanUnichars old_style);

  void unichar_insert(const char *const unichar_repr) {

    unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);

  }

  // Adds a unichar representation to the set. Avoids setting old_style to true,

  // unless it is necessary to make the new unichar get added.

  void unichar_insert_backwards_compatible(const char *const unichar_repr) {

    std::string cleaned = CleanupString(unichar_repr);

    if (cleaned != unichar_repr) {

      unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);

    } else {

      auto old_size = size();

      unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);

      if (size() == old_size) {

        unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);

      }

    }

  }


  // Return true if the given unichar id exists within the set.

  // Relies on the fact that unichar ids are contiguous in the unicharset.

  bool contains_unichar_id(UNICHAR_ID unichar_id) const {

    return static_cast<size_t>(unichar_id) < unichars.size();

  }


  // Return true if the given unichar representation exists within the set.

  bool contains_unichar(const char *const unichar_repr) const;

  bool contains_unichar(const char *const unichar_repr, int length) const;


  // Return true if the given unichar representation corresponds to the given

  // UNICHAR_ID within the set.

  bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;


  // Delete CHAR_FRAGMENTs stored in properties of unichars array.

  void delete_pointers_in_unichars() {

    for (auto &unichar : unichars) {

      delete unichar.properties.fragment;

      unichar.properties.fragment = nullptr;

    }

  }


  // Clear the UNICHARSET (all the previous data is lost).

  void clear() {

    if (script_table != nullptr) {

      for (int i = 0; i < script_table_size_used; ++i) {

        delete[] script_table[i];

      }

      delete[] script_table;

      script_table = nullptr;

      script_table_size_used = 0;

    }

    script_table_size_reserved = 0;

    delete_pointers_in_unichars();

    unichars.clear();

    ids.clear();

    top_bottom_set_ = false;

    script_has_upper_lower_ = false;

    script_has_xheight_ = false;

    old_style_included_ = false;

    null_sid_ = 0;

    common_sid_ = 0;

    latin_sid_ = 0;

    cyrillic_sid_ = 0;

    greek_sid_ = 0;

    han_sid_ = 0;

    hiragana_sid_ = 0;

    katakana_sid_ = 0;

    thai_sid_ = 0;

    hangul_sid_ = 0;

    default_sid_ = 0;

  }


  // Return the size of the set (the number of different UNICHAR it holds).

  size_t size() const {

    return unichars.size();

  }


  // Opens the file indicated by filename and saves unicharset to that file.

  // Returns true if the operation is successful.

  bool save_to_file(const char *const filename) const {

    FILE *file = fopen(filename, "w+b");

    if (file == nullptr) {

      return false;

    }

    bool result = save_to_file(file);

    fclose(file);

    return result;

  }


  // Saves the content of the UNICHARSET to the given file.

  // Returns true if the operation is successful.

  bool save_to_file(FILE *file) const {

    std::string str;

    return save_to_string(str) &&

           tesseract::Serialize(file, &str[0], str.length());

  }


  bool save_to_file(tesseract::TFile *file) const {

    std::string str;

    return save_to_string(str) && file->Serialize(&str[0], str.length());

  }


  // Saves the content of the UNICHARSET to the given string.

  // Returns true if the operation is successful.

  bool save_to_string(std::string &str) const;


  // Opens the file indicated by filename and loads the UNICHARSET

  // from the given file. The previous data is lost.

  // Returns true if the operation is successful.

  bool load_from_file(const char *const filename, bool skip_fragments) {

    FILE *file = fopen(filename, "rb");

    if (file == nullptr) {

      return false;

    }

    bool result = load_from_file(file, skip_fragments);

    fclose(file);

    return result;

  }

  // returns true if the operation is successful.

  bool load_from_file(const char *const filename) {

    return load_from_file(filename, false);

  }


  // Loads the UNICHARSET from the given file. The previous data is lost.

  // Returns true if the operation is successful.

  bool load_from_file(FILE *file, bool skip_fragments);

  bool load_from_file(FILE *file) {

    return load_from_file(file, false);

  }

  bool load_from_file(tesseract::TFile *file, bool skip_fragments);


  // Sets up internal data after loading the file, based on the char

  // properties. Called from load_from_file, but also needs to be run

  // during set_unicharset_properties.

  void post_load_setup();


  // Returns true if right_to_left scripts are significant in the unicharset,

  // but without being so sensitive that "universal" unicharsets containing

  // characters from many scripts, like orientation and script detection,

  // look like they are right_to_left.

  bool major_right_to_left() const;


  // Set a whitelist and/or blacklist of characters to recognize.

  // An empty or nullptr whitelist enables everything (minus any blacklist).

  // An empty or nullptr blacklist disables nothing.

  // An empty or nullptr unblacklist has no effect.

  // The blacklist overrides the whitelist.

  // The unblacklist overrides the blacklist.

  // Each list is a string of utf8 character strings. Boundaries between

  // unicharset units are worked out automatically, and characters not in

  // the unicharset are silently ignored.

  void set_black_and_whitelist(const char *blacklist, const char *whitelist,

                               const char *unblacklist);


  // Set the isalpha property of the given unichar to the given value.

  void set_isalpha(UNICHAR_ID unichar_id, bool value) {

    unichars[unichar_id].properties.isalpha = value;

  }


  // Set the islower property of the given unichar to the given value.

  void set_islower(UNICHAR_ID unichar_id, bool value) {

    unichars[unichar_id].properties.islower = value;

  }


  // Set the isupper property of the given unichar to the given value.

  void set_isupper(UNICHAR_ID unichar_id, bool value) {

    unichars[unichar_id].properties.isupper = value;

  }


  // Set the isdigit property of the given unichar to the given value.

  void set_isdigit(UNICHAR_ID unichar_id, bool value) {

    unichars[unichar_id].properties.isdigit = value;

  }


  // Set the ispunctuation property of the given unichar to the given value.

  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {

    unichars[unichar_id].properties.ispunctuation = value;

  }


  // Set the isngram property of the given unichar to the given value.

  void set_isngram(UNICHAR_ID unichar_id, bool value) {

    unichars[unichar_id].properties.isngram = value;

  }


  // Set the script name of the given unichar to the given value.

  // Value is copied and thus can be a temporary;

  void set_script(UNICHAR_ID unichar_id, const char *value) {

    unichars[unichar_id].properties.script_id = add_script(value);

  }


  // Set other_case unichar id in the properties for the given unichar id.

  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {

    unichars[unichar_id].properties.other_case = other_case;

  }


  // Set the direction property of the given unichar to the given value.

  void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {

    unichars[unichar_id].properties.direction = value;

  }


  // Set mirror unichar id in the properties for the given unichar id.

  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {

    unichars[unichar_id].properties.mirror = mirror;

  }


  // Record normalized version of unichar with the given unichar_id.

  void set_normed(UNICHAR_ID unichar_id, const char *normed) {

    unichars[unichar_id].properties.normed = normed;

    unichars[unichar_id].properties.normed_ids.clear();

  }

  // Sets the normed_ids vector from the normed string. normed_ids is not

  // stored in the file, and needs to be set when the UNICHARSET is loaded.

  void set_normed_ids(UNICHAR_ID unichar_id);


  // Return the isalpha property of the given unichar.

  bool get_isalpha(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return false;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.isalpha;

  }


  // Return the islower property of the given unichar.

  bool get_islower(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return false;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.islower;

  }


  // Return the isupper property of the given unichar.

  bool get_isupper(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return false;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.isupper;

  }


  // Return the isdigit property of the given unichar.

  bool get_isdigit(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return false;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.isdigit;

  }


  // Return the ispunctuation property of the given unichar.

  bool get_ispunctuation(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return false;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.ispunctuation;

  }


  // Return the isngram property of the given unichar.

  bool get_isngram(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return false;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.isngram;

  }


  // Returns whether the unichar id represents a unicode value in the private

  // use area.

  bool get_isprivate(UNICHAR_ID unichar_id) const;


  // Returns true if the ids have useful min/max top/bottom values.

  bool top_bottom_useful() const {

    return top_bottom_set_;

  }

  // Sets all ranges to empty, so they can be expanded to set the values.

  void set_ranges_empty();

  // Sets all the properties for this unicharset given a src_unicharset with

  // everything set. The unicharsets don't have to be the same, and graphemes

  // are correctly accounted for.

  void SetPropertiesFromOther(const UNICHARSET &src) {

    PartialSetPropertiesFromOther(0, src);

  }

  // Sets properties from Other, starting only at the given index.

  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);

  // Expands the tops and bottoms and widths for this unicharset given a

  // src_unicharset with ranges in it. The unicharsets don't have to be the

  // same, and graphemes are correctly accounted for.

  void ExpandRangesFromOther(const UNICHARSET &src);

  // Makes this a copy of src. Clears this completely first, so the automattic

  // ids will not be present in this if not in src.

  void CopyFrom(const UNICHARSET &src);

  // For each id in src, if it does not occur in this, add it, as in

  // SetPropertiesFromOther, otherwise expand the ranges, as in

  // ExpandRangesFromOther.

  void AppendOtherUnicharset(const UNICHARSET &src);

  // Returns true if the acceptable ranges of the tops of the characters do

  // not overlap, making their x-height calculations distinct.

  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;

  // Returns the min and max bottom and top of the given unichar in

  // baseline-normalized coordinates, ie, where the baseline is

  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight

  // (See normalis.h for the definitions).

  void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,

                      int *min_top, int *max_top) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      *min_bottom = *min_top = 0;

      *max_bottom = *max_top = 256; // kBlnCellHeight

      return;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    *min_bottom = unichars[unichar_id].properties.min_bottom;

    *max_bottom = unichars[unichar_id].properties.max_bottom;

    *min_top = unichars[unichar_id].properties.min_top;

    *max_top = unichars[unichar_id].properties.max_top;

  }

  void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,

                      int min_top, int max_top) {

    unichars[unichar_id].properties.min_bottom =

        ClipToRange<int>(min_bottom, 0, UINT8_MAX);

    unichars[unichar_id].properties.max_bottom =

        ClipToRange<int>(max_bottom, 0, UINT8_MAX);

    unichars[unichar_id].properties.min_top =

        ClipToRange<int>(min_top, 0, UINT8_MAX);

    unichars[unichar_id].properties.max_top =

        ClipToRange<int>(max_top, 0, UINT8_MAX);

  }

  // Returns the width stats (as mean, sd) of the given unichar relative to the

  // median advance of all characters in the character set.

  void get_width_stats(UNICHAR_ID unichar_id, float *width,

                       float *width_sd) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      *width = 0.0f;

      *width_sd = 0.0f;

      return;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    *width = unichars[unichar_id].properties.width;

    *width_sd = unichars[unichar_id].properties.width_sd;

  }

  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {

    unichars[unichar_id].properties.width = width;

    unichars[unichar_id].properties.width_sd = width_sd;

  }

  // Returns the stats of the x-bearing (as mean, sd) of the given unichar

  // relative to the median advance of all characters in the character set.

  void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,

                         float *bearing_sd) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      *bearing = *bearing_sd = 0.0f;

      return;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    *bearing = unichars[unichar_id].properties.bearing;

    *bearing_sd = unichars[unichar_id].properties.bearing_sd;

  }

  void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,

                         float bearing_sd) {

    unichars[unichar_id].properties.bearing = bearing;

    unichars[unichar_id].properties.bearing_sd = bearing_sd;

  }

  // Returns the stats of the x-advance of the given unichar (as mean, sd)

  // relative to the median advance of all characters in the character set.

  void get_advance_stats(UNICHAR_ID unichar_id, float *advance,

                         float *advance_sd) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      *advance = *advance_sd = 0;

      return;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    *advance = unichars[unichar_id].properties.advance;

    *advance_sd = unichars[unichar_id].properties.advance_sd;

  }

  void set_advance_stats(UNICHAR_ID unichar_id, float advance,

                         float advance_sd) {

    unichars[unichar_id].properties.advance = advance;

    unichars[unichar_id].properties.advance_sd = advance_sd;

  }

  // Returns true if the font metrics properties are empty.

  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {

    return unichars[unichar_id].properties.AnyRangeEmpty();

  }


  // Returns true if the script of the given id is space delimited.

  // Returns false for Han and Thai scripts.

  bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return true;

    }

    int script_id = get_script(unichar_id);

    return script_id != han_sid_ && script_id != thai_sid_ &&

           script_id != hangul_sid_ && script_id != hiragana_sid_ &&

           script_id != katakana_sid_;

  }


  // Return the script name of the given unichar.

  // The returned pointer will always be the same for the same script, it's

  // managed by unicharset and thus MUST NOT be deleted

  int get_script(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return null_sid_;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.script_id;

  }


  // Return the character properties, eg. alpha/upper/lower/digit/punct,

  // as a bit field of unsigned int.

  unsigned int get_properties(UNICHAR_ID unichar_id) const;


  // Return the character property as a single char.  If a character has

  // multiple attributes, the main property is defined by the following order:

  //   upper_case : 'A'

  //   lower_case : 'a'

  //   alpha      : 'x'

  //   digit      : '0'

  //   punctuation: 'p'

  char get_chartype(UNICHAR_ID unichar_id) const;


  // Get other_case unichar id in the properties for the given unichar id.

  UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return INVALID_UNICHAR_ID;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.other_case;

  }


  // Returns the direction property of the given unichar.

  Direction get_direction(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return UNICHARSET::U_OTHER_NEUTRAL;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.direction;

  }


  // Get mirror unichar id in the properties for the given unichar id.

  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return INVALID_UNICHAR_ID;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.mirror;

  }


  // Returns UNICHAR_ID of the corresponding lower-case unichar.

  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return INVALID_UNICHAR_ID;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    if (unichars[unichar_id].properties.islower) {

      return unichar_id;

    }

    return unichars[unichar_id].properties.other_case;

  }


  // Returns UNICHAR_ID of the corresponding upper-case unichar.

  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return INVALID_UNICHAR_ID;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    if (unichars[unichar_id].properties.isupper) {

      return unichar_id;

    }

    return unichars[unichar_id].properties.other_case;

  }


  // Returns true if this UNICHARSET has the special codes in

  // SpecialUnicharCodes available. If false then there are normal unichars

  // at these codes and they should not be used.

  bool has_special_codes() const {

    return get_fragment(UNICHAR_BROKEN) != nullptr &&

           strcmp(id_to_unichar(UNICHAR_BROKEN),

                  kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;

  }


  // Returns true if there are any repeated unicodes in the normalized

  // text of any unichar-id in the unicharset.

  bool AnyRepeatedUnicodes() const;


  // Return a pointer to the CHAR_FRAGMENT class if the given

  // unichar id represents a character fragment.

  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {

    if (INVALID_UNICHAR_ID == unichar_id) {

      return nullptr;

    }

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.fragment;

  }


  // Return the isalpha property of the given unichar representation.

  bool get_isalpha(const char *const unichar_repr) const {

    return get_isalpha(unichar_to_id(unichar_repr));

  }


  // Return the islower property of the given unichar representation.

  bool get_islower(const char *const unichar_repr) const {

    return get_islower(unichar_to_id(unichar_repr));

  }


  // Return the isupper property of the given unichar representation.

  bool get_isupper(const char *const unichar_repr) const {

    return get_isupper(unichar_to_id(unichar_repr));

  }


  // Return the isdigit property of the given unichar representation.

  bool get_isdigit(const char *const unichar_repr) const {

    return get_isdigit(unichar_to_id(unichar_repr));

  }


  // Return the ispunctuation property of the given unichar representation.

  bool get_ispunctuation(const char *const unichar_repr) const {

    return get_ispunctuation(unichar_to_id(unichar_repr));

  }


  // Return the character properties, eg. alpha/upper/lower/digit/punct,

  // of the given unichar representation

  unsigned int get_properties(const char *const unichar_repr) const {

    return get_properties(unichar_to_id(unichar_repr));

  }


  char get_chartype(const char *const unichar_repr) const {

    return get_chartype(unichar_to_id(unichar_repr));

  }


  // Return the script name of the given unichar representation.

  // The returned pointer will always be the same for the same script, it's

  // managed by unicharset and thus MUST NOT be deleted

  int get_script(const char *const unichar_repr) const {

    return get_script(unichar_to_id(unichar_repr));

  }


  // Return a pointer to the CHAR_FRAGMENT class struct if the given

  // unichar representation represents a character fragment.

  const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {

    if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||

        !ids.contains(unichar_repr, false)) {

      return nullptr;

    }

    return get_fragment(unichar_to_id(unichar_repr));

  }


  // Return the isalpha property of the given unichar representation.

  // Only the first length characters from unichar_repr are used.

  bool get_isalpha(const char *const unichar_repr, int length) const {

    return get_isalpha(unichar_to_id(unichar_repr, length));

  }


  // Return the islower property of the given unichar representation.

  // Only the first length characters from unichar_repr are used.

  bool get_islower(const char *const unichar_repr, int length) const {

    return get_islower(unichar_to_id(unichar_repr, length));

  }


  // Return the isupper property of the given unichar representation.

  // Only the first length characters from unichar_repr are used.

  bool get_isupper(const char *const unichar_repr, int length) const {

    return get_isupper(unichar_to_id(unichar_repr, length));

  }


  // Return the isdigit property of the given unichar representation.

  // Only the first length characters from unichar_repr are used.

  bool get_isdigit(const char *const unichar_repr, int length) const {

    return get_isdigit(unichar_to_id(unichar_repr, length));

  }


  // Return the ispunctuation property of the given unichar representation.

  // Only the first length characters from unichar_repr are used.

  bool get_ispunctuation(const char *const unichar_repr, int length) const {

    return get_ispunctuation(unichar_to_id(unichar_repr, length));

  }


  // Returns normalized version of unichar with the given unichar_id.

  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {

    if (unichar_id == UNICHAR_SPACE) {

      return " ";

    }

    return unichars[unichar_id].properties.normed.c_str();

  }

  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized

  // version of the given id. There may be more than one UNICHAR_ID in the

  // vector if unichar_id represents a ligature.

  const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {

    return unichars[unichar_id].properties.normed_ids;

  }


  // Return the script name of the given unichar representation.

  // Only the first length characters from unichar_repr are used.

  // The returned pointer will always be the same for the same script, it's

  // managed by unicharset and thus MUST NOT be deleted

  int get_script(const char *const unichar_repr, int length) const {

    return get_script(unichar_to_id(unichar_repr, length));

  }


  // Return the (current) number of scripts in the script table

  int get_script_table_size() const {

    return script_table_size_used;

  }


  // Return the script string from its id

  const char *get_script_from_script_id(int id) const {

    if (id >= script_table_size_used || id < 0) {

      return null_script;

    }

    return script_table[id];

  }


  // Returns the id from the name of the script, or 0 if script is not found.

  // Note that this is an expensive operation since it involves iteratively

  // comparing strings in the script table.  To avoid dependency on STL, we

  // won't use a hash.  Instead, the calling function can use this to lookup

  // and save the ID for relevant scripts for fast comparisons later.

  int get_script_id_from_name(const char *script_name) const;


  // Return true if the given script is the null script

  bool is_null_script(const char *script) const {

    return script == null_script;

  }


  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,

  // then the returned pointer will be the same.

  // The script parameter is copied and thus can be a temporary.

  int add_script(const char *script);


  // Return the enabled property of the given unichar.

  bool get_enabled(UNICHAR_ID unichar_id) const {

    ASSERT_HOST(contains_unichar_id(unichar_id));

    return unichars[unichar_id].properties.enabled;

  }


  int null_sid() const {

    return null_sid_;

  }

  int common_sid() const {

    return common_sid_;

  }

  int latin_sid() const {

    return latin_sid_;

  }

  int cyrillic_sid() const {

    return cyrillic_sid_;

  }

  int greek_sid() const {

    return greek_sid_;

  }

  int han_sid() const {

    return han_sid_;

  }

  int hiragana_sid() const {

    return hiragana_sid_;

  }

  int katakana_sid() const {

    return katakana_sid_;

  }

  int thai_sid() const {

    return thai_sid_;

  }

  int hangul_sid() const {

    return hangul_sid_;

  }

  int default_sid() const {

    return default_sid_;

  }


  // Returns true if the unicharset has the concept of upper/lower case.

  bool script_has_upper_lower() const {

    return script_has_upper_lower_;

  }

  // Returns true if the unicharset has the concept of x-height.

  // script_has_xheight can be true even if script_has_upper_lower is not,

  // when the script has a sufficiently predominant top line with ascenders,

  // such as Devanagari and Thai.

  bool script_has_xheight() const {

    return script_has_xheight_;

  }


private:

  struct TESS_API UNICHAR_PROPERTIES {

    UNICHAR_PROPERTIES();

    // Initializes all properties to sensible default values.

    void Init();

    // Sets all ranges wide open. Initialization default in case there are

    // no useful values available.

    void SetRangesOpen();

    // Sets all ranges to empty. Used before expanding with font-based data.

    void SetRangesEmpty();

    // Returns true if any of the top/bottom/width/bearing/advance ranges/stats

    // is empty.

    bool AnyRangeEmpty() const;

    // Expands the ranges with the ranges from the src properties.

    void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);

    // Copies the properties from src into this.

    void CopyFrom(const UNICHAR_PROPERTIES &src);


    bool isalpha;

    bool islower;

    bool isupper;

    bool isdigit;

    bool ispunctuation;

    bool isngram;

    bool enabled;

    // Possible limits of the top and bottom of the bounding box in

    // baseline-normalized coordinates, ie, where the baseline is

    // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight

    // (See normalis.h for the definitions).

    uint8_t min_bottom;

    uint8_t max_bottom;

    uint8_t min_top;

    uint8_t max_top;

    // Statistics of the widths of bounding box, relative to the median advance.

    float width;

    float width_sd;

    // Stats of the x-bearing and advance, also relative to the median advance.

    float bearing;

    float bearing_sd;

    float advance;

    float advance_sd;

    int script_id;

    UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar

    Direction direction;   // direction of this unichar

    // Mirror property is useful for reverse DAWG lookup for words in

    // right-to-left languages (e.g. "(word)" would be in

    // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.

    // However, what we want in our DAWG is

    // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not

    // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.

    UNICHAR_ID mirror;

    // A string of unichar_ids that represent the corresponding normed string.

    // For awkward characters like em-dash, this gives hyphen.

    // For ligatures, this gives the string of normal unichars.

    std::vector<UNICHAR_ID> normed_ids;

    std::string normed; // normalized version of this unichar

    // Contains meta information about the fragment if a unichar represents

    // a fragment of a character, otherwise should be set to nullptr.

    // It is assumed that character fragments are added to the unicharset

    // after the corresponding 'base' characters.

    CHAR_FRAGMENT *fragment;

  };


  struct UNICHAR_SLOT {

    char representation[UNICHAR_LEN + 1];

    UNICHAR_PROPERTIES properties;

  };


  // Internal recursive version of encode_string above.

  // str is the start of the whole string.

  // str_index is the current position in str.

  // str_length is the length of str.

  // encoding is a working encoding of str.

  // lengths is a working set of lengths of each element of encoding.

  // best_total_length is the longest length of str that has been successfully

  // encoded so far.

  // On return:

  // best_encoding contains the encoding that used the longest part of str.

  // best_lengths (may be null) contains the lengths of best_encoding.

  void encode_string(const char *str, int str_index, int str_length,

                     std::vector<UNICHAR_ID> *encoding,

                     std::vector<char> *lengths, unsigned *best_total_length,

                     std::vector<UNICHAR_ID> *best_encoding,

                     std::vector<char> *best_lengths) const;


  // Gets the properties for a grapheme string, combining properties for

  // multiple characters in a meaningful way where possible.

  // Returns false if no valid match was found in the unicharset.

  // NOTE that script_id, mirror, and other_case refer to this unicharset on

  // return and will need redirecting if the target unicharset is different.

  bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;


  // Load ourselves from a "file" where our only interface to the file is

  // an implementation of fgets().  This is the parsing primitive accessed by

  // the public routines load_from_file().

  bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,

                      bool skip_fragments);


  // List of mappings to make when ingesting strings from the outside.

  // The substitutions clean up text that should exists for rendering of

  // synthetic data, but not in the recognition set.

  static const char *kCleanupMaps[][2];

  static const char *null_script;


  std::vector<UNICHAR_SLOT> unichars;

  UNICHARMAP ids;

  char **script_table;

  int script_table_size_used;

  int script_table_size_reserved;

  // True if the unichars have their tops/bottoms set.

  bool top_bottom_set_;

  // True if the unicharset has significant upper/lower case chars.

  bool script_has_upper_lower_;

  // True if the unicharset has a significant mean-line with significant

  // ascenders above that.

  bool script_has_xheight_;

  // True if the set contains chars that would be changed by the cleanup.

  bool old_style_included_;


  // A few convenient script name-to-id mapping without using hash.

  // These are initialized when unicharset file is loaded.  Anything

  // missing from this list can be looked up using get_script_id_from_name.

  int null_sid_;

  int common_sid_;

  int latin_sid_;

  int cyrillic_sid_;

  int greek_sid_;

  int han_sid_;

  int hiragana_sid_;

  int katakana_sid_;

  int thai_sid_;

  int hangul_sid_;

  // The most frequently occurring script in the charset.

  int default_sid_;

};


} // namespace tesseract


#endif // TESSERACT_CCUTIL_UNICHARSET_H_

unichar.h

UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:31

unicharmap.h

errcode.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

serialis.h

helpers.h

value
int value
Definition: gmock-matchers_test.cc:664

p
const char * p
Definition: gmock-matchers_test.cc:4030

i
int i
Definition: gmock-matchers_test.cc:718

tesseract
Definition: baseapi.h:39

tesseract::Serialize
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236

tesseract::OldUncleanUnichars
OldUncleanUnichars
Definition: unicharset.h:45

tesseract::OldUncleanUnichars::kTrue
@ kTrue

tesseract::OldUncleanUnichars::kFalse
@ kFalse

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::SpecialUnicharCodes
SpecialUnicharCodes
Definition: unicharset.h:35

tesseract::UNICHAR_SPACE
@ UNICHAR_SPACE
Definition: unicharset.h:36

tesseract::UNICHAR_BROKEN
@ UNICHAR_BROKEN
Definition: unicharset.h:38

tesseract::SPECIAL_UNICHAR_CODES_COUNT
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40

tesseract::UNICHAR_JOINED
@ UNICHAR_JOINED
Definition: unicharset.h:37

tesseract::TFile
Definition: serialis.h:61

tesseract::CHAR_FRAGMENT
Definition: unicharset.h:50

tesseract::CHAR_FRAGMENT::set_pos
void set_pos(int p)
Definition: unicharset.h:70

tesseract::CHAR_FRAGMENT::equals
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:97

tesseract::CHAR_FRAGMENT::set_unichar
void set_unichar(const char *uch)
Definition: unicharset.h:66

tesseract::CHAR_FRAGMENT::set_all
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:60

tesseract::CHAR_FRAGMENT::is_ending
bool is_ending() const
Definition: unicharset.h:121

tesseract::CHAR_FRAGMENT::set_natural
void set_natural(bool value)
Definition: unicharset.h:131

tesseract::CHAR_FRAGMENT::to_string
std::string to_string() const
Definition: unicharset.h:91

tesseract::CHAR_FRAGMENT::is_continuation_of
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:109

tesseract::CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:76

tesseract::CHAR_FRAGMENT::set_total
void set_total(int t)
Definition: unicharset.h:73

tesseract::CHAR_FRAGMENT::is_natural
bool is_natural() const
Definition: unicharset.h:128

tesseract::CHAR_FRAGMENT::get_pos
int get_pos() const
Definition: unicharset.h:79

tesseract::CHAR_FRAGMENT::get_total
int get_total() const
Definition: unicharset.h:82

tesseract::CHAR_FRAGMENT::is_beginning
bool is_beginning() const
Definition: unicharset.h:116

tesseract::CHAR_FRAGMENT::equals
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:102

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::get_islower
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:782

tesseract::UNICHARSET::get_isalpha
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:777

tesseract::UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:283

tesseract::UNICHARSET::debug_str
std::string debug_str(const char *unichar_repr) const
Definition: unicharset.h:273

tesseract::UNICHARSET::greek_sid
int greek_sid() const
Definition: unicharset.h:928

tesseract::UNICHARSET::set_mirror
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:483

tesseract::UNICHARSET::set_script
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468

tesseract::UNICHARSET::delete_pointers_in_unichars
void delete_pointers_in_unichars()
Definition: unicharset.h:316

tesseract::UNICHARSET::default_sid
int default_sid() const
Definition: unicharset.h:946

tesseract::UNICHARSET::get_script_from_script_id
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:886

tesseract::UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:681

tesseract::UNICHARSET::normed_ids
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868

tesseract::UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:958

tesseract::UNICHARSET::get_script
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:876

tesseract::UNICHARSET::common_sid
int common_sid() const
Definition: unicharset.h:919

tesseract::UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:931

tesseract::UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768

tesseract::UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:881

tesseract::UNICHARSET::get_isupper
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:842

tesseract::UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497

tesseract::UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:756

tesseract::UNICHARSET::cyrillic_sid
int cyrillic_sid() const
Definition: unicharset.h:925

tesseract::UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:934

tesseract::UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:712

tesseract::UNICHARSET::set_isupper
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:447

tesseract::UNICHARSET::script_has_upper_lower
bool script_has_upper_lower() const
Definition: unicharset.h:951

tesseract::UNICHARSET::set_normed
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:488

tesseract::UNICHARSET::is_null_script
bool is_null_script(const char *script) const
Definition: unicharset.h:901

tesseract::UNICHARSET::get_script
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:814

tesseract::UNICHARSET::get_ispunctuation
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:797

tesseract::UNICHARSET::get_advance_stats
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:646

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:792

tesseract::UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506

tesseract::UNICHARSET::set_width_stats
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:623

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:848

tesseract::UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:916

tesseract::UNICHARSET::hangul_sid
int hangul_sid() const
Definition: unicharset.h:943

tesseract::UNICHARSET::load_from_file
bool load_from_file(FILE *file)
Definition: unicharset.h:408

tesseract::UNICHARSET::set_top_bottom
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:599

tesseract::UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391

tesseract::UNICHARSET::set_direction
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:478

tesseract::UNICHARSET::set_ispunctuation
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:457

tesseract::UNICHARSET::clear
void clear()
Definition: unicharset.h:324

tesseract::UNICHARSET::get_ispunctuation
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:854

tesseract::UNICHARSET::get_properties
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:803

tesseract::UNICHARSET::get_isupper
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:787

tesseract::UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

tesseract::UNICHARSET::latin_sid
int latin_sid() const
Definition: unicharset.h:922

tesseract::UNICHARSET::to_upper
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:742

tesseract::UNICHARSET::get_bearing_stats
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:629

tesseract::UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:937

tesseract::UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:703

tesseract::UNICHARSET::set_isalpha
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:437

tesseract::UNICHARSET::Direction
Direction
Definition: unicharset.h:175

tesseract::UNICHARSET::U_OTHER_NEUTRAL
@ U_OTHER_NEUTRAL
Definition: unicharset.h:186

tesseract::UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303

tesseract::UNICHARSET::get_isngram
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542

tesseract::UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:721

tesseract::UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:555

tesseract::UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361

tesseract::UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515

tesseract::UNICHARSET::get_isalpha
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:830

tesseract::UNICHARSET::unichar_insert_backwards_compatible
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:288

tesseract::UNICHARSET::save_to_file
bool save_to_file(FILE *file) const
Definition: unicharset.h:373

tesseract::UNICHARSET::set_other_case
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:473

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

tesseract::UNICHARSET::load_from_file
bool load_from_file(const char *const filename)
Definition: unicharset.h:401

tesseract::UNICHARSET::get_normed_unichar
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:859

tesseract::UNICHARSET::get_islower
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:836

tesseract::UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:911

tesseract::UNICHARSET::set_islower
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:442

tesseract::UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533

tesseract::UNICHARSET::size
size_t size() const
Definition: unicharset.h:355

tesseract::UNICHARSET::IsSpaceDelimited
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:668

tesseract::UNICHARSET::get_chartype
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:807

tesseract::UNICHARSET::thai_sid
int thai_sid() const
Definition: unicharset.h:940

tesseract::UNICHARSET::PropertiesIncomplete
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662

tesseract::UNICHARSET::SetPropertiesFromOther
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:563

tesseract::UNICHARSET::set_advance_stats
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:656

tesseract::UNICHARSET::get_width_stats
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:612

tesseract::UNICHARSET::set_isdigit
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:452

tesseract::UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265

tesseract::UNICHARSET::save_to_file
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:379

tesseract::UNICHARSET::set_isngram
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:462

tesseract::UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:820

tesseract::UNICHARSET::set_bearing_stats
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:639

tesseract::UNICHARSET::to_lower
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:730

file
Definition: include_gunit.h:36

TESS_API
#define TESS_API
Definition: export.h:32