tesseract-ocr.github.io/3.x/a00888_source.html

 // File:        unicharset.h

 // Description: Unicode character/ligature set class.

 // Author:      Thomas Kielbus

 // Created:     Wed Jun 28 17:05:01 PDT 2006

 //

 // (C) Copyright 2006, Google Inc.

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 // http://www.apache.org/licenses/LICENSE-2.0

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.

 //


 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__

 #define TESSERACT_CCUTIL_UNICHARSET_H__


 #include "errcode.h"

 #include "genericvector.h"

 #include "helpers.h"

 #include "serialis.h"

 #include "strngs.h"

 #include "tesscallback.h"

 #include "unichar.h"

 #include "unicharmap.h"


 // Enum holding special values of unichar_id. Every unicharset has these.

 // Warning! Keep in sync with kSpecialUnicharCodes.

 enum SpecialUnicharCodes {

   UNICHAR_SPACE,

   UNICHAR_JOINED,

   UNICHAR_BROKEN,


   SPECIAL_UNICHAR_CODES_COUNT

 };


 class CHAR_FRAGMENT {

  public:

   // Minimum number of characters used for fragment representation.

   static const int kMinLen = 6;

   // Maximum number of characters used for fragment representation.

   static const int kMaxLen = 3 + UNICHAR_LEN + 2;

   // Maximum number of fragments per character.

   static const int kMaxChunks = 5;


   // Setters and Getters.

   inline void set_all(const char *unichar, int pos, int total, bool natural) {

     set_unichar(unichar);

     set_pos(pos);

     set_total(total);

     set_natural(natural);

   }

   inline void set_unichar(const char *uch) {

     strncpy(this->unichar, uch, UNICHAR_LEN);

     this->unichar[UNICHAR_LEN] = '\0';

   }

   inline void set_pos(int p) { this->pos = p; }

   inline void set_total(int t) { this->total = t; }

   inline const char* get_unichar() const { return this->unichar; }

   inline int get_pos() const { return this->pos; }

   inline int get_total() const { return this->total; }


   // Returns the string that represents a fragment

   // with the given unichar, pos and total.

   static STRING to_string(const char *unichar, int pos, int total,

                           bool natural);

   // Returns the string that represents this fragment.

   STRING to_string() const {

     return to_string(unichar, pos, total, natural);

   }


   // Checks whether a fragment has the same unichar,

   // position and total as the given inputs.

   inline bool equals(const char *other_unichar,

                      int other_pos, int other_total) const {

     return (strcmp(this->unichar, other_unichar) == 0 &&

             this->pos == other_pos && this->total == other_total);

   }

   inline bool equals(const CHAR_FRAGMENT *other) const {

     return this->equals(other->get_unichar(),

                         other->get_pos(),

                         other->get_total());

   }


   // Checks whether a given fragment is a continuation of this fragment.

   // Assumes that the given fragment pointer is not NULL.

   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {

     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&

             this->total == fragment->get_total() &&

             this->pos == fragment->get_pos() + 1);

   }


   // Returns true if this fragment is a beginning fragment.

   inline bool is_beginning() const { return this->pos == 0; }


   // Returns true if this fragment is an ending fragment.

   inline bool is_ending() const { return this->pos == this->total-1; }


   // Returns true if the fragment was a separate component to begin with,

   // ie did not need chopping to be isolated, but may have been separated

   // out from a multi-outline blob.

   inline bool is_natural() const { return natural; }

   void set_natural(bool value) { natural = value; }


   // Parses the string to see whether it represents a character fragment

   // (rather than a regular character). If so, allocates memory for a new

   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment

   // information. Fragments are of the form:

   // |m|1|2, meaning chunk 1 of 2 of character m, or

   // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed

   // to divide the parts, as they were already separate connected components.

   //

   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT

   // instance, otherwise (if the string does not represent a fragment or it

   // looks like it does, but parsing it as a fragment fails) returns NULL.

   //

   // Note: The caller is responsible for deallocating memory

   // associated with the returned pointer.

   static CHAR_FRAGMENT *parse_from_string(const char *str);


  private:

   char unichar[UNICHAR_LEN + 1];

   // True if the fragment was a separate component to begin with,

   // ie did not need chopping to be isolated, but may have been separated

   // out from a multi-outline blob.

   bool natural;

   inT16 pos;    // fragment position in the character

   inT16 total;  // total number of fragments in the character

 };


 // The UNICHARSET class is an utility class for Tesseract that holds the

 // set of characters that are used by the engine. Each character is identified

 // by a unique number, from 0 to (size - 1).

 class UNICHARSET {

  public:

   // Custom list of characters and their ligature forms (UTF8)

   // These map to unicode values in the private use area (PUC) and are supported

   // by only few font families (eg. Wyld, Adobe Caslon Pro).

   static const char* kCustomLigatures[][2];


   // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.

   static const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];


   // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)

   enum Direction {

       U_LEFT_TO_RIGHT               = 0,

       U_RIGHT_TO_LEFT               = 1,

       U_EUROPEAN_NUMBER             = 2,

       U_EUROPEAN_NUMBER_SEPARATOR   = 3,

       U_EUROPEAN_NUMBER_TERMINATOR  = 4,

       U_ARABIC_NUMBER               = 5,

       U_COMMON_NUMBER_SEPARATOR     = 6,

       U_BLOCK_SEPARATOR             = 7,

       U_SEGMENT_SEPARATOR           = 8,

       U_WHITE_SPACE_NEUTRAL         = 9,

       U_OTHER_NEUTRAL               = 10,

       U_LEFT_TO_RIGHT_EMBEDDING     = 11,

       U_LEFT_TO_RIGHT_OVERRIDE      = 12,

       U_RIGHT_TO_LEFT_ARABIC        = 13,

       U_RIGHT_TO_LEFT_EMBEDDING     = 14,

       U_RIGHT_TO_LEFT_OVERRIDE      = 15,

       U_POP_DIRECTIONAL_FORMAT      = 16,

       U_DIR_NON_SPACING_MARK        = 17,

       U_BOUNDARY_NEUTRAL            = 18,

       U_CHAR_DIRECTION_COUNT

   };


   // Create an empty UNICHARSET

   UNICHARSET();


   ~UNICHARSET();


   // Return the UNICHAR_ID of a given unichar representation within the

   // UNICHARSET.

   const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;


   // Return the UNICHAR_ID of a given unichar representation within the

   // UNICHARSET. Only the first length characters from unichar_repr are used.

   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,

                                  int length) const;


   // Return the minimum number of bytes that matches a legal UNICHAR_ID,

   // while leaving the rest of the string encodable. Returns 0 if the

   // beginning of the string is not encodable.

   // WARNING: this function now encodes the whole string for precision.

   // Use encode_string in preference to repeatedly calling step.

   int step(const char* str) const;


   // Return whether the given UTF-8 string is encodable with this UNICHARSET.

   // If not encodable, write the first byte offset which cannot be converted

   // into the second (return) argument.

   bool encodable_string(const char *str, int *first_bad_position) const;


   // Encodes the given UTF-8 string with this UNICHARSET.

   // Any part of the string that cannot be encoded (because the utf8 can't

   // be broken up into pieces that are in the unicharset) then:

   // if give_up_on_failure, stops and returns a partial encoding,

   // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.

   // Returns true if the encoding succeeds completely, false if there is at

   // least one failure.

   // If lengths is not NULL, then it is filled with the corresponding

   // byte length of each encoded UNICHAR_ID.

   // If encoded_length is not NULL then on return it contains the length of

   // str that was encoded. (if give_up_on_failure the location of the first

   // failure, otherwise strlen(str).)

   bool encode_string(const char* str, bool give_up_on_failure,

                      GenericVector<UNICHAR_ID>* encoding,

                      GenericVector<char>* lengths,

                      int* encoded_length) const;


   // Return the unichar representation corresponding to the given UNICHAR_ID

   // within the UNICHARSET.

   const char* const id_to_unichar(UNICHAR_ID id) const;


   // Return the UTF8 representation corresponding to the given UNICHAR_ID after

   // resolving any private encodings internal to Tesseract. This method is

   // preferrable to id_to_unichar for outputting text that will be visible to

   // external applications.

   const char* const id_to_unichar_ext(UNICHAR_ID id) const;


   // Return a STRING that reformats the utf8 str into the str followed

   // by its hex unicodes.

   static STRING debug_utf8_str(const char* str);


   // Return a STRING containing debug information on the unichar, including

   // the id_to_unichar, its hex unicodes and the properties.

   STRING debug_str(UNICHAR_ID id) const;

   STRING debug_str(const char * unichar_repr) const {

     return debug_str(unichar_to_id(unichar_repr));

   }


   // Add a unichar representation to the set.

   void unichar_insert(const char* const unichar_repr);


   // Return true if the given unichar id exists within the set.

   // Relies on the fact that unichar ids are contiguous in the unicharset.

   bool contains_unichar_id(UNICHAR_ID unichar_id) const {

     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&

         unichar_id >= 0;

   }


   // Return true if the given unichar representation exists within the set.

   bool contains_unichar(const char* const unichar_repr) const;

   bool contains_unichar(const char* const unichar_repr, int length) const;


   // Return true if the given unichar representation corresponds to the given

   // UNICHAR_ID within the set.

   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;


   // Delete CHAR_FRAGMENTs stored in properties of unichars array.

   void delete_pointers_in_unichars() {

     for (int i = 0; i < size_used; ++i) {

       if (unichars[i].properties.fragment != NULL) {

         delete unichars[i].properties.fragment;

         unichars[i].properties.fragment = NULL;

       }

     }

   }


   // Clear the UNICHARSET (all the previous data is lost).

   void clear() {

     if (script_table != NULL) {

       for (int i = 0; i < script_table_size_used; ++i)

         delete[] script_table[i];

       delete[] script_table;

       script_table = NULL;

       script_table_size_used = 0;

     }

     if (unichars != NULL) {

       delete_pointers_in_unichars();

       delete[] unichars;

       unichars = NULL;

     }

     script_table_size_reserved = 0;

     size_reserved = 0;

     size_used = 0;

     ids.clear();

     top_bottom_set_ = false;

     script_has_upper_lower_ = false;

     script_has_xheight_ = false;

     null_sid_ = 0;

     common_sid_ = 0;

     latin_sid_ = 0;

     cyrillic_sid_ = 0;

     greek_sid_ = 0;

     han_sid_ = 0;

     hiragana_sid_ = 0;

     katakana_sid_ = 0;

   }


   // Return the size of the set (the number of different UNICHAR it holds).

   int size() const {

     return size_used;

   }


   // Reserve enough memory space for the given number of UNICHARS

   void reserve(int unichars_number);


   // Opens the file indicated by filename and saves unicharset to that file.

   // Returns true if the operation is successful.

   bool save_to_file(const char * const filename) const {

     FILE* file = fopen(filename, "w+b");

     if (file == NULL) return false;

     bool result = save_to_file(file);

     fclose(file);

     return result;

   }


   // Saves the content of the UNICHARSET to the given file.

   // Returns true if the operation is successful.

   bool save_to_file(FILE *file) const {

     STRING str;

     if (!save_to_string(&str)) return false;

     if (fwrite(&str[0], str.length(), 1, file) != 1) return false;

     return true;

   }

   bool save_to_file(tesseract::TFile *file) const {

     STRING str;

     if (!save_to_string(&str)) return false;

     if (file->FWrite(&str[0], str.length(), 1) != 1) return false;

     return true;

   }


   // Saves the content of the UNICHARSET to the given STRING.

   // Returns true if the operation is successful.

   bool save_to_string(STRING *str) const;


   // Load a unicharset from a unicharset file that has been loaded into

   // the given memory buffer.

   // Returns true if the operation is successful.

   bool load_from_inmemory_file(const char* const memory, int mem_size,

                                bool skip_fragments);

   // Returns true if the operation is successful.

   bool load_from_inmemory_file(const char* const memory, int mem_size) {

     return load_from_inmemory_file(memory, mem_size, false);

   }


   // Opens the file indicated by filename and loads the UNICHARSET

   // from the given file. The previous data is lost.

   // Returns true if the operation is successful.

   bool load_from_file(const char* const filename, bool skip_fragments) {

     FILE* file = fopen(filename, "rb");

     if (file == NULL) return false;

     bool result = load_from_file(file, skip_fragments);

     fclose(file);

     return result;

   }

   // returns true if the operation is successful.

   bool load_from_file(const char* const filename) {

     return load_from_file(filename, false);

   }


   // Loads the UNICHARSET from the given file. The previous data is lost.

   // Returns true if the operation is successful.

   bool load_from_file(FILE *file, bool skip_fragments);

   bool load_from_file(FILE *file) { return load_from_file(file, false); }

   bool load_from_file(tesseract::TFile *file, bool skip_fragments);


   // Sets up internal data after loading the file, based on the char

   // properties. Called from load_from_file, but also needs to be run

   // during set_unicharset_properties.

   void post_load_setup();


   // Returns true if right_to_left scripts are significant in the unicharset,

   // but without being so sensitive that "universal" unicharsets containing

   // characters from many scripts, like orientation and script detection,

   // look like they are right_to_left.

   bool major_right_to_left() const;


   // Set a whitelist and/or blacklist of characters to recognize.

   // An empty or NULL whitelist enables everything (minus any blacklist).

   // An empty or NULL blacklist disables nothing.

   // An empty or NULL unblacklist has no effect.

   // The blacklist overrides the whitelist.

   // The unblacklist overrides the blacklist.

   // Each list is a string of utf8 character strings. Boundaries between

   // unicharset units are worked out automatically, and characters not in

   // the unicharset are silently ignored.

   void set_black_and_whitelist(const char* blacklist, const char* whitelist,

                                const char* unblacklist);


   // Set the isalpha property of the given unichar to the given value.

   void set_isalpha(UNICHAR_ID unichar_id, bool value) {

     unichars[unichar_id].properties.isalpha = value;

   }


   // Set the islower property of the given unichar to the given value.

   void set_islower(UNICHAR_ID unichar_id, bool value) {

     unichars[unichar_id].properties.islower = value;

   }


   // Set the isupper property of the given unichar to the given value.

   void set_isupper(UNICHAR_ID unichar_id, bool value) {

     unichars[unichar_id].properties.isupper = value;

   }


   // Set the isdigit property of the given unichar to the given value.

   void set_isdigit(UNICHAR_ID unichar_id, bool value) {

     unichars[unichar_id].properties.isdigit = value;

   }


   // Set the ispunctuation property of the given unichar to the given value.

   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {

     unichars[unichar_id].properties.ispunctuation = value;

   }


   // Set the isngram property of the given unichar to the given value.

   void set_isngram(UNICHAR_ID unichar_id, bool value) {

     unichars[unichar_id].properties.isngram = value;

   }


   // Set the script name of the given unichar to the given value.

   // Value is copied and thus can be a temporary;

   void set_script(UNICHAR_ID unichar_id, const char* value) {

     unichars[unichar_id].properties.script_id = add_script(value);

   }


   // Set other_case unichar id in the properties for the given unichar id.

   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {

     unichars[unichar_id].properties.other_case = other_case;

   }


   // Set the direction property of the given unichar to the given value.

   void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {

     unichars[unichar_id].properties.direction = value;

   }


   // Set mirror unichar id in the properties for the given unichar id.

   void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {

     unichars[unichar_id].properties.mirror = mirror;

   }


   // Record normalized version of unichar with the given unichar_id.

   void set_normed(UNICHAR_ID unichar_id, const char* normed) {

     unichars[unichar_id].properties.normed = normed;

     unichars[unichar_id].properties.normed_ids.truncate(0);

   }

   // Sets the normed_ids vector from the normed string. normed_ids is not

   // stored in the file, and needs to be set when the UNICHARSET is loaded.

   void set_normed_ids(UNICHAR_ID unichar_id);


   // Return the isalpha property of the given unichar.

   bool get_isalpha(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return false;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.isalpha;

   }


   // Return the islower property of the given unichar.

   bool get_islower(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return false;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.islower;

   }


   // Return the isupper property of the given unichar.

   bool get_isupper(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return false;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.isupper;

   }


   // Return the isdigit property of the given unichar.

   bool get_isdigit(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return false;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.isdigit;

   }


   // Return the ispunctuation property of the given unichar.

   bool get_ispunctuation(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return false;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.ispunctuation;

   }


   // Return the isngram property of the given unichar.

   bool get_isngram(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return false;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.isngram;

   }


   // Returns whether the unichar id represents a unicode value in the private

   // use area.

   bool get_isprivate(UNICHAR_ID unichar_id) const;


   // Returns true if the ids have useful min/max top/bottom values.

   bool top_bottom_useful() const {

     return top_bottom_set_;

   }

   // Sets all ranges to empty, so they can be expanded to set the values.

   void set_ranges_empty();

   // Sets all the properties for this unicharset given a src_unicharset with

   // everything set. The unicharsets don't have to be the same, and graphemes

   // are correctly accounted for.

   void SetPropertiesFromOther(const UNICHARSET& src) {

     PartialSetPropertiesFromOther(0, src);

   }

   // Sets properties from Other, starting only at the given index.

   void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);

   // Expands the tops and bottoms and widths for this unicharset given a

   // src_unicharset with ranges in it. The unicharsets don't have to be the

   // same, and graphemes are correctly accounted for.

   void ExpandRangesFromOther(const UNICHARSET& src);

   // Makes this a copy of src. Clears this completely first, so the automattic

   // ids will not be present in this if not in src.

   void CopyFrom(const UNICHARSET& src);

   // For each id in src, if it does not occur in this, add it, as in

   // SetPropertiesFromOther, otherwise expand the ranges, as in

   // ExpandRangesFromOther.

   void AppendOtherUnicharset(const UNICHARSET& src);

   // Returns true if the acceptable ranges of the tops of the characters do

   // not overlap, making their x-height calculations distinct.

   bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;

   // Returns the min and max bottom and top of the given unichar in

   // baseline-normalized coordinates, ie, where the baseline is

   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight

   // (See normalis.h for the definitions).

   void get_top_bottom(UNICHAR_ID unichar_id,

                       int* min_bottom, int* max_bottom,

                       int* min_top, int* max_top) const {

     if (INVALID_UNICHAR_ID == unichar_id) {

       *min_bottom = *min_top = 0;

       *max_bottom = *max_top = 256;  // kBlnCellHeight

       return;

     }

     ASSERT_HOST(contains_unichar_id(unichar_id));

     *min_bottom = unichars[unichar_id].properties.min_bottom;

     *max_bottom = unichars[unichar_id].properties.max_bottom;

     *min_top = unichars[unichar_id].properties.min_top;

     *max_top = unichars[unichar_id].properties.max_top;

   }

   void set_top_bottom(UNICHAR_ID unichar_id,

                       int min_bottom, int max_bottom,

                       int min_top, int max_top) {

     unichars[unichar_id].properties.min_bottom =

         static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));

     unichars[unichar_id].properties.max_bottom =

         static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));

     unichars[unichar_id].properties.min_top =

         static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));

     unichars[unichar_id].properties.max_top =

         static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));

   }

   // Returns the width stats (as mean, sd) of the given unichar relative to the

   // median advance of all characters in the character set.

   void get_width_stats(UNICHAR_ID unichar_id,

                        float* width, float* width_sd) const {

     if (INVALID_UNICHAR_ID == unichar_id) {

       *width = 0.0f;

       *width_sd = 0.0f;;

       return;

     }

     ASSERT_HOST(contains_unichar_id(unichar_id));

     *width = unichars[unichar_id].properties.width;

     *width_sd = unichars[unichar_id].properties.width_sd;

   }

   void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {

     unichars[unichar_id].properties.width = width;

     unichars[unichar_id].properties.width_sd = width_sd;

   }

   // Returns the stats of the x-bearing (as mean, sd) of the given unichar

   // relative to the median advance of all characters in the character set.

   void get_bearing_stats(UNICHAR_ID unichar_id,

                          float* bearing, float* bearing_sd) const {

     if (INVALID_UNICHAR_ID == unichar_id) {

       *bearing = *bearing_sd = 0.0f;

       return;

     }

     ASSERT_HOST(contains_unichar_id(unichar_id));

     *bearing = unichars[unichar_id].properties.bearing;

     *bearing_sd = unichars[unichar_id].properties.bearing_sd;

   }

   void set_bearing_stats(UNICHAR_ID unichar_id,

                          float bearing, float bearing_sd) {

     unichars[unichar_id].properties.bearing = bearing;

     unichars[unichar_id].properties.bearing_sd = bearing_sd;

   }

   // Returns the stats of the x-advance of the given unichar (as mean, sd)

   // relative to the median advance of all characters in the character set.

   void get_advance_stats(UNICHAR_ID unichar_id,

                          float* advance, float* advance_sd) const {

     if (INVALID_UNICHAR_ID == unichar_id) {

       *advance = *advance_sd = 0;

       return;

     }

     ASSERT_HOST(contains_unichar_id(unichar_id));

     *advance = unichars[unichar_id].properties.advance;

     *advance_sd = unichars[unichar_id].properties.advance_sd;

   }

   void set_advance_stats(UNICHAR_ID unichar_id,

                          float advance, float advance_sd) {

     unichars[unichar_id].properties.advance = advance;

     unichars[unichar_id].properties.advance_sd = advance_sd;

   }

   // Returns true if the font metrics properties are empty.

   bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {

     return unichars[unichar_id].properties.AnyRangeEmpty();

   }


   // Return the script name of the given unichar.

   // The returned pointer will always be the same for the same script, it's

   // managed by unicharset and thus MUST NOT be deleted

   int get_script(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.script_id;

   }


   // Return the character properties, eg. alpha/upper/lower/digit/punct,

   // as a bit field of unsigned int.

   unsigned int get_properties(UNICHAR_ID unichar_id) const;


   // Return the character property as a single char.  If a character has

   // multiple attributes, the main property is defined by the following order:

   //   upper_case : 'A'

   //   lower_case : 'a'

   //   alpha      : 'x'

   //   digit      : '0'

   //   punctuation: 'p'

   char get_chartype(UNICHAR_ID unichar_id) const;


   // Get other_case unichar id in the properties for the given unichar id.

   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.other_case;

   }


   // Returns the direction property of the given unichar.

   Direction get_direction(UNICHAR_ID unichar_id) const {

      if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;

      ASSERT_HOST(contains_unichar_id(unichar_id));

      return unichars[unichar_id].properties.direction;

    }


   // Get mirror unichar id in the properties for the given unichar id.

   UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.mirror;

   }


   // Returns UNICHAR_ID of the corresponding lower-case unichar.

   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     if (unichars[unichar_id].properties.islower) return unichar_id;

     return unichars[unichar_id].properties.other_case;

   }


   // Returns UNICHAR_ID of the corresponding upper-case unichar.

   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     if (unichars[unichar_id].properties.isupper) return unichar_id;

     return unichars[unichar_id].properties.other_case;

   }


   // Returns true if this UNICHARSET has the special codes in

   // SpecialUnicharCodes available. If false then there are normal unichars

   // at these codes and they should not be used.

   bool has_special_codes() const {

     return get_fragment(UNICHAR_BROKEN) != NULL &&

         strcmp(id_to_unichar(UNICHAR_BROKEN),

                kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;

   }


   // Returns true if there are any repeated unicodes in the normalized

   // text of any unichar-id in the unicharset.

   bool AnyRepeatedUnicodes() const;


   // Return a pointer to the CHAR_FRAGMENT class if the given

   // unichar id represents a character fragment.

   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {

     if (INVALID_UNICHAR_ID == unichar_id) return NULL;

     ASSERT_HOST(contains_unichar_id(unichar_id));

     return unichars[unichar_id].properties.fragment;

   }


   // Return the isalpha property of the given unichar representation.

   bool get_isalpha(const char* const unichar_repr) const {

     return get_isalpha(unichar_to_id(unichar_repr));

   }


   // Return the islower property of the given unichar representation.

   bool get_islower(const char* const unichar_repr) const {

     return get_islower(unichar_to_id(unichar_repr));

   }


   // Return the isupper property of the given unichar representation.

   bool get_isupper(const char* const unichar_repr) const {

     return get_isupper(unichar_to_id(unichar_repr));

   }


   // Return the isdigit property of the given unichar representation.

   bool get_isdigit(const char* const unichar_repr) const {

     return get_isdigit(unichar_to_id(unichar_repr));

   }


   // Return the ispunctuation property of the given unichar representation.

   bool get_ispunctuation(const char* const unichar_repr) const {

     return get_ispunctuation(unichar_to_id(unichar_repr));

   }


   // Return the character properties, eg. alpha/upper/lower/digit/punct,

   // of the given unichar representation

   unsigned int get_properties(const char* const unichar_repr) const {

     return get_properties(unichar_to_id(unichar_repr));

   }


   char get_chartype(const char* const unichar_repr) const {

     return get_chartype(unichar_to_id(unichar_repr));

   }


   // Return the script name of the given unichar representation.

   // The returned pointer will always be the same for the same script, it's

   // managed by unicharset and thus MUST NOT be deleted

   int get_script(const char* const unichar_repr) const {

     return get_script(unichar_to_id(unichar_repr));

   }


   // Return a pointer to the CHAR_FRAGMENT class struct if the given

   // unichar representation represents a character fragment.

   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {

     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||

         !ids.contains(unichar_repr)) {

       return NULL;

     }

     return get_fragment(unichar_to_id(unichar_repr));

   }


   // Return the isalpha property of the given unichar representation.

   // Only the first length characters from unichar_repr are used.

   bool get_isalpha(const char* const unichar_repr,

                int length) const {

     return get_isalpha(unichar_to_id(unichar_repr, length));

   }


   // Return the islower property of the given unichar representation.

   // Only the first length characters from unichar_repr are used.

   bool get_islower(const char* const unichar_repr,

                int length) const {

     return get_islower(unichar_to_id(unichar_repr, length));

   }


   // Return the isupper property of the given unichar representation.

   // Only the first length characters from unichar_repr are used.

   bool get_isupper(const char* const unichar_repr,

                int length) const {

     return get_isupper(unichar_to_id(unichar_repr, length));

   }


   // Return the isdigit property of the given unichar representation.

   // Only the first length characters from unichar_repr are used.

   bool get_isdigit(const char* const unichar_repr,

                int length) const {

     return get_isdigit(unichar_to_id(unichar_repr, length));

   }


   // Return the ispunctuation property of the given unichar representation.

   // Only the first length characters from unichar_repr are used.

   bool get_ispunctuation(const char* const unichar_repr,

                           int length) const {

     return get_ispunctuation(unichar_to_id(unichar_repr, length));

   }


   // Returns normalized version of unichar with the given unichar_id.

   const char *get_normed_unichar(UNICHAR_ID unichar_id) const {

     if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";

     return unichars[unichar_id].properties.normed.string();

   }

   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized

   // version of the given id. There may be more than one UNICHAR_ID in the

   // vector if unichar_id represents a ligature.

   const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {

     return unichars[unichar_id].properties.normed_ids;

   }


   // Return the script name of the given unichar representation.

   // Only the first length characters from unichar_repr are used.

   // The returned pointer will always be the same for the same script, it's

   // managed by unicharset and thus MUST NOT be deleted

   int get_script(const char* const unichar_repr,

                  int length) const {

     return get_script(unichar_to_id(unichar_repr, length));

   }


   // Return the (current) number of scripts in the script table

   int get_script_table_size() const {

     return script_table_size_used;

   }


   // Return the script string from its id

   const char* get_script_from_script_id(int id) const {

     if (id >= script_table_size_used || id < 0)

       return null_script;

     return script_table[id];

   }


   // Returns the id from the name of the script, or 0 if script is not found.

   // Note that this is an expensive operation since it involves iteratively

   // comparing strings in the script table.  To avoid dependency on STL, we

   // won't use a hash.  Instead, the calling function can use this to lookup

   // and save the ID for relevant scripts for fast comparisons later.

   int get_script_id_from_name(const char* script_name) const;


   // Return true if the given script is the null script

   bool is_null_script(const char* script) const {

     return script == null_script;

   }


   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,

   // then the returned pointer will be the same.

   // The script parameter is copied and thus can be a temporary.

   int add_script(const char* script);


   // Return the enabled property of the given unichar.

   bool get_enabled(UNICHAR_ID unichar_id) const {

     return unichars[unichar_id].properties.enabled;

   }


   int null_sid() const { return null_sid_; }

   int common_sid() const { return common_sid_; }

   int latin_sid() const { return latin_sid_; }

   int cyrillic_sid() const { return cyrillic_sid_; }

   int greek_sid() const { return greek_sid_; }

   int han_sid() const { return han_sid_; }

   int hiragana_sid() const { return hiragana_sid_; }

   int katakana_sid() const { return katakana_sid_; }

   int default_sid() const { return default_sid_; }


   // Returns true if the unicharset has the concept of upper/lower case.

   bool script_has_upper_lower() const {

     return script_has_upper_lower_;

   }

   // Returns true if the unicharset has the concept of x-height.

   // script_has_xheight can be true even if script_has_upper_lower is not,

   // when the script has a sufficiently predominant top line with ascenders,

   // such as Devanagari and Thai.

   bool script_has_xheight() const {

     return script_has_xheight_;

   }


  private:


   struct UNICHAR_PROPERTIES {

     UNICHAR_PROPERTIES();

     // Initializes all properties to sensible default values.

     void Init();

     // Sets all ranges wide open. Initialization default in case there are

     // no useful values available.

     void SetRangesOpen();

     // Sets all ranges to empty. Used before expanding with font-based data.

     void SetRangesEmpty();

     // Returns true if any of the top/bottom/width/bearing/advance ranges/stats

     // is emtpy.

     bool AnyRangeEmpty() const;

     // Expands the ranges with the ranges from the src properties.

     void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);

     // Copies the properties from src into this.

     void CopyFrom(const UNICHAR_PROPERTIES& src);


     bool  isalpha;

     bool  islower;

     bool  isupper;

     bool  isdigit;

     bool  ispunctuation;

     bool  isngram;

     bool  enabled;

     // Possible limits of the top and bottom of the bounding box in

     // baseline-normalized coordinates, ie, where the baseline is

     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight

     // (See normalis.h for the definitions).

     uinT8 min_bottom;

     uinT8 max_bottom;

     uinT8 min_top;

     uinT8 max_top;

     // Statstics of the widths of bounding box, relative to the median advance.

     float width;

     float width_sd;

     // Stats of the x-bearing and advance, also relative to the median advance.

     float bearing;

     float bearing_sd;

     float advance;

     float advance_sd;

     int   script_id;

     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar

     Direction direction;  // direction of this unichar

     // Mirror property is useful for reverse DAWG lookup for words in

     // right-to-left languages (e.g. "(word)" would be in

     // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.

     // However, what we want in our DAWG is

     // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not

     // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.

     UNICHAR_ID mirror;

     // A string of unichar_ids that represent the corresponding normed string.

     // For awkward characters like em-dash, this gives hyphen.

     // For ligatures, this gives the string of normal unichars.

     GenericVector<UNICHAR_ID> normed_ids;

     STRING normed;  // normalized version of this unichar

     // Contains meta information about the fragment if a unichar represents

     // a fragment of a character, otherwise should be set to NULL.

     // It is assumed that character fragments are added to the unicharset

     // after the corresponding 'base' characters.

     CHAR_FRAGMENT *fragment;

   };


   struct UNICHAR_SLOT {

     char representation[UNICHAR_LEN + 1];

     UNICHAR_PROPERTIES properties;

   };


   // Internal recursive version of encode_string above.

   // str is the start of the whole string.

   // str_index is the current position in str.

   // str_length is the length of str.

   // encoding is a working encoding of str.

   // lengths is a working set of lengths of each element of encoding.

   // best_total_length is the longest length of str that has been successfully

   // encoded so far.

   // On return:

   // best_encoding contains the encoding that used the longest part of str.

   // best_lengths (may be null) contains the lengths of best_encoding.

   void encode_string(const char* str, int str_index, int str_length,

                      GenericVector<UNICHAR_ID>* encoding,

                      GenericVector<char>* lengths,

                      int* best_total_length,

                      GenericVector<UNICHAR_ID>* best_encoding,

                      GenericVector<char>* best_lengths) const;


   // Gets the properties for a grapheme string, combining properties for

   // multiple characters in a meaningful way where possible.

   // Returns false if no valid match was found in the unicharset.

   // NOTE that script_id, mirror, and other_case refer to this unicharset on

   // return and will need redirecting if the target unicharset is different.

   bool GetStrProperties(const char* utf8_str,

                         UNICHAR_PROPERTIES* props) const;


   // Load ourselves from a "file" where our only interface to the file is

   // an implementation of fgets().  This is the parsing primitive accessed by

   // the public routines load_from_file() and load_from_inmemory_file().

   bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,

                       bool skip_fragments);


   UNICHAR_SLOT* unichars;

   UNICHARMAP ids;

   int size_used;

   int size_reserved;

   char** script_table;

   int script_table_size_used;

   int script_table_size_reserved;

   const char* null_script;

   // True if the unichars have their tops/bottoms set.

   bool top_bottom_set_;

   // True if the unicharset has significant upper/lower case chars.

   bool script_has_upper_lower_;

   // True if the unicharset has a significant mean-line with significant

   // ascenders above that.

   bool script_has_xheight_;


   // A few convenient script name-to-id mapping without using hash.

   // These are initialized when unicharset file is loaded.  Anything

   // missing from this list can be looked up using get_script_id_from_name.

   int null_sid_;

   int common_sid_;

   int latin_sid_;

   int cyrillic_sid_;

   int greek_sid_;

   int han_sid_;

   int hiragana_sid_;

   int katakana_sid_;

   // The most frequently occurring script in the charset.

   int default_sid_;

 };


 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__

UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:838

UNICHARSET::set_isupper
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:399

UNICHARSET::get_script
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:791

UNICHARSET::load_from_file
bool load_from_file(const char *const filename)
Definition: unicharset.h:354

UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318

UNICHARSET::major_right_to_left
bool major_right_to_left() const
Definition: unicharset.cpp:931

UNICHARSET::SizesDistinct
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472

UNICHARSET::unichar_to_id
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

UNICHARSET::U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:157

UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306

UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:837

UNICHARSET::U_WHITE_SPACE_NEUTRAL
Definition: unicharset.h:160

UNICHARSET::set_islower
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:394

CHAR_FRAGMENT::is_continuation_of
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:92

UNICHAR_JOINED
Definition: unicharset.h:36

UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656

CHAR_FRAGMENT
Definition: unicharset.h:42

UNICHARSET::get_isdigit
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:763

UNICHARSET::U_SEGMENT_SEPARATOR
Definition: unicharset.h:159

UNICHARSET::get_ispunctuation
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:770

UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:155

UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:831

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

direction
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43

UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:836

UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:154

UNICHARSET::to_lower
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:652

UNICHARSET::get_isupper
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:756

UNICHARSET::get_isupper
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:699

UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346

CHAR_FRAGMENT::parse_from_string
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:1038

UNICHARSET::load_from_inmemory_file
bool load_from_inmemory_file(const char *const memory, int mem_size)
Definition: unicharset.h:339

CHAR_FRAGMENT::set_pos
void set_pos(int p)
Definition: unicharset.h:62

UNICHARSET::U_EUROPEAN_NUMBER
Definition: unicharset.h:153

UNICHARSET::to_upper
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:660

STRING::length
inT32 length() const
Definition: strngs.cpp:188

UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:797

UNICHARSET::greek_sid
int greek_sid() const
Definition: unicharset.h:835

UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783

UNICHARSET::get_islower
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:749

unichar.h

CHAR_FRAGMENT::get_total
int get_total() const
Definition: unicharset.h:66

UNICHARSET::set_normed
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:440

SpecialUnicharCodes
SpecialUnicharCodes
Definition: unicharset.h:34

UNICHARSET::set_ranges_empty
void set_ranges_empty()
Definition: unicharset.cpp:371

UNICHARSET::ExpandRangesFromOther
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:410

UNICHARSET::U_OTHER_NEUTRAL
Definition: unicharset.h:161

UNICHARSET::U_BLOCK_SEPARATOR
Definition: unicharset.h:158

tesseract::TFile
Definition: serialis.h:51

UNICHAR_SPACE
Definition: unicharset.h:35

UNICHARSET::U_POP_DIRECTIONAL_FORMAT
Definition: unicharset.h:167

ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115

CHAR_FRAGMENT::kMaxChunks
static const int kMaxChunks
Definition: unicharset.h:49

tesseract-c_api-demo.filename
string filename
Definition: tesseract-c_api-demo.py:29

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:645

UNICHAR_BROKEN
Definition: unicharset.h:37

UNICHARSET::AppendOtherUnicharset
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439

UNICHARSET::get_chartype
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:603

UNICHARSET::get_script_id_from_name
int get_script_id_from_name(const char *script_name) const
Definition: unicharset.cpp:1080

UNICHARSET::load_from_file
bool load_from_file(FILE *file)
Definition: unicharset.h:361

UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682

UNICHARSET::set_width_stats
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:565

SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:39

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

UNICHARSET::U_ARABIC_NUMBER
Definition: unicharset.h:156

UNICHARSET::debug_str
STRING debug_str(const char *unichar_repr) const
Definition: unicharset.h:233

UNICHARSET::get_isalpha
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:742

CHAR_FRAGMENT::to_string
STRING to_string() const
Definition: unicharset.h:73

UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611

UNICHARSET::get_script
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:726

UNICHARSET::get_isngram
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:484

UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:849

UNICHARSET::id_to_unichar
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

UNICHARSET::is_null_script
bool is_null_script(const char *script) const
Definition: unicharset.h:816

UNICHARMAP
Definition: unicharmap.h:27

UNICHARSET::get_isprivate
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:363

UNICHARSET::latin_sid
int latin_sid() const
Definition: unicharset.h:833

UNICHARSET::reserve
void reserve(int unichars_number)
Definition: unicharset.cpp:179

UNICHARSET::set_isngram
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:414

UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:670

CHAR_FRAGMENT::is_natural
bool is_natural() const
Definition: unicharset.h:107

TessResultCallback2
Definition: tesscallback.h:1688

UNICHARSET::set_ispunctuation
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:409

UNICHARSET::add_script
int add_script(const char *script)
Definition: unicharset.cpp:1002

UNICHARSET::load_from_inmemory_file
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:724

UNICHARSET::cyrillic_sid
int cyrillic_sid() const
Definition: unicharset.h:834

UNICHARSET::debug_utf8_str
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:294

CHAR_FRAGMENT::equals
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:79

CHAR_FRAGMENT::kMaxLen
static const int kMaxLen
Definition: unicharset.h:47

UNICHARSET::get_chartype
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:719

UNICHARSET::set_advance_stats
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:598

UNICHARSET::set_isdigit
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:404

UNICHARSET::get_isdigit
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:704

UNICHARMAP::contains
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101

UNICHARSET::get_islower
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:694

UNICHARSET::kSpecialUnicharCodes
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:147

MAX_UINT8
#define MAX_UINT8
Definition: host.h:121

UNICHARSET::U_CHAR_DIRECTION_COUNT
Definition: unicharset.h:170

UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526

UNICHARSET::encodable_string
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:222

UNICHARSET::get_properties
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:588

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesscallback.h

UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456

UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631

UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477

CHAR_FRAGMENT::is_ending
bool is_ending() const
Definition: unicharset.h:102

UNICHARSET::set_other_case
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:425

UNICHARSET::get_properties
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:715

CHAR_FRAGMENT::kMinLen
static const int kMinLen
Definition: unicharset.h:45

UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234

UNICHARSET::set_top_bottom
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:540

CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:64

helpers.h

UNICHARSET::get_isalpha
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:689

UNICHARSET::set_isalpha
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:389

UNICHARSET::set_direction
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:430

CHAR_FRAGMENT::set_total
void set_total(int t)
Definition: unicharset.h:63

UNICHARSET::get_script_from_script_id
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:802

UNICHARSET::script_has_upper_lower
bool script_has_upper_lower() const
Definition: unicharset.h:842

tesseract::TFile::FWrite
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:131

UNICHARSET::set_bearing_stats
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:581

UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:495

UNICHARSET::common_sid
int common_sid() const
Definition: unicharset.h:832

unicharmap.h

UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:638

UNICHARSET::U_RIGHT_TO_LEFT_OVERRIDE
Definition: unicharset.h:166

UNICHARSET::save_to_file
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:322

errcode.h

UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:826

UNICHARSET::set_script
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:420

UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449

UNICHARSET::U_BOUNDARY_NEUTRAL
Definition: unicharset.h:169

UNICHARSET::PropertiesIncomplete
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:604

UNICHARSET::SetPropertiesFromOther
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:503

UNICHARSET::set_mirror
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:435

UNICHARSET::set_normed_ids
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:348

UNICHARSET::UNICHARSET
UNICHARSET()
Definition: unicharset.cpp:159

UNICHARSET::U_LEFT_TO_RIGHT_OVERRIDE
Definition: unicharset.h:163

UNICHARSET::save_to_string
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:661

UNICHARSET
Definition: unicharset.h:139

UNICHARMAP::clear
void clear()
Definition: unicharmap.cpp:154

UNICHARSET::PartialSetPropertiesFromOther
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:380

CHAR_FRAGMENT::set_natural
void set_natural(bool value)
Definition: unicharset.h:108

UNICHARSET::default_sid
int default_sid() const
Definition: unicharset.h:839

CHAR_FRAGMENT::set_unichar
void set_unichar(const char *uch)
Definition: unicharset.h:58

UNICHARSET::AnyRepeatedUnicodes
bool AnyRepeatedUnicodes() const
Definition: unicharset.cpp:986

UNICHARSET::get_normed_unichar
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:776

UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:211

UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:164

UNICHARSET::get_ispunctuation
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:709

strngs.h

UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644

UNICHARSET::delete_pointers_in_unichars
void delete_pointers_in_unichars()
Definition: unicharset.h:256

STRING
Definition: strngs.h:44

UNICHARSET::U_LEFT_TO_RIGHT_EMBEDDING
Definition: unicharset.h:162

UNICHARSET::Direction
Direction
Definition: unicharset.h:150

UNICHARSET::U_DIR_NON_SPACING_MARK
Definition: unicharset.h:168

NULL
#define NULL
Definition: host.h:144

UNICHARSET::kCustomLigatures
static const char * kCustomLigatures[][2]
Definition: unicharset.h:144

CHAR_FRAGMENT::equals
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:84

UNICHARSET::get_bearing_stats
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:571

UNICHARSET::id_to_unichar_ext
const char *const id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:274

UNICHARSET::U_RIGHT_TO_LEFT_EMBEDDING
Definition: unicharset.h:165

UNICHARSET::~UNICHARSET
~UNICHARSET()
Definition: unicharset.cpp:175

UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:30

UNICHARSET::clear
void clear()
Definition: unicharset.h:266

GenericVector< UNICHAR_ID >

UNICHARSET::size
int size() const
Definition: unicharset.h:297

UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:152

UNICHARSET::post_load_setup
void post_load_setup()
Definition: unicharset.cpp:867

CHAR_FRAGMENT::set_all
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:52

UNICHARSET::U_LEFT_TO_RIGHT
Definition: unicharset.h:151

UNICHARSET::CopyFrom
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423

UNICHARSET::set_black_and_whitelist
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:948

CHAR_FRAGMENT::get_pos
int get_pos() const
Definition: unicharset.h:65

UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:732

UNICHARSET::get_width_stats
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:554

UNICHARSET::get_advance_stats
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:588

serialis.h

genericvector.h

inT16
short inT16
Definition: host.h:100

UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:242

CHAR_FRAGMENT::is_beginning
bool is_beginning() const
Definition: unicharset.h:99

UNICHARSET::save_to_file
bool save_to_file(FILE *file) const
Definition: unicharset.h:316

uinT8
unsigned char uinT8
Definition: host.h:99