tesseract v5.3.3.20231005
unicharset.h
Go to the documentation of this file.
1
2// File: unicharset.h
3// Description: Unicode character/ligature set class.
4// Author: Thomas Kielbus
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20#define TESSERACT_CCUTIL_UNICHARSET_H_
21
22#include "errcode.h"
23#include "unicharmap.h"
24
25#include <tesseract/unichar.h>
26#include "helpers.h"
27#include "serialis.h"
28
29#include <functional> // for std::function
30
31namespace tesseract {
32
33// Enum holding special values of unichar_id. Every unicharset has these.
34// Warning! Keep in sync with kSpecialUnicharCodes.
39
41};
42
43// Boolean flag for unichar_insert. It's a bit of a double negative to allow
44// the default value to be false.
46 kFalse,
47 kTrue,
48};
49
51public:
52 // Minimum number of characters used for fragment representation.
53 static const int kMinLen = 6;
54 // Maximum number of characters used for fragment representation.
55 static const int kMaxLen = 3 + UNICHAR_LEN + 2;
56 // Maximum number of fragments per character.
57 static const int kMaxChunks = 5;
58
59 // Setters and Getters.
60 inline void set_all(const char *unichar, int pos, int total, bool natural) {
61 set_unichar(unichar);
62 set_pos(pos);
63 set_total(total);
64 set_natural(natural);
65 }
66 inline void set_unichar(const char *uch) {
67 strncpy(this->unichar, uch, sizeof(this->unichar));
68 this->unichar[UNICHAR_LEN] = '\0';
69 }
70 inline void set_pos(int p) {
71 this->pos = p;
72 }
73 inline void set_total(int t) {
74 this->total = t;
75 }
76 inline const char *get_unichar() const {
77 return this->unichar;
78 }
79 inline int get_pos() const {
80 return this->pos;
81 }
82 inline int get_total() const {
83 return this->total;
84 }
85
86 // Returns the string that represents a fragment
87 // with the given unichar, pos and total.
88 static std::string to_string(const char *unichar, int pos, int total,
89 bool natural);
90 // Returns the string that represents this fragment.
91 std::string to_string() const {
92 return to_string(unichar, pos, total, natural);
93 }
94
95 // Checks whether a fragment has the same unichar,
96 // position and total as the given inputs.
97 inline bool equals(const char *other_unichar, int other_pos,
98 int other_total) const {
99 return (strcmp(this->unichar, other_unichar) == 0 &&
100 this->pos == other_pos && this->total == other_total);
101 }
102 inline bool equals(const CHAR_FRAGMENT *other) const {
103 return this->equals(other->get_unichar(), other->get_pos(),
104 other->get_total());
105 }
106
107 // Checks whether a given fragment is a continuation of this fragment.
108 // Assumes that the given fragment pointer is not nullptr.
109 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
110 return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
111 this->total == fragment->get_total() &&
112 this->pos == fragment->get_pos() + 1);
113 }
114
115 // Returns true if this fragment is a beginning fragment.
116 inline bool is_beginning() const {
117 return this->pos == 0;
118 }
119
120 // Returns true if this fragment is an ending fragment.
121 inline bool is_ending() const {
122 return this->pos == this->total - 1;
123 }
124
125 // Returns true if the fragment was a separate component to begin with,
126 // ie did not need chopping to be isolated, but may have been separated
127 // out from a multi-outline blob.
128 inline bool is_natural() const {
129 return natural;
130 }
131 void set_natural(bool value) {
132 natural = value;
133 }
134
135 // Parses the string to see whether it represents a character fragment
136 // (rather than a regular character). If so, allocates memory for a new
137 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
138 // information. Fragments are of the form:
139 // |m|1|2, meaning chunk 1 of 2 of character m, or
140 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
141 // to divide the parts, as they were already separate connected components.
142 //
143 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
144 // instance, otherwise (if the string does not represent a fragment or it
145 // looks like it does, but parsing it as a fragment fails) returns nullptr.
146 //
147 // Note: The caller is responsible for deallocating memory
148 // associated with the returned pointer.
149 static CHAR_FRAGMENT *parse_from_string(const char *str);
150
151private:
152 char unichar[UNICHAR_LEN + 1];
153 // True if the fragment was a separate component to begin with,
154 // ie did not need chopping to be isolated, but may have been separated
155 // out from a multi-outline blob.
156 bool natural;
157 int16_t pos; // fragment position in the character
158 int16_t total; // total number of fragments in the character
159};
160
161// The UNICHARSET class is an utility class for Tesseract that holds the
162// set of characters that are used by the engine. Each character is identified
163// by a unique number, from 0 to (size - 1).
165public:
166 // Custom list of characters and their ligature forms (UTF8)
167 // These map to unicode values in the private use area (PUC) and are supported
168 // by only few font families (eg. Wyld, Adobe Caslon Pro).
169 static const char *kCustomLigatures[][2];
170
171 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
172 static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
173
174 // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
176 U_LEFT_TO_RIGHT = 0,
177 U_RIGHT_TO_LEFT = 1,
178 U_EUROPEAN_NUMBER = 2,
179 U_EUROPEAN_NUMBER_SEPARATOR = 3,
180 U_EUROPEAN_NUMBER_TERMINATOR = 4,
181 U_ARABIC_NUMBER = 5,
182 U_COMMON_NUMBER_SEPARATOR = 6,
183 U_BLOCK_SEPARATOR = 7,
184 U_SEGMENT_SEPARATOR = 8,
185 U_WHITE_SPACE_NEUTRAL = 9,
186 U_OTHER_NEUTRAL = 10,
187 U_LEFT_TO_RIGHT_EMBEDDING = 11,
188 U_LEFT_TO_RIGHT_OVERRIDE = 12,
189 U_RIGHT_TO_LEFT_ARABIC = 13,
190 U_RIGHT_TO_LEFT_EMBEDDING = 14,
191 U_RIGHT_TO_LEFT_OVERRIDE = 15,
192 U_POP_DIRECTIONAL_FORMAT = 16,
193 U_DIR_NON_SPACING_MARK = 17,
194 U_BOUNDARY_NEUTRAL = 18,
195 U_FIRST_STRONG_ISOLATE = 19,
196 U_LEFT_TO_RIGHT_ISOLATE = 20,
197 U_RIGHT_TO_LEFT_ISOLATE = 21,
198 U_POP_DIRECTIONAL_ISOLATE = 22,
199#ifndef U_HIDE_DEPRECATED_API
200 U_CHAR_DIRECTION_COUNT
201#endif // U_HIDE_DEPRECATED_API
202 };
203
204 // Create an empty UNICHARSET
205 UNICHARSET();
206
207 ~UNICHARSET();
208
209 // Return the UNICHAR_ID of a given unichar representation within the
210 // UNICHARSET.
211 UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;
212
213 // Return the UNICHAR_ID of a given unichar representation within the
214 // UNICHARSET. Only the first length characters from unichar_repr are used.
215 UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;
216
217 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
218 // while leaving the rest of the string encodable. Returns 0 if the
219 // beginning of the string is not encodable.
220 // WARNING: this function now encodes the whole string for precision.
221 // Use encode_string in preference to repeatedly calling step.
222 int step(const char *str) const;
223
224 // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
225 // If not encodable, write the first byte offset which cannot be converted
226 // into the second (return) argument.
227 bool encodable_string(const char *str, unsigned *first_bad_position) const;
228
229 // Encodes the given UTF-8 string with this UNICHARSET.
230 // Any part of the string that cannot be encoded (because the utf8 can't
231 // be broken up into pieces that are in the unicharset) then:
232 // if give_up_on_failure, stops and returns a partial encoding,
233 // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
234 // Returns true if the encoding succeeds completely, false if there is at
235 // least one failure.
236 // If lengths is not nullptr, then it is filled with the corresponding
237 // byte length of each encoded UNICHAR_ID.
238 // If encoded_length is not nullptr then on return it contains the length of
239 // str that was encoded. (if give_up_on_failure the location of the first
240 // failure, otherwise strlen(str).)
241 // WARNING: Caller must guarantee that str has already been cleaned of codes
242 // that do not belong in the unicharset, or encoding may fail.
243 // Use CleanupString to perform the cleaning.
244 bool encode_string(const char *str, bool give_up_on_failure,
245 std::vector<UNICHAR_ID> *encoding,
246 std::vector<char> *lengths,
247 unsigned *encoded_length) const;
248
249 // Return the unichar representation corresponding to the given UNICHAR_ID
250 // within the UNICHARSET.
251 const char *id_to_unichar(UNICHAR_ID id) const;
252
253 // Return the UTF8 representation corresponding to the given UNICHAR_ID after
254 // resolving any private encodings internal to Tesseract. This method is
255 // preferable to id_to_unichar for outputting text that will be visible to
256 // external applications.
257 const char *id_to_unichar_ext(UNICHAR_ID id) const;
258
259 // Return a string that reformats the utf8 str into the str followed
260 // by its hex unicodes.
261 static std::string debug_utf8_str(const char *str);
262
263 // Removes/replaces content that belongs in rendered text, but not in the
264 // unicharset.
265 static std::string CleanupString(const char *utf8_str) {
266 return CleanupString(utf8_str, strlen(utf8_str));
267 }
268 static std::string CleanupString(const char *utf8_str, size_t length);
269
270 // Return a string containing debug information on the unichar, including
271 // the id_to_unichar, its hex unicodes and the properties.
272 std::string debug_str(UNICHAR_ID id) const;
273 std::string debug_str(const char *unichar_repr) const {
274 return debug_str(unichar_to_id(unichar_repr));
275 }
276
277 // Adds a unichar representation to the set. If old_style is true, then
278 // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
279 // characters are ignored/skipped as if they don't exist and n-grams that
280 // can already be encoded are not added.
281 void unichar_insert(const char *const unichar_repr,
282 OldUncleanUnichars old_style);
283 void unichar_insert(const char *const unichar_repr) {
284 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
285 }
286 // Adds a unichar representation to the set. Avoids setting old_style to true,
287 // unless it is necessary to make the new unichar get added.
288 void unichar_insert_backwards_compatible(const char *const unichar_repr) {
289 std::string cleaned = CleanupString(unichar_repr);
290 if (cleaned != unichar_repr) {
291 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
292 } else {
293 auto old_size = size();
294 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
295 if (size() == old_size) {
296 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
297 }
298 }
299 }
300
301 // Return true if the given unichar id exists within the set.
302 // Relies on the fact that unichar ids are contiguous in the unicharset.
303 bool contains_unichar_id(UNICHAR_ID unichar_id) const {
304 return static_cast<size_t>(unichar_id) < unichars.size();
305 }
306
307 // Return true if the given unichar representation exists within the set.
308 bool contains_unichar(const char *const unichar_repr) const;
309 bool contains_unichar(const char *const unichar_repr, int length) const;
310
311 // Return true if the given unichar representation corresponds to the given
312 // UNICHAR_ID within the set.
313 bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;
314
315 // Delete CHAR_FRAGMENTs stored in properties of unichars array.
317 for (auto &unichar : unichars) {
318 delete unichar.properties.fragment;
319 unichar.properties.fragment = nullptr;
320 }
321 }
322
323 // Clear the UNICHARSET (all the previous data is lost).
324 void clear() {
325 if (script_table != nullptr) {
326 for (int i = 0; i < script_table_size_used; ++i) {
327 delete[] script_table[i];
328 }
329 delete[] script_table;
330 script_table = nullptr;
331 script_table_size_used = 0;
332 }
333 script_table_size_reserved = 0;
334 delete_pointers_in_unichars();
335 unichars.clear();
336 ids.clear();
337 top_bottom_set_ = false;
338 script_has_upper_lower_ = false;
339 script_has_xheight_ = false;
340 old_style_included_ = false;
341 null_sid_ = 0;
342 common_sid_ = 0;
343 latin_sid_ = 0;
344 cyrillic_sid_ = 0;
345 greek_sid_ = 0;
346 han_sid_ = 0;
347 hiragana_sid_ = 0;
348 katakana_sid_ = 0;
349 thai_sid_ = 0;
350 hangul_sid_ = 0;
351 default_sid_ = 0;
352 }
353
354 // Return the size of the set (the number of different UNICHAR it holds).
355 size_t size() const {
356 return unichars.size();
357 }
358
359 // Opens the file indicated by filename and saves unicharset to that file.
360 // Returns true if the operation is successful.
361 bool save_to_file(const char *const filename) const {
362 FILE *file = fopen(filename, "w+b");
363 if (file == nullptr) {
364 return false;
365 }
366 bool result = save_to_file(file);
367 fclose(file);
368 return result;
369 }
370
371 // Saves the content of the UNICHARSET to the given file.
372 // Returns true if the operation is successful.
373 bool save_to_file(FILE *file) const {
374 std::string str;
375 return save_to_string(str) &&
376 tesseract::Serialize(file, &str[0], str.length());
377 }
378
380 std::string str;
381 return save_to_string(str) && file->Serialize(&str[0], str.length());
382 }
383
384 // Saves the content of the UNICHARSET to the given string.
385 // Returns true if the operation is successful.
386 bool save_to_string(std::string &str) const;
387
388 // Opens the file indicated by filename and loads the UNICHARSET
389 // from the given file. The previous data is lost.
390 // Returns true if the operation is successful.
391 bool load_from_file(const char *const filename, bool skip_fragments) {
392 FILE *file = fopen(filename, "rb");
393 if (file == nullptr) {
394 return false;
395 }
396 bool result = load_from_file(file, skip_fragments);
397 fclose(file);
398 return result;
399 }
400 // returns true if the operation is successful.
401 bool load_from_file(const char *const filename) {
402 return load_from_file(filename, false);
403 }
404
405 // Loads the UNICHARSET from the given file. The previous data is lost.
406 // Returns true if the operation is successful.
407 bool load_from_file(FILE *file, bool skip_fragments);
408 bool load_from_file(FILE *file) {
409 return load_from_file(file, false);
410 }
411 bool load_from_file(tesseract::TFile *file, bool skip_fragments);
412
413 // Sets up internal data after loading the file, based on the char
414 // properties. Called from load_from_file, but also needs to be run
415 // during set_unicharset_properties.
416 void post_load_setup();
417
418 // Returns true if right_to_left scripts are significant in the unicharset,
419 // but without being so sensitive that "universal" unicharsets containing
420 // characters from many scripts, like orientation and script detection,
421 // look like they are right_to_left.
422 bool major_right_to_left() const;
423
424 // Set a whitelist and/or blacklist of characters to recognize.
425 // An empty or nullptr whitelist enables everything (minus any blacklist).
426 // An empty or nullptr blacklist disables nothing.
427 // An empty or nullptr unblacklist has no effect.
428 // The blacklist overrides the whitelist.
429 // The unblacklist overrides the blacklist.
430 // Each list is a string of utf8 character strings. Boundaries between
431 // unicharset units are worked out automatically, and characters not in
432 // the unicharset are silently ignored.
433 void set_black_and_whitelist(const char *blacklist, const char *whitelist,
434 const char *unblacklist);
435
436 // Set the isalpha property of the given unichar to the given value.
437 void set_isalpha(UNICHAR_ID unichar_id, bool value) {
438 unichars[unichar_id].properties.isalpha = value;
439 }
440
441 // Set the islower property of the given unichar to the given value.
442 void set_islower(UNICHAR_ID unichar_id, bool value) {
443 unichars[unichar_id].properties.islower = value;
444 }
445
446 // Set the isupper property of the given unichar to the given value.
447 void set_isupper(UNICHAR_ID unichar_id, bool value) {
448 unichars[unichar_id].properties.isupper = value;
449 }
450
451 // Set the isdigit property of the given unichar to the given value.
452 void set_isdigit(UNICHAR_ID unichar_id, bool value) {
453 unichars[unichar_id].properties.isdigit = value;
454 }
455
456 // Set the ispunctuation property of the given unichar to the given value.
457 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
458 unichars[unichar_id].properties.ispunctuation = value;
459 }
460
461 // Set the isngram property of the given unichar to the given value.
462 void set_isngram(UNICHAR_ID unichar_id, bool value) {
463 unichars[unichar_id].properties.isngram = value;
464 }
465
466 // Set the script name of the given unichar to the given value.
467 // Value is copied and thus can be a temporary;
468 void set_script(UNICHAR_ID unichar_id, const char *value) {
469 unichars[unichar_id].properties.script_id = add_script(value);
470 }
471
472 // Set other_case unichar id in the properties for the given unichar id.
473 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
474 unichars[unichar_id].properties.other_case = other_case;
475 }
476
477 // Set the direction property of the given unichar to the given value.
479 unichars[unichar_id].properties.direction = value;
480 }
481
482 // Set mirror unichar id in the properties for the given unichar id.
483 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
484 unichars[unichar_id].properties.mirror = mirror;
485 }
486
487 // Record normalized version of unichar with the given unichar_id.
488 void set_normed(UNICHAR_ID unichar_id, const char *normed) {
489 unichars[unichar_id].properties.normed = normed;
490 unichars[unichar_id].properties.normed_ids.clear();
491 }
492 // Sets the normed_ids vector from the normed string. normed_ids is not
493 // stored in the file, and needs to be set when the UNICHARSET is loaded.
494 void set_normed_ids(UNICHAR_ID unichar_id);
495
496 // Return the isalpha property of the given unichar.
497 bool get_isalpha(UNICHAR_ID unichar_id) const {
498 if (INVALID_UNICHAR_ID == unichar_id) {
499 return false;
500 }
501 ASSERT_HOST(contains_unichar_id(unichar_id));
502 return unichars[unichar_id].properties.isalpha;
503 }
504
505 // Return the islower property of the given unichar.
506 bool get_islower(UNICHAR_ID unichar_id) const {
507 if (INVALID_UNICHAR_ID == unichar_id) {
508 return false;
509 }
510 ASSERT_HOST(contains_unichar_id(unichar_id));
511 return unichars[unichar_id].properties.islower;
512 }
513
514 // Return the isupper property of the given unichar.
515 bool get_isupper(UNICHAR_ID unichar_id) const {
516 if (INVALID_UNICHAR_ID == unichar_id) {
517 return false;
518 }
519 ASSERT_HOST(contains_unichar_id(unichar_id));
520 return unichars[unichar_id].properties.isupper;
521 }
522
523 // Return the isdigit property of the given unichar.
524 bool get_isdigit(UNICHAR_ID unichar_id) const {
525 if (INVALID_UNICHAR_ID == unichar_id) {
526 return false;
527 }
528 ASSERT_HOST(contains_unichar_id(unichar_id));
529 return unichars[unichar_id].properties.isdigit;
530 }
531
532 // Return the ispunctuation property of the given unichar.
533 bool get_ispunctuation(UNICHAR_ID unichar_id) const {
534 if (INVALID_UNICHAR_ID == unichar_id) {
535 return false;
536 }
537 ASSERT_HOST(contains_unichar_id(unichar_id));
538 return unichars[unichar_id].properties.ispunctuation;
539 }
540
541 // Return the isngram property of the given unichar.
542 bool get_isngram(UNICHAR_ID unichar_id) const {
543 if (INVALID_UNICHAR_ID == unichar_id) {
544 return false;
545 }
546 ASSERT_HOST(contains_unichar_id(unichar_id));
547 return unichars[unichar_id].properties.isngram;
548 }
549
550 // Returns whether the unichar id represents a unicode value in the private
551 // use area.
552 bool get_isprivate(UNICHAR_ID unichar_id) const;
553
554 // Returns true if the ids have useful min/max top/bottom values.
555 bool top_bottom_useful() const {
556 return top_bottom_set_;
557 }
558 // Sets all ranges to empty, so they can be expanded to set the values.
559 void set_ranges_empty();
560 // Sets all the properties for this unicharset given a src_unicharset with
561 // everything set. The unicharsets don't have to be the same, and graphemes
562 // are correctly accounted for.
564 PartialSetPropertiesFromOther(0, src);
565 }
566 // Sets properties from Other, starting only at the given index.
567 void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);
568 // Expands the tops and bottoms and widths for this unicharset given a
569 // src_unicharset with ranges in it. The unicharsets don't have to be the
570 // same, and graphemes are correctly accounted for.
571 void ExpandRangesFromOther(const UNICHARSET &src);
572 // Makes this a copy of src. Clears this completely first, so the automattic
573 // ids will not be present in this if not in src.
574 void CopyFrom(const UNICHARSET &src);
575 // For each id in src, if it does not occur in this, add it, as in
576 // SetPropertiesFromOther, otherwise expand the ranges, as in
577 // ExpandRangesFromOther.
578 void AppendOtherUnicharset(const UNICHARSET &src);
579 // Returns true if the acceptable ranges of the tops of the characters do
580 // not overlap, making their x-height calculations distinct.
581 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
582 // Returns the min and max bottom and top of the given unichar in
583 // baseline-normalized coordinates, ie, where the baseline is
584 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
585 // (See normalis.h for the definitions).
586 void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,
587 int *min_top, int *max_top) const {
588 if (INVALID_UNICHAR_ID == unichar_id) {
589 *min_bottom = *min_top = 0;
590 *max_bottom = *max_top = 256; // kBlnCellHeight
591 return;
592 }
593 ASSERT_HOST(contains_unichar_id(unichar_id));
594 *min_bottom = unichars[unichar_id].properties.min_bottom;
595 *max_bottom = unichars[unichar_id].properties.max_bottom;
596 *min_top = unichars[unichar_id].properties.min_top;
597 *max_top = unichars[unichar_id].properties.max_top;
598 }
599 void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,
600 int min_top, int max_top) {
601 unichars[unichar_id].properties.min_bottom =
602 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603 unichars[unichar_id].properties.max_bottom =
604 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605 unichars[unichar_id].properties.min_top =
606 ClipToRange<int>(min_top, 0, UINT8_MAX);
607 unichars[unichar_id].properties.max_top =
608 ClipToRange<int>(max_top, 0, UINT8_MAX);
609 }
610 // Returns the width stats (as mean, sd) of the given unichar relative to the
611 // median advance of all characters in the character set.
612 void get_width_stats(UNICHAR_ID unichar_id, float *width,
613 float *width_sd) const {
614 if (INVALID_UNICHAR_ID == unichar_id) {
615 *width = 0.0f;
616 *width_sd = 0.0f;
617 return;
618 }
619 ASSERT_HOST(contains_unichar_id(unichar_id));
620 *width = unichars[unichar_id].properties.width;
621 *width_sd = unichars[unichar_id].properties.width_sd;
622 }
623 void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
624 unichars[unichar_id].properties.width = width;
625 unichars[unichar_id].properties.width_sd = width_sd;
626 }
627 // Returns the stats of the x-bearing (as mean, sd) of the given unichar
628 // relative to the median advance of all characters in the character set.
629 void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,
630 float *bearing_sd) const {
631 if (INVALID_UNICHAR_ID == unichar_id) {
632 *bearing = *bearing_sd = 0.0f;
633 return;
634 }
635 ASSERT_HOST(contains_unichar_id(unichar_id));
636 *bearing = unichars[unichar_id].properties.bearing;
637 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
638 }
639 void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,
640 float bearing_sd) {
641 unichars[unichar_id].properties.bearing = bearing;
642 unichars[unichar_id].properties.bearing_sd = bearing_sd;
643 }
644 // Returns the stats of the x-advance of the given unichar (as mean, sd)
645 // relative to the median advance of all characters in the character set.
646 void get_advance_stats(UNICHAR_ID unichar_id, float *advance,
647 float *advance_sd) const {
648 if (INVALID_UNICHAR_ID == unichar_id) {
649 *advance = *advance_sd = 0;
650 return;
651 }
652 ASSERT_HOST(contains_unichar_id(unichar_id));
653 *advance = unichars[unichar_id].properties.advance;
654 *advance_sd = unichars[unichar_id].properties.advance_sd;
655 }
656 void set_advance_stats(UNICHAR_ID unichar_id, float advance,
657 float advance_sd) {
658 unichars[unichar_id].properties.advance = advance;
659 unichars[unichar_id].properties.advance_sd = advance_sd;
660 }
661 // Returns true if the font metrics properties are empty.
662 bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
663 return unichars[unichar_id].properties.AnyRangeEmpty();
664 }
665
666 // Returns true if the script of the given id is space delimited.
667 // Returns false for Han and Thai scripts.
668 bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
669 if (INVALID_UNICHAR_ID == unichar_id) {
670 return true;
671 }
672 int script_id = get_script(unichar_id);
673 return script_id != han_sid_ && script_id != thai_sid_ &&
674 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
675 script_id != katakana_sid_;
676 }
677
678 // Return the script name of the given unichar.
679 // The returned pointer will always be the same for the same script, it's
680 // managed by unicharset and thus MUST NOT be deleted
681 int get_script(UNICHAR_ID unichar_id) const {
682 if (INVALID_UNICHAR_ID == unichar_id) {
683 return null_sid_;
684 }
685 ASSERT_HOST(contains_unichar_id(unichar_id));
686 return unichars[unichar_id].properties.script_id;
687 }
688
689 // Return the character properties, eg. alpha/upper/lower/digit/punct,
690 // as a bit field of unsigned int.
691 unsigned int get_properties(UNICHAR_ID unichar_id) const;
692
693 // Return the character property as a single char. If a character has
694 // multiple attributes, the main property is defined by the following order:
695 // upper_case : 'A'
696 // lower_case : 'a'
697 // alpha : 'x'
698 // digit : '0'
699 // punctuation: 'p'
700 char get_chartype(UNICHAR_ID unichar_id) const;
701
702 // Get other_case unichar id in the properties for the given unichar id.
704 if (INVALID_UNICHAR_ID == unichar_id) {
705 return INVALID_UNICHAR_ID;
706 }
707 ASSERT_HOST(contains_unichar_id(unichar_id));
708 return unichars[unichar_id].properties.other_case;
709 }
710
711 // Returns the direction property of the given unichar.
713 if (INVALID_UNICHAR_ID == unichar_id) {
715 }
716 ASSERT_HOST(contains_unichar_id(unichar_id));
717 return unichars[unichar_id].properties.direction;
718 }
719
720 // Get mirror unichar id in the properties for the given unichar id.
721 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
722 if (INVALID_UNICHAR_ID == unichar_id) {
723 return INVALID_UNICHAR_ID;
724 }
725 ASSERT_HOST(contains_unichar_id(unichar_id));
726 return unichars[unichar_id].properties.mirror;
727 }
728
729 // Returns UNICHAR_ID of the corresponding lower-case unichar.
730 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
731 if (INVALID_UNICHAR_ID == unichar_id) {
732 return INVALID_UNICHAR_ID;
733 }
734 ASSERT_HOST(contains_unichar_id(unichar_id));
735 if (unichars[unichar_id].properties.islower) {
736 return unichar_id;
737 }
738 return unichars[unichar_id].properties.other_case;
739 }
740
741 // Returns UNICHAR_ID of the corresponding upper-case unichar.
742 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
743 if (INVALID_UNICHAR_ID == unichar_id) {
744 return INVALID_UNICHAR_ID;
745 }
746 ASSERT_HOST(contains_unichar_id(unichar_id));
747 if (unichars[unichar_id].properties.isupper) {
748 return unichar_id;
749 }
750 return unichars[unichar_id].properties.other_case;
751 }
752
753 // Returns true if this UNICHARSET has the special codes in
754 // SpecialUnicharCodes available. If false then there are normal unichars
755 // at these codes and they should not be used.
756 bool has_special_codes() const {
757 return get_fragment(UNICHAR_BROKEN) != nullptr &&
758 strcmp(id_to_unichar(UNICHAR_BROKEN),
759 kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
760 }
761
762 // Returns true if there are any repeated unicodes in the normalized
763 // text of any unichar-id in the unicharset.
764 bool AnyRepeatedUnicodes() const;
765
766 // Return a pointer to the CHAR_FRAGMENT class if the given
767 // unichar id represents a character fragment.
768 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
769 if (INVALID_UNICHAR_ID == unichar_id) {
770 return nullptr;
771 }
772 ASSERT_HOST(contains_unichar_id(unichar_id));
773 return unichars[unichar_id].properties.fragment;
774 }
775
776 // Return the isalpha property of the given unichar representation.
777 bool get_isalpha(const char *const unichar_repr) const {
778 return get_isalpha(unichar_to_id(unichar_repr));
779 }
780
781 // Return the islower property of the given unichar representation.
782 bool get_islower(const char *const unichar_repr) const {
783 return get_islower(unichar_to_id(unichar_repr));
784 }
785
786 // Return the isupper property of the given unichar representation.
787 bool get_isupper(const char *const unichar_repr) const {
788 return get_isupper(unichar_to_id(unichar_repr));
789 }
790
791 // Return the isdigit property of the given unichar representation.
792 bool get_isdigit(const char *const unichar_repr) const {
793 return get_isdigit(unichar_to_id(unichar_repr));
794 }
795
796 // Return the ispunctuation property of the given unichar representation.
797 bool get_ispunctuation(const char *const unichar_repr) const {
798 return get_ispunctuation(unichar_to_id(unichar_repr));
799 }
800
801 // Return the character properties, eg. alpha/upper/lower/digit/punct,
802 // of the given unichar representation
803 unsigned int get_properties(const char *const unichar_repr) const {
804 return get_properties(unichar_to_id(unichar_repr));
805 }
806
807 char get_chartype(const char *const unichar_repr) const {
808 return get_chartype(unichar_to_id(unichar_repr));
809 }
810
811 // Return the script name of the given unichar representation.
812 // The returned pointer will always be the same for the same script, it's
813 // managed by unicharset and thus MUST NOT be deleted
814 int get_script(const char *const unichar_repr) const {
815 return get_script(unichar_to_id(unichar_repr));
816 }
817
818 // Return a pointer to the CHAR_FRAGMENT class struct if the given
819 // unichar representation represents a character fragment.
820 const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {
821 if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
822 !ids.contains(unichar_repr, false)) {
823 return nullptr;
824 }
825 return get_fragment(unichar_to_id(unichar_repr));
826 }
827
828 // Return the isalpha property of the given unichar representation.
829 // Only the first length characters from unichar_repr are used.
830 bool get_isalpha(const char *const unichar_repr, int length) const {
831 return get_isalpha(unichar_to_id(unichar_repr, length));
832 }
833
834 // Return the islower property of the given unichar representation.
835 // Only the first length characters from unichar_repr are used.
836 bool get_islower(const char *const unichar_repr, int length) const {
837 return get_islower(unichar_to_id(unichar_repr, length));
838 }
839
840 // Return the isupper property of the given unichar representation.
841 // Only the first length characters from unichar_repr are used.
842 bool get_isupper(const char *const unichar_repr, int length) const {
843 return get_isupper(unichar_to_id(unichar_repr, length));
844 }
845
846 // Return the isdigit property of the given unichar representation.
847 // Only the first length characters from unichar_repr are used.
848 bool get_isdigit(const char *const unichar_repr, int length) const {
849 return get_isdigit(unichar_to_id(unichar_repr, length));
850 }
851
852 // Return the ispunctuation property of the given unichar representation.
853 // Only the first length characters from unichar_repr are used.
854 bool get_ispunctuation(const char *const unichar_repr, int length) const {
855 return get_ispunctuation(unichar_to_id(unichar_repr, length));
856 }
857
858 // Returns normalized version of unichar with the given unichar_id.
859 const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
860 if (unichar_id == UNICHAR_SPACE) {
861 return " ";
862 }
863 return unichars[unichar_id].properties.normed.c_str();
864 }
865 // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
866 // version of the given id. There may be more than one UNICHAR_ID in the
867 // vector if unichar_id represents a ligature.
868 const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {
869 return unichars[unichar_id].properties.normed_ids;
870 }
871
872 // Return the script name of the given unichar representation.
873 // Only the first length characters from unichar_repr are used.
874 // The returned pointer will always be the same for the same script, it's
875 // managed by unicharset and thus MUST NOT be deleted
876 int get_script(const char *const unichar_repr, int length) const {
877 return get_script(unichar_to_id(unichar_repr, length));
878 }
879
880 // Return the (current) number of scripts in the script table
882 return script_table_size_used;
883 }
884
885 // Return the script string from its id
886 const char *get_script_from_script_id(int id) const {
887 if (id >= script_table_size_used || id < 0) {
888 return null_script;
889 }
890 return script_table[id];
891 }
892
893 // Returns the id from the name of the script, or 0 if script is not found.
894 // Note that this is an expensive operation since it involves iteratively
895 // comparing strings in the script table. To avoid dependency on STL, we
896 // won't use a hash. Instead, the calling function can use this to lookup
897 // and save the ID for relevant scripts for fast comparisons later.
898 int get_script_id_from_name(const char *script_name) const;
899
900 // Return true if the given script is the null script
901 bool is_null_script(const char *script) const {
902 return script == null_script;
903 }
904
905 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
906 // then the returned pointer will be the same.
907 // The script parameter is copied and thus can be a temporary.
908 int add_script(const char *script);
909
910 // Return the enabled property of the given unichar.
911 bool get_enabled(UNICHAR_ID unichar_id) const {
912 ASSERT_HOST(contains_unichar_id(unichar_id));
913 return unichars[unichar_id].properties.enabled;
914 }
915
916 int null_sid() const {
917 return null_sid_;
918 }
919 int common_sid() const {
920 return common_sid_;
921 }
922 int latin_sid() const {
923 return latin_sid_;
924 }
925 int cyrillic_sid() const {
926 return cyrillic_sid_;
927 }
928 int greek_sid() const {
929 return greek_sid_;
930 }
931 int han_sid() const {
932 return han_sid_;
933 }
934 int hiragana_sid() const {
935 return hiragana_sid_;
936 }
937 int katakana_sid() const {
938 return katakana_sid_;
939 }
940 int thai_sid() const {
941 return thai_sid_;
942 }
943 int hangul_sid() const {
944 return hangul_sid_;
945 }
946 int default_sid() const {
947 return default_sid_;
948 }
949
950 // Returns true if the unicharset has the concept of upper/lower case.
952 return script_has_upper_lower_;
953 }
954 // Returns true if the unicharset has the concept of x-height.
955 // script_has_xheight can be true even if script_has_upper_lower is not,
956 // when the script has a sufficiently predominant top line with ascenders,
957 // such as Devanagari and Thai.
958 bool script_has_xheight() const {
959 return script_has_xheight_;
960 }
961
962private:
963 struct TESS_API UNICHAR_PROPERTIES {
964 UNICHAR_PROPERTIES();
965 // Initializes all properties to sensible default values.
966 void Init();
967 // Sets all ranges wide open. Initialization default in case there are
968 // no useful values available.
969 void SetRangesOpen();
970 // Sets all ranges to empty. Used before expanding with font-based data.
971 void SetRangesEmpty();
972 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
973 // is empty.
974 bool AnyRangeEmpty() const;
975 // Expands the ranges with the ranges from the src properties.
976 void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);
977 // Copies the properties from src into this.
978 void CopyFrom(const UNICHAR_PROPERTIES &src);
979
980 bool isalpha;
981 bool islower;
982 bool isupper;
983 bool isdigit;
984 bool ispunctuation;
985 bool isngram;
986 bool enabled;
987 // Possible limits of the top and bottom of the bounding box in
988 // baseline-normalized coordinates, ie, where the baseline is
989 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
990 // (See normalis.h for the definitions).
991 uint8_t min_bottom;
992 uint8_t max_bottom;
993 uint8_t min_top;
994 uint8_t max_top;
995 // Statistics of the widths of bounding box, relative to the median advance.
996 float width;
997 float width_sd;
998 // Stats of the x-bearing and advance, also relative to the median advance.
999 float bearing;
1000 float bearing_sd;
1001 float advance;
1002 float advance_sd;
1003 int script_id;
1004 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
1005 Direction direction; // direction of this unichar
1006 // Mirror property is useful for reverse DAWG lookup for words in
1007 // right-to-left languages (e.g. "(word)" would be in
1008 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
1009 // However, what we want in our DAWG is
1010 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
1011 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
1012 UNICHAR_ID mirror;
1013 // A string of unichar_ids that represent the corresponding normed string.
1014 // For awkward characters like em-dash, this gives hyphen.
1015 // For ligatures, this gives the string of normal unichars.
1016 std::vector<UNICHAR_ID> normed_ids;
1017 std::string normed; // normalized version of this unichar
1018 // Contains meta information about the fragment if a unichar represents
1019 // a fragment of a character, otherwise should be set to nullptr.
1020 // It is assumed that character fragments are added to the unicharset
1021 // after the corresponding 'base' characters.
1022 CHAR_FRAGMENT *fragment;
1023 };
1024
1025 struct UNICHAR_SLOT {
1026 char representation[UNICHAR_LEN + 1];
1027 UNICHAR_PROPERTIES properties;
1028 };
1029
1030 // Internal recursive version of encode_string above.
1031 // str is the start of the whole string.
1032 // str_index is the current position in str.
1033 // str_length is the length of str.
1034 // encoding is a working encoding of str.
1035 // lengths is a working set of lengths of each element of encoding.
1036 // best_total_length is the longest length of str that has been successfully
1037 // encoded so far.
1038 // On return:
1039 // best_encoding contains the encoding that used the longest part of str.
1040 // best_lengths (may be null) contains the lengths of best_encoding.
1041 void encode_string(const char *str, int str_index, int str_length,
1042 std::vector<UNICHAR_ID> *encoding,
1043 std::vector<char> *lengths, unsigned *best_total_length,
1044 std::vector<UNICHAR_ID> *best_encoding,
1045 std::vector<char> *best_lengths) const;
1046
1047 // Gets the properties for a grapheme string, combining properties for
1048 // multiple characters in a meaningful way where possible.
1049 // Returns false if no valid match was found in the unicharset.
1050 // NOTE that script_id, mirror, and other_case refer to this unicharset on
1051 // return and will need redirecting if the target unicharset is different.
1052 bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;
1053
1054 // Load ourselves from a "file" where our only interface to the file is
1055 // an implementation of fgets(). This is the parsing primitive accessed by
1056 // the public routines load_from_file().
1057 bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,
1058 bool skip_fragments);
1059
1060 // List of mappings to make when ingesting strings from the outside.
1061 // The substitutions clean up text that should exists for rendering of
1062 // synthetic data, but not in the recognition set.
1063 static const char *kCleanupMaps[][2];
1064 static const char *null_script;
1065
1066 std::vector<UNICHAR_SLOT> unichars;
1067 UNICHARMAP ids;
1068 char **script_table;
1069 int script_table_size_used;
1070 int script_table_size_reserved;
1071 // True if the unichars have their tops/bottoms set.
1072 bool top_bottom_set_;
1073 // True if the unicharset has significant upper/lower case chars.
1074 bool script_has_upper_lower_;
1075 // True if the unicharset has a significant mean-line with significant
1076 // ascenders above that.
1077 bool script_has_xheight_;
1078 // True if the set contains chars that would be changed by the cleanup.
1079 bool old_style_included_;
1080
1081 // A few convenient script name-to-id mapping without using hash.
1082 // These are initialized when unicharset file is loaded. Anything
1083 // missing from this list can be looked up using get_script_id_from_name.
1084 int null_sid_;
1085 int common_sid_;
1086 int latin_sid_;
1087 int cyrillic_sid_;
1088 int greek_sid_;
1089 int han_sid_;
1090 int hiragana_sid_;
1091 int katakana_sid_;
1092 int thai_sid_;
1093 int hangul_sid_;
1094 // The most frequently occurring script in the charset.
1095 int default_sid_;
1096};
1097
1098} // namespace tesseract
1099
1100#endif // TESSERACT_CCUTIL_UNICHARSET_H_
#define UNICHAR_LEN
Definition: unichar.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:54
int value
const char * p
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236
OldUncleanUnichars
Definition: unicharset.h:45
int UNICHAR_ID
Definition: unichar.h:34
SpecialUnicharCodes
Definition: unicharset.h:35
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
@ UNICHAR_JOINED
Definition: unicharset.h:37
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:97
void set_unichar(const char *uch)
Definition: unicharset.h:66
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:60
bool is_ending() const
Definition: unicharset.h:121
void set_natural(bool value)
Definition: unicharset.h:131
std::string to_string() const
Definition: unicharset.h:91
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:109
const char * get_unichar() const
Definition: unicharset.h:76
bool is_natural() const
Definition: unicharset.h:128
bool is_beginning() const
Definition: unicharset.h:116
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:102
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:782
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:777
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:283
std::string debug_str(const char *unichar_repr) const
Definition: unicharset.h:273
int greek_sid() const
Definition: unicharset.h:928
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:483
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468
void delete_pointers_in_unichars()
Definition: unicharset.h:316
int default_sid() const
Definition: unicharset.h:946
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:886
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:681
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868
bool script_has_xheight() const
Definition: unicharset.h:958
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:876
int common_sid() const
Definition: unicharset.h:919
int han_sid() const
Definition: unicharset.h:931
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768
int get_script_table_size() const
Definition: unicharset.h:881
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:842
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool has_special_codes() const
Definition: unicharset.h:756
int cyrillic_sid() const
Definition: unicharset.h:925
int hiragana_sid() const
Definition: unicharset.h:934
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:712
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:447
bool script_has_upper_lower() const
Definition: unicharset.h:951
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:488
bool is_null_script(const char *script) const
Definition: unicharset.h:901
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:814
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:797
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:646
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:792
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:623
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:848
int null_sid() const
Definition: unicharset.h:916
int hangul_sid() const
Definition: unicharset.h:943
bool load_from_file(FILE *file)
Definition: unicharset.h:408
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:599
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:478
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:457
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:854
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:803
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:787
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
int latin_sid() const
Definition: unicharset.h:922
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:742
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:629
int katakana_sid() const
Definition: unicharset.h:937
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:703
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:437
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:721
bool top_bottom_useful() const
Definition: unicharset.h:555
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:830
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:288
bool save_to_file(FILE *file) const
Definition: unicharset.h:373
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:473
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool load_from_file(const char *const filename)
Definition: unicharset.h:401
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:859
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:836
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:911
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:442
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
size_t size() const
Definition: unicharset.h:355
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:668
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:807
int thai_sid() const
Definition: unicharset.h:940
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:563
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:656
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:612
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:452
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:379
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:462
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:820
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:639
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:730
#define TESS_API
Definition: export.h:32