tesseract  4.00.00dev
unicharset.cpp
Go to the documentation of this file.
1 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "unicharset.h"
21 
22 #include <assert.h>
23 #include <stdio.h>
24 #include <string.h>
25 
26 #include "params.h"
27 #include "serialis.h"
28 #include "tesscallback.h"
29 #include "tprintf.h"
30 #include "unichar.h"
31 
32 // TODO(rays) Move UNICHARSET to tesseract namespace.
33 using tesseract::char32;
34 using tesseract::UNICHAR;
35 
36 // Special character used in representing character fragments.
37 static const char kSeparator = '|';
38 // Special character used in representing 'natural' character fragments.
39 static const char kNaturalFlag = 'n';
40 
41 static const int ISALPHA_MASK = 0x1;
42 static const int ISLOWER_MASK = 0x2;
43 static const int ISUPPER_MASK = 0x4;
44 static const int ISDIGIT_MASK = 0x8;
45 static const int ISPUNCTUATION_MASK = 0x10;
46 
47 // Y coordinate threshold for determining cap-height vs x-height.
48 // TODO(rays) Bring the global definition down to the ccutil library level,
49 // so this constant is relative to some other constants.
50 static const int kMeanlineThreshold = 220;
51 // Let C be the number of alpha chars for which all tops exceed
52 // kMeanlineThreshold, and X the number of alpha chars for which all
53 // tops are below kMeanlineThreshold, then if X > C *
54 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
55 // half the alpha characters have upper or lower case, then the
56 // unicharset "has x-height".
57 const double kMinXHeightFraction = 0.25;
58 const double kMinCapHeightFraction = 0.05;
59 
60 /*static */
61 const char* UNICHARSET::kCustomLigatures[][2] = {
62  {"ct", "\uE003"}, // c + t -> U+E003
63  {"ſh", "\uE006"}, // long-s + h -> U+E006
64  {"ſi", "\uE007"}, // long-s + i -> U+E007
65  {"ſl", "\uE008"}, // long-s + l -> U+E008
66  {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
67  {NULL, NULL}
68 };
69 
70 // List of mappings to make when ingesting strings from the outside.
71 // The substitutions clean up text that should exist for rendering of
72 // synthetic data, but not in the recognition set.
73 const char* UNICHARSET::kCleanupMaps[][2] = {
74  {"\u0640", ""}, // TATWEEL is deleted.
75  {"\ufb01", "fi"}, // fi ligature->fi pair.
76  {"\ufb02", "fl"}, // fl ligature->fl pair.
77  {nullptr, nullptr}};
78 
79 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
81  " ",
82  "Joined",
83  "|Broken|0|1"
84 };
85 
86 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
87  Init();
88 }
89 
90 // Initialize all properties to sensible default values.
91 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
92  isalpha = false;
93  islower = false;
94  isupper = false;
95  isdigit = false;
96  ispunctuation = false;
97  isngram = false;
98  enabled = false;
99  SetRangesOpen();
100  script_id = 0;
101  other_case = 0;
102  mirror = 0;
103  normed = "";
105  fragment = NULL;
106 }
107 
108 // Sets all ranges wide open. Initialization default in case there are
109 // no useful values available.
110 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
111  min_bottom = 0;
112  max_bottom = MAX_UINT8;
113  min_top = 0;
114  max_top = MAX_UINT8;
115  width = 0.0f;
116  width_sd = 0.0f;
117  bearing = 0.0f;
118  bearing_sd = 0.0f;
119  advance = 0.0f;
120  advance_sd = 0.0f;
121 }
122 
123 // Sets all ranges to empty. Used before expanding with font-based data.
124 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
125  min_bottom = MAX_UINT8;
126  max_bottom = 0;
127  min_top = MAX_UINT8;
128  max_top = 0;
129  width = 0.0f;
130  width_sd = 0.0f;
131  bearing = 0.0f;
132  bearing_sd = 0.0f;
133  advance = 0.0f;
134  advance_sd = 0.0f;
135 }
136 
137 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
138 // is emtpy.
139 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
140  return width == 0.0f || advance == 0.0f;
141 }
142 
143 // Expands the ranges with the ranges from the src properties.
144 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
145  const UNICHAR_PROPERTIES& src) {
146  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
147  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
148  UpdateRange(src.min_top, &min_top, &max_top);
149  UpdateRange(src.max_top, &min_top, &max_top);
150  if (src.width_sd > width_sd) {
151  width = src.width;
152  width_sd = src.width_sd;
153  }
154  if (src.bearing_sd > bearing_sd) {
155  bearing = src.bearing;
156  bearing_sd = src.bearing_sd;
157  }
158  if (src.advance_sd > advance_sd) {
159  advance = src.advance;
160  advance_sd = src.advance_sd;
161  }
162 }
163 
164 // Copies the properties from src into this.
165 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
166  // Apart from the fragment, everything else can be done with a default copy.
167  CHAR_FRAGMENT* saved_fragment = fragment;
168  *this = src; // Bitwise copy.
169  fragment = saved_fragment;
170 }
171 
173  unichars(NULL),
174  ids(),
175  size_used(0),
176  size_reserved(0),
177  script_table(NULL),
178  script_table_size_used(0),
179  null_script("NULL") {
180  clear();
181  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
183  if (i == UNICHAR_JOINED)
184  set_isngram(i, true);
185  }
186 }
187 
189  clear();
190 }
191 
192 void UNICHARSET::reserve(int unichars_number) {
193  if (unichars_number > size_reserved) {
194  UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
195  for (int i = 0; i < size_used; ++i)
196  unichars_new[i] = unichars[i];
197  for (int j = size_used; j < unichars_number; ++j) {
198  unichars_new[j].properties.script_id = add_script(null_script);
199  }
200  delete[] unichars;
201  unichars = unichars_new;
202  size_reserved = unichars_number;
203  }
204 }
205 
207 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
208  string cleaned =
209  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
210  return ids.contains(cleaned.data(), cleaned.size())
211  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
212  : INVALID_UNICHAR_ID;
213 }
214 
215 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
216  int length) const {
217  assert(length > 0 && length <= UNICHAR_LEN);
218  string cleaned(unichar_repr, length);
219  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
220  return ids.contains(cleaned.data(), cleaned.size())
221  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
222  : INVALID_UNICHAR_ID;
223 }
224 
225 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
226 // while leaving the rest of the string encodable. Returns 0 if the
227 // beginning of the string is not encodable.
228 // WARNING: this function now encodes the whole string for precision.
229 // Use encode_string in preference to repeatedly calling step.
230 int UNICHARSET::step(const char* str) const {
231  GenericVector<UNICHAR_ID> encoding;
232  GenericVector<char> lengths;
233  encode_string(str, true, &encoding, &lengths, NULL);
234  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
235  return lengths[0];
236 }
237 
238 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
239 // If not encodable, write the first byte offset which cannot be converted
240 // into the second (return) argument.
241 bool UNICHARSET::encodable_string(const char *str,
242  int *first_bad_position) const {
243  GenericVector<UNICHAR_ID> encoding;
244  return encode_string(str, true, &encoding, NULL, first_bad_position);
245 }
246 
247 // Encodes the given UTF-8 string with this UNICHARSET.
248 // Returns true if the encoding succeeds completely, false if there is at
249 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
250 // the rest of the string is still encoded.
251 // If lengths is not NULL, then it is filled with the corresponding
252 // byte length of each encoded UNICHAR_ID.
253 // WARNING: Caller must guarantee that str has already been cleaned of codes
254 // that do not belong in the unicharset, or encoding may fail.
255 // Use CleanupString to perform the cleaning.
256 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
257  GenericVector<UNICHAR_ID>* encoding,
258  GenericVector<char>* lengths,
259  int* encoded_length) const {
260  GenericVector<UNICHAR_ID> working_encoding;
261  GenericVector<char> working_lengths;
262  GenericVector<char> best_lengths;
263  encoding->truncate(0); // Just in case str is empty.
264  int str_length = strlen(str);
265  int str_pos = 0;
266  bool perfect = true;
267  while (str_pos < str_length) {
268  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
269  &str_pos, encoding, &best_lengths);
270  if (str_pos < str_length) {
271  // This is a non-match. Skip one utf-8 character.
272  perfect = false;
273  if (give_up_on_failure) break;
274  int step = UNICHAR::utf8_step(str + str_pos);
275  if (step == 0) step = 1;
276  encoding->push_back(INVALID_UNICHAR_ID);
277  best_lengths.push_back(step);
278  str_pos += step;
279  working_encoding = *encoding;
280  working_lengths = best_lengths;
281  }
282  }
283  if (lengths != NULL) *lengths = best_lengths;
284  if (encoded_length != NULL) *encoded_length = str_pos;
285  return perfect;
286 }
287 
288 const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
289  if (id == INVALID_UNICHAR_ID) {
290  return INVALID_UNICHAR;
291  }
292  ASSERT_HOST(id < this->size());
293  return unichars[id].representation;
294 }
295 
297  if (id == INVALID_UNICHAR_ID) {
298  return INVALID_UNICHAR;
299  }
300  ASSERT_HOST(id < this->size());
301  // Resolve from the kCustomLigatures table if this is a private encoding.
302  if (get_isprivate(id)) {
303  const char* ch = id_to_unichar(id);
304  for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
305  if (!strcmp(ch, kCustomLigatures[i][1])) {
306  return kCustomLigatures[i][0];
307  }
308  }
309  }
310  // Otherwise return the stored representation.
311  return unichars[id].representation;
312 }
313 
314 // Return a STRING that reformats the utf8 str into the str followed
315 // by its hex unicodes.
317  STRING result = str;
318  result += " [";
319  int step = 1;
320  // Chop into unicodes and code each as hex.
321  for (int i = 0; str[i] != '\0'; i += step) {
322  char hex[sizeof(int) * 2 + 1];
323  step = UNICHAR::utf8_step(str + i);
324  if (step == 0) {
325  step = 1;
326  sprintf(hex, "%x", str[i]);
327  } else {
328  UNICHAR ch(str + i, step);
329  sprintf(hex, "%x", ch.first_uni());
330  }
331  result += hex;
332  result += " ";
333  }
334  result += "]";
335  return result;
336 }
337 
338 // Return a STRING containing debug information on the unichar, including
339 // the id_to_unichar, its hex unicodes and the properties.
341  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
342  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
343  if (fragment) {
344  return fragment->to_string();
345  }
346  const char* str = id_to_unichar(id);
347  STRING result = debug_utf8_str(str);
348  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
349  if (get_isalpha(id)) {
350  if (get_islower(id))
351  result += "a";
352  else if (get_isupper(id))
353  result += "A";
354  else
355  result += "x";
356  }
357  // Append 0 if a digit.
358  if (get_isdigit(id)) {
359  result += "0";
360  }
361  // Append p is a punctuation symbol.
362  if (get_ispunctuation(id)) {
363  result += "p";
364  }
365  return result;
366 }
367 
368 // Sets the normed_ids vector from the normed string. normed_ids is not
369 // stored in the file, and needs to be set when the UNICHARSET is loaded.
371  unichars[unichar_id].properties.normed_ids.truncate(0);
372  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
373  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
374  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
375  true, &unichars[unichar_id].properties.normed_ids,
376  NULL, NULL)) {
377  unichars[unichar_id].properties.normed_ids.truncate(0);
378  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
379  }
380 }
381 
382 // Returns whether the unichar id represents a unicode value in the private use
383 // area. We use this range only internally to represent uncommon ligatures
384 // (eg. 'ct') that do not have regular unicode values.
385 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
386  UNICHAR uc(id_to_unichar(unichar_id), -1);
387  int uni = uc.first_uni();
388  return (uni >= 0xE000 && uni <= 0xF8FF);
389 }
390 
391 
392 // Sets all ranges to empty, so they can be expanded to set the values.
394  for (int id = 0; id < size_used; ++id) {
395  unichars[id].properties.SetRangesEmpty();
396  }
397 }
398 
399 // Sets all the properties for this unicharset given a src unicharset with
400 // everything set. The unicharsets don't have to be the same, and graphemes
401 // are correctly accounted for.
403  const UNICHARSET& src) {
404  for (int ch = start_index; ch < size_used; ++ch) {
405  const char* utf8 = id_to_unichar(ch);
406  UNICHAR_PROPERTIES properties;
407  if (src.GetStrProperties(utf8, &properties)) {
408  // Setup the script_id, other_case, and mirror properly.
409  const char* script = src.get_script_from_script_id(properties.script_id);
410  properties.script_id = add_script(script);
411  const char* other_case = src.id_to_unichar(properties.other_case);
412  if (contains_unichar(other_case)) {
413  properties.other_case = unichar_to_id(other_case);
414  } else {
415  properties.other_case = ch;
416  }
417  const char* mirror_str = src.id_to_unichar(properties.mirror);
418  if (contains_unichar(mirror_str)) {
419  properties.mirror = unichar_to_id(mirror_str);
420  } else {
421  properties.mirror = ch;
422  }
423  unichars[ch].properties.CopyFrom(properties);
424  set_normed_ids(ch);
425  }
426  }
427 }
428 
429 // Expands the tops and bottoms and widths for this unicharset given a
430 // src unicharset with ranges in it. The unicharsets don't have to be the
431 // same, and graphemes are correctly accounted for.
433  for (int ch = 0; ch < size_used; ++ch) {
434  const char* utf8 = id_to_unichar(ch);
435  UNICHAR_PROPERTIES properties;
436  if (src.GetStrProperties(utf8, &properties)) {
437  // Expand just the ranges from properties.
438  unichars[ch].properties.ExpandRangesFrom(properties);
439  }
440  }
441 }
442 
443 // Makes this a copy of src. Clears this completely first, so the automatic
444 // ids will not be present in this if not in src. Does NOT reorder the set!
446  clear();
447  for (int ch = 0; ch < src.size_used; ++ch) {
448  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
449  const char* utf8 = src.id_to_unichar(ch);
451  unichars[ch].properties.ExpandRangesFrom(src_props);
452  }
453  // Set properties, including mirror and other_case, WITHOUT reordering
454  // the unicharset.
456 }
457 
458 // For each id in src, if it does not occur in this, add it, as in
459 // SetPropertiesFromOther, otherwise expand the ranges, as in
460 // ExpandRangesFromOther.
462  int initial_used = size_used;
463  for (int ch = 0; ch < src.size_used; ++ch) {
464  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
465  const char* utf8 = src.id_to_unichar(ch);
466  int id = size_used;
467  if (contains_unichar(utf8)) {
468  id = unichar_to_id(utf8);
469  // Just expand current ranges.
470  unichars[id].properties.ExpandRangesFrom(src_props);
471  } else {
473  unichars[id].properties.SetRangesEmpty();
474  }
475  }
476  // Set properties, including mirror and other_case, WITHOUT reordering
477  // the unicharset.
478  PartialSetPropertiesFromOther(initial_used, src);
479 }
480 
481 // Returns true if the acceptable ranges of the tops of the characters do
482 // not overlap, making their x-height calculations distinct.
484  int overlap = MIN(unichars[id1].properties.max_top,
485  unichars[id2].properties.max_top) -
486  MAX(unichars[id1].properties.min_top,
487  unichars[id2].properties.min_top);
488  return overlap <= 0;
489 }
490 
491 // Internal recursive version of encode_string above.
492 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
493 // each UNICHAR_ID uses the least possible part of the utf8 str.
494 // It does this by depth-first tail recursion on increasing length matches
495 // to the UNICHARSET, saving the first encountered result that encodes the
496 // maximum total length of str. It stops on a failure to encode to make
497 // the overall process of encoding a partially failed string more efficient.
498 // See unicharset.h for definition of the args.
499 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
500  GenericVector<UNICHAR_ID>* encoding,
501  GenericVector<char>* lengths,
502  int* best_total_length,
503  GenericVector<UNICHAR_ID>* best_encoding,
504  GenericVector<char>* best_lengths) const {
505  if (str_index > *best_total_length) {
506  // This is the best result so far.
507  *best_total_length = str_index;
508  *best_encoding = *encoding;
509  if (best_lengths != NULL)
510  *best_lengths = *lengths;
511  }
512  if (str_index == str_length) return;
513  int encoding_index = encoding->size();
514  // Find the length of the first matching unicharset member.
515  int length = ids.minmatch(str + str_index);
516  if (length == 0 || str_index + length > str_length) return;
517  do {
518  if (ids.contains(str + str_index, length)) {
519  // Successful encoding so far.
520  UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
521  encoding->push_back(id);
522  lengths->push_back(length);
523  encode_string(str, str_index + length, str_length, encoding, lengths,
524  best_total_length, best_encoding, best_lengths);
525  if (*best_total_length == str_length)
526  return; // Tail recursion success!
527  // Failed with that length, truncate back and try again.
528  encoding->truncate(encoding_index);
529  lengths->truncate(encoding_index);
530  }
531  int step = UNICHAR::utf8_step(str + str_index + length);
532  if (step == 0) step = 1;
533  length += step;
534  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
535 }
536 
537 // Gets the properties for a grapheme string, combining properties for
538 // multiple characters in a meaningful way where possible.
539 // Returns false if no valid match was found in the unicharset.
540 // NOTE that script_id, mirror, and other_case refer to this unicharset on
541 // return and will need translation if the target unicharset is different.
542 bool UNICHARSET::GetStrProperties(const char* utf8_str,
543  UNICHAR_PROPERTIES* props) const {
544  props->Init();
545  props->SetRangesEmpty();
546  int total_unicodes = 0;
547  GenericVector<UNICHAR_ID> encoding;
548  if (!encode_string(utf8_str, true, &encoding, NULL, NULL))
549  return false; // Some part was invalid.
550  for (int i = 0; i < encoding.size(); ++i) {
551  int id = encoding[i];
552  const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
553  // Logical OR all the bools.
554  if (src_props.isalpha) props->isalpha = true;
555  if (src_props.islower) props->islower = true;
556  if (src_props.isupper) props->isupper = true;
557  if (src_props.isdigit) props->isdigit = true;
558  if (src_props.ispunctuation) props->ispunctuation = true;
559  if (src_props.isngram) props->isngram = true;
560  if (src_props.enabled) props->enabled = true;
561  // Min/max the tops/bottoms.
562  UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
563  UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
564  UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
565  UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
566  float bearing = props->advance + src_props.bearing;
567  if (total_unicodes == 0 || bearing < props->bearing) {
568  props->bearing = bearing;
569  props->bearing_sd = props->advance_sd + src_props.bearing_sd;
570  }
571  props->advance += src_props.advance;
572  props->advance_sd += src_props.advance_sd;
573  // With a single width, just use the widths stored in the unicharset.
574  props->width = src_props.width;
575  props->width_sd = src_props.width_sd;
576  // Use the first script id, other_case, mirror, direction.
577  // Note that these will need translation, except direction.
578  if (total_unicodes == 0) {
579  props->script_id = src_props.script_id;
580  props->other_case = src_props.other_case;
581  props->mirror = src_props.mirror;
582  props->direction = src_props.direction;
583  }
584  // The normed string for the compound character is the concatenation of
585  // the normed versions of the individual characters.
586  props->normed += src_props.normed;
587  ++total_unicodes;
588  }
589  if (total_unicodes > 1) {
590  // Estimate the total widths from the advance - bearing.
591  props->width = props->advance - props->bearing;
592  props->width_sd = props->advance_sd + props->bearing_sd;
593  }
594  return total_unicodes > 0;
595 }
596 
597 // TODO(rays) clean-up the order of functions to match unicharset.h.
598 
599 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
600  unsigned int properties = 0;
601  if (this->get_isalpha(id))
602  properties |= ISALPHA_MASK;
603  if (this->get_islower(id))
604  properties |= ISLOWER_MASK;
605  if (this->get_isupper(id))
606  properties |= ISUPPER_MASK;
607  if (this->get_isdigit(id))
608  properties |= ISDIGIT_MASK;
609  if (this->get_ispunctuation(id))
610  properties |= ISPUNCTUATION_MASK;
611  return properties;
612 }
613 
615  if (this->get_isupper(id)) return 'A';
616  if (this->get_islower(id)) return 'a';
617  if (this->get_isalpha(id)) return 'x';
618  if (this->get_isdigit(id)) return '0';
619  if (this->get_ispunctuation(id)) return 'p';
620  return 0;
621 }
622 
623 void UNICHARSET::unichar_insert(const char* const unichar_repr,
624  OldUncleanUnichars old_style) {
625  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
626  string cleaned =
627  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
628  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
629  const char* str = cleaned.c_str();
630  GenericVector<int> encoding;
631  if (!old_style_included_ &&
632  encode_string(str, true, &encoding, nullptr, nullptr))
633  return;
634  if (size_used == size_reserved) {
635  if (size_used == 0)
636  reserve(8);
637  else
638  reserve(2 * size_used);
639  }
640  int index = 0;
641  do {
642  if (index > UNICHAR_LEN) {
643  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
644  unichar_repr);
645  return;
646  }
647  unichars[size_used].representation[index++] = *str++;
648  } while (*str != '\0');
649  unichars[size_used].representation[index] = '\0';
650  this->set_script(size_used, null_script);
651  // If the given unichar_repr represents a fragmented character, set
652  // fragment property to a pointer to CHAR_FRAGMENT class instance with
653  // information parsed from the unichar representation. Use the script
654  // of the base unichar for the fragmented character if possible.
655  CHAR_FRAGMENT* frag =
656  CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
657  this->unichars[size_used].properties.fragment = frag;
658  if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
659  this->unichars[size_used].properties.script_id =
660  this->get_script(frag->get_unichar());
661  }
662  this->unichars[size_used].properties.enabled = true;
663  ids.insert(unichars[size_used].representation, size_used);
664  ++size_used;
665  }
666 }
667 
668 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
669  string cleaned =
670  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
671  return ids.contains(cleaned.data(), cleaned.size());
672 }
673 
674 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
675  int length) const {
676  if (length == 0) {
677  return false;
678  }
679  string cleaned(unichar_repr, length);
680  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
681  return ids.contains(cleaned.data(), cleaned.size());
682 }
683 
684 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
685  const char* const unichar_repr) const {
686  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
687 }
688 
690  const int kFileBufSize = 1024;
691  char buffer[kFileBufSize + 1];
692  snprintf(buffer, kFileBufSize, "%d\n", this->size());
693  *str = buffer;
694  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
695  int min_bottom, max_bottom, min_top, max_top;
696  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
697  float width, width_sd;
698  get_width_stats(id, &width, &width_sd);
699  float bearing, bearing_sd;
700  get_bearing_stats(id, &bearing, &bearing_sd);
701  float advance, advance_sd;
702  get_advance_stats(id, &advance, &advance_sd);
703  unsigned int properties = this->get_properties(id);
704  if (strcmp(this->id_to_unichar(id), " ") == 0) {
705  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
706  this->get_script_from_script_id(this->get_script(id)),
707  this->get_other_case(id));
708  } else {
709  snprintf(buffer, kFileBufSize,
710  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
711  this->id_to_unichar(id), properties,
712  min_bottom, max_bottom, min_top, max_top, width, width_sd,
713  bearing, bearing_sd, advance, advance_sd,
714  this->get_script_from_script_id(this->get_script(id)),
715  this->get_other_case(id), this->get_direction(id),
716  this->get_mirror(id), this->get_normed_unichar(id),
717  this->debug_str(id).string());
718  }
719  *str += buffer;
720  }
721  return true;
722 }
723 
724 // TODO(rays) Replace with TFile everywhere.
726  public:
727  InMemoryFilePointer(const char *memory, int mem_size)
728  : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
729 
730  char *fgets(char *orig_dst, int size) {
731  const char *src_end = memory_ + mem_size_;
732  char *dst_end = orig_dst + size - 1;
733  if (size < 1) {
734  return fgets_ptr_ < src_end ? orig_dst : NULL;
735  }
736 
737  char *dst = orig_dst;
738  char ch = '^';
739  while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
740  ch = *dst++ = *fgets_ptr_++;
741  }
742  *dst = 0;
743  return (dst == orig_dst) ? NULL : orig_dst;
744  }
745 
746  private:
747  const char *memory_;
748  const char *fgets_ptr_;
749  const int mem_size_;
750 };
751 
753  const char *memory, int mem_size, bool skip_fragments) {
754  InMemoryFilePointer mem_fp(memory, mem_size);
757  bool success = load_via_fgets(fgets_cb, skip_fragments);
758  delete fgets_cb;
759  return success;
760 }
761 
763  public:
764  LocalFilePointer(FILE *stream) : fp_(stream) {}
765  char *fgets(char *dst, int size) {
766  return ::fgets(dst, size, fp_);
767  }
768  private:
769  FILE *fp_;
770 };
771 
772 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
773  LocalFilePointer lfp(file);
776  bool success = load_via_fgets(fgets_cb, skip_fragments);
777  delete fgets_cb;
778  return success;
779 }
780 
781 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
784  bool success = load_via_fgets(fgets_cb, skip_fragments);
785  delete fgets_cb;
786  return success;
787 }
788 
789 bool UNICHARSET::load_via_fgets(
791  bool skip_fragments) {
792  int unicharset_size;
793  char buffer[256];
794 
795  this->clear();
796  if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
797  sscanf(buffer, "%d", &unicharset_size) != 1) {
798  return false;
799  }
800  this->reserve(unicharset_size);
801  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
802  char unichar[256];
803  unsigned int properties;
804  char script[64];
805 
806  strcpy(script, null_script);
807  int min_bottom = 0;
808  int max_bottom = MAX_UINT8;
809  int min_top = 0;
810  int max_top = MAX_UINT8;
811  float width = 0.0f;
812  float width_sd = 0.0f;
813  float bearing = 0.0f;
814  float bearing_sd = 0.0f;
815  float advance = 0.0f;
816  float advance_sd = 0.0f;
817  // TODO(eger): check that this default it ok
818  // after enabling BiDi iterator for Arabic+Cube.
820  UNICHAR_ID other_case = id;
821  UNICHAR_ID mirror = id;
822  char normed[64];
823  int v = -1;
824  if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
825  ((v = sscanf(buffer,
826  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
827  unichar, &properties,
828  &min_bottom, &max_bottom, &min_top, &max_top,
829  &width, &width_sd, &bearing, &bearing_sd,
830  &advance, &advance_sd, script, &other_case,
831  &direction, &mirror, normed)) != 17 &&
832  (v = sscanf(buffer,
833  "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
834  unichar, &properties,
835  &min_bottom, &max_bottom, &min_top, &max_top,
836  &width, &width_sd, &bearing, &bearing_sd,
837  &advance, &advance_sd, script, &other_case,
838  &direction, &mirror)) != 16 &&
839  (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
840  unichar, &properties,
841  &min_bottom, &max_bottom, &min_top, &max_top,
842  script, &other_case, &direction, &mirror)) != 10 &&
843  (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
844  &min_bottom, &max_bottom, &min_top, &max_top,
845  script, &other_case)) != 8 &&
846  (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
847  script, &other_case)) != 4 &&
848  (v = sscanf(buffer, "%s %x %63s",
849  unichar, &properties, script)) != 3 &&
850  (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
851  return false;
852  }
853 
854  // Skip fragments if needed.
855  CHAR_FRAGMENT *frag = NULL;
856  if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
857  int num_pieces = frag->get_total();
858  delete frag;
859  // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
860  if (num_pieces > 1)
861  continue;
862  }
863  // Insert unichar into unicharset and set its properties.
864  if (strcmp(unichar, "NULL") == 0)
865  this->unichar_insert(" ");
866  else
868 
869  this->set_isalpha(id, properties & ISALPHA_MASK);
870  this->set_islower(id, properties & ISLOWER_MASK);
871  this->set_isupper(id, properties & ISUPPER_MASK);
872  this->set_isdigit(id, properties & ISDIGIT_MASK);
873  this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
874  this->set_isngram(id, false);
875  this->set_script(id, script);
876  this->unichars[id].properties.enabled = true;
877  this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
878  this->set_width_stats(id, width, width_sd);
879  this->set_bearing_stats(id, bearing, bearing_sd);
880  this->set_advance_stats(id, advance, advance_sd);
881  this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
882  this->set_other_case(
883  id, (v > 3 && other_case < unicharset_size) ? other_case : id);
884  this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
885  this->set_normed(id, (v>16) ? normed : unichar);
886  }
887  post_load_setup();
888  return true;
889 }
890 
891 // Sets up internal data after loading the file, based on the char
892 // properties. Called from load_from_file, but also needs to be run
893 // during set_unicharset_properties.
895  // Number of alpha chars with the case property minus those without,
896  // in order to determine that half the alpha chars have case.
897  int net_case_alphas = 0;
898  int x_height_alphas = 0;
899  int cap_height_alphas = 0;
900  top_bottom_set_ = false;
901  for (UNICHAR_ID id = 0; id < size_used; ++id) {
902  int min_bottom = 0;
903  int max_bottom = MAX_UINT8;
904  int min_top = 0;
905  int max_top = MAX_UINT8;
906  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
907  if (min_top > 0)
908  top_bottom_set_ = true;
909  if (get_isalpha(id)) {
910  if (get_islower(id) || get_isupper(id))
911  ++net_case_alphas;
912  else
913  --net_case_alphas;
914  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
915  ++x_height_alphas;
916  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
917  ++cap_height_alphas;
918  }
919  set_normed_ids(id);
920  }
921 
922  script_has_upper_lower_ = net_case_alphas > 0;
923  script_has_xheight_ = script_has_upper_lower_ ||
924  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
925  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
926 
927  null_sid_ = get_script_id_from_name(null_script);
928  ASSERT_HOST(null_sid_ == 0);
929  common_sid_ = get_script_id_from_name("Common");
930  latin_sid_ = get_script_id_from_name("Latin");
931  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
932  greek_sid_ = get_script_id_from_name("Greek");
933  han_sid_ = get_script_id_from_name("Han");
934  hiragana_sid_ = get_script_id_from_name("Hiragana");
935  katakana_sid_ = get_script_id_from_name("Katakana");
936  thai_sid_ = get_script_id_from_name("Thai");
937  hangul_sid_ = get_script_id_from_name("Hangul");
938 
939  // Compute default script. Use the highest-counting alpha script, that is
940  // not the common script, as that still contains some "alphas".
941  int* script_counts = new int[script_table_size_used];
942  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
943  for (int id = 0; id < size_used; ++id) {
944  if (get_isalpha(id)) {
945  ++script_counts[get_script(id)];
946  }
947  }
948  default_sid_ = 0;
949  for (int s = 1; s < script_table_size_used; ++s) {
950  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
951  default_sid_ = s;
952  }
953  delete [] script_counts;
954 }
955 
956 // Returns true if right_to_left scripts are significant in the unicharset,
957 // but without being so sensitive that "universal" unicharsets containing
958 // characters from many scripts, like orientation and script detection,
959 // look like they are right_to_left.
961  int ltr_count = 0;
962  int rtl_count = 0;
963  for (int id = 0; id < size_used; ++id) {
964  int dir = get_direction(id);
965  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
966  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
968  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
969  }
970  return rtl_count > ltr_count;
971 }
972 
973 // Set a whitelist and/or blacklist of characters to recognize.
974 // An empty or NULL whitelist enables everything (minus any blacklist).
975 // An empty or NULL blacklist disables nothing.
976 // An empty or NULL blacklist has no effect.
977 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
978  const char* whitelist,
979  const char* unblacklist) {
980  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
981  // Set everything to default
982  for (int ch = 0; ch < size_used; ++ch)
983  unichars[ch].properties.enabled = def_enabled;
984  if (!def_enabled) {
985  // Enable the whitelist.
986  GenericVector<UNICHAR_ID> encoding;
987  encode_string(whitelist, false, &encoding, NULL, NULL);
988  for (int i = 0; i < encoding.size(); ++i) {
989  if (encoding[i] != INVALID_UNICHAR_ID)
990  unichars[encoding[i]].properties.enabled = true;
991  }
992  }
993  if (blacklist != NULL && blacklist[0] != '\0') {
994  // Disable the blacklist.
995  GenericVector<UNICHAR_ID> encoding;
996  encode_string(blacklist, false, &encoding, NULL, NULL);
997  for (int i = 0; i < encoding.size(); ++i) {
998  if (encoding[i] != INVALID_UNICHAR_ID)
999  unichars[encoding[i]].properties.enabled = false;
1000  }
1001  }
1002  if (unblacklist != NULL && unblacklist[0] != '\0') {
1003  // Re-enable the unblacklist.
1004  GenericVector<UNICHAR_ID> encoding;
1005  encode_string(unblacklist, false, &encoding, NULL, NULL);
1006  for (int i = 0; i < encoding.size(); ++i) {
1007  if (encoding[i] != INVALID_UNICHAR_ID)
1008  unichars[encoding[i]].properties.enabled = true;
1009  }
1010  }
1011 }
1012 
1013 // Returns true if there are any repeated unicodes in the normalized
1014 // text of any unichar-id in the unicharset.
1016  int start_id = 0;
1018  for (int id = start_id; id < size_used; ++id) {
1019  // Convert to unicodes.
1020  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1021  for (int u = 1; u < unicodes.size(); ++u) {
1022  if (unicodes[u - 1] == unicodes[u]) return true;
1023  }
1024  }
1025  return false;
1026 }
1027 
1028 int UNICHARSET::add_script(const char* script) {
1029  for (int i = 0; i < script_table_size_used; ++i) {
1030  if (strcmp(script, script_table[i]) == 0)
1031  return i;
1032  }
1033  if (script_table_size_reserved == 0) {
1034  script_table_size_reserved = 8;
1035  script_table = new char*[script_table_size_reserved];
1036  } else if (script_table_size_used >= script_table_size_reserved) {
1037  assert(script_table_size_used == script_table_size_reserved);
1038  script_table_size_reserved += script_table_size_reserved;
1039  char** new_script_table = new char*[script_table_size_reserved];
1040  memcpy(new_script_table, script_table,
1041  script_table_size_used * sizeof(char*));
1042  delete[] script_table;
1043  script_table = new_script_table;
1044  }
1045  script_table[script_table_size_used] = new char[strlen(script) + 1];
1046  strcpy(script_table[script_table_size_used], script);
1047  return script_table_size_used++;
1048 }
1049 
1050 // Returns the string that represents a fragment
1051 // with the given unichar, pos and total.
1052 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1053  bool natural) {
1054  if (total == 1) return STRING(unichar);
1055  STRING result = "";
1056  result += kSeparator;
1057  result += unichar;
1058  char buffer[kMaxLen];
1059  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1060  natural ? kNaturalFlag : kSeparator, total);
1061  result += buffer;
1062  return result;
1063 }
1064 
1066  const char *ptr = string;
1067  int len = strlen(string);
1068  if (len < kMinLen || *ptr != kSeparator) {
1069  return NULL; // this string can not represent a fragment
1070  }
1071  ptr++; // move to the next character
1072  int step = 0;
1073  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1074  step += UNICHAR::utf8_step(ptr + step);
1075  }
1076  if (step == 0 || step > UNICHAR_LEN) {
1077  return NULL; // no character for unichar or the character is too long
1078  }
1079  char unichar[UNICHAR_LEN + 1];
1080  strncpy(unichar, ptr, step);
1081  unichar[step] = '\0'; // null terminate unichar
1082  ptr += step; // move to the next fragment separator
1083  int pos = 0;
1084  int total = 0;
1085  bool natural = false;
1086  char *end_ptr = NULL;
1087  for (int i = 0; i < 2; i++) {
1088  if (ptr > string + len || *ptr != kSeparator) {
1089  if (i == 1 && *ptr == kNaturalFlag)
1090  natural = true;
1091  else
1092  return NULL; // Failed to parse fragment representation.
1093  }
1094  ptr++; // move to the next character
1095  i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1096  : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1097  ptr = end_ptr;
1098  }
1099  if (ptr != string + len) {
1100  return NULL; // malformed fragment representation
1101  }
1102  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
1103  fragment->set_all(unichar, pos, total, natural);
1104  return fragment;
1105 }
1106 
1107 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
1108  for (int i = 0; i < script_table_size_used; ++i) {
1109  if (strcmp(script_name, script_table[i]) == 0)
1110  return i;
1111  }
1112  return 0; // 0 is always the null_script
1113 }
1114 
1115 // Removes/replaces content that belongs in rendered text, but not in the
1116 // unicharset.
1117 /* static */
1118 string UNICHARSET::CleanupString(const char* utf8_str, int length) {
1119  string result;
1120  result.reserve(length);
1121  char ch;
1122  while ((ch = *utf8_str) != '\0' && --length >= 0) {
1123  int key_index = 0;
1124  const char* key;
1125  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1126  int match = 0;
1127  while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1128  if (key[match] == '\0') {
1129  utf8_str += match;
1130  break;
1131  }
1132  ++key_index;
1133  }
1134  if (key == nullptr) {
1135  result.push_back(ch);
1136  ++utf8_str;
1137  } else {
1138  result.append(kCleanupMaps[key_index][1]);
1139  }
1140  }
1141  return result;
1142 }
int get_script_id_from_name(const char *script_name) const
const double kMinCapHeightFraction
Definition: unicharset.cpp:58
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:450
bool empty() const
Definition: genericvector.h:91
#define MIN(x, y)
Definition: ndminx.h:28
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:154
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:296
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:622
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:435
static CHAR_FRAGMENT * parse_from_string(const char *str)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:567
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:481
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:471
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:86
bool AnyRepeatedUnicodes() const
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:445
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:37
int get_total() const
Definition: unicharset.h:73
void set_ranges_empty()
Definition: unicharset.cpp:393
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:599
#define MAX(x, y)
Definition: ndminx.h:24
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:639
virtual R Run(A1, A2)=0
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:432
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:853
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:461
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
LocalFilePointer(FILE *stream)
Definition: unicharset.cpp:764
int size() const
Definition: genericvector.h:72
const double kMinXHeightFraction
Definition: unicharset.cpp:57
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:385
#define MAX_UINT8
Definition: host.h:63
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:629
OldUncleanUnichars
Definition: unicharset.h:44
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:455
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:614
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
void truncate(int size)
STRING to_string() const
Definition: unicharset.h:80
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int size() const
Definition: unicharset.h:338
signed int char32
Definition: unichar.h:52
int push_back(T object)
bool has_special_codes() const
Definition: unicharset.h:721
static STRING to_string(const char *unichar, int pos, int total, bool natural)
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:752
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:440
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:827
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:132
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:689
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
int add_script(const char *script)
char * fgets(char *dst, int size)
Definition: unicharset.cpp:765
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:59
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:595
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:241
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:476
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:370
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
Definition: strngs.h:45
InMemoryFilePointer(const char *memory, int mem_size)
Definition: unicharset.cpp:727
bool major_right_to_left() const
Definition: unicharset.cpp:960
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:264
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:316
void post_load_setup()
Definition: unicharset.cpp:894
#define UNICHAR_LEN
Definition: unichar.h:31
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
#define ASSERT_HOST(x)
Definition: errcode.h:84
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:445
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:340
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:483
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:689
const char * get_unichar() const
Definition: unicharset.h:71
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:387
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:151
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:461
void clear()
Definition: unicharset.h:303
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:977
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:684
int first_uni() const
Definition: unichar.cpp:99
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:430
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:466
void reserve(int unichars_number)
Definition: unicharset.cpp:192
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:103
int step(const char *str) const
Definition: unicharset.cpp:230
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:730
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:402
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:581
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:696
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:612
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:606