20# include "config_auto.h"
53static const char kPermuterTypeNoPerm[] =
"None";
54static const char kPermuterTypePuncPerm[] =
"Punctuation";
55static const char kPermuterTypeTopPerm[] =
"Top Choice";
56static const char kPermuterTypeLowerPerm[] =
"Top Lower Case";
57static const char kPermuterTypeUpperPerm[] =
"Top Upper Case";
58static const char kPermuterTypeNgramPerm[] =
"Ngram";
59static const char kPermuterTypeNumberPerm[] =
"Number";
60static const char kPermuterTypeUserPatPerm[] =
"User Pattern";
61static const char kPermuterTypeSysDawgPerm[] =
"System Dictionary";
62static const char kPermuterTypeDocDawgPerm[] =
"Document Dictionary";
63static const char kPermuterTypeUserDawgPerm[] =
"User Dictionary";
64static const char kPermuterTypeFreqDawgPerm[] =
"Frequent Words Dictionary";
65static const char kPermuterTypeCompoundPerm[] =
"Compound";
67static const char *
const kPermuterTypeNames[] = {
69 kPermuterTypePuncPerm,
71 kPermuterTypeLowerPerm,
72 kPermuterTypeUpperPerm,
73 kPermuterTypeNgramPerm,
74 kPermuterTypeNumberPerm,
75 kPermuterTypeUserPatPerm,
76 kPermuterTypeSysDawgPerm,
77 kPermuterTypeDocDawgPerm,
78 kPermuterTypeUserDawgPerm,
79 kPermuterTypeFreqDawgPerm,
80 kPermuterTypeCompoundPerm
96 unichar_id_ = src_unichar_id;
98 certainty_ = src_cert;
101 script_id_ = src_script_id;
120 matrix_cell_ = other.matrix_cell_;
121 min_xheight_ = other.min_xheight_;
122 max_xheight_ = other.max_xheight_;
124 classifier_ = other.classifier_;
125#ifndef DISABLED_LEGACY_ENGINE
126 fonts_ = other.fonts_;
139 matrix_cell_ = other.matrix_cell_;
140 min_xheight_ = other.min_xheight_;
141 max_xheight_ = other.max_xheight_;
143 classifier_ = other.classifier_;
144#ifndef DISABLED_LEGACY_ENGINE
145 fonts_ = other.fonts_;
153 double baseline_diff = std::fabs(
yshift() - other.
yshift());
156 tprintf(
"Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_);
166 overlap /= denominator;
168 tprintf(
"PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_,
169 other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap);
179 BLOB_CHOICE_IT choice_it(bc_list);
180 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
190 return kPermuterTypeNames[
permuter];
194 switch (script_pos) {
214 : unicharset_(&unicharset) {
215 std::vector<UNICHAR_ID> encoding;
216 std::vector<char> lengths;
219 lengths.push_back(
'\0');
220 std::string src_lengths = &lengths[0];
221 this->
init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0,
NO_PERM);
239 float src_certainty, uint8_t src_permuter) {
240 int src_string_len = strlen(src_string);
241 if (src_string_len == 0) {
244 this->
init(src_lengths ? strlen(src_lengths) : src_string_len);
247 for (
unsigned i = 0;
i < length_; ++
i) {
248 int unichar_length = src_lengths ? src_lengths[
i] : 1;
249 unichar_ids_[
i] = unicharset_->
unichar_to_id(src_string + offset, unichar_length);
251 certainties_[
i] = src_certainty;
252 offset += unichar_length;
255 adjust_factor_ = 1.0f;
256 rating_ = src_rating;
257 certainty_ = src_certainty;
258 permuter_ = src_permuter;
259 dangerous_ambig_found_ =
false;
268 return kPermuterTypeNames[permuter_];
276 BLOB_CHOICE_LIST *result = ratings->
get(coord.
col, coord.
row);
277 if (result ==
nullptr) {
278 result =
new BLOB_CHOICE_LIST;
279 ratings->
put(coord.
col, coord.
row, result);
288 for (
unsigned i = 0;
i < index; ++
i) {
291 int row = col + state_[index] - 1;
298 unichar_ids_[index] = blob_choice->
unichar_id();
300 state_[index] = blob_count;
301 certainties_[index] = blob_choice->
certainty();
310 for (
unsigned i = 0;
i < length_; ++
i) {
328 for (
int i = 0;
i < num; ++
i) {
330 state_[start - 1] += state_[start +
i];
331 }
else if (start + num < length_) {
332 state_[start + num] += state_[start +
i];
335 for (
unsigned i = start;
i + num < length_; ++
i) {
336 unichar_ids_[
i] = unichar_ids_[
i + num];
337 script_pos_[
i] = script_pos_[
i + num];
338 state_[
i] = state_[
i + num];
339 certainties_[
i] = certainties_[
i + num];
350 for (
unsigned i = 0;
i < length_ / 2; ++
i) {
352 unichar_ids_[
i] = unicharset_->
get_mirror(unichar_ids_[length_ - 1 -
i]);
353 unichar_ids_[length_ - 1 -
i] = unicharset_->
get_mirror(tmp_id);
355 if (length_ % 2 != 0) {
356 unichar_ids_[length_ / 2] = unicharset_->
get_mirror(unichar_ids_[length_ / 2]);
380 while (end > 0 && unicharset_->
get_isdigit(unichar_ids_[end - 1]) &&
385 while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&
400 for (
auto i = start;
i < end;
i++) {
412 for (
unsigned i = 0;
i < length_; ++
i) {
429 if (word_lengths_str !=
nullptr) {
430 *word_lengths_str =
"";
432 for (
unsigned i = 0;
i < length_; ++
i) {
435 if (word_lengths_str !=
nullptr) {
436 *word_lengths_str += (char)strlen(
ch);
449 if (length_ == reserved_) {
464 while (reserved_ < length_ + second.
length()) {
467 const std::vector<UNICHAR_ID> &other_unichar_ids = second.
unichar_ids();
468 for (
unsigned i = 0;
i < second.
length(); ++
i) {
469 unichar_ids_[length_ +
i] = other_unichar_ids[
i];
470 state_[length_ +
i] = second.state_[
i];
471 certainties_[length_ +
i] = second.certainties_[
i];
474 length_ += second.
length();
475 if (second.adjust_factor_ > adjust_factor_) {
476 adjust_factor_ = second.adjust_factor_;
478 rating_ += second.
rating();
482 if (second.dangerous_ambig_found_) {
483 dangerous_ambig_found_ =
true;
500 while (reserved_ < source.
length()) {
504 unicharset_ = source.unicharset_;
505 const std::vector<UNICHAR_ID> &other_unichar_ids = source.
unichar_ids();
506 for (
unsigned i = 0;
i < source.
length(); ++
i) {
507 unichar_ids_[
i] = other_unichar_ids[
i];
508 state_[
i] = source.state_[
i];
509 certainties_[
i] = source.certainties_[
i];
512 length_ = source.
length();
513 adjust_factor_ = source.adjust_factor_;
514 rating_ = source.
rating();
519 dangerous_ambig_found_ = source.dangerous_ambig_found_;
530 for (
unsigned i = 0;
i < length_; ++
i) {
537 unsigned position_counts[4] = {0, 0, 0, 0};
540 for (
unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
544 if (!state_.empty()) {
545 for (
int i = 1;
i < state_[blob_index]; ++
i) {
547 tblob = word->
blobs[chunk_index];
551 script_pos_[blob_index] =
ScriptPositionOf(
false, *unicharset_, blob_box, uni_id);
555 position_counts[script_pos_[blob_index]]++;
563 "Most characters of %s are subscript or superscript.\n"
564 "That seems wrong, so I'll assume we got the baseline wrong\n",
567 for (
unsigned i = 0;
i < length_;
i++) {
571 position_counts[sp]--;
581 for (
unsigned blob_index = 0; blob_index < length_; ++blob_index) {
586 chunk_index += state_.empty() ? 1 : state_[blob_index];
593 for (
unsigned i = 0;
i < length_; ++
i) {
594 script_pos_[
i] = position;
602 int top = blob_box.
top();
603 int bottom = blob_box.
bottom();
604 int min_bottom, max_bottom, min_top, max_top;
612 }
else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
614 }
else if (bottom > sup_thresh_bot) {
621 "%s Character %s[bot:%d top: %d] "
622 "bot_range[%d,%d] top_range[%d, %d] "
623 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
625 max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);
633 std::vector<unsigned> sid(max_script);
634 for (
unsigned x = 0;
x < length_; ++
x) {
651 unsigned max_sid = 0;
652 for (
unsigned x = 1;
x < max_script;
x++) {
653 if (sid[
x] >= sid[max_sid]) {
657 if (sid[max_sid] < length_ / 2) {
665 int total_chunks = 0;
666 for (
unsigned i = 0;
i < length_; ++
i) {
667 total_chunks += state_[
i];
668 if (total_chunks > blob_position) {
677 unsigned total_chunks = 0;
678 for (
unsigned i = 0;
i < length_; ++
i) {
679 total_chunks += state_[
i];
691 for (
unsigned i = 0;
i < length_; ++
i) {
694 tprintf(
" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
695 adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
697 for (
unsigned i = 0;
i < length_; ++
i) {
701 for (
unsigned i = 0;
i < length_; ++
i) {
705 for (
unsigned i = 0;
i < length_; ++
i) {
709 for (
unsigned i = 0;
i < length_; ++
i) {
718 for (
unsigned i = 0;
i < length_; ++
i) {
724#ifndef GRAPHICS_DISABLED
730 const int kNumColors = 6;
733 static std::vector<int> prev_drawn_state;
734 bool already_done = prev_drawn_state.size() == length_;
736 prev_drawn_state.clear();
737 prev_drawn_state.resize(length_);
739 for (
unsigned i = 0;
i < length_; ++
i) {
740 if (prev_drawn_state[
i] != state_[
i]) {
741 already_done =
false;
743 prev_drawn_state[
i] = state_[
i];
745 if (already_done || word->
blobs.empty()) {
750 if (segm_window ==
nullptr) {
751 segm_window =
new ScrollView(
"Segmentation", 5, 10, 500, 256, 2000.0, 256.0,
true);
753 segm_window->
Clear();
758 for (
unsigned c = 0; c < length_; ++c) {
760 for (
int i = 0;
i < state_[c]; ++
i, ++blob_index) {
763 blob->
plot(segm_window, color, color);
778 unsigned w1start, w1end;
780 unsigned w2start, w2end;
782 if (w1end - w1start != w2end - w2start) {
785 for (
unsigned i = 0;
i < w1end - w1start;
i++) {
806 if (ratings->empty()) {
814 c_it.set_to_list(ratings);
815 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
816 c_it.data()->print(¤t_unicharset);
817 if (!c_it.at_last()) {
const int kMaxDropCapBottom
const double kMinXHeightMatch
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void tprintf(const char *format,...)
const double kMaxBaselineDrift
const double kMaxOverlapDenominator
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
const char * ScriptPosToString(enum ScriptPos script_pos)
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
const int kMinSubscriptOffset
const int kMinSuperscriptOffset
const int kBlnBaselineOffset
void put(ICOORD pos, const T &thing)
TBOX bounding_box() const
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
int16_t fontinfo_id2() const
UNICHAR_ID unichar_id() const
float min_xheight() const
float max_xheight() const
int16_t fontinfo_id() const
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
void punct_stripped(unsigned *start_core, unsigned *end_core) const
float max_x_height() const
bool has_rtl_unichar_id() const
unsigned TotalOfStates() const
WERD_CHOICE & operator=(const WERD_CHOICE &source)
int GetTopScriptID() const
MATRIX_COORD MatrixCoord(unsigned index) const
void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice)
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
WERD_CHOICE(const UNICHARSET *unicharset)
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
WERD_CHOICE shallow_copy(unsigned start, unsigned end) const
UNICHAR_ID unichar_id(unsigned index) const
void print_state(const char *msg) const
static const float kBadRating
void make_bad()
Set the fields in this choice to be default (bad) values.
void GetNonSuperscriptSpan(int *start, int *end) const
void reverse_and_mirror_unichar_ids()
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
void init(unsigned reserved)
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
BLOB_CHOICE_LIST * blob_choices(unsigned index, MATRIX *ratings) const
const UNICHARSET * unicharset() const
float min_x_height() const
const std::vector< UNICHAR_ID > & unichar_ids() const
const char * permuter_name() const
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
void DisplaySegmentation(TWERD *word)
std::string & unichar_string()
void SetAllScriptPositions(ScriptPos position)
void UpdateStateForSplit(int blob_position)
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
void remove_unichar_ids(unsigned index, int num)
ScriptPos BlobPosition(unsigned index) const
TDimension bottom() const
void operator=(const ELIST_LINK &)
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
int get_script(UNICHAR_ID unichar_id) const
int get_script_table_size() const
Direction get_direction(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
static std::string CleanupString(const char *utf8_str)
const char * id_to_unichar_ext(UNICHAR_ID id) const
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
void void ZoomToRectangle(int x1, int y1, int x2, int y2)