24#if !defined(DISABLED_LEGACY_ENGINE)
67 const TBOX &word_box) {
69 truth_has_char_boxes_ =
false;
71 std::vector<UNICHAR_ID> encoding;
72 std::vector<char> lengths;
73 unicharset.
encode_string(truth_str,
false, &encoding, &lengths,
nullptr);
75 for (
size_t i = 0;
i < encoding.size(); total_length += lengths[
i++]) {
76 std::string uch(truth_str + total_length);
77 uch.resize(lengths[
i] - total_length);
79 if (
id != INVALID_UNICHAR_ID) {
82 truth_text_.push_back(uch);
89 const TBOX &char_box) {
90 std::string symbol_str(char_str);
92 if (
id != INVALID_UNICHAR_ID) {
94 if (normed_uch.length() > 0) {
95 symbol_str = normed_uch;
98 int length = truth_word_.
length();
99 truth_text_.push_back(symbol_str);
102 truth_has_char_boxes_ =
true;
103 }
else if (truth_word_.
BlobBox(length - 1) == char_box) {
104 truth_has_char_boxes_ =
false;
112 truth_has_char_boxes_ =
false;
117 if (word_choice ==
nullptr) {
121 std::string normed_choice_str;
122 for (
unsigned i = 0;
i < word_choice->
length(); ++
i) {
126 return truth_str == normed_choice_str;
131 for (
auto &text : this->truth_text_) {
134 if (!this->truth_has_char_boxes_) {
135 debug +=
" (no char boxes)";
137 if (choice !=
nullptr) {
139 std::string choice_str;
143 if (msg.length() > 0) {
153 norm_box_tolerance_ = kBlamerBoxTolerance * denorm.
x_scale();
158 for (
unsigned b = 0; b < truth_word_.
length(); ++b) {
160 topleft.
x = box.
left();
161 topleft.
y = box.
top();
166 TBOX norm_box(norm_topleft.
x, norm_botright.
y, norm_botright.
x, norm_topleft.
y);
176 std::string debug_str;
178 unsigned begin2_truth_index = 0;
179 if (incorrect_result_reason_ !=
IRR_NO_TRUTH && truth_has_char_boxes_) {
180 debug_str =
"Looking for truth split at";
181 debug_str +=
" end1_x " + std::to_string(word1_right);
182 debug_str +=
" begin2_x " + std::to_string(word2_left);
183 debug_str +=
"\nnorm_truth_word boxes:\n";
184 if (norm_truth_word_.
length() > 1) {
186 for (
unsigned b = 1; b < norm_truth_word_.
length(); ++b) {
188 if ((abs(word1_right - norm_truth_word_.
BlobBox(b - 1).
right()) < norm_box_tolerance_) &&
189 (abs(word2_left - norm_truth_word_.
BlobBox(b).
left()) < norm_box_tolerance_)) {
190 begin2_truth_index = b;
191 debug_str +=
"Split found";
200 if (begin2_truth_index > 0) {
201 bundle1->truth_has_char_boxes_ =
true;
202 bundle1->norm_box_tolerance_ = norm_box_tolerance_;
203 bundle2->truth_has_char_boxes_ =
true;
204 bundle2->norm_box_tolerance_ = norm_box_tolerance_;
206 for (
unsigned b = 0; b < norm_truth_word_.
length(); ++b) {
207 if (b == begin2_truth_index) {
212 curr_bb->truth_text_.push_back(truth_text_[b]);
218 debug_str +=
"Truth split not found";
219 debug_str += truth_has_char_boxes_ ?
"\n" :
" (no truth char boxes)\n";
228 std::string debug_str;
233 if (bundle1.incorrect_result_reason_ !=
IRR_CORRECT &&
236 debug_str +=
"Blame from part 1: ";
237 debug_str += bundle1.debug_;
238 irr = bundle1.incorrect_result_reason_;
240 if (bundle2.incorrect_result_reason_ !=
IRR_CORRECT &&
243 debug_str +=
"Blame from part 2: ";
244 debug_str += bundle2.debug_;
246 irr = bundle2.incorrect_result_reason_;
247 }
else if (irr != bundle2.incorrect_result_reason_) {
251 incorrect_result_reason_ = irr;
253 SetBlame(irr, debug_str,
nullptr,
debug);
261 const BLOB_CHOICE_LIST &choices,
bool debug) {
262 if (!truth_has_char_boxes_ || incorrect_result_reason_ !=
IRR_CORRECT) {
266 for (
unsigned b = 0; b < norm_truth_word_.
length(); ++b) {
267 const TBOX &truth_box = norm_truth_word_.
BlobBox(b);
271 if (blob_box.
x_almost_equal(truth_box, norm_box_tolerance_ / 2)) {
273 bool incorrect_adapted =
false;
274 UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
275 const char *truth_str = truth_text_[b].c_str();
278 BLOB_CHOICE_IT choices_it(
const_cast<BLOB_CHOICE_LIST *
>(&choices));
279 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) {
285 incorrect_adapted =
true;
290 std::string debug_str =
"unichar ";
291 debug_str += truth_str;
292 debug_str +=
" not found in classification list";
294 }
else if (incorrect_adapted) {
295 std::string debug_str =
"better rating for adapted ";
297 debug_str +=
" than for correct ";
298 debug_str += truth_str;
313 bool missing_chop =
false;
315 unsigned box_index = 0;
317 int16_t truth_x = -1;
318 while (box_index < truth_word_.
length() && blob_index < num_blobs) {
331 if (missing_chop || box_index < norm_truth_word_.
length()) {
332 std::string debug_str;
334 debug_str +=
"Detected missing chop (tolerance=" + std::to_string(norm_box_tolerance_);
335 debug_str +=
") at Bounding Box=";
338 debug_str +=
"\nNo chop for truth at x=" + std::to_string(truth_x);
340 debug_str +=
"Missing chops for last " + std::to_string(norm_truth_word_.
length() - box_index);
341 debug_str +=
" truth box(es)";
343 debug_str +=
"\nMaximally chopped word boxes:\n";
344 for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
349 debug_str +=
"Truth bounding boxes:\n";
350 for (box_index = 0; box_index < norm_truth_word_.
length(); ++box_index) {
364 bool valid_permuter,
bool debug) {
365 if (valid_permuter) {
367 best_choice_is_dict_and_top_choice_ =
true;
372 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
373 blob_choice_it.forward()) {
374 if (!(unicharset.
get_fragment(blob_choice_it.data()->unichar_id()))) {
375 first_choice = blob_choice_it.data();
381 best_choice_is_dict_and_top_choice_ =
false;
386 std::string debug_str;
387 if (best_choice_is_dict_and_top_choice_) {
388 debug_str =
"Best choice is: incorrect, top choice, dictionary word";
389 debug_str +=
" with permuter ";
392 debug_str =
"Classifier/Old LM tradeoff is to blame";
400#ifndef DISABLED_LEGACY_ENGINE
403 if (incorrect_result_reason_ !=
IRR_CORRECT || !truth_has_char_boxes_) {
407 std::string debug_str =
"Blamer computing correct_segmentation_cols\n";
408 int curr_box_col = 0;
409 int next_box_col = 0;
411 if (num_blobs == 0) {
415 int16_t next_box_x = word->
blobs[blob_index]->bounding_box().right();
416 for (
unsigned truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.
length();
419 int16_t curr_box_x = next_box_x;
420 if (blob_index + 1 < num_blobs) {
421 next_box_x = word->
blobs[blob_index + 1]->bounding_box().right();
423 int16_t truth_x = norm_truth_word_.
BlobBox(truth_idx).
right();
424 debug_str +=
"Box x coord vs. truth: " + std::to_string(curr_box_x);
425 debug_str +=
" " + std::to_string(truth_x);
427 if (curr_box_x > (truth_x + norm_box_tolerance_)) {
429 }
else if (curr_box_x >= truth_x - norm_box_tolerance_ &&
430 (blob_index + 1 >= num_blobs ||
431 next_box_x > truth_x + norm_box_tolerance_)) {
432 correct_segmentation_cols_.push_back(curr_box_col);
433 correct_segmentation_rows_.push_back(next_box_col - 1);
435 debug_str +=
"col=" + std::to_string(curr_box_col);
436 debug_str +=
" row=" + std::to_string(next_box_col - 1);
438 curr_box_col = next_box_col;
441 if (blob_index < num_blobs ||
442 correct_segmentation_cols_.size() != norm_truth_word_.
length()) {
444 "Blamer failed to find correct segmentation"
446 std::to_string(norm_box_tolerance_);
447 if (blob_index >= num_blobs) {
448 debug_str +=
" blob == nullptr";
451 debug_str +=
" path length " + std::to_string(correct_segmentation_cols_.size());
452 debug_str +=
" vs. truth " + std::to_string(norm_truth_word_.
length());
455 correct_segmentation_cols_.clear();
456 correct_segmentation_rows_.clear();
462 return incorrect_result_reason_ ==
IRR_CORRECT && !segsearch_is_looking_for_blame_ &&
466#if !defined(DISABLED_LEGACY_ENGINE)
469 UNICHAR_ID wildcard_id,
bool debug, std::string &debug_str,
472 segsearch_is_looking_for_blame_ =
true;
474 tprintf(
"segsearch starting to look for blame\n");
478 debug_str +=
"Correct segmentation:\n";
479 for (
unsigned idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {
480 debug_str +=
"col=" + std::to_string(correct_segmentation_cols_[idx]);
481 debug_str +=
" row=" + std::to_string(correct_segmentation_rows_[idx]);
483 if (!ratings->
Classified(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
486 correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
488 segsearch_is_looking_for_blame_ =
false;
489 debug_str +=
"\nFailed to insert pain point\n";
499 return segsearch_is_looking_for_blame_;
515 if (segsearch_is_looking_for_blame_) {
516 segsearch_is_looking_for_blame_ =
false;
517 if (best_choice_is_dict_and_top_choice_) {
518 debug_str =
"Best choice is: incorrect, top choice, dictionary word";
519 debug_str +=
" with permuter ";
522 }
else if (best_correctly_segmented_rating_ < best_choice->rating()) {
523 debug_str +=
"Correct segmentation state was not explored";
527 debug_str +=
"Correct segmentation paths were pruned by LM\n";
529 debug_str +=
"Best correct segmentation rating " +
530 std::to_string(best_correctly_segmented_rating_);
531 debug_str +=
" vs. best choice rating " + std::to_string(best_choice->
rating());
550 std::string debug_str =
"Choice is incorrect after recognition";
566 misadaption_debug_ =
"misadapt to word (";
568 misadaption_debug_ +=
"): ";
571 tprintf(
"%s\n", misadaption_debug_.c_str());
const char kBlameNoTruthSplit[]
const char kBlameSegsearchHeur[]
const char kBlameChopper[]
const char kBlameUnknown[]
void tprintf(const char *format,...)
const char kBlamePageLayout[]
const char kBlameSegsearchPP[]
const char kBlameClassOldLMTradeoff[]
const char kBlameNoTruth[]
const char kBlameClassifier[]
@ IRR_CLASS_OLD_LM_TRADEOFF
const char *const kIncorrectResultReasonNames[]
const char kBlameAdaption[]
const char kBlameClassLMTradeoff[]
const char kBlameCorrect[]
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
bool GuidedSegsearchStillGoing() const
static const char * IncorrectReasonName(IncorrectResultReason irr)
std::string TruthString() const
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str)
static void LastChanceBlame(bool debug, WERD_RES *word)
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
const std::string & debug() const
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box)
void SetChopperBlame(const WERD_RES *word, bool debug)
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points, double max_char_wh_ratio, WERD_RES *word_res)
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
const char * IncorrectReason() const
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box)
void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug)
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
void SetupNormTruthWord(const DENORM &denorm)
void SetupCorrectSegmentation(const TWERD *word, bool debug)
TBOX bounding_box() const
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
const TBOX & BlobBox(unsigned index) const
void InsertBox(unsigned index, const TBOX &box)
bool Classified(int col, int row, int wildcard_id) const
void NormTransform(const DENORM *first_norm, const TPOINT &pt, TPOINT *transformed) const
WERD_CHOICE * best_choice
BlamerBundle * blamer_bundle
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
void StartHypothesisList()
UNICHAR_ID unichar_id() const
static const char * permuter_name(uint8_t permuter)
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
UNICHAR_ID unichar_id(unsigned index) const
static const float kBadRating
const UNICHARSET * unicharset() const
void print_to_str(std::string &str) const
bool x_almost_equal(const TBOX &box, int tolerance) const
TDimension bottom() const
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)