tesseract-ocr.github.io/5.3.3/a00116_source.html

/**********************************************************************

 * File:        applybox.cpp  (Formerly applybox.c)

 * Description: Re segment rows according to box file data

 * Author:      Phil Cheatle

 *

 * (C) Copyright 1993, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


#ifndef DISABLED_LEGACY_ENGINE

#  include <allheaders.h>

#  include <cctype>

#  include <cerrno>

#  include <cstring>

#  include "boxread.h"

#endif // ndef DISABLED_LEGACY_ENGINE

#include <tesseract/unichar.h>

#include "pageres.h"

#include "tesseractclass.h"

#include "unicharset.h"


#ifndef DISABLED_LEGACY_ENGINE

const int kMaxGroupSize = 4;

const double kMaxXHeightDeviationFraction = 0.125;

#endif // ndef DISABLED_LEGACY_ENGINE


namespace tesseract {


#ifndef DISABLED_LEGACY_ENGINE

static void clear_any_old_text(BLOCK_LIST *block_list) {

  BLOCK_IT block_it(block_list);

  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {

    ROW_IT row_it(block_it.data()->row_list());

    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {

      WERD_IT word_it(row_it.data()->word_list());

      for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

        word_it.data()->set_text("");

      }

    }

  }

}


// Applies the box file based on the image name filename, and resegments

// the words in the block_list (page), with:

// blob-mode: one blob per line in the box file, words as input.

// word/line-mode: one blob per space-delimited unit after the #, and one word

// per line in the box file. (See comment above for box file format.)

// If find_segmentation is true, (word/line mode) then the classifier is used

// to re-segment words/lines to match the space-delimited truth string for

// each box. In this case, the input box may be for a word or even a whole

// text line, and the output words will contain multiple blobs corresponding

// to the space-delimited input string.

// With find_segmentation false, no classifier is needed, but the chopper

// can still be used to correctly segment touching characters with the help

// of the input boxes.

// In the returned PAGE_RES, the WERD_RES are setup as they would be returned

// from normal classification, ie. with a word, chopped_word, rebuild_word,

// seam_array, denorm, box_word, and best_state, but NO best_choice or

// raw_choice, as they would require a UNICHARSET, which we aim to avoid.

// Instead, the correct_text member of WERD_RES is set, and this may be later

// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords

// is not required before calling ApplyBoxTraining.

PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,

                                BLOCK_LIST *block_list) {

  std::vector<TBOX> boxes;

  std::vector<std::string> texts, full_texts;

  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {

    return nullptr; // Can't do it.

  }


  const int box_count = boxes.size();

  int box_failures = 0;


  // In word mode, we use the boxes to make a word for each box, but

  // in blob mode we use the existing words and maximally chop them first.

  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);

  clear_any_old_text(block_list);


  for (int i = 0; i < box_count; i++) {

    bool foundit = false;

    if (page_res != nullptr) {

      foundit =

          ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],

                           (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());

    } else {

      foundit = ResegmentWordBox(block_list, boxes[i],

                                 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());

    }

    if (!foundit) {

      box_failures++;

      ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");

    }

  }


  if (page_res == nullptr) {

    // In word/line mode, we now maximally chop all the words and resegment

    // them with the classifier.

    page_res = SetupApplyBoxes(boxes, block_list);

    ReSegmentByClassification(page_res);

  }

  if (applybox_debug > 0) {

    tprintf("APPLY_BOXES:\n");

    tprintf("   Boxes read from boxfile:  %6d\n", box_count);

    if (box_failures > 0) {

      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);

    }

  }

  TidyUp(page_res);

  return page_res;

}


// Helper computes median xheight in the image.

static double MedianXHeight(BLOCK_LIST *block_list) {

  BLOCK_IT block_it(block_list);

  STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1);

  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {

    ROW_IT row_it(block_it.data()->row_list());

    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {

      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);

    }

  }

  return xheights.median();

}


void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {

  const double median_xheight = MedianXHeight(block_list);

  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;

  // Strip all fuzzy space markers to simplify the PAGE_RES.

  BLOCK_IT b_it(block_list);

  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {

    BLOCK *block = b_it.data();

    ROW_IT r_it(block->row_list());

    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {

      ROW *row = r_it.data();

      const double diff = fabs(row->x_height() - median_xheight);

      if (diff > max_deviation) {

        if (applybox_debug) {

          tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);

        }

        row->set_x_height(static_cast<float>(median_xheight));

      }

    }

  }

}


PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {

  PreenXHeights(block_list);

  // Strip all fuzzy space markers to simplify the PAGE_RES.

  BLOCK_IT b_it(block_list);

  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {

    BLOCK *block = b_it.data();

    ROW_IT r_it(block->row_list());

    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {

      ROW *row = r_it.data();

      WERD_IT w_it(row->word_list());

      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {

        WERD *word = w_it.data();

        if (word->cblob_list()->empty()) {

          delete w_it.extract();

        } else {

          word->set_flag(W_FUZZY_SP, false);

          word->set_flag(W_FUZZY_NON, false);

        }

      }

    }

  }

  auto *page_res = new PAGE_RES(false, block_list, nullptr);

  PAGE_RES_IT pr_it(page_res);

  WERD_RES *word_res;

  while ((word_res = pr_it.word()) != nullptr) {

    MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);

    pr_it.forward();

  }

  return page_res;

}


void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,

                                  WERD_RES *word_res) {

  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,

                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,

                                     poly_allow_detailed_fx, row, block)) {

    word_res->CloneChoppedToRebuild();

    return;

  }

  if (chop_debug) {

    tprintf("Maximally chopping word at:");

    word_res->word->bounding_box().print();

  }

  std::vector<BLOB_CHOICE *> blob_choices;

  ASSERT_HOST(!word_res->chopped_word->blobs.empty());

  auto rating = static_cast<float>(INT8_MAX);

  for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {

    // The rating and certainty are not quite arbitrary. Since

    // select_blob_to_chop uses the worst certainty to choose, they all have

    // to be different, so starting with INT8_MAX, subtract 1/8 for each blob

    // in here, and then divide by e each time they are chopped, which

    // should guarantee a set of unequal values for the whole tree of blobs

    // produced, however much chopping is required. The chops are thus only

    // limited by the ability of the chopper to find suitable chop points,

    // and not by the value of the certainties.

    auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);

    blob_choices.push_back(choice);

    rating -= 0.125f;

  }

  const double e = exp(1.0); // The base of natural logs.

  unsigned blob_number;

  int right_chop_index = 0;

  if (!assume_fixed_pitch_char_segment) {

    // We only chop if the language is not fixed pitch like CJK.

    SEAM *seam = nullptr;

    while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {

      word_res->InsertSeam(blob_number, seam);

      BLOB_CHOICE *left_choice = blob_choices[blob_number];

      rating = left_choice->rating() / e;

      left_choice->set_rating(rating);

      left_choice->set_certainty(-rating);

      // combine confidence w/ serial #

      auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,

                                           0.0f, 0.0f, BCC_FAKE);

      blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);

    }

  }

  word_res->CloneChoppedToRebuild();

  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);

}


static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {

  const int overlap_area = box1.intersection(box2).area();

  const int a = box1.area();

  const int b = box2.area();

  ASSERT_HOST(a != 0 && b != 0);

  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;

}


bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,

                                 const TBOX *next_box, const char *correct_text) {

  if (applybox_debug > 1) {

    tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);

  }

  PAGE_RES_IT page_res_it(page_res);

  WERD_RES *word_res;

  for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {

    if (!word_res->box_word->bounding_box().major_overlap(box)) {

      continue;

    }

    if (applybox_debug > 1) {

      tprintf("Checking word box:");

      word_res->box_word->bounding_box().print();

    }

    int word_len = word_res->box_word->length();

    for (int i = 0; i < word_len; ++i) {

      TBOX char_box = TBOX();

      int blob_count = 0;

      for (blob_count = 0; i + blob_count < word_len; ++blob_count) {

        TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);

        if (!blob_box.major_overlap(box)) {

          break;

        }

        if (word_res->correct_text[i + blob_count].length() > 0) {

          break; // Blob is claimed already.

        }

        if (next_box != nullptr) {

          const double current_box_miss_metric = BoxMissMetric(blob_box, box);

          const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);

          if (applybox_debug > 2) {

            tprintf("Checking blob:");

            blob_box.print();

            tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,

                    next_box_miss_metric);

          }

          if (current_box_miss_metric > next_box_miss_metric) {

            break; // Blob is a better match for next box.

          }

        }

        char_box += blob_box;

      }

      if (blob_count > 0) {

        if (applybox_debug > 1) {

          tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);

        }

        if (!char_box.almost_equal(box, 3) &&

            ((next_box != nullptr && box.x_gap(*next_box) < -3) ||

             (prev_box != nullptr && prev_box->x_gap(box) < -3))) {

          return false;

        }

        // We refine just the box_word, best_state and correct_text here.

        // The rebuild_word is made in TidyUp.

        // blob_count blobs are put together to match the box. Merge the

        // box_word boxes, save the blob_count in the state and the text.

        word_res->box_word->MergeBoxes(i, i + blob_count);

        word_res->best_state[i] = blob_count;

        word_res->correct_text[i] = correct_text;

        if (applybox_debug > 2) {

          tprintf("%d Blobs match: blob box:", blob_count);

          word_res->box_word->BlobBox(i).print();

          tprintf("Matches box:");

          box.print();

          if (next_box != nullptr) {

            tprintf("With next box:");

            next_box->print();

          }

        }

        // Eliminated best_state and correct_text entries for the consumed

        // blobs.

        for (int j = 1; j < blob_count; ++j) {

          word_res->best_state.erase(word_res->best_state.begin() + i + 1);

          word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);

        }

        // Assume that no box spans multiple source words, so we are done with

        // this box.

        if (applybox_debug > 1) {

          tprintf("Best state = ");

          for (auto best_state : word_res->best_state) {

            tprintf("%d ", best_state);

          }

          tprintf("\n");

          tprintf("Correct text = [[ ");

          for (auto &it : word_res->correct_text) {

            tprintf("%s ", it.c_str());

          }

          tprintf("]]\n");

        }

        return true;

      }

    }

  }

  if (applybox_debug > 0) {

    tprintf("FAIL!\n");

  }

  return false; // Failure.

}


bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,

                                 const char *correct_text) {

  if (applybox_debug > 1) {

    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);

  }

  WERD *new_word = nullptr;

  BLOCK_IT b_it(block_list);

  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {

    BLOCK *block = b_it.data();

    if (!box.major_overlap(block->pdblk.bounding_box())) {

      continue;

    }

    ROW_IT r_it(block->row_list());

    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {

      ROW *row = r_it.data();

      if (!box.major_overlap(row->bounding_box())) {

        continue;

      }

      WERD_IT w_it(row->word_list());

      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {

        WERD *word = w_it.data();

        if (applybox_debug > 2) {

          tprintf("Checking word:");

          word->bounding_box().print();

        }

        if (word->text() != nullptr && word->text()[0] != '\0') {

          continue; // Ignore words that are already done.

        }

        if (!box.major_overlap(word->bounding_box())) {

          continue;

        }

        C_BLOB_IT blob_it(word->cblob_list());

        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {

          C_BLOB *blob = blob_it.data();

          TBOX blob_box = blob->bounding_box();

          if (!blob_box.major_overlap(box)) {

            continue;

          }

          if (next_box != nullptr) {

            const double current_box_miss_metric = BoxMissMetric(blob_box, box);

            const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);

            if (applybox_debug > 2) {

              tprintf("Checking blob:");

              blob_box.print();

              tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,

                      next_box_miss_metric);

            }

            if (current_box_miss_metric > next_box_miss_metric) {

              continue; // Blob is a better match for next box.

            }

          }

          if (applybox_debug > 2) {

            tprintf("Blob match: blob:");

            blob_box.print();

            tprintf("Matches box:");

            box.print();

            if (next_box != nullptr) {

              tprintf("With next box:");

              next_box->print();

            }

          }

          if (new_word == nullptr) {

            // Make a new word with a single blob.

            new_word = word->shallow_copy();

            new_word->set_text(correct_text);

            w_it.add_to_end(new_word);

          }

          C_BLOB_IT new_blob_it(new_word->cblob_list());

          new_blob_it.add_to_end(blob_it.extract());

        }

      }

    }

  }

  if (new_word == nullptr && applybox_debug > 0) {

    tprintf("FAIL!\n");

  }

  return new_word != nullptr;

}


void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {

  PAGE_RES_IT pr_it(page_res);

  WERD_RES *word_res;

  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {

    const WERD *word = word_res->word;

    if (word->text() == nullptr || word->text()[0] == '\0') {

      continue; // Ignore words that have no text.

    }

    // Convert the correct text to a vector of UNICHAR_ID

    std::vector<UNICHAR_ID> target_text;

    if (!ConvertStringToUnichars(word->text(), &target_text)) {

      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());

      pr_it.DeleteCurrentWord();

      continue;

    }

    if (!FindSegmentation(target_text, word_res)) {

      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());

      pr_it.DeleteCurrentWord();

      continue;

    }

  }

}


bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {

  for (int step = 0; *utf8 != '\0'; utf8 += step) {

    const char *next_space = strchr(utf8, ' ');

    if (next_space == nullptr) {

      next_space = utf8 + strlen(utf8);

    }

    step = next_space - utf8;

    UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);

    if (class_id == INVALID_UNICHAR_ID) {

      return false;

    }

    while (utf8[step] == ' ') {

      ++step;

    }

    class_ids->push_back(class_id);

  }

  return true;

}


bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {

  // Classify all required combinations of blobs and save results in choices.

  const int word_length = word_res->box_word->length();

  auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];

  for (int i = 0; i < word_length; ++i) {

    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {

      BLOB_CHOICE_LIST *match_result =

          classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,

                         word_res->blamer_bundle);

      if (applybox_debug > 2) {

        tprintf("%d+%d:", i, j);

        print_ratings_list("Segment:", match_result, unicharset);

      }

      choices[i].push_back(match_result);

    }

  }

  // Search the segmentation graph for the target text. Must be an exact

  // match. Using wildcards makes it difficult to find the correct

  // segmentation even when it is there.

  word_res->best_state.clear();

  std::vector<int> search_segmentation;

  float best_rating = 0.0f;

  SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,

                &word_res->best_state);

  for (int i = 0; i < word_length; ++i) {

    for (auto choice : choices[i]) {

      delete choice;

    }

  }

  delete[] choices;

  if (word_res->best_state.empty()) {

    // Build the original segmentation and if it is the same length as the

    // truth, assume it will do.

    int blob_count = 1;

    for (auto s : word_res->seam_array) {

      SEAM *seam = s;

      if (!seam->HasAnySplits()) {

        word_res->best_state.push_back(blob_count);

        blob_count = 1;

      } else {

        ++blob_count;

      }

    }

    word_res->best_state.push_back(blob_count);

    if (word_res->best_state.size() != target_text.size()) {

      word_res->best_state.clear(); // No good. Original segmentation bad size.

      return false;

    }

  }

  word_res->correct_text.clear();

  for (auto &text : target_text) {

    word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));

  }

  return true;

}


void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,

                              unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,

                              unsigned text_index, float rating, std::vector<int> *segmentation,

                              float *best_rating, std::vector<int> *best_segmentation) {

  const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();

  for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {

    // Rating of matching choice or worst choice if no match.

    float choice_rating = 0.0f;

    // Find the corresponding best BLOB_CHOICE.

    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);

    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {

      const BLOB_CHOICE *choice = choice_it.data();

      choice_rating = choice->rating();

      auto class_id = choice->unichar_id();

      if (class_id == target_text[text_index]) {

        break;

      }

      // Search ambigs table.

      if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {

        AmbigSpec_IT spec_it(table[class_id]);

        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {

          const AmbigSpec *ambig_spec = spec_it.data();

          // We'll only do 1-1.

          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&

              ambig_spec->correct_ngram_id == target_text[text_index]) {

            break;

          }

        }

        if (!spec_it.cycled_list()) {

          break; // Found an ambig.

        }

      }

    }

    if (choice_it.cycled_list()) {

      continue; // No match.

    }

    segmentation->push_back(length);

    if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {

      // This is a complete match. If the rating is good record a new best.

      if (applybox_debug > 2) {

        tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",

                rating + choice_rating, *best_rating, segmentation->size(),

                best_segmentation->size());

      }

      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {

        *best_segmentation = *segmentation;

        *best_rating = rating + choice_rating;

      }

    } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {

      if (applybox_debug > 3) {

        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],

                unicharset.id_to_unichar(target_text[text_index]),

                choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",

                choices_pos, length);

      }

      SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,

                    rating + choice_rating, segmentation, best_rating, best_segmentation);

      if (applybox_debug > 3) {

        tprintf("End recursion for %d=%s\n", target_text[text_index],

                unicharset.id_to_unichar(target_text[text_index]));

      }

    }

    segmentation->resize(segmentation->size() - 1);

  }

}


void Tesseract::TidyUp(PAGE_RES *page_res) {

  int ok_blob_count = 0;

  int bad_blob_count = 0;

  int ok_word_count = 0;

  int unlabelled_words = 0;

  PAGE_RES_IT pr_it(page_res);

  WERD_RES *word_res;

  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {

    int ok_in_word = 0;

    int blob_count = word_res->correct_text.size();

    auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);

    word_choice->set_permuter(TOP_CHOICE_PERM);

    for (int c = 0; c < blob_count; ++c) {

      if (word_res->correct_text[c].length() > 0) {

        ++ok_in_word;

      }

      // Since we only need a fake word_res->best_choice, the actual

      // unichar_ids do not matter. Which is fortunate, since TidyUp()

      // can be called while training Tesseract, at the stage where

      // unicharset is not meaningful yet.

      word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],

                                                     1.0f, -1.0f);

    }

    if (ok_in_word > 0) {

      ok_blob_count += ok_in_word;

      bad_blob_count += word_res->correct_text.size() - ok_in_word;

      word_res->LogNewRawChoice(word_choice);

      word_res->LogNewCookedChoice(1, false, word_choice);

    } else {

      ++unlabelled_words;

      if (applybox_debug > 0) {

        tprintf("APPLY_BOXES: Unlabelled word at :");

        word_res->word->bounding_box().print();

      }

      pr_it.DeleteCurrentWord();

      delete word_choice;

    }

  }

  pr_it.restart_page();

  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {

    // Denormalize back to a BoxWord.

    word_res->RebuildBestState();

    word_res->SetupBoxWord();

    word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());

    word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());

  }

  if (applybox_debug > 0) {

    tprintf("   Found %d good blobs.\n", ok_blob_count);

    if (bad_blob_count > 0) {

      tprintf("   Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);

    }

    if (unlabelled_words > 0) {

      tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);

    }

  }

}


void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,

                                const char *err_msg) {

  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,

          box.left(), box.bottom(), box.right(), box.top(), err_msg);

}


void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {

  PAGE_RES_IT pr_it(page_res);

  int word_count = 0;

  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {

    LearnWord(fontname.c_str(), word_res);

    ++word_count;

  }

  tprintf("Generated training data for %d words\n", word_count);

}


#endif // ndef DISABLED_LEGACY_ENGINE


void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {

  PAGE_RES_IT pr_it(page_res);

  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {

    auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());

    for (auto &correct_text : word_res->correct_text) {

      // The part before the first space is the real ground truth, and the

      // rest is the bounding box location and page number.

      std::vector<std::string> tokens = split(correct_text, ' ');

      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());

      choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);

    }

    word_res->ClearWordChoices();

    word_res->LogNewRawChoice(choice);

    word_res->LogNewCookedChoice(1, false, choice);

  }

}


} // namespace tesseract

unichar.h

kMaxGroupSize
const int kMaxGroupSize
Definition: applybox.cpp:33

kMaxXHeightDeviationFraction
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:36

tesseractclass.h

unicharset.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

pageres.h

boxread.h

TBOX
@ TBOX
Definition: cleanapi_test.cc:19

i
int i
Definition: gmock-matchers_test.cc:718

tesseract
Definition: baseapi.h:39

tesseract::W_BOL
@ W_BOL
start of line
Definition: werd.h:34

tesseract::W_FUZZY_SP
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41

tesseract::W_EOL
@ W_EOL
end of line
Definition: werd.h:35

tesseract::W_FUZZY_NON
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42

tesseract::UnicharAmbigsVector
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:140

tesseract::print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:170

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::ReadAllBoxes
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:76

tesseract::BCC_FAKE
@ BCC_FAKE
Definition: ratngs.h:53

tesseract::TOP_CHOICE_PERM
@ TOP_CHOICE_PERM
Definition: ratngs.h:238

tesseract::split
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43

tesseract::Tesseract::ResegmentWordBox
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:414

tesseract::Tesseract::ApplyBoxes
PAGE_RES * ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:110

tesseract::Tesseract::TidyUp
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:685

tesseract::Tesseract::ApplyBoxTraining
void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:751

tesseract::Tesseract::ResegmentCharBox
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:310

tesseract::Tesseract::ReSegmentByClassification
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:495

tesseract::Tesseract::ConvertStringToUnichars
bool ConvertStringToUnichars(const char *utf8, std::vector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:520

tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:480

tesseract::Tesseract::BestPix
Image BestPix() const
Definition: tesseractclass.h:238

tesseract::Tesseract::SearchForText
void SearchForText(const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
Definition: applybox.cpp:615

tesseract::Tesseract::PreenXHeights
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:174

tesseract::Tesseract::CorrectClassifyWords
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:764

tesseract::Tesseract::MaximallyChopWord
void MaximallyChopWord(const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:231

tesseract::Tesseract::ReportFailedBox
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:743

tesseract::Tesseract::FindSegmentation
bool FindSegmentation(const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:545

tesseract::Tesseract::SetupApplyBoxes
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197

tesseract::TWERD::blobs
std::vector< TBLOB * > blobs
Definition: blobs.h:462

tesseract::TWERD::NumBlobs
unsigned NumBlobs() const
Definition: blobs.h:449

tesseract::BoxWord::MergeBoxes
void MergeBoxes(unsigned start, unsigned end)
Definition: boxword.cpp:138

tesseract::BoxWord::bounding_box
const TBOX & bounding_box() const
Definition: boxword.h:78

tesseract::BoxWord::length
unsigned length() const
Definition: boxword.h:81

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84

tesseract::BLOCK
Definition: ocrblock.h:34

tesseract::BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185

tesseract::BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:111

tesseract::ROW
Definition: ocrrow.h:39

tesseract::ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:57

tesseract::ROW::x_height
float x_height() const
Definition: ocrrow.h:66

tesseract::ROW::bounding_box
TBOX bounding_box() const
Definition: ocrrow.h:90

tesseract::ROW::set_x_height
void set_x_height(float new_xheight)
Definition: ocrrow.h:69

tesseract::PAGE_RES
Definition: pageres.h:77

tesseract::BLOCK_RES::block
BLOCK * block
Definition: pageres.h:120

tesseract::ROW_RES::row
ROW * row
Definition: pageres.h:144

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:865

tesseract::WERD_RES::correct_text
std::vector< std::string > correct_text
Definition: pageres.h:287

tesseract::WERD_RES::FakeClassifyWord
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:908

tesseract::WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:210

tesseract::WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:419

tesseract::WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:304

tesseract::WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:629

tesseract::WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:250

tesseract::WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:837

tesseract::WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:201

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::best_state
std::vector< int > best_state
Definition: pageres.h:283

tesseract::WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:270

tesseract::WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:879

tesseract::WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:613

tesseract::WERD_RES::seam_array
std::vector< SEAM * > seam_array
Definition: pageres.h:212

tesseract::PAGE_RES_IT
Definition: pageres.h:682

tesseract::PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:769

tesseract::PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:743

tesseract::PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:763

tesseract::PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:710

tesseract::PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1488

tesseract::PAGE_RES_IT::next_row
ROW_RES * next_row() const
Definition: pageres.h:775

tesseract::PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:757

tesseract::PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:766

tesseract::PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67

tesseract::BLOB_CHOICE
Definition: ratngs.h:56

tesseract::BLOB_CHOICE::set_certainty
void set_certainty(float newrat)
Definition: ratngs.h:150

tesseract::BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81

tesseract::BLOB_CHOICE::set_rating
void set_rating(float newrat)
Definition: ratngs.h:147

tesseract::BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:84

tesseract::WERD_CHOICE
Definition: ratngs.h:258

tesseract::TBOX
Definition: rect.h:37

tesseract::TBOX::left
TDimension left() const
Definition: rect.h:82

tesseract::TBOX::almost_equal
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:272

tesseract::TBOX::top
TDimension top() const
Definition: rect.h:68

tesseract::TBOX::x_gap
int x_gap(const TBOX &box) const
Definition: rect.h:238

tesseract::TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:84

tesseract::TBOX::print
void print() const
Definition: rect.h:289

tesseract::TBOX::right
TDimension right() const
Definition: rect.h:89

tesseract::TBOX::bottom
TDimension bottom() const
Definition: rect.h:75

tesseract::TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:374

tesseract::TBOX::area
int32_t area() const
Definition: rect.h:134

tesseract::SEAM
Definition: seam.h:33

tesseract::SEAM::HasAnySplits
bool HasAnySplits() const
Definition: seam.h:52

tesseract::STATS
Definition: statistc.h:30

tesseract::C_BLOB
Definition: stepblob.h:40

tesseract::C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:250

tesseract::WERD
Definition: werd.h:58

tesseract::WERD::text
const char * text() const
Definition: werd.h:121

tesseract::WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131

tesseract::WERD::shallow_copy
WERD * shallow_copy()
Definition: werd.cpp:342

tesseract::WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:155

tesseract::WERD::set_text
void set_text(const char *new_text)
Definition: werd.h:124

tesseract::WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:96

tesseract::AmbigSpec
Definition: ambigs.h:112

tesseract::AmbigSpec::correct_ngram_id
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:132

tesseract::AmbigSpec::wrong_ngram
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:130

tesseract::UnicharAmbigs::dang_ambigs
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:157

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:61

tesseract::UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

tesseract::UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262

tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111

tesseract::Wordrec::chop_one_blob
SEAM * chop_one_blob(const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
Definition: chopper.cpp:367

tesseract::Wordrec::classify_piece
virtual BLOB_CHOICE_LIST * classify_piece(const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:49