tesseract-ocr.github.io/5.3.3/a00161_source.html

// File:        resultiterator.cpp

// Description: Iterator for tesseract results that is capable of

//              iterating in proper reading order over Bi Directional

//              (e.g. mixed Hebrew and English) text.

// Author:      David Eger

//

// (C) Copyright 2011, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//


#include <tesseract/resultiterator.h>


#include "pageres.h"

#include "tesseractclass.h"

#include "unicharset.h"


#include <allheaders.h>


#include <set>

#include <vector>


static const char *const kLRM = "\u200E"; // Left-to-Right Mark

static const char *const kRLM = "\u200F"; // Right-to-Left Mark


namespace tesseract {


ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) {

  in_minor_direction_ = false;

  at_beginning_of_minor_run_ = false;

  preserve_interword_spaces_ = false;


  auto *p = ParamUtils::FindParam<BoolParam>(

      "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);

  if (p != nullptr) {

    preserve_interword_spaces_ = (bool)(*p);

  }


  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();

  MoveToLogicalStartOfTextline();

}


ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) {

  return new ResultIterator(resit);

}


bool ResultIterator::ParagraphIsLtr() const {

  return current_paragraph_is_ltr_;

}


bool ResultIterator::CurrentParagraphIsLtr() const {

  if (!it_->word()) {

    return true; // doesn't matter.

  }

  LTRResultIterator it(*this);

  it.RestartParagraph();

  // Try to figure out the ltr-ness of the paragraph.  The rules below

  // make more sense in the context of a difficult paragraph example.

  // Here we denote {ltr characters, RTL CHARACTERS}:

  //

  //   "don't go in there!" DAIS EH

  //   EHT OTNI DEPMUJ FELSMIH NEHT DNA

  //                  .GNIDLIUB GNINRUB

  //

  // On the first line, the left-most word is LTR and the rightmost word

  // is RTL.  Thus, we are better off taking the majority direction for

  // the whole paragraph contents.  So instead of "the leftmost word is LTR"

  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs

  // would not do:  Typically an RTL paragraph would *not* start with an LTR

  // word.  So our heuristics are as follows:

  //

  // (1) If the first text line has an RTL word in the left-most position

  //     it is RTL.

  // (2) If the first text line has an LTR word in the right-most position

  //     it is LTR.

  // (3) If neither of the above is true, take the majority count for the

  //     paragraph -- if there are more rtl words, it is RTL.  If there

  //     are more LTR words, it's LTR.

  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;

  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;

  int num_ltr, num_rtl;

  num_rtl = leftmost_rtl ? 1 : 0;

  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;

  for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);

       it.Next(RIL_WORD)) {

    StrongScriptDirection dir = it.WordDirection();

    rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);

    num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;

    num_ltr += rightmost_ltr ? 1 : 0;

  }

  if (leftmost_rtl) {

    return false;

  }

  if (rightmost_ltr) {

    return true;

  }

  // First line is ambiguous.  Take statistics on the whole paragraph.

  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {

    do {

      StrongScriptDirection dir = it.WordDirection();

      num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;

      num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;

    } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));

  }

  return num_ltr >= num_rtl;

}


const int ResultIterator::kMinorRunStart = -1;

const int ResultIterator::kMinorRunEnd = -2;

const int ResultIterator::kComplexWord = -3;


void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {

  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;

  blob_indices->clear();

  if (Empty(RIL_WORD)) {

    return;

  }

  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {

    // Easy! just return the blobs in order;

    for (int i = 0; i < word_length_; i++) {

      blob_indices->push_back(i);

    }

    return;

  }


  // The blobs are in left-to-right order, but the current reading context

  // is right-to-left.

  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;

  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;

  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;

  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;

  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;

  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;

  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;


  // Step 1: Scan for and mark European Number sequences

  //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*

  std::vector<int> letter_types;

  letter_types.reserve(word_length_);

  for (int i = 0; i < word_length_; i++) {

    letter_types.push_back(it_->word()->SymbolDirection(i));

  }

  // Convert a single separator sandwiched between two ENs into an EN.

  for (int i = 0; i + 2 < word_length_; i++) {

    if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&

        (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {

      letter_types[i + 1] = U_EURO_NUM;

    }

  }

  // Scan for sequences of European Number Terminators around ENs and convert

  // them to ENs.

  for (int i = 0; i < word_length_; i++) {

    if (letter_types[i] == U_EURO_NUM_TERM) {

      int j = i + 1;

      while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {

        j++;

      }

      if (j < word_length_ && letter_types[j] == U_EURO_NUM) {

        // The sequence [i..j] should be converted to all European Numbers.

        for (int k = i; k < j; k++) {

          letter_types[k] = U_EURO_NUM;

        }

      }

      j = i - 1;

      while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {

        j--;

      }

      if (j > -1 && letter_types[j] == U_EURO_NUM) {

        // The sequence [j..i] should be converted to all European Numbers.

        for (int k = j; k <= i; k++) {

          letter_types[k] = U_EURO_NUM;

        }

      }

    }

  }

  // Step 2: Convert all remaining types to either L or R.

  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.

  // All other are R.

  for (int i = 0; i < word_length_;) {

    int ti = letter_types[i];

    if (ti == U_LTR || ti == U_EURO_NUM) {

      // Left to right sequence; scan to the end of it.

      int last_good = i;

      for (int j = i + 1; j < word_length_; j++) {

        int tj = letter_types[j];

        if (tj == U_LTR || tj == U_EURO_NUM) {

          last_good = j;

        } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {

          // do nothing.

        } else {

          break;

        }

      }

      // [i..last_good] is the L sequence

      for (int k = i; k <= last_good; k++) {

        letter_types[k] = U_LTR;

      }

      i = last_good + 1;

    } else {

      letter_types[i] = U_RTL;

      i++;

    }

  }


  // At this point, letter_types is entirely U_LTR or U_RTL.

  for (int i = word_length_ - 1; i >= 0;) {

    if (letter_types[i] == U_RTL) {

      blob_indices->push_back(i);

      i--;

    } else {

      // left to right sequence.  scan to the beginning.

      int j = i - 1;

      for (; j >= 0 && letter_types[j] != U_RTL; j--) {

      } // pass

      // Now (j, i] is LTR

      for (int k = j + 1; k <= i; k++) {

        blob_indices->push_back(k);

      }

      i = j;

    }

  }

  ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_));

}


static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {

  for (auto dir : dirs) {

    switch (dir) {

      case DIR_NEUTRAL:

        tprintf("N ");

        break;

      case DIR_LEFT_TO_RIGHT:

        tprintf("L ");

        break;

      case DIR_RIGHT_TO_LEFT:

        tprintf("R ");

        break;

      case DIR_MIX:

        tprintf("Z ");

        break;

      default:

        tprintf("? ");

        break;

    }

  }

  tprintf("\n");

}


void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,

                                            std::vector<int> *word_indices) const {

  std::vector<StrongScriptDirection> directions;

  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);

}


void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,

                                            std::vector<StrongScriptDirection> *dirs_arg,

                                            std::vector<int> *word_indices) const {

  std::vector<StrongScriptDirection> dirs;

  std::vector<StrongScriptDirection> *directions;

  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;

  directions->clear();


  // A LTRResultIterator goes strictly left-to-right word order.

  LTRResultIterator ltr_it(resit);

  ltr_it.RestartRow();

  if (ltr_it.Empty(RIL_WORD)) {

    return;

  }

  do {

    directions->push_back(ltr_it.WordDirection());

  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));


  word_indices->clear();

  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);

}


void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,

                                            const std::vector<StrongScriptDirection> &word_dirs,

                                            std::vector<int> *reading_order) {

  reading_order->clear();

  if (word_dirs.empty()) {

    return;

  }


  // Take all of the runs of minor direction words and insert them

  // in reverse order.

  int minor_direction, major_direction, major_step, start, end;

  if (paragraph_is_ltr) {

    start = 0;

    end = word_dirs.size();

    major_step = 1;

    major_direction = DIR_LEFT_TO_RIGHT;

    minor_direction = DIR_RIGHT_TO_LEFT;

  } else {

    start = word_dirs.size() - 1;

    end = -1;

    major_step = -1;

    major_direction = DIR_RIGHT_TO_LEFT;

    minor_direction = DIR_LEFT_TO_RIGHT;

    // Special rule: if there are neutral words at the right most side

    //   of a line adjacent to a left-to-right word in the middle of the

    //   line, we interpret the end of the line as a single LTR sequence.

    if (word_dirs[start] == DIR_NEUTRAL) {

      int neutral_end = start;

      while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {

        neutral_end--;

      }

      if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {

        // LTR followed by neutrals.

        // Scan for the beginning of the minor left-to-right run.

        int left = neutral_end;

        for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {

          if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {

            left = i;

          }

        }

        reading_order->push_back(kMinorRunStart);

        for (unsigned i = left; i < word_dirs.size(); i++) {

          reading_order->push_back(i);

          if (word_dirs[i] == DIR_MIX) {

            reading_order->push_back(kComplexWord);

          }

        }

        reading_order->push_back(kMinorRunEnd);

        start = left - 1;

      }

    }

  }

  for (int i = start; i != end;) {

    if (word_dirs[i] == minor_direction) {

      int j = i;

      while (j != end && word_dirs[j] != major_direction) {

        j += major_step;

      }

      if (j == end) {

        j -= major_step;

      }

      while (j != i && word_dirs[j] != minor_direction) {

        j -= major_step;

      }

      //  [j..i] is a minor direction run.

      reading_order->push_back(kMinorRunStart);

      for (int k = j; k != i; k -= major_step) {

        reading_order->push_back(k);

      }

      reading_order->push_back(i);

      reading_order->push_back(kMinorRunEnd);

      i = j + major_step;

    } else {

      reading_order->push_back(i);

      if (word_dirs[i] == DIR_MIX) {

        reading_order->push_back(kComplexWord);

      }

      i += major_step;

    }

  }

}


int ResultIterator::LTRWordIndex() const {

  int this_word_index = 0;

  LTRResultIterator textline(*this);

  textline.RestartRow();

  while (!textline.PositionedAtSameWord(it_)) {

    this_word_index++;

    textline.Next(RIL_WORD);

  }

  return this_word_index;

}


void ResultIterator::MoveToLogicalStartOfWord() {

  if (word_length_ == 0) {

    BeginWord(0);

    return;

  }

  std::vector<int> blob_order;

  CalculateBlobOrder(&blob_order);

  if (blob_order.empty() || blob_order[0] == 0) {

    return;

  }

  BeginWord(blob_order[0]);

}


bool ResultIterator::IsAtFinalSymbolOfWord() const {

  if (!it_->word()) {

    return true;

  }

  std::vector<int> blob_order;

  CalculateBlobOrder(&blob_order);

  return blob_order.empty() || blob_order.back() == blob_index_;

}


bool ResultIterator::IsAtFirstSymbolOfWord() const {

  if (!it_->word()) {

    return true;

  }

  std::vector<int> blob_order;

  CalculateBlobOrder(&blob_order);

  return blob_order.empty() || blob_order[0] == blob_index_;

}


void ResultIterator::AppendSuffixMarks(std::string *text) const {

  if (!it_->word()) {

    return;

  }

  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;

  // scan forward to see what meta-information the word ordering algorithm

  // left us.

  // If this word is at the  *end* of a minor run, insert the other

  // direction's mark;  else if this was a complex word, insert the

  // current reading order's mark.

  std::vector<int> textline_order;

  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);

  int this_word_index = LTRWordIndex();

  size_t i = 0;

  for (const auto word_index : textline_order) {

    if (word_index == this_word_index) {

      break;

    }

    i++;

  }

  if (i == textline_order.size()) {

    return;

  }


  int last_non_word_mark = 0;

  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {

    last_non_word_mark = textline_order[i];

  }

  if (last_non_word_mark == kComplexWord) {

    *text += reading_direction_is_ltr ? kLRM : kRLM;

  } else if (last_non_word_mark == kMinorRunEnd) {

    if (current_paragraph_is_ltr_) {

      *text += kLRM;

    } else {

      *text += kRLM;

    }

  }

}


void ResultIterator::MoveToLogicalStartOfTextline() {

  std::vector<int> word_indices;

  RestartRow();

  CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),

                         &word_indices);

  unsigned i = 0;

  for (; i < word_indices.size() && word_indices[i] < 0; i++) {

    if (word_indices[i] == kMinorRunStart) {

      in_minor_direction_ = true;

    } else if (word_indices[i] == kMinorRunEnd) {

      in_minor_direction_ = false;

    }

  }

  if (in_minor_direction_) {

    at_beginning_of_minor_run_ = true;

  }

  if (i >= word_indices.size()) {

    return;

  }

  int first_word_index = word_indices[i];

  for (int j = 0; j < first_word_index; j++) {

    PageIterator::Next(RIL_WORD);

  }

  MoveToLogicalStartOfWord();

}


void ResultIterator::Begin() {

  LTRResultIterator::Begin();

  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();

  in_minor_direction_ = false;

  at_beginning_of_minor_run_ = false;

  MoveToLogicalStartOfTextline();

}


bool ResultIterator::Next(PageIteratorLevel level) {

  if (it_->block() == nullptr) {

    return false; // already at end!

  }

  switch (level) {

    case RIL_BLOCK: // explicit fall-through

    case RIL_PARA:  // explicit fall-through

    case RIL_TEXTLINE:

      if (!PageIterator::Next(level)) {

        return false;

      }

      if (IsWithinFirstTextlineOfParagraph()) {

        // if we've advanced to a new paragraph,

        // recalculate current_paragraph_is_ltr_

        current_paragraph_is_ltr_ = CurrentParagraphIsLtr();

      }

      in_minor_direction_ = false;

      MoveToLogicalStartOfTextline();

      return it_->block() != nullptr;

    case RIL_SYMBOL: {

      std::vector<int> blob_order;

      CalculateBlobOrder(&blob_order);

      unsigned next_blob = 0;

      while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {

        next_blob++;

      }

      next_blob++;

      if (next_blob < blob_order.size()) {

        // we're in the same word; simply advance one blob.

        BeginWord(blob_order[next_blob]);

        at_beginning_of_minor_run_ = false;

        return true;

      }

      level = RIL_WORD; // we've fallen through to the next word.

    }

      // Fall through.

    case RIL_WORD: // explicit fall-through.

    {

      if (it_->word() == nullptr) {

        return Next(RIL_BLOCK);

      }

      std::vector<int> word_indices;

      int this_word_index = LTRWordIndex();

      CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);

      int final_real_index = word_indices.size() - 1;

      while (final_real_index > 0 && word_indices[final_real_index] < 0) {

        final_real_index--;

      }

      for (int i = 0; i < final_real_index; i++) {

        if (word_indices[i] == this_word_index) {

          int j = i + 1;

          for (; j < final_real_index && word_indices[j] < 0; j++) {

            if (word_indices[j] == kMinorRunStart) {

              in_minor_direction_ = true;

            }

            if (word_indices[j] == kMinorRunEnd) {

              in_minor_direction_ = false;

            }

          }

          at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);

          // awesome, we move to word_indices[j]

          if (BidiDebug(3)) {

            tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);

          }

          PageIterator::RestartRow();

          for (int k = 0; k < word_indices[j]; k++) {

            PageIterator::Next(RIL_WORD);

          }

          MoveToLogicalStartOfWord();

          return true;

        }

      }

      if (BidiDebug(3)) {

        tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);

      }

      // we're going off the end of the text line.

      return Next(RIL_TEXTLINE);

    }

  }

  ASSERT_HOST(false); // shouldn't happen.

  return false;

}


bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {

  if (it_->block() == nullptr) {

    return false; // Already at the end!

  }

  if (it_->word() == nullptr) {

    return true; // In an image block.

  }

  if (level == RIL_SYMBOL) {

    return true; // Always at beginning of a symbol.

  }


  bool at_word_start = IsAtFirstSymbolOfWord();

  if (level == RIL_WORD) {

    return at_word_start;

  }


  ResultIterator line_start(*this);

  // move to the first word in the line...

  line_start.MoveToLogicalStartOfTextline();


  bool at_textline_start = at_word_start && *line_start.it_ == *it_;

  if (level == RIL_TEXTLINE) {

    return at_textline_start;

  }


  // now we move to the left-most word...

  line_start.RestartRow();

  bool at_block_start =

      at_textline_start && line_start.it_->block() != line_start.it_->prev_block();

  if (level == RIL_BLOCK) {

    return at_block_start;

  }


  bool at_para_start =

      at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=

                                                  line_start.it_->prev_row()->row->para());

  if (level == RIL_PARA) {

    return at_para_start;

  }


  ASSERT_HOST(false); // shouldn't happen.

  return false;

}


bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {

  if (Empty(element)) {

    return true; // Already at the end!

  }

  // The result is true if we step forward by element and find we are

  // at the end of the page or at beginning of *all* levels in:

  // [level, element).

  // When there is more than one level difference between element and level,

  // we could for instance move forward one symbol and still be at the first

  // word on a line, so we also have to be at the first symbol in a word.

  ResultIterator next(*this);

  next.Next(element);

  if (next.Empty(element)) {

    return true; // Reached the end of the page.

  }

  while (element > level) {

    element = static_cast<PageIteratorLevel>(element - 1);

    if (!next.IsAtBeginningOf(element)) {

      return false;

    }

  }

  return true;

}


// Returns the number of blanks before the current word.

int ResultIterator::BlanksBeforeWord() const {

  if (CurrentParagraphIsLtr()) {

    return LTRResultIterator::BlanksBeforeWord();

  }

  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;

}


char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {

  if (it_->word() == nullptr) {

    return nullptr; // Already at the end!

  }

  std::string text;

  switch (level) {

    case RIL_BLOCK: {

      ResultIterator pp(*this);

      do {

        pp.AppendUTF8ParagraphText(&text);

      } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());

    } break;

    case RIL_PARA:

      AppendUTF8ParagraphText(&text);

      break;

    case RIL_TEXTLINE: {

      ResultIterator it(*this);

      it.MoveToLogicalStartOfTextline();

      it.IterateAndAppendUTF8TextlineText(&text);

    } break;

    case RIL_WORD:

      AppendUTF8WordText(&text);

      break;

    case RIL_SYMBOL: {

      bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;

      if (at_beginning_of_minor_run_) {

        text += reading_direction_is_ltr ? kLRM : kRLM;

      }

      text = it_->word()->BestUTF8(blob_index_, false);

      if (IsAtFinalSymbolOfWord()) {

        AppendSuffixMarks(&text);

      }

    } break;

  }

  int length = text.length() + 1;

  char *result = new char[length];

  strncpy(result, text.c_str(), length);

  return result;

}

std::vector<std::vector<std::vector<std::pair<const char *, float>>>>

    *ResultIterator::GetRawLSTMTimesteps() const {

  if (it_->word() != nullptr) {

    return &it_->word()->segmented_timesteps;

  } else {

    return nullptr;

  }

}


std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()

    const {

  if (it_->word() != nullptr) {

    return &it_->word()->CTC_symbol_choices;

  } else {

    return nullptr;

  }

}


void ResultIterator::AppendUTF8WordText(std::string *text) const {

  if (!it_->word()) {

    return;

  }

  ASSERT_HOST(it_->word()->best_choice != nullptr);

  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;

  if (at_beginning_of_minor_run_) {

    *text += reading_direction_is_ltr ? kLRM : kRLM;

  }


  std::vector<int> blob_order;

  CalculateBlobOrder(&blob_order);

  for (int i : blob_order) {

    *text += it_->word()->BestUTF8(i, false);

  }

  AppendSuffixMarks(text);

}


void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {

  if (Empty(RIL_WORD)) {

    Next(RIL_WORD);

    return;

  }

  if (BidiDebug(1)) {

    std::vector<int> textline_order;

    std::vector<StrongScriptDirection> dirs;

    CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);

    tprintf("Strong Script dirs     [%p/P=%s]: ",

            static_cast<void *>(it_->row()),

            current_paragraph_is_ltr_ ? "ltr" : "rtl");

    PrintScriptDirs(dirs);

    tprintf("Logical textline order [%p/P=%s]: ",

            static_cast<void *>(it_->row()),

            current_paragraph_is_ltr_ ? "ltr" : "rtl");

    for (int i : textline_order) {

      tprintf("%d ", i);

    }

    tprintf("\n");

  }


  int words_appended = 0;

  do {

    int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);

    for (int i = 0; i < numSpaces; ++i) {

      *text += " ";

    }

    AppendUTF8WordText(text);

    words_appended++;

    if (BidiDebug(2)) {

      tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());

    }

  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));

  if (BidiDebug(1)) {

    tprintf("%d words printed\n", words_appended);

  }

  *text += line_separator_;

  // If we just finished a paragraph, add an extra newline.

  if (IsAtBeginningOf(RIL_PARA)) {

    *text += paragraph_separator_;

  }

}


void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {

  ResultIterator it(*this);

  it.RestartParagraph();

  it.MoveToLogicalStartOfTextline();

  if (it.Empty(RIL_WORD)) {

    return;

  }

  do {

    it.IterateAndAppendUTF8TextlineText(text);

  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));

}


bool ResultIterator::BidiDebug(int min_level) const {

  int debug_level = 1;

  auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,

                                            tesseract_->params()->int_params);

  if (p != nullptr) {

    debug_level = (int32_t)(*p);

  }

  return debug_level >= min_level;

}


} // namespace tesseract.

resultiterator.h

tesseractclass.h

unicharset.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

pageres.h

p
const char * p
Definition: gmock-matchers_test.cc:4030

i
int i
Definition: gmock-matchers_test.cc:718

tesseract
Definition: baseapi.h:39

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::StrongScriptDirection
StrongScriptDirection
Definition: unichar.h:41

tesseract::DIR_MIX
@ DIR_MIX
Definition: unichar.h:45

tesseract::DIR_LEFT_TO_RIGHT
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43

tesseract::DIR_RIGHT_TO_LEFT
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44

tesseract::DIR_NEUTRAL
@ DIR_NEUTRAL
Definition: unichar.h:42

tesseract::GlobalParams
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:36

tesseract::PageIteratorLevel
PageIteratorLevel
Definition: publictypes.h:214

tesseract::RIL_BLOCK
@ RIL_BLOCK
Definition: publictypes.h:215

tesseract::RIL_PARA
@ RIL_PARA
Definition: publictypes.h:216

tesseract::RIL_TEXTLINE
@ RIL_TEXTLINE
Definition: publictypes.h:217

tesseract::RIL_SYMBOL
@ RIL_SYMBOL
Definition: publictypes.h:219

tesseract::RIL_WORD
@ RIL_WORD
Definition: publictypes.h:218

cpp.ast.next
def next(obj)
Definition: ast.py:56

tesseract::LTRResultIterator
Definition: ltrresultiterator.h:45

tesseract::LTRResultIterator::paragraph_separator_
const char * paragraph_separator_
Definition: ltrresultiterator.h:176

tesseract::LTRResultIterator::LTRResultIterator
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
Definition: ltrresultiterator.cpp:29

tesseract::LTRResultIterator::BlanksBeforeWord
int BlanksBeforeWord() const
Definition: ltrresultiterator.cpp:241

tesseract::LTRResultIterator::line_separator_
const char * line_separator_
Definition: ltrresultiterator.h:175

tesseract::PageIterator::RestartRow
virtual void RestartRow()
Definition: pageiterator.cpp:131

tesseract::PageIterator::Next
virtual bool Next(PageIteratorLevel level)
Definition: pageiterator.cpp:149

tesseract::PageIterator::it_
PAGE_RES_IT * it_
Definition: pageiterator.h:334

tesseract::PageIterator::word_length_
int word_length_
Definition: pageiterator.h:341

tesseract::PageIterator::Begin
virtual void Begin()
Definition: pageiterator.cpp:105

tesseract::PageIterator::IsWithinFirstTextlineOfParagraph
bool IsWithinFirstTextlineOfParagraph() const
Definition: pageiterator.cpp:125

tesseract::PageIterator::Empty
bool Empty(PageIteratorLevel level) const
Definition: pageiterator.cpp:373

tesseract::PageIterator::BeginWord
void BeginWord(int offset)
Definition: pageiterator.cpp:636

tesseract::PageIterator::blob_index_
int blob_index_
Definition: pageiterator.h:343

tesseract::PageIterator::tesseract_
Tesseract * tesseract_
Definition: pageiterator.h:329

tesseract::ResultIterator
Definition: resultiterator.h:32

tesseract::ResultIterator::CalculateTextlineOrder
static void CalculateTextlineOrder(bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
Definition: resultiterator.cpp:285

tesseract::ResultIterator::IsAtFinalElement
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
Definition: resultiterator.cpp:614

tesseract::ResultIterator::kMinorRunEnd
static const int kMinorRunEnd
Definition: resultiterator.h:135

tesseract::ResultIterator::ParagraphIsLtr
bool ParagraphIsLtr() const
Definition: resultiterator.cpp:56

tesseract::ResultIterator::kMinorRunStart
static const int kMinorRunStart
Definition: resultiterator.h:134

tesseract::ResultIterator::GetUTF8Text
virtual char * GetUTF8Text(PageIteratorLevel level) const
Definition: resultiterator.cpp:650

tesseract::ResultIterator::IsAtBeginningOf
bool IsAtBeginningOf(PageIteratorLevel level) const override
Definition: resultiterator.cpp:565

tesseract::ResultIterator::GetBestLSTMSymbolChoices
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
Definition: resultiterator.cpp:698

tesseract::ResultIterator::BlanksBeforeWord
int BlanksBeforeWord() const
Definition: resultiterator.cpp:639

tesseract::ResultIterator::Next
bool Next(PageIteratorLevel level) override
Definition: resultiterator.cpp:482

tesseract::ResultIterator::GetRawLSTMTimesteps
virtual std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * GetRawLSTMTimesteps() const
Definition: resultiterator.cpp:690

tesseract::ResultIterator::StartOfParagraph
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
Definition: resultiterator.cpp:52

tesseract::ResultIterator::Begin
void Begin() override
Definition: resultiterator.cpp:474

tesseract::ResultIterator::ResultIterator
ResultIterator(const LTRResultIterator &resit)
Definition: resultiterator.cpp:37

tesseract::ResultIterator::kComplexWord
static const int kComplexWord
Definition: resultiterator.h:136

tesseract::ROW::para
PARA * para() const
Definition: ocrrow.h:120

tesseract::ROW_RES::row
ROW * row
Definition: pageres.h:144

tesseract::WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:239

tesseract::WERD_RES::BestUTF8
const char * BestUTF8(unsigned blob_index, bool in_rtl_context) const
Definition: pageres.h:361

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::UnicharsInReadingOrder
bool UnicharsInReadingOrder() const
Definition: pageres.h:435

tesseract::WERD_RES::CTC_symbol_choices
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:224

tesseract::WERD_RES::SymbolDirection
UNICHARSET::Direction SymbolDirection(unsigned blob_index) const
Definition: pageres.h:387

tesseract::WERD_RES::segmented_timesteps
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:222

tesseract::PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:769

tesseract::PAGE_RES_IT::prev_block
BLOCK_RES * prev_block() const
Definition: pageres.h:760

tesseract::PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:763

tesseract::PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:757

tesseract::PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:766

tesseract::WERD::space
uint8_t space() const
Definition: werd.h:100

tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:53

tesseract::ParamsVectors::bool_params
std::vector< BoolParam * > bool_params
Definition: params.h:48

tesseract::ParamsVectors::int_params
std::vector< IntParam * > int_params
Definition: params.h:47

tesseract::UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR
@ U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:180

tesseract::UNICHARSET::U_COMMON_NUMBER_SEPARATOR
@ U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:182

tesseract::UNICHARSET::U_RIGHT_TO_LEFT
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:177

tesseract::UNICHARSET::U_OTHER_NEUTRAL
@ U_OTHER_NEUTRAL
Definition: unicharset.h:186

tesseract::UNICHARSET::U_EUROPEAN_NUMBER
@ U_EUROPEAN_NUMBER
Definition: unicharset.h:178

tesseract::UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR
@ U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:179

tesseract::UNICHARSET::U_LEFT_TO_RIGHT
@ U_LEFT_TO_RIGHT
Definition: unicharset.h:176