tesseract-ocr.github.io/5.3.3/a00995_source.html

/**********************************************************************

 * File:        validator.h

 * Description: Base class for various text validators. Intended mainly for

 *              scripts that use a virama character.

 * Author:      Ray Smith

 *

 * (C) Copyright 2017, Google Inc.

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 * http://www.apache.org/licenses/LICENSE-2.0

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 *

 **********************************************************************/


#ifndef TESSERACT_TRAINING_VALIDATOR_H_

#define TESSERACT_TRAINING_VALIDATOR_H_


#include "export.h"


#include <tesseract/unichar.h>


#include <memory>

#include <vector>


namespace tesseract {


// Different kinds of grapheme normalization - not just for Indic!

// A grapheme is a syllable unit in Indic and can be several unicodes.

// In other scripts, a grapheme is a base character and accent/diacritic

// combination, as not all accented characters have a single composed form.

enum class GraphemeNormMode {

  // Validation result is a single string, even if input is multi-word.

  kSingleString,

  // Standard unicode graphemes are validated and output as grapheme units.

  kCombined,

  // Graphemes are validated and sub-divided. For virama-using scripts, units

  // that correspond to repeatable glyphs are generated. (Mostly single unicodes

  // but viramas and joiners are paired with the most sensible neighbor.)

  // For non-virama scripts, this means that base/accent pairs are separated,

  // ie the output is individual unicodes.

  kGlyphSplit,

  // The output is always single unicodes, regardless of the script.

  kIndividualUnicodes,

};


// An enum representing the scripts that use a virama character. It is

// guaranteed that the value of any element, (except kNonVirama) can be cast

// to a unicode (char32) value that represents the start of the unicode range

// of the corresponding script.

enum class ViramaScript : char32 {

  kNonVirama = 0,

  kDevanagari = 0x900,

  kBengali = 0x980,

  kGurmukhi = 0xa00,

  kGujarati = 0xa80,

  kOriya = 0xb00,

  kTamil = 0xb80,

  kTelugu = 0xc00,

  kKannada = 0xc80,

  kMalayalam = 0xd00,

  kSinhala = 0xd80,

  kMyanmar = 0x1000,

  kKhmer = 0x1780,

  kJavanese = 0xa980,

};


// Base class offers a validation API and protected methods to allow subclasses

// to easily build the validated/segmented output.

class TESS_UNICHARSET_TRAINING_API Validator {

public:

  // Validates and cleans the src vector of unicodes to the *dest, according to

  // g_mode. In the case of kSingleString, a single vector containing the whole

  // result is added to *dest. With kCombined, multiple vectors are added to

  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are

  // added to *dest with a smaller unit representing a glyph in each.

  // In case of validation error, returns false and as much as possible of the

  // input, without discarding invalid text.

  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors,

                                      const std::vector<char32> &src,

                                      std::vector<std::vector<char32>> *dest);


  // Returns true if the unicode ch is a non-printing zero-width mark of no

  // significance to OCR training or evaluation.

  static bool IsZeroWidthMark(char32 ch) {

    return ch == kZeroWidthSpace || ch == kLeftToRightMark || ch == kRightToLeftMark ||

           ch == kInvalid;

  }

  virtual ~Validator();


  // Some specific but universally useful unicodes.

  static const char32 kZeroWidthSpace;

  static const char32 kZeroWidthNonJoiner;

  static const char32 kZeroWidthJoiner;

  static const char32 kLeftToRightMark;

  static const char32 kRightToLeftMark;

  static const char32 kInvalid;


protected:

  // These are more or less the character class identifiers in the ISCII

  // standard, section 8.  They have been augmented with the Unicode meta

  // characters Zero Width Joiner and Zero Width Non Joiner, and the

  // Unicode Vedic Marks.

  // The best sources of information on Unicode and Indic scripts are:

  //   http://varamozhi.sourceforge.net/iscii91.pdf

  //   http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf

  //   http://unicode.org/faq/indic.html

  //   http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx

  enum class CharClass {

    // NOTE: The values of the enum members are meaningless and arbitrary, ie

    // they are not used for sorting, or any other risky application.

    // The reason they are what they are is they are a single character

    // abbreviation that can be used in a regexp/BNF definition of a grammar,

    // IN A COMMENT, and still not relied upon in the code.

    kConsonant = 'C',

    kVowel = 'V',

    kVirama = 'H',             // (aka Halant)

    kMatra = 'M',              // (aka Dependent Vowel)

    kMatraPiece = 'P',         // unicode provides pieces of Matras.

    kVowelModifier = 'D',      // (candrabindu, anusvara, visarga, other marks)

    kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C

    kZeroWidthJoiner = 'Z',    // Unicode Zero Width Joiner U+200D

    kVedicMark = 'v',          // Modifiers can come modify any indic syllable.

    kNukta = 'N',              // Occurs only immediately after consonants.

    kRobat = 'R',              // Khmer only.

    kOther = 'O',              // (digits, measures, non-Indic, etc)

    // Additional classes used only by ValidateGrapheme.

    kWhitespace = ' ',

    kCombiner = 'c', // Combiners other than virama.

  };

  using IndicPair = std::pair<CharClass, char32>;


  Validator(ViramaScript script, bool report_errors)

      : script_(script), codes_used_(0), output_used_(0), report_errors_(report_errors) {}


  // Factory method that understands how to map script to the right subclass.

  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script, bool report_errors);


  // Internal version of the public static ValidateCleanAndSegment.

  // Validates and cleans the src vector of unicodes to the *dest, according to

  // its type and the given g_mode.

  // In case of validation error, returns false and returns as much as possible

  // of the input, without discarding invalid text.

  bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector<char32> &src,

                                       std::vector<std::vector<char32>> *dest);

  // Moves the results from parts_ or output_ to dest according to g_mode.

  void MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest);


  // Computes and returns the ViramaScript corresponding to the most frequent

  // virama-using script in the input, or kNonVirama if none are present.

  static ViramaScript MostFrequentViramaScript(const std::vector<char32> &utf32);

  // Returns true if the given UTF-32 unicode is a "virama" character.

  static bool IsVirama(char32 unicode);

  // Returns true if the given UTF-32 unicode is a vedic accent.

  static bool IsVedicAccent(char32 unicode);

  // Returns true if the script is one that uses subscripts for conjuncts.

  bool IsSubscriptScript() const;


  // Helper function appends the next element of codes_ only to output_,

  // without touching parts_

  // Returns true at the end of codes_.

  bool CodeOnlyToOutput() {

    output_.push_back(codes_[codes_used_].second);

    return ++codes_used_ == codes_.size();

  }


  // Helper function adds a length-element vector to parts_ from the last length

  // elements of output_. If there are more than length unused elements in

  // output_, adds unicodes as single-element vectors to parts_ to catch

  // output_used_ up to output->size() - length before adding the length-element

  // vector.

  void MultiCodePart(unsigned length) {

    while (output_used_ + length < output_.size()) {

      parts_.emplace_back(std::initializer_list<char32>{output_[output_used_++]});

    }

    parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});

    while (++output_used_ < output_.size()) {

      parts_.back().push_back(output_[output_used_]);

    }

  }


  // Helper function appends the next element of codes_ to output_, and then

  // calls MultiCodePart to add the appropriate components to parts_.

  // Returns true at the end of codes_.

  bool UseMultiCode(unsigned length) {

    output_.push_back(codes_[codes_used_].second);

    MultiCodePart(length);

    return ++codes_used_ == codes_.size();

  }


  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to

  // parts_ and output_. Returns true if a valid Grapheme was consumed,

  // otherwise does not increment codes_used_.

  virtual bool ConsumeGraphemeIfValid() = 0;

  // Sets codes_ to the class codes for the given unicode text.

  void ComputeClassCodes(const std::vector<char32> &text);

  // Returns the CharClass corresponding to the given Unicode ch.

  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;

  // Resets to the initial state.

  void Clear();


  // Number of unicodes in each Indic codepage.

  static const int kIndicCodePageSize = 128;

  // Lowest unicode value of any Indic script. (Devanagari).

  static const char32 kMinIndicUnicode = 0x900;

  // Highest unicode value of any consistent (ISCII-based) Indic script.

  static const char32 kMaxSinhalaUnicode = 0xdff;

  // Highest unicode value of any virama-using script. (Khmer).

  static const char32 kMaxViramaScriptUnicode = 0x17ff;

  // Some special unicodes.

  static const char32 kSinhalaVirama = 0xdca;

  static const char32 kMyanmarVirama = 0x1039;

  static const char32 kKhmerVirama = 0x17d2;

  // Javanese Script - aksarajawa

  static const char32 kJavaneseVirama = 0xa9c0;

  static const char32 kMaxJavaneseUnicode = 0xa9df;


  // Script we are operating on.

  ViramaScript script_;

  // Input unicodes with assigned CharClass is the data to be validated.

  std::vector<IndicPair> codes_;

  // Glyph-like components of the input.

  std::vector<std::vector<char32>> parts_;

  // Copied validated unicodes from codes_ that are OK to output.

  std::vector<char32> output_;

  // The number of elements of codes_ that have been processed so far.

  unsigned codes_used_;

  // The number of elements of output_ that have already been added to parts_.

  unsigned output_used_;

  // Log error messages for reasons why text is invalid.

  bool report_errors_;

};


} // namespace tesseract


#endif // TESSERACT_TRAINING_VALIDATOR_H_

unichar.h

ch
char ch
Definition: gmock-matchers_test.cc:4035

tesseract
Definition: baseapi.h:39

tesseract::GraphemeNormMode
GraphemeNormMode
Definition: validator.h:36

tesseract::GraphemeNormMode::kGlyphSplit
@ kGlyphSplit

tesseract::GraphemeNormMode::kIndividualUnicodes
@ kIndividualUnicodes

tesseract::GraphemeNormMode::kSingleString
@ kSingleString

tesseract::GraphemeNormMode::kCombined
@ kCombined

tesseract::char32
signed int char32
Definition: unichar.h:49

tesseract::ViramaScript
ViramaScript
Definition: validator.h:55

tesseract::ViramaScript::kOriya
@ kOriya

tesseract::ViramaScript::kDevanagari
@ kDevanagari

tesseract::ViramaScript::kGurmukhi
@ kGurmukhi

tesseract::ViramaScript::kMalayalam
@ kMalayalam

tesseract::ViramaScript::kTelugu
@ kTelugu

tesseract::ViramaScript::kTamil
@ kTamil

tesseract::ViramaScript::kJavanese
@ kJavanese

tesseract::ViramaScript::kGujarati
@ kGujarati

tesseract::ViramaScript::kMyanmar
@ kMyanmar

tesseract::ViramaScript::kNonVirama
@ kNonVirama

tesseract::ViramaScript::kKhmer
@ kKhmer

tesseract::ViramaScript::kKannada
@ kKannada

tesseract::ViramaScript::kBengali
@ kBengali

tesseract::ViramaScript::kSinhala
@ kSinhala

testing::internal::kOther
@ kOther
Definition: gmock-internal-utils.h:101

upload.dest
dest
Definition: upload.py:409

tesseract::Validator
Definition: validator.h:74

tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97

tesseract::Validator::script_
ViramaScript script_
Definition: validator.h:223

tesseract::Validator::UnicodeToCharClass
virtual CharClass UnicodeToCharClass(char32 ch) const =0

tesseract::Validator::output_
std::vector< char32 > output_
Definition: validator.h:229

tesseract::Validator::kInvalid
static const char32 kInvalid
Definition: validator.h:101

tesseract::Validator::kRightToLeftMark
static const char32 kRightToLeftMark
Definition: validator.h:100

tesseract::Validator::output_used_
unsigned output_used_
Definition: validator.h:233

tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:231

tesseract::Validator::UseMultiCode
bool UseMultiCode(unsigned length)
Definition: validator.h:189

tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:166

tesseract::Validator::~Validator
virtual ~Validator()

tesseract::Validator::MultiCodePart
void MultiCodePart(unsigned length)
Definition: validator.h:176

tesseract::Validator::kLeftToRightMark
static const char32 kLeftToRightMark
Definition: validator.h:99

tesseract::Validator::CharClass
CharClass
Definition: validator.h:113

tesseract::Validator::IndicPair
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:135

tesseract::Validator::kZeroWidthSpace
static const char32 kZeroWidthSpace
Definition: validator.h:96

tesseract::Validator::IsZeroWidthMark
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:89

tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:225

tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:98

tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:235

tesseract::Validator::Validator
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:137

tesseract::Validator::parts_
std::vector< std::vector< char32 > > parts_
Definition: validator.h:227

tesseract::Validator::ConsumeGraphemeIfValid
virtual bool ConsumeGraphemeIfValid()=0

export.h