tesseract-ocr.github.io/5.3.3/a02240_source.html

// (C) Copyright 2017, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.


#include <string>


#include <allheaders.h>


#include "include_gunit.h"

#include "log.h" // for LOG

#include "serialis.h"

#include "tprintf.h"

#include "unicharcompress.h"


namespace tesseract {


class UnicharcompressTest : public ::testing::Test {

protected:

  void SetUp() override {

    std::locale::global(std::locale(""));

    file::MakeTmpdir();

  }


  // Loads and compresses the given unicharset.

  void LoadUnicharset(const std::string &unicharset_name) {

    std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");

    std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);

    std::string radical_data;

    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));

    CHECK(unicharset_.load_from_file(unicharset_file.c_str()));

    std::string radical_str(radical_data.c_str());

    null_char_ = unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();

    compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);

    // Get the encoding of the null char.

    RecodedCharID code;

    compressed_.EncodeUnichar(null_char_, &code);

    encoded_null_char_ = code(0);

    std::string output_name =

        file::JoinPath(FLAGS_test_tmpdir, unicharset_name) + ".encoding.txt";

    std::string encoding = compressed_.GetEncodingAsString(unicharset_);

    std::string encoding_str(&encoding[0], encoding.size());

    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));

    LOG(INFO) << "Wrote encoding to:" << output_name;

  }

  // Serializes and de-serializes compressed_ over itself.

  void SerializeAndUndo() {

    std::vector<char> data;

    TFile wfp;

    wfp.OpenWrite(&data);

    EXPECT_TRUE(compressed_.Serialize(&wfp));

    TFile rfp;

    rfp.Open(&data[0], data.size());

    EXPECT_TRUE(compressed_.DeSerialize(&rfp));

  }

  // Returns true if the lang is in CJK.

  bool IsCJKLang(const std::string &lang) {

    return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn";

  }

  // Returns true if the lang is Indic.

  bool IsIndicLang(const std::string &lang) {

    return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" ||

           lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" ||

           lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" ||

           lang == "tam" || lang == "tel";

  }


  // Expects the appropriate results from the compressed_  unicharset_.

  void ExpectCorrect(const std::string &lang) {

    // Count the number of times each code is used in each element of

    // RecodedCharID.

    RecodedCharID zeros;

    for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {

      zeros.Set(i, 0);

    }

    int code_range = compressed_.code_range();

    std::vector<RecodedCharID> times_seen(code_range, zeros);

    for (int u = 0; u <= unicharset_.size(); ++u) {

      if (u != UNICHAR_SPACE && u != null_char_ &&

          (u == unicharset_.size() ||

           (unicharset_.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT))) {

        continue; // Not used so not encoded.

      }

      RecodedCharID code;

      int len = compressed_.EncodeUnichar(u, &code);

      // Check round-trip encoding.

      int unichar_id;

      std::vector<UNICHAR_ID> normed_ids;

      if (u == null_char_ || u == unicharset_.size()) {

        unichar_id = null_char_;

      } else {

        unichar_id = u;

      }

      EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));

      // Check that the codes are valid.

      for (int i = 0; i < len; ++i) {

        int code_val = code(i);

        EXPECT_GE(code_val, 0);

        EXPECT_LT(code_val, code_range);

        times_seen[code_val].Set(i, times_seen[code_val](i) + 1);

      }

    }

    // Check that each code is used in at least one position.

    for (int c = 0; c < code_range; ++c) {

      int num_used = 0;

      for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {

        if (times_seen[c](i) != 0) {

          ++num_used;

        }

      }

      EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;

    }

    // Check that GetNextCodes/GetFinalCodes lists match the times_seen,

    // and create valid codes.

    RecodedCharID code;

    CheckCodeExtensions(code, times_seen);

    // Finally, we achieved all that using a codebook < 10% of the size of

    // the original unicharset, for CK or Indic, and 20% with J, but just

    // no bigger for all others.

    if (IsCJKLang(lang) || IsIndicLang(lang)) {

      EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));

    } else {

      EXPECT_LE(code_range, unicharset_.size() + 1);

    }

    LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range;

  }

  // Checks for extensions of the current code that either finish a code, or

  // extend it and checks those extensions recursively.

  void CheckCodeExtensions(const RecodedCharID &code,

                           const std::vector<RecodedCharID> &times_seen) {

    RecodedCharID extended = code;

    int length = code.length();

    const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);

    if (final_codes != nullptr) {

      for (int ending : *final_codes) {

        EXPECT_GT(times_seen[ending](length), 0);

        extended.Set(length, ending);

        int unichar_id = compressed_.DecodeUnichar(extended);

        EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);

      }

    }

    const std::vector<int> *next_codes = compressed_.GetNextCodes(code);

    if (next_codes != nullptr) {

      for (int extension : *next_codes) {

        EXPECT_GT(times_seen[extension](length), 0);

        extended.Set(length, extension);

        CheckCodeExtensions(extended, times_seen);

      }

    }

  }


  UnicharCompress compressed_;

  UNICHARSET unicharset_;

  int null_char_;

  // The encoding of the null_char_.

  int encoded_null_char_;

};


TEST_F(UnicharcompressTest, DoesChinese) {

  LOG(INFO) << "Testing chi_tra";

  LoadUnicharset("chi_tra.unicharset");

  ExpectCorrect("chi_tra");

  LOG(INFO) << "Testing chi_sim";

  LoadUnicharset("chi_sim.unicharset");

  ExpectCorrect("chi_sim");

}


TEST_F(UnicharcompressTest, DoesJapanese) {

  LOG(INFO) << "Testing jpn";

  LoadUnicharset("jpn.unicharset");

  ExpectCorrect("jpn");

}


TEST_F(UnicharcompressTest, DoesKorean) {

  LOG(INFO) << "Testing kor";

  LoadUnicharset("kor.unicharset");

  ExpectCorrect("kor");

}


TEST_F(UnicharcompressTest, DoesKannada) {

  LOG(INFO) << "Testing kan";

  LoadUnicharset("kan.unicharset");

  ExpectCorrect("kan");

  SerializeAndUndo();

  ExpectCorrect("kan");

}


TEST_F(UnicharcompressTest, DoesMarathi) {

  LOG(INFO) << "Testing mar";

  LoadUnicharset("mar.unicharset");

  ExpectCorrect("mar");

}


TEST_F(UnicharcompressTest, DoesEnglish) {

  LOG(INFO) << "Testing eng";

  LoadUnicharset("eng.unicharset");

  ExpectCorrect("eng");

}


// Tests that a unicharset that contains double-letter ligatures (eg ff) has

// no null char in the encoding at all.

TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {

  LOG(INFO) << "Testing por with ligatures";

  LoadUnicharset("por.unicharset");

  ExpectCorrect("por");

  // Check that any unichar-id that is encoded with multiple codes has the

  // correct encoded_nulll_char_ in between.

  for (int u = 0; u <= unicharset_.size(); ++u) {

    RecodedCharID code;

    int len = compressed_.EncodeUnichar(u, &code);

    if (len > 1) {

      // The should not be any null char in the code.

      for (int i = 0; i < len; ++i) {

        EXPECT_NE(encoded_null_char_, code(i));

      }

    }

  }

}


// Tests that GetEncodingAsString returns the right result for a trivial

// unicharset.

TEST_F(UnicharcompressTest, GetEncodingAsString) {

  LoadUnicharset("trivial.unicharset");

  ExpectCorrect("trivial");

  std::string encoding = compressed_.GetEncodingAsString(unicharset_);

  std::string encoding_str(&encoding[0], encoding.length());

  std::vector<std::string> lines = split(encoding_str, '\n');

  EXPECT_EQ(5, lines.size());

  // The first line is always space.

  EXPECT_EQ("0\t ", lines[0]);

  // Next we have i.

  EXPECT_EQ("1\ti", lines[1]);

  // Next we have f.

  EXPECT_EQ("2\tf", lines[2]);

  // Next we have the fi ligature: ﬁ. There are no nulls in it, as there are no

  // repeated letter ligatures in this unicharset, unlike por.unicharset above.

  EXPECT_EQ("2,1\tﬁ", lines[3]);

  // Finally the null character.

  EXPECT_EQ("3\t<nul>", lines[4]);

}


} // namespace tesseract

serialis.h

tprintf.h

unicharcompress.h

LOG
@ LOG
Definition: cleanapi_test.cc:19

log.h

INFO
@ INFO
Definition: log.h:28

i
int i
Definition: gmock-matchers_test.cc:718

EXPECT_EQ
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043

EXPECT_NE
#define EXPECT_NE(val1, val2)
Definition: gtest.h:2045

EXPECT_GT
#define EXPECT_GT(val1, val2)
Definition: gtest.h:2053

EXPECT_GE
#define EXPECT_GE(val1, val2)
Definition: gtest.h:2051

EXPECT_TRUE
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982

EXPECT_LE
#define EXPECT_LE(val1, val2)
Definition: gtest.h:2047

EXPECT_LT
#define EXPECT_LT(val1, val2)
Definition: gtest.h:2049

include_gunit.h

CHECK
#define CHECK(condition)
Definition: include_gunit.h:76

CHECK_OK
#define CHECK_OK(test)
Definition: include_gunit.h:84

tesseract
Definition: baseapi.h:39

tesseract::UNICHAR_SPACE
@ UNICHAR_SPACE
Definition: unicharset.h:36

tesseract::UNICHAR_BROKEN
@ UNICHAR_BROKEN
Definition: unicharset.h:38

tesseract::SPECIAL_UNICHAR_CODES_COUNT
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40

tesseract::split
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43

tesseract::TEST_F
TEST_F(EuroText, FastLatinOCR)
Definition: apiexample_test.cc:105

tesseract::TFile
Definition: serialis.h:61

tesseract::TFile::OpenWrite
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246

tesseract::TFile::Open
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140

tesseract::RecodedCharID
Definition: unicharcompress.h:32

tesseract::RecodedCharID::length
int length() const
Definition: unicharcompress.h:62

tesseract::RecodedCharID::Set
void Set(int index, int value)
Definition: unicharcompress.h:44

tesseract::RecodedCharID::kMaxCodeLen
static const int kMaxCodeLen
Definition: unicharcompress.h:35

tesseract::UnicharCompress
Definition: unicharcompress.h:139

tesseract::UnicharCompress::EncodeUnichar
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
Definition: unicharcompress.cpp:295

tesseract::UnicharCompress::DeSerialize
bool DeSerialize(TFile *fp)
Definition: unicharcompress.cpp:323

tesseract::UnicharCompress::GetEncodingAsString
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
Definition: unicharcompress.cpp:339

tesseract::UnicharCompress::GetFinalCodes
const std::vector< int > * GetFinalCodes(const RecodedCharID &code) const
Definition: unicharcompress.h:193

tesseract::UnicharCompress::code_range
int code_range() const
Definition: unicharcompress.h:171

tesseract::UnicharCompress::GetNextCodes
const std::vector< int > * GetNextCodes(const RecodedCharID &code) const
Definition: unicharcompress.h:187

tesseract::UnicharCompress::DecodeUnichar
int DecodeUnichar(const RecodedCharID &code) const
Definition: unicharcompress.cpp:305

tesseract::UnicharCompress::ComputeEncoding
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
Definition: unicharcompress.cpp:109

tesseract::UnicharCompress::Serialize
bool Serialize(TFile *fp) const
Definition: unicharcompress.cpp:318

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:756

tesseract::UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391

tesseract::UNICHARSET::size
size_t size() const
Definition: unicharset.h:355

file::Defaults
static int Defaults()
Definition: include_gunit.h:61

file::MakeTmpdir
static void MakeTmpdir()
Definition: include_gunit.h:38

file::JoinPath
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65

file::SetContents
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56

file::GetContents
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52

testing::Test
Definition: gtest.h:414

tesseract::UnicharcompressTest
Definition: unicharcompress_test.cc:24

tesseract::UnicharcompressTest::compressed_
UnicharCompress compressed_
Definition: unicharcompress_test.cc:158

tesseract::UnicharcompressTest::ExpectCorrect
void ExpectCorrect(const std::string &lang)
Definition: unicharcompress_test.cc:75

tesseract::UnicharcompressTest::SerializeAndUndo
void SerializeAndUndo()
Definition: unicharcompress_test.cc:53

tesseract::UnicharcompressTest::IsIndicLang
bool IsIndicLang(const std::string &lang)
Definition: unicharcompress_test.cc:67

tesseract::UnicharcompressTest::CheckCodeExtensions
void CheckCodeExtensions(const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)
Definition: unicharcompress_test.cc:135

tesseract::UnicharcompressTest::IsCJKLang
bool IsCJKLang(const std::string &lang)
Definition: unicharcompress_test.cc:63

tesseract::UnicharcompressTest::LoadUnicharset
void LoadUnicharset(const std::string &unicharset_name)
Definition: unicharcompress_test.cc:32

tesseract::UnicharcompressTest::SetUp
void SetUp() override
Definition: unicharcompress_test.cc:26

tesseract::UnicharcompressTest::null_char_
int null_char_
Definition: unicharcompress_test.cc:160

tesseract::UnicharcompressTest::unicharset_
UNICHARSET unicharset_
Definition: unicharcompress_test.cc:159

tesseract::UnicharcompressTest::encoded_null_char_
int encoded_null_char_
Definition: unicharcompress_test.cc:162