Inheritance diagram for tesseract::UnicharcompressTest:

Protected Member Functions
void	SetUp () override

void	LoadUnicharset (const std::string &unicharset_name)

void	SerializeAndUndo ()

bool	IsCJKLang (const std::string &lang)

bool	IsIndicLang (const std::string &lang)

void	ExpectCorrect (const std::string &lang)

void	CheckCodeExtensions (const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)

Protected Member Functions inherited from testing::Test
	Test ()

virtual void	SetUp ()

virtual void	TearDown ()

Protected Attributes
UnicharCompress	compressed_

UNICHARSET	unicharset_

int	null_char_

int	encoded_null_char_

Additional Inherited Members
Public Member Functions inherited from testing::Test
virtual	~Test ()

Static Public Member Functions inherited from testing::Test
static void	SetUpTestSuite ()

static void	TearDownTestSuite ()

static void	TearDownTestCase ()

static void	SetUpTestCase ()

static bool	HasFatalFailure ()

static bool	HasNonfatalFailure ()

static bool	IsSkipped ()

static bool	HasFailure ()

static void	RecordProperty (const std::string &key, const std::string &value)

static void	RecordProperty (const std::string &key, int value)

Detailed Description

Definition at line 24 of file unicharcompress_test.cc.

Member Function Documentation

◆ CheckCodeExtensions()

void tesseract::UnicharcompressTest::CheckCodeExtensions	(	const RecodedCharID &	code,
		const std::vector< RecodedCharID > &	times_seen
	)

inlineprotected

Definition at line 135 of file unicharcompress_test.cc.

                                                                       {
    RecodedCharID extended = code;
    int length = code.length();
    const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);
    if (final_codes != nullptr) {
      for (int ending : *final_codes) {
        EXPECT_GT(times_seen[ending](length), 0);
        extended.Set(length, ending);
        int unichar_id = compressed_.DecodeUnichar(extended);
        EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
      }
    }
    const std::vector<int> *next_codes = compressed_.GetNextCodes(code);
    if (next_codes != nullptr) {
      for (int extension : *next_codes) {
        EXPECT_GT(times_seen[extension](length), 0);
        extended.Set(length, extension);
        CheckCodeExtensions(extended, times_seen);
      }
    }
  }

◆ ExpectCorrect()

void tesseract::UnicharcompressTest::ExpectCorrect ( const std::string & lang )

inlineprotected

Definition at line 75 of file unicharcompress_test.cc.

                                            {
    // Count the number of times each code is used in each element of
    // RecodedCharID.
    RecodedCharID zeros;
    for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
      zeros.Set(i, 0);
    }
    int code_range = compressed_.code_range();
    std::vector<RecodedCharID> times_seen(code_range, zeros);
    for (int u = 0; u <= unicharset_.size(); ++u) {
      if (u != UNICHAR_SPACE && u != null_char_ &&
          (u == unicharset_.size() ||
           (unicharset_.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT))) {
        continue; // Not used so not encoded.
      }
      RecodedCharID code;
      int len = compressed_.EncodeUnichar(u, &code);
      // Check round-trip encoding.
      int unichar_id;
      std::vector<UNICHAR_ID> normed_ids;
      if (u == null_char_ || u == unicharset_.size()) {
        unichar_id = null_char_;
      } else {
        unichar_id = u;
      }
      EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
      // Check that the codes are valid.
      for (int i = 0; i < len; ++i) {
        int code_val = code(i);
        EXPECT_GE(code_val, 0);
        EXPECT_LT(code_val, code_range);
        times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
      }
    }
    // Check that each code is used in at least one position.
    for (int c = 0; c < code_range; ++c) {
      int num_used = 0;
      for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
        if (times_seen[c](i) != 0) {
          ++num_used;
        }
      }
      EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
    }
    // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
    // and create valid codes.
    RecodedCharID code;
    CheckCodeExtensions(code, times_seen);
    // Finally, we achieved all that using a codebook < 10% of the size of
    // the original unicharset, for CK or Indic, and 20% with J, but just
    // no bigger for all others.
    if (IsCJKLang(lang) || IsIndicLang(lang)) {
      EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
    } else {
      EXPECT_LE(code_range, unicharset_.size() + 1);
    }
    LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range;
  }

◆ IsCJKLang()

bool tesseract::UnicharcompressTest::IsCJKLang ( const std::string & lang )

inlineprotected

Definition at line 63 of file unicharcompress_test.cc.

                                        {
    return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn";
  }

◆ IsIndicLang()

bool tesseract::UnicharcompressTest::IsIndicLang ( const std::string & lang )

inlineprotected

Definition at line 67 of file unicharcompress_test.cc.

                                          {
    return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" ||
           lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" ||
           lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" ||
           lang == "tam" || lang == "tel";
  }

◆ LoadUnicharset()

void tesseract::UnicharcompressTest::LoadUnicharset ( const std::string & unicharset_name )

inlineprotected

Definition at line 32 of file unicharcompress_test.cc.

                                                        {
    std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
    std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
    std::string radical_data;
    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
    CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
    std::string radical_str(radical_data.c_str());
    null_char_ = unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();
    compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
    // Get the encoding of the null char.
    RecodedCharID code;
    compressed_.EncodeUnichar(null_char_, &code);
    encoded_null_char_ = code(0);
    std::string output_name =
        file::JoinPath(FLAGS_test_tmpdir, unicharset_name) + ".encoding.txt";
    std::string encoding = compressed_.GetEncodingAsString(unicharset_);
    std::string encoding_str(&encoding[0], encoding.size());
    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
    LOG(INFO) << "Wrote encoding to:" << output_name;
  }

◆ SerializeAndUndo()

void tesseract::UnicharcompressTest::SerializeAndUndo ( )

inlineprotected

Definition at line 53 of file unicharcompress_test.cc.

                          {
    std::vector<char> data;
    TFile wfp;
    wfp.OpenWrite(&data);
    EXPECT_TRUE(compressed_.Serialize(&wfp));
    TFile rfp;
    rfp.Open(&data[0], data.size());
    EXPECT_TRUE(compressed_.DeSerialize(&rfp));
  }

◆ SetUp()

void tesseract::UnicharcompressTest::SetUp ( )

inlineoverrideprotectedvirtual

Reimplemented from testing::Test.

Definition at line 26 of file unicharcompress_test.cc.

                        {
    std::locale::global(std::locale(""));
    file::MakeTmpdir();
  }

Member Data Documentation

◆ compressed_

UnicharCompress tesseract::UnicharcompressTest::compressed_

protected

Definition at line 158 of file unicharcompress_test.cc.

◆ encoded_null_char_

int tesseract::UnicharcompressTest::encoded_null_char_

protected

Definition at line 162 of file unicharcompress_test.cc.

◆ null_char_

int tesseract::UnicharcompressTest::null_char_

protected

Definition at line 160 of file unicharcompress_test.cc.

◆ unicharset_

UNICHARSET tesseract::UnicharcompressTest::unicharset_

protected

Definition at line 159 of file unicharcompress_test.cc.

The documentation for this class was generated from the following file:

/media/home/debian/src/github/tesseract-ocr/tesseract/unittest/unicharcompress_test.cc

Protected Member Functions

Protected Attributes

Additional Inherited Members

Detailed Description

Member Function Documentation

◆ CheckCodeExtensions()

◆ ExpectCorrect()

◆ IsCJKLang()

◆ IsIndicLang()

◆ LoadUnicharset()

◆ SerializeAndUndo()

◆ SetUp()

Member Data Documentation

◆ compressed_

◆ encoded_null_char_

◆ null_char_

◆ unicharset_