tesseract v5.3.3.20231005
tesseract::RecodeBeamTest Class Reference
Inheritance diagram for tesseract::RecodeBeamTest:
testing::Test

Protected Member Functions

void SetUp () override
 
 RecodeBeamTest ()
 
 ~RecodeBeamTest () override
 
void LoadUnicharset (const std::string &unicharset_name)
 
void LoadDict (const std::string &lang)
 
void ExpectCorrect (const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)
 
void ExpectCorrect (const GENERIC_2D_ARRAY< float > &output, const std::string &truth_utf8, Dict *dict, PointerVector< WERD_RES > *words)
 
GENERIC_2D_ARRAY< float > GenerateRandomPaddedOutputs (const std::vector< int > &unichar_ids, int padding)
 
int EncodeUTF8 (const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)
 
GENERIC_2D_ARRAY< float > GenerateSyntheticOutputs (const char *chars1[], const float scores1[], const char *chars2[], const float scores2[], TRand *random)
 
- Protected Member Functions inherited from testing::Test
 Test ()
 
virtual void SetUp ()
 
virtual void TearDown ()
 

Protected Attributes

UnicharCompress recoder_
 
int unichar_null_char_ = 0
 
int encoded_null_char_ = 0
 
CCUtil ccutil_
 
Dict lstm_dict_
 

Additional Inherited Members

- Public Member Functions inherited from testing::Test
virtual ~Test ()
 
- Static Public Member Functions inherited from testing::Test
static void SetUpTestSuite ()
 
static void TearDownTestSuite ()
 
static void TearDownTestCase ()
 
static void SetUpTestCase ()
 
static bool HasFatalFailure ()
 
static bool HasNonfatalFailure ()
 
static bool IsSkipped ()
 
static bool HasFailure ()
 
static void RecordProperty (const std::string &key, const std::string &value)
 
static void RecordProperty (const std::string &key, int value)
 

Detailed Description

Definition at line 58 of file recodebeam_test.cc.

Constructor & Destructor Documentation

◆ RecodeBeamTest()

tesseract::RecodeBeamTest::RecodeBeamTest ( )
inlineprotected

◆ ~RecodeBeamTest()

tesseract::RecodeBeamTest::~RecodeBeamTest ( )
inlineoverrideprotected

Definition at line 66 of file recodebeam_test.cc.

66 {
68 }
void End()
Definition: dict.cpp:379

Member Function Documentation

◆ EncodeUTF8()

int tesseract::RecodeBeamTest::EncodeUTF8 ( const char *  utf8_str,
float  score,
int  start_t,
TRand random,
GENERIC_2D_ARRAY< float > *  outputs 
)
inlineprotected

Definition at line 244 of file recodebeam_test.cc.

245 {
246 int t = start_t;
247 std::vector<int> unichar_ids;
248 EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
249 if (unichar_ids.empty() || utf8_str[0] == '\0') {
250 unichar_ids.clear();
251 unichar_ids.push_back(unichar_null_char_);
252 }
253 int num_ids = unichar_ids.size();
254 for (int u = 0; u < num_ids; ++u) {
255 RecodedCharID code;
256 int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
257 EXPECT_NE(0, len);
258 for (int i = 0; i < len; ++i) {
259 // Apply the desired score.
260 (*outputs)(t++, code(i)) = score;
261 if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
262 int dups = static_cast<int>(random->UnsignedRand(3.0));
263 for (int d = 0; d < dups; ++d) {
264 // Duplicate the desired score.
265 (*outputs)(t++, code(i)) = score;
266 }
267 }
268 }
269 if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
270 int dups = static_cast<int>(random->UnsignedRand(3.0));
271 for (int d = 0; d < dups; ++d) {
272 // Add a random number of nulls as well.
273 (*outputs)(t++, encoded_null_char_) = score;
274 }
275 }
276 }
277 return t;
278 }
#define EXPECT_NE(val1, val2)
Definition: gtest.h:2045
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
UNICHARSET unicharset
Definition: ccutil.h:61
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ ExpectCorrect() [1/2]

void tesseract::RecodeBeamTest::ExpectCorrect ( const GENERIC_2D_ARRAY< float > &  output,
const std::string &  truth_utf8,
Dict dict,
PointerVector< WERD_RES > *  words 
)
inlineprotected

Definition at line 115 of file recodebeam_test.cc.

116 {
117 RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
118 beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
119 // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
120 // beam_search.DebugBeams(ccutil_.unicharset);
121 std::vector<int> labels, xcoords;
122 beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
123 LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
124 // Now decode using recoder_.
125 std::string decoded;
126 int end = 1;
127 for (unsigned start = 0; start < labels.size(); start = end) {
128 RecodedCharID code;
129 unsigned index = start;
130 int uni_id = INVALID_UNICHAR_ID;
131 do {
132 code.Set(code.length(), labels[index++]);
133 uni_id = recoder_.DecodeUnichar(code);
134 } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
135 (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
136 EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
137 // To the extent of truth_utf8, we expect decoded to match, but if
138 // transcription is shorter, that is OK too, as we may just be testing
139 // that we get a valid sequence when padded with random data.
140 if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
141 decoded += ccutil_.unicharset.id_to_unichar(uni_id);
142 }
143 end = index;
144 }
145 EXPECT_EQ(truth_utf8, decoded);
146
147 // Check that ExtractBestPathAsUnicharIds does the same thing.
148 std::vector<int> unichar_ids;
149 std::vector<float> certainties, ratings;
150 beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
151 &ratings, &xcoords);
152 std::string u_decoded;
153 float total_rating = 0.0f;
154 for (unsigned u = 0; u < unichar_ids.size(); ++u) {
155 // To the extent of truth_utf8, we expect decoded to match, but if
156 // transcription is shorter, that is OK too, as we may just be testing
157 // that we get a valid sequence when padded with random data.
158 if (u_decoded.size() < truth_utf8.size()) {
159 const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
160 total_rating += ratings[u];
161 LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
162 << certainties[u] << ", r=" << ratings[u] << "r_sum="
163 << total_rating << " @" << xcoords[u] << "\n";
164 if (str[0] == ' ') {
165 total_rating = 0.0f;
166 }
167 u_decoded += str;
168 }
169 }
170 EXPECT_EQ(truth_utf8, u_decoded);
171
172 // Check that ExtractBestPathAsWords does the same thing.
173 TBOX line_box(0, 0, 100, 10);
174 for (int i = 0; i < 2; ++i) {
175 beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
176 std::string w_decoded;
177 for (int w = 0; w < words->size(); ++w) {
178 const WERD_RES *word = (*words)[w];
179 if (w_decoded.size() < truth_utf8.size()) {
180 if (!w_decoded.empty() && word->word->space()) {
181 w_decoded += " ";
182 }
183 w_decoded += word->best_choice->unichar_string().c_str();
184 }
185 LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
186 << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
187 << ", perm=" << word->best_choice->permuter() << "\n";
188 }
189 std::string w_trunc(w_decoded.data(), truth_utf8.size());
190 if (truth_utf8 != w_trunc) {
193 tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
194 w_trunc.assign(w_decoded.data(), truth_utf8.size());
195 }
196 EXPECT_EQ(truth_utf8, w_trunc);
197 }
198 }
@ TBOX
@ LOG
@ INFO
Definition: log.h:28
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
bool IsValidFirstCode(int code) const
int DecodeUnichar(const RecodedCharID &code) const
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ ExpectCorrect() [2/2]

void tesseract::RecodeBeamTest::ExpectCorrect ( const GENERIC_2D_ARRAY< float > &  output,
const std::vector< int > &  transcription 
)
inlineprotected

Definition at line 105 of file recodebeam_test.cc.

106 {
107 // Get the utf8 string of the transcription.
108 std::string truth_utf8;
109 for (int i : transcription) {
110 truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
111 }
112 PointerVector<WERD_RES> words;
113 ExpectCorrect(output, truth_utf8, nullptr, &words);
114 }
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)

◆ GenerateRandomPaddedOutputs()

GENERIC_2D_ARRAY< float > tesseract::RecodeBeamTest::GenerateRandomPaddedOutputs ( const std::vector< int > &  unichar_ids,
int  padding 
)
inlineprotected

Definition at line 201 of file recodebeam_test.cc.

202 {
203 int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
204 int num_codes = recoder_.code_range();
205 GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
206 // Fill with random data.
207 TRand random;
208 for (int t = 0; t < width; ++t) {
209 for (int i = 0; i < num_codes; ++i) {
210 outputs(t, i) = random.UnsignedRand(0.25);
211 }
212 }
213 int t = 0;
214 for (int unichar_id : unichar_ids) {
215 RecodedCharID code;
216 int len = recoder_.EncodeUnichar(unichar_id, &code);
217 EXPECT_NE(0, len);
218 for (int j = 0; j < len; ++j) {
219 // Make the desired answer a clear winner.
220 if (j > 0 && code(j) == code(j - 1)) {
221 // We will collapse adjacent equal codes so put a null in between.
222 outputs(t++, encoded_null_char_) = 1.0f;
223 }
224 outputs(t++, code(j)) = 1.0f;
225 }
226 // Put a 0 as a null char in between.
227 outputs(t++, encoded_null_char_) = 1.0f;
228 }
229 // Normalize the probs.
230 for (int t = 0; t < width; ++t) {
231 double sum = 0.0;
232 for (int i = 0; i < num_codes; ++i) {
233 sum += outputs(t, i);
234 }
235 for (int i = 0; i < num_codes; ++i) {
236 outputs(t, i) /= sum;
237 }
238 }
239
240 return outputs;
241 }

◆ GenerateSyntheticOutputs()

GENERIC_2D_ARRAY< float > tesseract::RecodeBeamTest::GenerateSyntheticOutputs ( const char *  chars1[],
const float  scores1[],
const char *  chars2[],
const float  scores2[],
TRand random 
)
inlineprotected

Definition at line 283 of file recodebeam_test.cc.

285 {
286 int width = 0;
287 while (chars1[width] != nullptr) {
288 ++width;
289 }
290 int padding = width * RecodedCharID::kMaxCodeLen;
291 int num_codes = recoder_.code_range();
292 GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
293 int t = 0;
294 for (int i = 0; i < width; ++i) {
295 // In case there is overlap in the codes between 1st and 2nd choice, it
296 // is better to encode the 2nd choice first.
297 int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
298 int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
299 // Advance t to the max end, setting everything else to the leftovers.
300 int max_t = std::max(end_t1, end_t2);
301 while (t < max_t) {
302 double total_score = 0.0;
303 for (int j = 0; j < num_codes; ++j) {
304 total_score += outputs(t, j);
305 }
306 double null_remainder = (1.0 - total_score) / 2.0;
307 double remainder = null_remainder / (num_codes - 2);
308 if (outputs(t, encoded_null_char_) < null_remainder) {
309 outputs(t, encoded_null_char_) += null_remainder;
310 } else {
311 remainder += remainder;
312 }
313 for (int j = 0; j < num_codes; ++j) {
314 if (outputs(t, j) == 0.0f) {
315 outputs(t, j) = remainder;
316 }
317 }
318 ++t;
319 }
320 }
321 // Fill the rest with null chars.
322 while (t < width + padding) {
323 outputs(t++, encoded_null_char_) = 1.0f;
324 }
325 return outputs;
326 }
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)

◆ LoadDict()

void tesseract::RecodeBeamTest::LoadDict ( const std::string &  lang)
inlineprotected

Definition at line 94 of file recodebeam_test.cc.

94 {
95 std::string traineddata_name = lang + ".traineddata";
96 std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
97 lstm_dict_.SetupForLoad(nullptr);
99 mgr.Init(traineddata_file.c_str());
100 lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
102 }
bool Init(const char *data_file_name)
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:291
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:180
bool FinishLoad()
Definition: dict.cpp:357
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65

◆ LoadUnicharset()

void tesseract::RecodeBeamTest::LoadUnicharset ( const std::string &  unicharset_name)
inlineprotected

Definition at line 71 of file recodebeam_test.cc.

71 {
72 std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
73 std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
74 std::string radical_data;
75 CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
76 CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
79 std::string radical_str(radical_data.c_str());
81 RecodedCharID code;
83 encoded_null_char_ = code(0);
84 // Space should encode as itself.
86 EXPECT_EQ(UNICHAR_SPACE, code(0));
87 std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
88 std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
89 std::string encoding_str(&encoding[0], encoding.size());
90 CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
91 LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
92 }
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool has_special_codes() const
Definition: unicharset.h:756
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355
static int Defaults()
Definition: include_gunit.h:61
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52

◆ SetUp()

void tesseract::RecodeBeamTest::SetUp ( )
inlineoverrideprotectedvirtual

Reimplemented from testing::Test.

Definition at line 60 of file recodebeam_test.cc.

60 {
61 std::locale::global(std::locale(""));
63 }
static void MakeTmpdir()
Definition: include_gunit.h:38

Member Data Documentation

◆ ccutil_

CCUtil tesseract::RecodeBeamTest::ccutil_
protected

Definition at line 330 of file recodebeam_test.cc.

◆ encoded_null_char_

int tesseract::RecodeBeamTest::encoded_null_char_ = 0
protected

Definition at line 329 of file recodebeam_test.cc.

◆ lstm_dict_

Dict tesseract::RecodeBeamTest::lstm_dict_
protected

Definition at line 331 of file recodebeam_test.cc.

◆ recoder_

UnicharCompress tesseract::RecodeBeamTest::recoder_
protected

Definition at line 327 of file recodebeam_test.cc.

◆ unichar_null_char_

int tesseract::RecodeBeamTest::unichar_null_char_ = 0
protected

Definition at line 328 of file recodebeam_test.cc.


The documentation for this class was generated from the following file: