tesseract v5.3.3.20231005
recodebeam_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include "include_gunit.h"
13#include "log.h" // for LOG
14
15#include "matrix.h"
16#include "normstrngs.h"
17#include "pageres.h"
18#include "ratngs.h"
19#include "recodebeam.h"
20#include "unicharcompress.h"
22
23#include "helpers.h"
24
25namespace tesseract {
26
27// Number of characters to test beam search with.
28const int kNumChars = 100;
29// Amount of extra random data to pad with after.
30const int kPadding = 64;
31// Dictionary test data.
32// The top choice is: "Gef s wordsright.".
33// The desired phrase is "Gets words right.".
34// There is a competing dictionary phrase: "Get swords right.".
35// ... due to the following errors from the network:
36// f stronger than t in "Get".
37// weak space between Gef and s and between s and words.
38// weak space between words and right.
39const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
40 "s", "", "r", "i", "g", "h", "t", ".", nullptr};
41const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
42 0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
43const char *kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
44 "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
45const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
46 0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
47
48const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
49const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
50const char *kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
51const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
52
53const char *kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
54const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
55const char *kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
56const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
57
59protected:
60 void SetUp() override {
61 std::locale::global(std::locale(""));
63 }
64
66 ~RecodeBeamTest() override {
68 }
69
70 // Loads and compresses the given unicharset.
71 void LoadUnicharset(const std::string &unicharset_name) {
72 std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
73 std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
74 std::string radical_data;
75 CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
76 CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
79 std::string radical_str(radical_data.c_str());
81 RecodedCharID code;
83 encoded_null_char_ = code(0);
84 // Space should encode as itself.
86 EXPECT_EQ(UNICHAR_SPACE, code(0));
87 std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
88 std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
89 std::string encoding_str(&encoding[0], encoding.size());
90 CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
91 LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
92 }
93 // Loads the dictionary.
94 void LoadDict(const std::string &lang) {
95 std::string traineddata_name = lang + ".traineddata";
96 std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
97 lstm_dict_.SetupForLoad(nullptr);
99 mgr.Init(traineddata_file.c_str());
100 lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
102 }
103
104 // Expects the appropriate results from the compressed_ ccutil_.unicharset.
106 const std::vector<int> &transcription) {
107 // Get the utf8 string of the transcription.
108 std::string truth_utf8;
109 for (int i : transcription) {
110 truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
111 }
113 ExpectCorrect(output, truth_utf8, nullptr, &words);
114 }
115 void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,
116 Dict *dict, PointerVector<WERD_RES> *words) {
117 RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
118 beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
119 // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
120 // beam_search.DebugBeams(ccutil_.unicharset);
121 std::vector<int> labels, xcoords;
122 beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
123 LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
124 // Now decode using recoder_.
125 std::string decoded;
126 int end = 1;
127 for (unsigned start = 0; start < labels.size(); start = end) {
128 RecodedCharID code;
129 unsigned index = start;
130 int uni_id = INVALID_UNICHAR_ID;
131 do {
132 code.Set(code.length(), labels[index++]);
133 uni_id = recoder_.DecodeUnichar(code);
134 } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
135 (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
136 EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
137 // To the extent of truth_utf8, we expect decoded to match, but if
138 // transcription is shorter, that is OK too, as we may just be testing
139 // that we get a valid sequence when padded with random data.
140 if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
141 decoded += ccutil_.unicharset.id_to_unichar(uni_id);
142 }
143 end = index;
144 }
145 EXPECT_EQ(truth_utf8, decoded);
146
147 // Check that ExtractBestPathAsUnicharIds does the same thing.
148 std::vector<int> unichar_ids;
149 std::vector<float> certainties, ratings;
150 beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
151 &ratings, &xcoords);
152 std::string u_decoded;
153 float total_rating = 0.0f;
154 for (unsigned u = 0; u < unichar_ids.size(); ++u) {
155 // To the extent of truth_utf8, we expect decoded to match, but if
156 // transcription is shorter, that is OK too, as we may just be testing
157 // that we get a valid sequence when padded with random data.
158 if (u_decoded.size() < truth_utf8.size()) {
159 const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
160 total_rating += ratings[u];
161 LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
162 << certainties[u] << ", r=" << ratings[u] << "r_sum="
163 << total_rating << " @" << xcoords[u] << "\n";
164 if (str[0] == ' ') {
165 total_rating = 0.0f;
166 }
167 u_decoded += str;
168 }
169 }
170 EXPECT_EQ(truth_utf8, u_decoded);
171
172 // Check that ExtractBestPathAsWords does the same thing.
173 TBOX line_box(0, 0, 100, 10);
174 for (int i = 0; i < 2; ++i) {
175 beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
176 std::string w_decoded;
177 for (int w = 0; w < words->size(); ++w) {
178 const WERD_RES *word = (*words)[w];
179 if (w_decoded.size() < truth_utf8.size()) {
180 if (!w_decoded.empty() && word->word->space()) {
181 w_decoded += " ";
182 }
183 w_decoded += word->best_choice->unichar_string().c_str();
184 }
185 LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
186 << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
187 << ", perm=" << word->best_choice->permuter() << "\n";
188 }
189 std::string w_trunc(w_decoded.data(), truth_utf8.size());
190 if (truth_utf8 != w_trunc) {
193 tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
194 w_trunc.assign(w_decoded.data(), truth_utf8.size());
195 }
196 EXPECT_EQ(truth_utf8, w_trunc);
197 }
198 }
199 // Generates easy encoding of the given unichar_ids, and pads with at least
200 // padding of random data.
201 GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const std::vector<int> &unichar_ids,
202 int padding) {
203 int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
204 int num_codes = recoder_.code_range();
205 GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
206 // Fill with random data.
207 TRand random;
208 for (int t = 0; t < width; ++t) {
209 for (int i = 0; i < num_codes; ++i) {
210 outputs(t, i) = random.UnsignedRand(0.25);
211 }
212 }
213 int t = 0;
214 for (int unichar_id : unichar_ids) {
215 RecodedCharID code;
216 int len = recoder_.EncodeUnichar(unichar_id, &code);
217 EXPECT_NE(0, len);
218 for (int j = 0; j < len; ++j) {
219 // Make the desired answer a clear winner.
220 if (j > 0 && code(j) == code(j - 1)) {
221 // We will collapse adjacent equal codes so put a null in between.
222 outputs(t++, encoded_null_char_) = 1.0f;
223 }
224 outputs(t++, code(j)) = 1.0f;
225 }
226 // Put a 0 as a null char in between.
227 outputs(t++, encoded_null_char_) = 1.0f;
228 }
229 // Normalize the probs.
230 for (int t = 0; t < width; ++t) {
231 double sum = 0.0;
232 for (int i = 0; i < num_codes; ++i) {
233 sum += outputs(t, i);
234 }
235 for (int i = 0; i < num_codes; ++i) {
236 outputs(t, i) /= sum;
237 }
238 }
239
240 return outputs;
241 }
242 // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
243 // the score for the appropriate sequence of codes, returning the ending t.
244 int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random,
245 GENERIC_2D_ARRAY<float> *outputs) {
246 int t = start_t;
247 std::vector<int> unichar_ids;
248 EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
249 if (unichar_ids.empty() || utf8_str[0] == '\0') {
250 unichar_ids.clear();
251 unichar_ids.push_back(unichar_null_char_);
252 }
253 int num_ids = unichar_ids.size();
254 for (int u = 0; u < num_ids; ++u) {
255 RecodedCharID code;
256 int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
257 EXPECT_NE(0, len);
258 for (int i = 0; i < len; ++i) {
259 // Apply the desired score.
260 (*outputs)(t++, code(i)) = score;
261 if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
262 int dups = static_cast<int>(random->UnsignedRand(3.0));
263 for (int d = 0; d < dups; ++d) {
264 // Duplicate the desired score.
265 (*outputs)(t++, code(i)) = score;
266 }
267 }
268 }
269 if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
270 int dups = static_cast<int>(random->UnsignedRand(3.0));
271 for (int d = 0; d < dups; ++d) {
272 // Add a random number of nulls as well.
273 (*outputs)(t++, encoded_null_char_) = score;
274 }
275 }
276 }
277 return t;
278 }
279 // Generates an encoding of the given 4 arrays as synthetic network scores.
280 // uses scores1 for chars1 and scores2 for chars2, and everything else gets
281 // the leftovers shared out equally. Note that empty string encodes as the
282 // null_char_.
283 GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],
284 const char *chars2[], const float scores2[],
285 TRand *random) {
286 int width = 0;
287 while (chars1[width] != nullptr) {
288 ++width;
289 }
290 int padding = width * RecodedCharID::kMaxCodeLen;
291 int num_codes = recoder_.code_range();
292 GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
293 int t = 0;
294 for (int i = 0; i < width; ++i) {
295 // In case there is overlap in the codes between 1st and 2nd choice, it
296 // is better to encode the 2nd choice first.
297 int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
298 int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
299 // Advance t to the max end, setting everything else to the leftovers.
300 int max_t = std::max(end_t1, end_t2);
301 while (t < max_t) {
302 double total_score = 0.0;
303 for (int j = 0; j < num_codes; ++j) {
304 total_score += outputs(t, j);
305 }
306 double null_remainder = (1.0 - total_score) / 2.0;
307 double remainder = null_remainder / (num_codes - 2);
308 if (outputs(t, encoded_null_char_) < null_remainder) {
309 outputs(t, encoded_null_char_) += null_remainder;
310 } else {
311 remainder += remainder;
312 }
313 for (int j = 0; j < num_codes; ++j) {
314 if (outputs(t, j) == 0.0f) {
315 outputs(t, j) = remainder;
316 }
317 }
318 ++t;
319 }
320 }
321 // Fill the rest with null chars.
322 while (t < width + padding) {
323 outputs(t++, encoded_null_char_) = 1.0f;
324 }
325 return outputs;
326 }
332};
333
334TEST_F(RecodeBeamTest, DoesChinese) {
335 LOG(INFO) << "Testing chi_tra"
336 << "\n";
337 LoadUnicharset("chi_tra.unicharset");
338 // Correctly reproduce the first kNumchars characters from easy output.
339 std::vector<int> transcription;
340 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
341 transcription.push_back(i);
342 }
343 GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
344 ExpectCorrect(outputs, transcription);
345 LOG(INFO) << "Testing chi_sim"
346 << "\n";
347 LoadUnicharset("chi_sim.unicharset");
348 // Correctly reproduce the first kNumchars characters from easy output.
349 transcription.clear();
350 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
351 transcription.push_back(i);
352 }
353 outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
354 ExpectCorrect(outputs, transcription);
355}
356
357TEST_F(RecodeBeamTest, DoesJapanese) {
358 LOG(INFO) << "Testing jpn"
359 << "\n";
360 LoadUnicharset("jpn.unicharset");
361 // Correctly reproduce the first kNumchars characters from easy output.
362 std::vector<int> transcription;
363 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
364 transcription.push_back(i);
365 }
366 GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
367 ExpectCorrect(outputs, transcription);
368}
369
370TEST_F(RecodeBeamTest, DoesKorean) {
371 LOG(INFO) << "Testing kor"
372 << "\n";
373 LoadUnicharset("kor.unicharset");
374 // Correctly reproduce the first kNumchars characters from easy output.
375 std::vector<int> transcription;
376 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
377 transcription.push_back(i);
378 }
379 GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
380 ExpectCorrect(outputs, transcription);
381}
382
383TEST_F(RecodeBeamTest, DoesKannada) {
384 LOG(INFO) << "Testing kan"
385 << "\n";
386 LoadUnicharset("kan.unicharset");
387 // Correctly reproduce the first kNumchars characters from easy output.
388 std::vector<int> transcription;
389 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
390 transcription.push_back(i);
391 }
392 GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
393 ExpectCorrect(outputs, transcription);
394}
395
396TEST_F(RecodeBeamTest, DoesMarathi) {
397 LOG(INFO) << "Testing mar"
398 << "\n";
399 LoadUnicharset("mar.unicharset");
400 // Correctly reproduce the first kNumchars characters from easy output.
401 std::vector<int> transcription;
402 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
403 transcription.push_back(i);
404 }
405 GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
406 ExpectCorrect(outputs, transcription);
407}
408
409TEST_F(RecodeBeamTest, DoesEnglish) {
410 LOG(INFO) << "Testing eng"
411 << "\n";
412 LoadUnicharset("eng.unicharset");
413 // Correctly reproduce the first kNumchars characters from easy output.
414 std::vector<int> transcription;
415 for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
416 transcription.push_back(i);
417 }
418 GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
419 ExpectCorrect(outputs, transcription);
420}
421
422TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
423 LOG(INFO) << "Testing eng dictionary"
424 << "\n";
425 LoadUnicharset("eng_beam.unicharset");
427 GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
428 std::string default_str;
429 for (int i = 0; kGWRTops[i] != nullptr; ++i) {
430 default_str += kGWRTops[i];
431 }
433 ExpectCorrect(outputs, default_str, nullptr, &words);
434 // Now try again with the dictionary.
435 LoadDict("eng_beam");
436 ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
437}
438
439TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
440 LOG(INFO) << "Testing zh_hans dictionary"
441 << "\n";
442 LoadUnicharset("zh_hans.unicharset");
444 GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
446 ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
447 // Each is an individual word, with permuter = top choice.
448 EXPECT_EQ(7, words.size());
449 for (int w = 0; w < words.size(); ++w) {
450 EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
451 }
452 // Now try again with the dictionary.
453 LoadDict("zh_hans");
454 ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
455 // Number of words expected.
456 const int kNumWords = 5;
457 // Content of the words.
458 const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
459 // Permuters of the words.
460 const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,
462 EXPECT_EQ(kNumWords, words.size());
463 for (int w = 0; w < kNumWords && w < words.size(); ++w) {
464 EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
465 EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
466 }
467}
468
469// Tests that a recoder built with decomposed unicode allows true ctc
470// arbitrary duplicates and inserted nulls inside the multicode sequence.
471TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
472 LOG(INFO) << "Testing duplicates in multi-code sequences"
473 << "\n";
474 LoadUnicharset("vie.d.unicharset");
475 tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
476 TRand random;
478 GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
480 std::string truth_str;
482 tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
483 ExpectCorrect(outputs, truth_str, nullptr, &words);
484}
485
486} // namespace tesseract
@ LOG
@ INFO
Definition: log.h:28
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_NE(val1, val2)
Definition: gtest.h:2045
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_STREQ(s1, s2)
Definition: gtest.h:2112
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
const float kGWR2ndScores[]
const char * kGWRTops[]
const float kZH2ndScores[]
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
const char * kVi2nds[]
const char * kViTops[]
const float kViTopScores[]
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
const int kNumChars
const char * kZH2nds[]
const float kVi2ndScores[]
const char * kZHTops[]
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ TOP_CHOICE_PERM
Definition: ratngs.h:238
TEST_F(EuroText, FastLatinOCR)
const float kZHTopScores[]
const float kGWRTopScores[]
const char * kGWR2nds[]
const int kPadding
WERD_CHOICE * best_choice
Definition: pageres.h:239
float certainty() const
Definition: ratngs.h:315
uint8_t permuter() const
Definition: ratngs.h:331
std::string & unichar_string()
Definition: ratngs.h:519
float rating() const
Definition: ratngs.h:312
uint8_t space() const
Definition: werd.h:100
UNICHARSET unicharset
Definition: ccutil.h:61
unsigned size() const
Definition: genericvector.h:70
double UnsignedRand(double range)
Definition: helpers.h:82
bool Init(const char *data_file_name)
void Set(int index, int value)
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool IsValidFirstCode(int code) const
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
bool has_special_codes() const
Definition: unicharset.h:756
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
size_t size() const
Definition: unicharset.h:355
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:291
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:180
void End()
Definition: dict.cpp:379
bool FinishLoad()
Definition: dict.cpp:357
void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode=0)
Definition: recodebeam.cpp:83
void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, std::vector< int > *unichar_ids, std::vector< float > *certs, std::vector< float > *ratings, std::vector< int > *xcoords) const
Definition: recodebeam.cpp:224
void ExtractBestPathAsLabels(std::vector< int > *labels, std::vector< int > *xcoords) const
Definition: recodebeam.cpp:201
void ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, const UNICHARSET *unicharset, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
Definition: recodebeam.cpp:239
static int Defaults()
Definition: include_gunit.h:61
static void MakeTmpdir()
Definition: include_gunit.h:38
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)
GENERIC_2D_ARRAY< float > GenerateRandomPaddedOutputs(const std::vector< int > &unichar_ids, int padding)
void LoadUnicharset(const std::string &unicharset_name)
void LoadDict(const std::string &lang)
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::string &truth_utf8, Dict *dict, PointerVector< WERD_RES > *words)
GENERIC_2D_ARRAY< float > GenerateSyntheticOutputs(const char *chars1[], const float scores1[], const char *chars2[], const float scores2[], TRand *random)