39const char *
kGWRTops[] = {
"G",
"e",
"f",
" ",
"s",
" ",
"w",
"o",
"r",
"d",
40 "s",
"",
"r",
"i",
"g",
"h",
"t",
".",
nullptr};
41const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
42 0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
43const char *
kGWR2nds[] = {
"C",
"c",
"t",
"",
"S",
"",
"W",
"O",
"t",
"h",
44 "S",
" ",
"t",
"I",
"9",
"b",
"f",
",",
nullptr};
45const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
46 0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
48const char *
kZHTops[] = {
"实",
"学",
"储",
"啬",
"投",
"学",
"生",
nullptr};
49const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
50const char *
kZH2nds[] = {
"学",
"储",
"投",
"生",
"学",
"生",
"实",
nullptr};
51const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
53const char *
kViTops[] = {
"v",
"ậ",
"y",
" ",
"t",
"ộ",
"i",
nullptr};
54const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
55const char *
kVi2nds[] = {
"V",
"a",
"v",
"",
"l",
"o",
"",
nullptr};
56const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
61 std::locale::global(std::locale(
""));
72 std::string radical_stroke_file =
file::JoinPath(LANGDATA_DIR,
"radical-stroke.txt");
73 std::string unicharset_file =
file::JoinPath(TESTDATA_DIR, unicharset_name);
74 std::string radical_data;
79 std::string radical_str(radical_data.c_str());
87 std::string output_name =
file::JoinPath(FLAGS_test_tmpdir,
"testenc.txt");
89 std::string encoding_str(&encoding[0], encoding.size());
91 LOG(
INFO) <<
"Wrote encoding to:" << output_name <<
"\n";
95 std::string traineddata_name = lang +
".traineddata";
96 std::string traineddata_file =
file::JoinPath(TESTDATA_DIR, traineddata_name);
99 mgr.
Init(traineddata_file.c_str());
106 const std::vector<int> &transcription) {
108 std::string truth_utf8;
109 for (
int i : transcription) {
118 beam_search.
Decode(
output, 3.5, -0.125, -25.0,
nullptr);
121 std::vector<int> labels, xcoords;
123 LOG(
INFO) <<
"Labels size = " << labels.size() <<
" coords " << xcoords.size() <<
"\n";
127 for (
unsigned start = 0; start < labels.size(); start = end) {
129 unsigned index = start;
130 int uni_id = INVALID_UNICHAR_ID;
132 code.
Set(code.
length(), labels[index++]);
136 EXPECT_NE(INVALID_UNICHAR_ID, uni_id) <<
"index=" << index <<
"/" << labels.size();
148 std::vector<int> unichar_ids;
149 std::vector<float> certainties, ratings;
152 std::string u_decoded;
153 float total_rating = 0.0f;
154 for (
unsigned u = 0; u < unichar_ids.size(); ++u) {
158 if (u_decoded.size() < truth_utf8.size()) {
160 total_rating += ratings[u];
161 LOG(
INFO) << u <<
":u_id=" << unichar_ids[u] <<
"=" << str <<
", c="
162 << certainties[u] <<
", r=" << ratings[u] <<
"r_sum="
163 << total_rating <<
" @" << xcoords[u] <<
"\n";
173 TBOX line_box(0, 0, 100, 10);
174 for (
int i = 0;
i < 2; ++
i) {
176 std::string w_decoded;
177 for (
int w = 0; w < words->
size(); ++w) {
179 if (w_decoded.size() < truth_utf8.size()) {
180 if (!w_decoded.empty() && word->
word->
space()) {
189 std::string w_trunc(w_decoded.data(), truth_utf8.size());
190 if (truth_utf8 != w_trunc) {
194 w_trunc.assign(w_decoded.data(), truth_utf8.size());
208 for (
int t = 0; t < width; ++t) {
209 for (
int i = 0;
i < num_codes; ++
i) {
214 for (
int unichar_id : unichar_ids) {
218 for (
int j = 0; j < len; ++j) {
220 if (j > 0 && code(j) == code(j - 1)) {
224 outputs(t++, code(j)) = 1.0f;
230 for (
int t = 0; t < width; ++t) {
232 for (
int i = 0;
i < num_codes; ++
i) {
233 sum += outputs(t,
i);
235 for (
int i = 0;
i < num_codes; ++
i) {
236 outputs(t,
i) /= sum;
247 std::vector<int> unichar_ids;
249 if (unichar_ids.empty() || utf8_str[0] ==
'\0') {
253 int num_ids = unichar_ids.size();
254 for (
int u = 0; u < num_ids; ++u) {
258 for (
int i = 0;
i < len; ++
i) {
260 (*outputs)(t++, code(
i)) = score;
263 for (
int d = 0; d < dups; ++d) {
265 (*outputs)(t++, code(
i)) = score;
271 for (
int d = 0; d < dups; ++d) {
284 const char *chars2[],
const float scores2[],
287 while (chars1[width] !=
nullptr) {
294 for (
int i = 0;
i < width; ++
i) {
297 int end_t2 =
EncodeUTF8(chars2[
i], scores2[
i], t, random, &outputs);
298 int end_t1 =
EncodeUTF8(chars1[
i], scores1[
i], t, random, &outputs);
300 int max_t = std::max(end_t1, end_t2);
302 double total_score = 0.0;
303 for (
int j = 0; j < num_codes; ++j) {
304 total_score += outputs(t, j);
306 double null_remainder = (1.0 - total_score) / 2.0;
307 double remainder = null_remainder / (num_codes - 2);
311 remainder += remainder;
313 for (
int j = 0; j < num_codes; ++j) {
314 if (outputs(t, j) == 0.0f) {
315 outputs(t, j) = remainder;
322 while (t < width + padding) {
337 LoadUnicharset(
"chi_tra.unicharset");
339 std::vector<int> transcription;
341 transcription.push_back(
i);
344 ExpectCorrect(outputs, transcription);
347 LoadUnicharset(
"chi_sim.unicharset");
349 transcription.clear();
351 transcription.push_back(
i);
353 outputs = GenerateRandomPaddedOutputs(transcription,
kPadding);
354 ExpectCorrect(outputs, transcription);
360 LoadUnicharset(
"jpn.unicharset");
362 std::vector<int> transcription;
364 transcription.push_back(
i);
367 ExpectCorrect(outputs, transcription);
373 LoadUnicharset(
"kor.unicharset");
375 std::vector<int> transcription;
377 transcription.push_back(
i);
380 ExpectCorrect(outputs, transcription);
386 LoadUnicharset(
"kan.unicharset");
388 std::vector<int> transcription;
390 transcription.push_back(
i);
393 ExpectCorrect(outputs, transcription);
399 LoadUnicharset(
"mar.unicharset");
401 std::vector<int> transcription;
403 transcription.push_back(
i);
406 ExpectCorrect(outputs, transcription);
412 LoadUnicharset(
"eng.unicharset");
414 std::vector<int> transcription;
416 transcription.push_back(
i);
419 ExpectCorrect(outputs, transcription);
423 LOG(
INFO) <<
"Testing eng dictionary"
425 LoadUnicharset(
"eng_beam.unicharset");
428 std::string default_str;
433 ExpectCorrect(outputs, default_str,
nullptr, &words);
435 LoadDict(
"eng_beam");
436 ExpectCorrect(outputs,
"Gets words right.", &lstm_dict_, &words);
440 LOG(
INFO) <<
"Testing zh_hans dictionary"
442 LoadUnicharset(
"zh_hans.unicharset");
446 ExpectCorrect(outputs,
"实学储啬投学生",
nullptr, &words);
449 for (
int w = 0; w < words.
size(); ++w) {
454 ExpectCorrect(outputs,
"实学储啬投学生", &lstm_dict_, &words);
456 const int kNumWords = 5;
458 const char *kWords[kNumWords] = {
"实学",
"储",
"啬",
"投",
"学生"};
463 for (
int w = 0; w < kNumWords && w < words.
size(); ++w) {
464 EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
465 EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
472 LOG(
INFO) <<
"Testing duplicates in multi-code sequences"
474 LoadUnicharset(
"vie.d.unicharset");
480 std::string truth_str;
483 ExpectCorrect(outputs, truth_str,
nullptr, &words);
#define EXPECT_EQ(val1, val2)
#define EXPECT_NE(val1, val2)
#define EXPECT_TRUE(condition)
#define EXPECT_STREQ(s1, s2)
const float kGWR2ndScores[]
const float kZH2ndScores[]
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
const float kViTopScores[]
@ SPECIAL_UNICHAR_CODES_COUNT
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
const float kVi2ndScores[]
TEST_F(EuroText, FastLatinOCR)
const float kZHTopScores[]
const float kGWRTopScores[]
WERD_CHOICE * best_choice
std::string & unichar_string()
double UnsignedRand(double range)
bool Init(const char *data_file_name)
void Set(int index, int value)
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool IsValidFirstCode(int code) const
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
bool has_special_codes() const
bool load_from_file(const char *const filename, bool skip_fragments)
const char * id_to_unichar(UNICHAR_ID id) const
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
void SetupForLoad(DawgCache *dawg_cache)
void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode=0)
void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, std::vector< int > *unichar_ids, std::vector< float > *certs, std::vector< float > *ratings, std::vector< int > *xcoords) const
void ExtractBestPathAsLabels(std::vector< int > *labels, std::vector< int > *xcoords) const
void ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, const UNICHARSET *unicharset, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
static std::string JoinPath(const std::string &s1, const std::string &s2)
static bool SetContents(const std::string &name, const std::string &contents, bool)
static bool GetContents(const std::string &filename, std::string *out, int)
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)
GENERIC_2D_ARRAY< float > GenerateRandomPaddedOutputs(const std::vector< int > &unichar_ids, int padding)
void LoadUnicharset(const std::string &unicharset_name)
void LoadDict(const std::string &lang)
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::string &truth_utf8, Dict *dict, PointerVector< WERD_RES > *words)
GENERIC_2D_ARRAY< float > GenerateSyntheticOutputs(const char *chars1[], const float scores1[], const char *chars2[], const float scores2[], TRand *random)
~RecodeBeamTest() override