14#include <allheaders.h>
27 std::locale::global(std::locale(
""));
33 std::string radical_stroke_file =
file::JoinPath(LANGDATA_DIR,
"radical-stroke.txt");
34 std::string unicharset_file =
file::JoinPath(TESTDATA_DIR, unicharset_name);
35 std::string radical_data;
38 std::string radical_str(radical_data.c_str());
45 std::string output_name =
46 file::JoinPath(FLAGS_test_tmpdir, unicharset_name) +
".encoding.txt";
48 std::string encoding_str(&encoding[0], encoding.size());
50 LOG(
INFO) <<
"Wrote encoding to:" << output_name;
54 std::vector<char> data;
59 rfp.
Open(&data[0], data.size());
64 return lang ==
"chi_sim" || lang ==
"chi_tra" || lang ==
"kor" || lang ==
"jpn";
68 return lang ==
"asm" || lang ==
"ben" || lang ==
"bih" || lang ==
"hin" || lang ==
"mar" ||
69 lang ==
"nep" || lang ==
"san" || lang ==
"bod" || lang ==
"dzo" || lang ==
"guj" ||
70 lang ==
"kan" || lang ==
"mal" || lang ==
"ori" || lang ==
"pan" || lang ==
"sin" ||
71 lang ==
"tam" || lang ==
"tel";
83 std::vector<RecodedCharID> times_seen(code_range, zeros);
94 std::vector<UNICHAR_ID> normed_ids;
102 for (
int i = 0;
i < len; ++
i) {
103 int code_val = code(
i);
106 times_seen[code_val].Set(
i, times_seen[code_val](
i) + 1);
110 for (
int c = 0; c < code_range; ++c) {
113 if (times_seen[c](
i) != 0) {
117 EXPECT_GE(num_used, 1) <<
"c=" << c <<
"/" << code_range;
136 const std::vector<RecodedCharID> ×_seen) {
138 int length = code.
length();
140 if (final_codes !=
nullptr) {
141 for (
int ending : *final_codes) {
142 EXPECT_GT(times_seen[ending](length), 0);
143 extended.
Set(length, ending);
145 EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
149 if (next_codes !=
nullptr) {
150 for (
int extension : *next_codes) {
151 EXPECT_GT(times_seen[extension](length), 0);
152 extended.
Set(length, extension);
166 LOG(
INFO) <<
"Testing chi_tra";
167 LoadUnicharset(
"chi_tra.unicharset");
168 ExpectCorrect(
"chi_tra");
169 LOG(
INFO) <<
"Testing chi_sim";
170 LoadUnicharset(
"chi_sim.unicharset");
171 ExpectCorrect(
"chi_sim");
176 LoadUnicharset(
"jpn.unicharset");
177 ExpectCorrect(
"jpn");
182 LoadUnicharset(
"kor.unicharset");
183 ExpectCorrect(
"kor");
188 LoadUnicharset(
"kan.unicharset");
189 ExpectCorrect(
"kan");
191 ExpectCorrect(
"kan");
196 LoadUnicharset(
"mar.unicharset");
197 ExpectCorrect(
"mar");
202 LoadUnicharset(
"eng.unicharset");
203 ExpectCorrect(
"eng");
209 LOG(
INFO) <<
"Testing por with ligatures";
210 LoadUnicharset(
"por.unicharset");
211 ExpectCorrect(
"por");
214 for (
int u = 0; u <= unicharset_.size(); ++u) {
216 int len = compressed_.EncodeUnichar(u, &code);
219 for (
int i = 0;
i < len; ++
i) {
229 LoadUnicharset(
"trivial.unicharset");
230 ExpectCorrect(
"trivial");
231 std::string encoding = compressed_.GetEncodingAsString(unicharset_);
232 std::string encoding_str(&encoding[0], encoding.length());
233 std::vector<std::string> lines =
split(encoding_str,
'\n');
#define EXPECT_EQ(val1, val2)
#define EXPECT_NE(val1, val2)
#define EXPECT_GT(val1, val2)
#define EXPECT_GE(val1, val2)
#define EXPECT_TRUE(condition)
#define EXPECT_LE(val1, val2)
#define EXPECT_LT(val1, val2)
@ SPECIAL_UNICHAR_CODES_COUNT
const std::vector< std::string > split(const std::string &s, char c)
TEST_F(EuroText, FastLatinOCR)
void OpenWrite(std::vector< char > *data)
bool Open(const char *filename, FileReader reader)
void Set(int index, int value)
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool DeSerialize(TFile *fp)
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
const std::vector< int > * GetFinalCodes(const RecodedCharID &code) const
const std::vector< int > * GetNextCodes(const RecodedCharID &code) const
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool Serialize(TFile *fp) const
bool has_special_codes() const
bool load_from_file(const char *const filename, bool skip_fragments)
static std::string JoinPath(const std::string &s1, const std::string &s2)
static bool SetContents(const std::string &name, const std::string &contents, bool)
static bool GetContents(const std::string &filename, std::string *out, int)
UnicharCompress compressed_
void ExpectCorrect(const std::string &lang)
bool IsIndicLang(const std::string &lang)
void CheckCodeExtensions(const RecodedCharID &code, const std::vector< RecodedCharID > ×_seen)
bool IsCJKLang(const std::string &lang)
void LoadUnicharset(const std::string &unicharset_name)