21#include <allheaders.h>
31using ::testing::ContainsRegex;
32using ::testing::HasSubstr;
34static const char *langs[] = {
"eng",
"vie",
"hin",
"ara",
nullptr};
35static const char *image_files[] = {
"HelloGoogle.tif",
"viet.tif",
"raaj.tif",
"arabic.tif",
37static const char *gt_text[] = {
"Hello Google",
"\x74\x69\xe1\xba\xbf\x6e\x67",
38 "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
39 "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a",
nullptr};
48 std::string ocr_result = result;
74 std::string truth_text;
109 EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
110 boxaDestroy(&block_boxes);
111 boxaDestroy(¶_boxes);
135 EXPECT_THAT(result, HasSubstr(
"<div class='ocr_page'"));
156 result, std::regex{
"<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+"}));
164#ifdef DISABLED_LEGACY_ENGINE
168 static const char *kTrainingPages[] = {
"136.tif",
"256.tif",
"410.tif",
"432.tif",
"540.tif",
169 "692.tif",
"779.tif",
"793.tif",
"808.tif",
"815.tif",
170 "12.tif",
"12.tif",
nullptr};
171 static const char *kTrainingText[] = {
"1 3 6",
"2 5 6",
"4 1 0",
"4 3 2",
"5 4 0",
172 "6 9 2",
"7 7 9",
"7 9 3",
"8 0 8",
"8 1 5",
173 "1 2",
"1 2",
nullptr};
174 static const char *kTestPages[] = {
"324.tif",
"433.tif",
"12.tif",
nullptr};
175 static const char *kTestText[] = {
"324",
"433",
"12",
nullptr};
177 std::string truth_text;
178 std::string ocr_text;
184 api.
SetVariable(
"matcher_sufficient_examples_for_prototyping",
"1");
185 api.
SetVariable(
"classify_class_pruner_threshold",
"220");
187 for (
int i = 0; kTrainingPages[
i] !=
nullptr; ++
i) {
189 Image src_pix = pixRead(image_file.c_str());
193 <<
"Failed to adapt to text \"" << kTrainingText[
i] <<
"\" on image " << image_file;
199 for (
int i = 0; kTestPages[
i] !=
nullptr; ++
i) {
213 std::string truth_text;
214 std::string ocr_text;
282 const char *langs[] = {
"eng",
"chi_tra",
"jpn",
"vie"};
283 std::unique_ptr<tesseract::TessBaseAPI> api;
285 for (
auto &lang : langs) {
286 api = std::make_unique<tesseract::TessBaseAPI>();
290 LOG(
INFO) <<
"Lang " << lang <<
" took " << timer.
GetInMs() <<
"ms in regular init";
293 std::vector<std::string> vars_vec, vars_values;
294 vars_vec.emplace_back(
"tessedit_init_config_only");
295 vars_values.emplace_back(
"1");
296 LOG(
INFO) <<
"Switching to config only initialization:";
297 for (
auto &lang : langs) {
298 api = std::make_unique<tesseract::TessBaseAPI>();
301 &vars_vec, &vars_values,
false));
303 LOG(
INFO) <<
"Lang " << lang <<
" took " << timer.
GetInMs() <<
"ms in config-only init";
313TEST(TesseractInstanceTest, TestMultipleTessInstances) {
315 while (langs[num_langs] !=
nullptr) {
319 const std::string kTessdataPath = TESSDATA_DIR;
322 std::vector<Image > pix(num_langs);
323 for (
int i = 0;
i < num_langs; ++
i) {
324 std::string tracestring =
"Single instance test with lang = ";
325 tracestring += langs[
i];
328 pix[
i] = pixRead(path.c_str());
329 QCHECK(pix[
i] !=
nullptr) <<
"Could not read " << path;
338 std::string ocr_result[2];
339 for (
int i = 0;
i < num_langs; ++
i) {
340 for (
int j =
i + 1; j < num_langs; ++j) {
342 tess1.
Init(kTessdataPath.c_str(), langs[
i]);
343 tess2.
Init(kTessdataPath.c_str(), langs[j]);
349 strcmp(gt_text[j], ocr_result[1].c_str()))
350 <<
"OCR failed on language pair " << langs[
i] <<
"-" << langs[j];
354 for (
int i = 0;
i < num_langs; ++
i) {
360TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
361 std::string illegal_name =
"an_illegal_name";
362 std::string langs[2] = {
"eng",
"hin"};
363 std::string int_param_name =
"tessedit_pageseg_mode";
364 int int_param[2] = {1, 2};
365 std::string int_param_str[2] = {
"1",
"2"};
366 std::string bool_param_name =
"tessedit_ambigs_training";
367 bool bool_param[2] = {
false,
true};
368 std::string bool_param_str[2] = {
"F",
"T"};
369 std::string str_param_name =
"tessedit_char_blacklist";
370 std::string str_param[2] = {
"abc",
"def"};
371 std::string double_param_name =
"segment_penalty_dict_frequent_word";
372 std::string double_param_str[2] = {
"0.01",
"2"};
373 double double_param[2] = {0.01, 2};
375 const std::string kTessdataPath = TESSDATA_DIR;
378 for (
int i = 0;
i < 2; ++
i) {
380 api->
Init(kTessdataPath.c_str(), langs[
i].c_str());
382 api->
SetVariable(int_param_name.c_str(), int_param_str[
i].c_str());
383 api->
SetVariable(bool_param_name.c_str(), bool_param_str[
i].c_str());
384 api->
SetVariable(str_param_name.c_str(), str_param[
i].c_str());
385 api->
SetVariable(double_param_name.c_str(), double_param_str[
i].c_str());
387 for (
int i = 0;
i < 2; ++
i) {
#define EXPECT_THAT(value, matcher)
#define ASSERT_EQ(val1, val2)
#define EXPECT_EQ(val1, val2)
#define SCOPED_TRACE(message)
#define EXPECT_GE(val1, val2)
#define EXPECT_TRUE(condition)
#define EXPECT_STREQ(s1, s2)
#define EXPECT_FALSE(condition)
#define EXPECT_LT(val1, val2)
#define CHECK_GT(test, value)
@ PSM_SINGLE_WORD
Treat the image as a single word.
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
std::string TestDataNameToPath(const std::string &name)
TEST_F(EuroText, FastLatinOCR)
std::string GetCleanedTextResult(tesseract::TessBaseAPI *tess, Image pix)
TEST(TesseractInstanceTest, TestMultipleTessInstances)
int Recognize(ETEXT_DESC *monitor)
void SetPageSegMode(PageSegMode mode)
bool GetIntVariable(const char *name, int *value) const
bool SetVariable(const char *name, const char *value)
const PAGE_RES * GetPageRes() const
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
const char * GetStringVariable(const char *name) const
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
bool GetBoolVariable(const char *name, bool *value) const
bool GetDoubleVariable(const char *name, double *value) const
const TBOX & BlobBox(unsigned index) const
FCOORD re_rotation() const
WERD_CHOICE * best_choice
tesseract::BoxWord * box_word
BLOCK_RES * block() const
WERD_RES * restart_page()
void rotate(const FCOORD &vec)
TDimension bottom() const
TBOX bounding_box() const
static std::string TessdataPath()
static std::string TestDataNameToPath(const std::string &name)
static std::string JoinPath(const std::string &s1, const std::string &s2)
static bool GetContents(const std::string &filename, std::string *out, int)