tesseract v5.3.3.20231005
baseapi_thread_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12// Unit test to run Tesseract instances in parallel threads and verify
13// the OCR result.
14
15// Note that success of running this test as-is does NOT verify
16// thread-safety. For that, you need to run this binary under TSAN using the
17// associated baseapi_thread_test_with_tsan.sh script.
18//
19// The tests are partitioned by instance to allow running Tesseract/Cube/both
20// and by stage to run initialization/recognition/both. See flag descriptions
21// for details.
22
23#include <functional>
24#include <memory>
25#include <string>
26#ifdef INCLUDE_TENSORFLOW
27# include <tensorflow/core/lib/core/threadpool.h>
28#endif
29#include <allheaders.h>
30#include <tesseract/baseapi.h>
31#include "commandlineflags.h"
32#include "include_gunit.h"
33#include "log.h"
34#include "image.h"
35
36// Run with Tesseract instances.
37BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances");
38// Run with Cube instances.
39// Note that with TSAN, Cube typically takes much longer to test. Ignoring
40// std::string operations using the associated tess_tsan.ignore file when
41// testing Cube significantly reduces testing time.
42BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances");
43
44// When used with TSAN, having more repetitions can help in finding hidden
45// thread-safety violations at the expense of increased testing time.
46INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.");
47
48INT_PARAM_FLAG(max_concurrent_instances, 0,
49 "Maximum number of instances to run in parallel at any given "
50 "instant. The number of concurrent instances cannot exceed "
51 "reps * number_of_langs_tested, which is also the default value.");
52
53namespace tesseract {
54
55static const char *kTessLangs[] = {"eng", "vie", nullptr};
56static const char *kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr};
57static const char *kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", nullptr};
58
59static const char *kCubeLangs[] = {"hin", "ara", nullptr};
60static const char *kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr};
61static const char *kCubeTruthText[] = {"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
62 "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
63
65protected:
66 static void SetUpTestCase() {
67 CHECK(FLAGS_test_tesseract || FLAGS_test_cube)
68 << "Need to test at least one of Tesseract/Cube!";
69 // Form a list of langs/gt_text/image_files we will work with.
70 std::vector<std::string> image_files;
71 if (FLAGS_test_tesseract) {
72 int i = 0;
73 while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) {
74 langs_.emplace_back(kTessLangs[i]);
75 gt_text_.emplace_back(kTessTruthText[i]);
76 image_files.emplace_back(kTessImages[i]);
77 ++i;
78 }
79 LOG(INFO) << "Testing Tesseract on " << i << " languages.";
80 }
81 if (FLAGS_test_cube) {
82 int i = 0;
83 while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) {
84 langs_.emplace_back(kCubeLangs[i]);
85 gt_text_.emplace_back(kCubeTruthText[i]);
86 image_files.emplace_back(kCubeImages[i]);
87 ++i;
88 }
89 LOG(INFO) << "Testing Cube on " << i << " languages.";
90 }
91 num_langs_ = langs_.size();
92
93 // Pre-load the images into an array. We will be making multiple copies of
94 // an image here if FLAGS_reps > 1 and that is intentional. In this test, we
95 // wish to not make any assumptions about the thread-safety of Pix objects,
96 // and so entirely disallow concurrent access of a Pix instance.
97 const int n = num_langs_ * FLAGS_reps;
98 for (int i = 0; i < n; ++i) {
99 std::string path = TESTING_DIR "/" + image_files[i % num_langs_];
100 Image new_pix = pixRead(path.c_str());
101 QCHECK(new_pix != nullptr) << "Could not read " << path;
102 pix_.push_back(new_pix);
103 }
104
105#ifdef INCLUDE_TENSORFLOW
106 pool_size_ = (FLAGS_max_concurrent_instances < 1) ? num_langs_ * FLAGS_reps
107 : FLAGS_max_concurrent_instances;
108#endif
109 }
110
111 static void TearDownTestCase() {
112 for (auto &pix : pix_) {
113 pix.destroy();
114 }
115 }
116
117#ifdef INCLUDE_TENSORFLOW
118 void ResetPool() {
119 pool_.reset(
120 new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
121 }
122
123 void WaitForPoolWorkers() {
124 pool_.reset(nullptr);
125 }
126
127 std::unique_ptr<tensorflow::thread::ThreadPool> pool_;
128 static int pool_size_;
129#endif
130 static std::vector<Image > pix_;
131 static std::vector<std::string> langs_;
132 static std::vector<std::string> gt_text_;
133 static int num_langs_;
134};
135
136// static member variable declarations.
137#ifdef INCLUDE_TENSORFLOW
138int BaseapiThreadTest::pool_size_;
139#endif
140std::vector<Image > BaseapiThreadTest::pix_;
141std::vector<std::string> BaseapiThreadTest::langs_;
142std::vector<std::string> BaseapiThreadTest::gt_text_;
144
145static void InitTessInstance(TessBaseAPI *tess, const std::string &lang) {
146 CHECK(tess != nullptr);
147 EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str()));
148}
149
150static void GetCleanedText(TessBaseAPI *tess, Image pix, std::string &ocr_text) {
151 tess->SetImage(pix);
152 char *result = tess->GetUTF8Text();
153 ocr_text = result;
154 delete[] result;
155 trim(ocr_text);
156}
157
158#ifdef INCLUDE_TENSORFLOW
159static void VerifyTextResult(TessBaseAPI *tess, Image pix, const std::string &lang,
160 const std::string &expected_text) {
161 TessBaseAPI *tess_local = nullptr;
162 if (tess) {
163 tess_local = tess;
164 } else {
165 tess_local = new TessBaseAPI;
166 InitTessInstance(tess_local, lang);
167 }
168 std::string ocr_text;
169 GetCleanedText(tess_local, pix, ocr_text);
170 EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());
171 if (tess_local != tess) {
172 delete tess_local;
173 }
174}
175#endif
176
177// Check that Tesseract/Cube produce the correct results in single-threaded
178// operation. If not, it is pointless to run the real multi-threaded tests.
179TEST_F(BaseapiThreadTest, TestBasicSanity) {
180 for (int i = 0; i < num_langs_; ++i) {
181 TessBaseAPI tess;
182 InitTessInstance(&tess, langs_[i]);
183 std::string ocr_text;
184 GetCleanedText(&tess, pix_[i], ocr_text);
185 CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) << "Failed with lang = " << langs_[i];
186 }
187}
188
189// Test concurrent instance initialization.
191#ifdef INCLUDE_TENSORFLOW
192 const int n = num_langs_ * FLAGS_reps;
193 ResetPool();
194 std::vector<TessBaseAPI> tess(n);
195 for (int i = 0; i < n; ++i) {
196 pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_]));
197 }
198 WaitForPoolWorkers();
199#endif
200}
201
202// Test concurrent recognition.
203TEST_F(BaseapiThreadTest, TestRecognition) {
204#ifdef INCLUDE_TENSORFLOW
205 const int n = num_langs_ * FLAGS_reps;
206 std::vector<TessBaseAPI> tess(n);
207 // Initialize api instances in a single thread.
208 for (int i = 0; i < n; ++i) {
209 InitTessInstance(&tess[i], langs_[i % num_langs_]);
210 }
211
212 ResetPool();
213 for (int i = 0; i < n; ++i) {
214 pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], langs_[i % num_langs_],
215 gt_text_[i % num_langs_]));
216 }
217 WaitForPoolWorkers();
218#endif
219}
220
222#ifdef INCLUDE_TENSORFLOW
223 const int n = num_langs_ * FLAGS_reps;
224 ResetPool();
225 for (int i = 0; i < n; ++i) {
226 pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], langs_[i % num_langs_],
227 gt_text_[i % num_langs_]));
228 }
229 WaitForPoolWorkers();
230#endif
231}
232} // namespace tesseract
struct TessBaseAPI TessBaseAPI
Definition: capi.h:60
@ LOG
@ INFO
Definition: log.h:28
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_STREQ(s1, s2)
Definition: gtest.h:2112
#define CHECK(condition)
Definition: include_gunit.h:76
INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.")
BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances")
TEST_F(EuroText, FastLatinOCR)
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:368
static std::vector< std::string > langs_
static std::vector< Image > pix_
static std::vector< std::string > gt_text_