tesseract v5.3.3.20231005
classifier_tester.cpp
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14// Filename: classifier_tester.cpp
15// Purpose: Tests a character classifier on data as formatted for training,
16// but doesn't have to be the same as the training data.
17// Author: Ray Smith
18
19#include <tesseract/baseapi.h>
20#include <algorithm>
21#include <cstdio>
22#include "commontraining.h"
23#include "mastertrainer.h"
24#include "params.h"
25#include "tessclassifier.h"
26#include "tesseractclass.h"
27
28using namespace tesseract;
29
30static STRING_PARAM_FLAG(classifier, "", "Classifier to test");
31static STRING_PARAM_FLAG(lang, "eng", "Language to test");
32static STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
33
35
36static const char *names[] = {"pruner", "full"};
37
38static tesseract::ShapeClassifier *InitializeClassifier(const char *classifer_name,
39 const UNICHARSET &unicharset, int argc,
40 char **argv, tesseract::TessBaseAPI **api) {
41 // Decode the classifier string.
42 ClassifierName classifier = CN_COUNT;
43 for (int c = 0; c < CN_COUNT; ++c) {
44 if (strcmp(classifer_name, names[c]) == 0) {
45 classifier = static_cast<ClassifierName>(c);
46 break;
47 }
48 }
49 if (classifier == CN_COUNT) {
50 fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
51 return nullptr;
52 }
53
54 // We need to initialize tesseract to test.
55 *api = new tesseract::TessBaseAPI;
58 tesseract::Classify *classify = nullptr;
59 if (classifier == CN_PRUNER || classifier == CN_FULL) {
60 if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(), engine_mode) < 0) {
61 fprintf(stderr, "Tesseract initialization failed!\n");
62 return nullptr;
63 }
64 tesseract = const_cast<tesseract::Tesseract *>((*api)->tesseract());
65 classify = static_cast<tesseract::Classify *>(tesseract);
66 if (classify->shape_table() == nullptr) {
67 fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
68 return nullptr;
69 }
70 }
71 tesseract::ShapeClassifier *shape_classifier = nullptr;
72
73 if (classifier == CN_PRUNER) {
74 shape_classifier = new tesseract::TessClassifier(true, classify);
75 } else if (classifier == CN_FULL) {
76 shape_classifier = new tesseract::TessClassifier(false, classify);
77 }
78 tprintf("Testing classifier %s:\n", classifer_name);
79 return shape_classifier;
80}
81
82// This program has complex setup requirements, so here is some help:
83// Two different modes, tr files and serialized mastertrainer.
84// From tr files:
85// classifier_tester -U unicharset -F font_properties -X xheights
86// -classifier x -lang lang [-output_trainer trainer] *.tr
87// From a serialized trainer:
88// classifier_tester -input_trainer trainer [-lang lang] -classifier x
89//
90// In the first case, the unicharset must be the unicharset from within
91// the classifier under test, and the font_properties and xheights files must
92// match the files used during training.
93// In the second case, the trainer file must have been prepared from
94// some previous run of shapeclustering, mftraining, or classifier_tester
95// using the same conditions as above, ie matching unicharset/font_properties.
96//
97// Available values of classifier (x above) are:
98// pruner : Tesseract class pruner only.
99// full : Tesseract full classifier.
100// with an input trainer.)
101int main(int argc, char **argv) {
102 tesseract::CheckSharedLibraryVersion();
103 ParseArguments(&argc, &argv);
104 std::string file_prefix;
105 auto trainer = tesseract::LoadTrainingData(argv + 1, false, nullptr, file_prefix);
107 // Decode the classifier string.
108 tesseract::ShapeClassifier *shape_classifier =
109 InitializeClassifier(FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
110 if (shape_classifier == nullptr) {
111 fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
112 return EXIT_FAILURE;
113 }
114
115 // We want to test junk as well if it is available.
116 // trainer->IncludeJunk();
117 // We want to test with replicated samples too.
118 trainer->ReplicateAndRandomizeSamplesIfRequired();
119
120 trainer->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR,
121 std::max(3, static_cast<int>(FLAGS_debug_level)), false,
122 shape_classifier, nullptr);
123 delete shape_classifier;
124 delete api;
125
126 return EXIT_SUCCESS;
127} /* main */
struct TessBaseAPI TessBaseAPI
Definition: capi.h:60
#define STRING_PARAM_FLAG(name, val, comment)
int main(int argc, char **argv)
ClassifierName
@ CN_FULL
@ CN_PRUNER
@ CN_COUNT
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:264
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void ParseArguments(int *argc, char ***argv)
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
@ CT_UNICHAR_TOP1_ERR
Definition: errorcounter.h:74
const ShapeTable * shape_table() const
Definition: classify.h:102