All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract_cube_combiner.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tesseract_cube_combiner.h
3  * Description: Declaration of the Tesseract & Cube results combiner Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The TesseractCubeCombiner class provides the functionality of combining
21 // the recognition results of Tesseract and Cube at the word level
22 
23 #ifndef TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
24 #define TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
25 
26 #include <string>
27 #include <vector>
28 #include "pageres.h"
29 
30 #ifdef _WIN32
31 #include <windows.h>
32 using namespace std;
33 #endif
34 
35 #ifdef USE_STD_NAMESPACE
36 using std::string;
37 using std::vector;
38 #endif
39 
40 namespace tesseract {
41 
42 class CubeObject;
43 class NeuralNet;
44 class CubeRecoContext;
45 class WordAltList;
46 
48  public:
49  explicit TesseractCubeCombiner(CubeRecoContext *cube_cntxt);
50  virtual ~TesseractCubeCombiner();
51 
52  // There are 2 public methods for combining the results of tesseract
53  // and cube. Both return the probability that the Tesseract result is
54  // correct. The difference between the two interfaces is in how the
55  // passed-in CubeObject is used.
56 
57  // The CubeObject parameter is used for 2 purposes: 1) to retrieve
58  // cube's alt list, and 2) to compute cube's word cost for the
59  // tesseract result. Both uses may modify the state of the
60  // CubeObject (including the BeamSearch state) with a call to
61  // RecognizeWord().
62  float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj);
63 
64  // The alt_list parameter is expected to have been extracted from the
65  // CubeObject that recognized the word to be combined. The cube_obj
66  // parameter passed in is a separate instance to be used only by
67  // the combiner.
68  float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj,
69  WordAltList *alt_list);
70 
71  // Public method for computing the combiner features. The agreement
72  // output parameter will be true if both answers are identical,
73  // false otherwise. Modifies the cube_alt_list, so no assumptions
74  // should be made about its state upon return.
75  bool ComputeCombinerFeatures(const string &tess_res,
76  int tess_confidence,
77  CubeObject *cube_obj,
78  WordAltList *cube_alt_list,
79  vector<double> *features,
80  bool *agreement);
81 
82  // Is the word valid according to Tesseract's language model
83  bool ValidWord(const string &str);
84 
85  // Loads the combiner neural network from file, using cube_cntxt_
86  // to find path.
87  bool LoadCombinerNet();
88  private:
89  // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
90  // strips punc and/or normalizes case and then converts back
91  string NormalizeString(const string &str, bool remove_punc, bool norm_case);
92 
93  // Compares 2 strings after optionally normalizing them and or stripping
94  // punctuation
95  int CompareStrings(const string &str1, const string &str2, bool ignore_punc,
96  bool norm_case);
97 
98  NeuralNet *combiner_net_; // pointer to the combiner NeuralNet object
99  CubeRecoContext *cube_cntxt_; // used for language ID and data paths
100 };
101 }
102 
103 #endif // TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
STL namespace.