All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
lm_state.h
Go to the documentation of this file.
1 // File: lm_state.h
3 // Description: Structures and functionality for capturing the state of
4 // segmentation search guided by the language model.
5 //
6 // Author: Rika Antonova
7 // Created: Mon Jun 20 11:26:43 PST 2012
8 //
9 // (C) Copyright 2012, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
23 #define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
24 
25 #include "associate.h"
26 #include "elst.h"
27 #include "dawg.h"
28 #include "lm_consistency.h"
29 #include "matrix.h"
30 #include "ratngs.h"
31 #include "stopper.h"
32 #include "strngs.h"
33 
34 namespace tesseract {
35 
37 typedef unsigned char LanguageModelFlagsType;
38 
57 
64  }
66  delete active_dawgs;
67  }
70 };
71 
75  LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
78  STRING context; //< context string
86  bool pruned;
88  float ngram_cost;
91 };
92 
95 struct ViterbiStateEntry : public ELIST_LINK {
97  BLOB_CHOICE *b, float c, float ol,
98  const LMConsistencyInfo &ci,
99  const AssociateStats &as,
100  LanguageModelFlagsType tcf,
103  const char *debug_uch)
104  : cost(c), curr_b(b), parent_vse(pe), competing_vse(NULL),
105  ratings_sum(b->rating()),
106  min_certainty(b->certainty()), adapted(b->IsAdapted()), length(1),
109  updated(true) {
110  debug_str = (debug_uch == NULL) ? NULL : new STRING();
111  if (pe != NULL) {
112  ratings_sum += pe->ratings_sum;
113  if (pe->min_certainty < min_certainty) {
115  }
116  adapted += pe->adapted;
117  length += pe->length;
119  if (debug_uch != NULL) *debug_str += *(pe->debug_str);
120  }
121  if (debug_str != NULL && debug_uch != NULL) *debug_str += debug_uch;
122  }
124  delete dawg_info;
125  delete ngram_info;
126  delete debug_str;
127  }
130  static int Compare(const void *e1, const void *e2) {
131  const ViterbiStateEntry *ve1 =
132  *reinterpret_cast<const ViterbiStateEntry * const *>(e1);
133  const ViterbiStateEntry *ve2 =
134  *reinterpret_cast<const ViterbiStateEntry * const *>(e2);
135  return (ve1->cost < ve2->cost) ? -1 : 1;
136  }
137  inline bool Consistent() const {
139  return true;
140  }
141  return consistency_info.Consistent();
142  }
145  bool HasAlnumChoice(const UNICHARSET& unicharset) {
146  if (curr_b == NULL) return false;
147  UNICHAR_ID unichar_id = curr_b->unichar_id();
148  if (unicharset.get_isalpha(unichar_id) ||
149  unicharset.get_isdigit(unichar_id))
150  return true;
151  return false;
152  }
153  void Print(const char *msg) const;
154 
157  float cost;
158 
165 
168  float ratings_sum; //< sum of ratings of character on the path
169  float min_certainty; //< minimum certainty on the path
170  int adapted; //< number of BLOB_CHOICES from adapted templates
171  int length; //< number of characters on the path
172  float outline_length; //< length of the outline so far
173  LMConsistencyInfo consistency_info; //< path consistency info
174  AssociateStats associate_stats; //< character widths/gaps/seams
175 
178  LanguageModelFlagsType top_choice_flags;
179 
183 
187 
188  bool updated; //< set to true if the entry has just been created/updated
192 };
193 
195 
203 
205  void Clear();
206 
207  void Print(const char *msg);
208 
210  ViterbiStateEntry_LIST viterbi_state_entries;
216 };
217 
220  explicit BestChoiceBundle(int matrix_dimension)
221  : updated(false), best_vse(NULL) {
222  beam.reserve(matrix_dimension);
223  for (int i = 0; i < matrix_dimension; ++i)
224  beam.push_back(new LanguageModelState);
225  }
227 
229  bool updated;
238 };
239 
240 } // namespace tesseract
241 
242 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
LMConsistencyInfo consistency_info
Definition: lm_state.h:173
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:212
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:237
ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol, const LMConsistencyInfo &ci, const AssociateStats &as, LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n, const char *debug_uch)
Definition: lm_state.h:96
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:160
PermuterType
Definition: ratngs.h:240
ELISTIZEH(AmbigSpec)
void Print(const char *msg)
Definition: lm_state.cpp:70
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:186
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:182
LanguageModelDawgInfo(DawgPositionVector *a, PermuterType pt)
Definition: lm_state.h:62
ViterbiStateEntry * parent_vse
Definition: lm_state.h:161
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:178
bool Consistent() const
Definition: lm_state.h:137
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:90
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:145
AssociateStats associate_stats
Definition: lm_state.h:174
ParamsEditor * pe
Definition: pgedit.cpp:108
DawgPositionVector * active_dawgs
Definition: lm_state.h:68
LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
Definition: lm_state.h:75
int UNICHAR_ID
Definition: unichar.h:33
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:130
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:231
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:213
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:215
PointerVector< LanguageModelState > beam
Definition: lm_state.h:235
ViterbiStateEntry * competing_vse
Definition: lm_state.h:164
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:229
void Clear()
Clears the viterbi search state back to its initial conditions.
Definition: lm_state.cpp:63
#define MAX_FLOAT32
Definition: host.h:124
Definition: strngs.h:44
void Print(const char *msg) const
Definition: lm_state.cpp:27
Struct to store information maintained by various language model components.
Definition: lm_state.h:197
#define NULL
Definition: host.h:144
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:88
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:210
BestChoiceBundle(int matrix_dimension)
Definition: lm_state.h:220
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:219