All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
word_list_lang_model.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_list_lang_model.h
3  * Description: Declaration of the Word List Language Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The WordListLangModel class abstracts a language model that is based on
21 // a list of words. It inherits from the LangModel abstract class
22 // Besides providing the methods inherited from the LangModel abstract class,
23 // the class provided methods to add new strings to the Language Model:
24 // AddString & AddString32
25 
26 #ifndef WORD_LIST_LANG_MODEL_H
27 #define WORD_LIST_LANG_MODEL_H
28 
29 #include <vector>
30 
31 #include "cube_reco_context.h"
32 #include "lang_model.h"
33 #include "tess_lang_mod_edge.h"
34 
35 namespace tesseract {
36 
37 class Trie;
38 
39 class WordListLangModel : public LangModel {
40  public:
41  explicit WordListLangModel(CubeRecoContext *cntxt);
43  // Returns an edge pointer to the Root
44  LangModEdge *Root();
45  // Returns the edges that fan-out of the specified edge and their count
46  LangModEdge **GetEdges(CharAltList *alt_list,
47  LangModEdge *edge,
48  int *edge_cnt);
49  // Returns is a sequence of 32-bit characters are valid within this language
50  // model or net. And EndOfWord flag is specified. If true, the sequence has
51  // to end on a valid word. The function also optionally returns the list
52  // of language model edges traversed to parse the string
53  bool IsValidSequence(const char_32 *sequence,
54  bool eow_flag,
55  LangModEdge **edges);
56  bool IsLeadingPunc(char_32 ch) { return false; } // not yet implemented
57  bool IsTrailingPunc(char_32 ch) { return false; } // not yet implemented
58  bool IsDigit(char_32 ch) { return false; } // not yet implemented
59  // Adds a new UTF-8 string to the language model
60  bool AddString(const char *char_ptr);
61  // Adds a new UTF-32 string to the language model
62  bool AddString32(const char_32 *char_32_ptr);
63  // Compute all the variants of a 32-bit string in terms of the class-ids.
64  // This is needed for languages that have ligatures. A word can then have
65  // more than one spelling in terms of the class-ids.
66  static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset,
67  string_32 str32,
68  vector<WERD_CHOICE *> *word_variants);
69  private:
70  // constants needed to configure the language model
71  static const int kMaxEdge = 512;
72 
73  CubeRecoContext *cntxt_;
74  Trie *dawg_;
75  bool init_;
76  // Initialize the language model
77  bool Init();
78  // Cleanup
79  void Cleanup();
80  // Recursive helper function for WordVariants().
81  static void WordVariants(
82  const CharSet &char_set,
83  string_32 prefix_str32, WERD_CHOICE *word_so_far,
84  string_32 str32,
85  vector<WERD_CHOICE *> *word_variants);
86 };
87 } // tesseract
88 
89 #endif // WORD_LIST_LANG_MODEL_H
bool AddString32(const char_32 *char_32_ptr)
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **edges)
basic_string< char_32 > string_32
Definition: string_32.h:41
WordListLangModel(CubeRecoContext *cntxt)
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)
signed int char_32
Definition: string_32.h:40
bool AddString(const char *char_ptr)