All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
context.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: context.c (Formerly context.c)
5  * Description: Context checking functions
6  * Author: Mark Seaman, OCR Technology
7  * Created: Thu Feb 15 11:18:24 1990
8  * Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Experimental (Do Not Distribute)
12  *
13  * (c) Copyright 1990, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 
26 #include "dict.h"
27 #include "tprintf.h"
28 #include "unicharset.h"
29 
30 namespace tesseract {
31 
32 static const int kMinAbsoluteGarbageWordLength = 10;
33 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
34 
35 const int case_state_table[6][4] = { {
36  /* 0. Begining of word */
37  /* P U L D */
38  /* -1. Error on case */
39  0, 1, 5, 4
40  },
41  { /* 1. After initial capital */
42  0, 3, 2, 4
43  },
44  { /* 2. After lower case */
45  0, -1, 2, -1
46  },
47  { /* 3. After upper case */
48  0, 3, -1, 4
49  },
50  { /* 4. After a digit */
51  0, -1, -1, 4
52  },
53  { /* 5. After initial lower case */
54  5, -1, 2, -1
55  },
56  };
57 
58 int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
59  int state = 0;
60  int x;
61  for (x = 0; x < word.length(); ++x) {
62  UNICHAR_ID ch_id = word.unichar_id(x);
63  if (unicharset.get_isupper(ch_id))
64  state = case_state_table[state][1];
65  else if (unicharset.get_islower(ch_id))
66  state = case_state_table[state][2];
67  else if (unicharset.get_isdigit(ch_id))
68  state = case_state_table[state][3];
69  else
70  state = case_state_table[state][0];
71  if (state == -1) return false;
72  }
73  return state != 5; // single lower is bad
74 }
75 
77  const UNICHARSET &unicharset) {
78  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
79  int num_alphanum = 0;
80  for (int x = 0; x < word.length(); ++x) {
81  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
82  unicharset.get_isdigit(word.unichar_id(x)));
83  }
84  return (static_cast<float>(num_alphanum) /
85  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
86 }
87 
88 } // namespace tesseract
int length() const
Definition: ratngs.h:300
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:76
const int case_state_table[6][4]
Definition: context.cpp:35
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int UNICHAR_ID
Definition: unichar.h:33
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449