tesseract-ocr.github.io/3.x/a01058_source.html

 /* -*-C-*-

  ********************************************************************************

  *

  * File:        context.c  (Formerly context.c)

  * Description:  Context checking functions

  * Author:       Mark Seaman, OCR Technology

  * Created:      Thu Feb 15 11:18:24 1990

  * Modified:     Tue Jul  9 17:38:16 1991 (Mark Seaman) marks@hpgrlt

  * Language:     C

  * Package:      N/A

  * Status:       Experimental (Do Not Distribute)

  *

  * (c) Copyright 1990, Hewlett-Packard Company.

  ** Licensed under the Apache License, Version 2.0 (the "License");

  ** you may not use this file except in compliance with the License.

  ** You may obtain a copy of the License at

  ** http://www.apache.org/licenses/LICENSE-2.0

  ** Unless required by applicable law or agreed to in writing, software

  ** distributed under the License is distributed on an "AS IS" BASIS,

  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  ** See the License for the specific language governing permissions and

  ** limitations under the License.

  *

  *********************************************************************************/


 #include "dict.h"

 #include "tprintf.h"

 #include "unicharset.h"


 namespace tesseract {


 static const int kMinAbsoluteGarbageWordLength = 10;

 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;


 const int case_state_table[6][4] = { {

                                   /*  0. Begining of word        */

     /*    P   U   L   D                                          */

                                   /* -1. Error on case           */

       0, 1, 5, 4

     },

     {                            /*  1. After initial capital    */

       0, 3, 2, 4

     },

     {                            /*  2. After lower case         */

       0, -1, 2, -1

     },

     {                            /*  3. After upper case         */

       0, 3, -1, 4

     },

     {                            /*  4. After a digit            */

       0, -1, -1, 4

     },

     {                            /*  5. After initial lower case */

       5, -1, 2, -1

     },

   };


 int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {

   int state = 0;

   int x;

   for (x = 0; x < word.length(); ++x) {

     UNICHAR_ID ch_id = word.unichar_id(x);

     if (unicharset.get_isupper(ch_id))

       state = case_state_table[state][1];

     else if (unicharset.get_islower(ch_id))

       state = case_state_table[state][2];

     else if (unicharset.get_isdigit(ch_id))

       state = case_state_table[state][3];

     else

       state = case_state_table[state][0];

     if (state == -1) return false;

   }

   return state != 5; // single lower is bad

 }


 bool Dict::absolute_garbage(const WERD_CHOICE &word,

                             const UNICHARSET &unicharset) {

   if (word.length() < kMinAbsoluteGarbageWordLength) return false;

   int num_alphanum = 0;

   for (int x = 0; x < word.length(); ++x) {

     num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||

                      unicharset.get_isdigit(word.unichar_id(x)));

   }

   return (static_cast<float>(num_alphanum) /

           static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);

 }


 }  // namespace tesseract

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

unicharset.h

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

tesseract::Dict::absolute_garbage
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:76

WERD_CHOICE
Definition: ratngs.h:271

dict.h

tprintf.h

tesseract::case_state_table
const int case_state_table[6][4]
Definition: context.cpp:35

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456

tesseract
Definition: baseapi.cpp:83

UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449

UNICHARSET
Definition: unicharset.h:139