tesseract-ocr.github.io/3.x/a00701_source.html

 /******************************************************************

  * File:        fixspace.cpp  (Formerly fixspace.c)

  * Description: Implements a pass over the page res, exploring the alternative

  *              spacing possibilities, trying to use context to improve the

  *              word spacing

 * Author:               Phil Cheatle

 * Created:              Thu Oct 21 11:38:43 BST 1993

 *

 * (C) Copyright 1993, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


 #include <ctype.h>

 #include "reject.h"

 #include "statistc.h"

 #include "control.h"

 #include "fixspace.h"

 #include "genblob.h"

 #include "tessvars.h"

 #include "tessbox.h"

 #include "globals.h"

 #include "tesseractclass.h"


 #define PERFECT_WERDS   999

 #define MAXSPACING      128      /*max expected spacing in pix */


 namespace tesseract {


 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,

                                  inT32 word_count,

                                  PAGE_RES *page_res) {

   BLOCK_RES_IT block_res_it;

   ROW_RES_IT row_res_it;

   WERD_RES_IT word_res_it_from;

   WERD_RES_IT word_res_it_to;

   WERD_RES *word_res;

   WERD_RES_LIST fuzzy_space_words;

   inT16 new_length;

   BOOL8 prevent_null_wd_fixsp;   // DONT process blobless wds

   inT32 word_index;              // current word


   block_res_it.set_to_list(&page_res->block_res_list);

   word_index = 0;

   for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();

        block_res_it.forward()) {

     row_res_it.set_to_list(&block_res_it.data()->row_res_list);

     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();

          row_res_it.forward()) {

       word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);

       while (!word_res_it_from.at_last()) {

         word_res = word_res_it_from.data();

         while (!word_res_it_from.at_last() &&

                !(word_res->combination ||

                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||

                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {

           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,

                          block_res_it.data()->block);

           word_res = word_res_it_from.forward();

           word_index++;

           if (monitor != NULL) {

             monitor->ocr_alive = TRUE;

             monitor->progress = 90 + 5 * word_index / word_count;

             if (monitor->deadline_exceeded() ||

                 (monitor->cancel != NULL &&

                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))

             return;

           }

         }


         if (!word_res_it_from.at_last()) {

           word_res_it_to = word_res_it_from;

           prevent_null_wd_fixsp =

             word_res->word->cblob_list()->empty();

           if (check_debug_pt(word_res, 60))

             debug_fix_space_level.set_value(10);

           word_res_it_to.forward();

           word_index++;

           if (monitor != NULL) {

             monitor->ocr_alive = TRUE;

             monitor->progress = 90 + 5 * word_index / word_count;

             if (monitor->deadline_exceeded() ||

                 (monitor->cancel != NULL &&

                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))

             return;

           }

           while (!word_res_it_to.at_last () &&

                  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||

                   word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {

             if (check_debug_pt(word_res, 60))

               debug_fix_space_level.set_value(10);

             if (word_res->word->cblob_list()->empty())

               prevent_null_wd_fixsp = TRUE;

             word_res = word_res_it_to.forward();

           }

           if (check_debug_pt(word_res, 60))

             debug_fix_space_level.set_value(10);

           if (word_res->word->cblob_list()->empty())

             prevent_null_wd_fixsp = TRUE;

           if (prevent_null_wd_fixsp) {

             word_res_it_from = word_res_it_to;

           } else {

             fuzzy_space_words.assign_to_sublist(&word_res_it_from,

                                                 &word_res_it_to);

             fix_fuzzy_space_list(fuzzy_space_words,

                                  row_res_it.data()->row,

                                  block_res_it.data()->block);

             new_length = fuzzy_space_words.length();

             word_res_it_from.add_list_before(&fuzzy_space_words);

             for (;

                  !word_res_it_from.at_last() && new_length > 0;

                  new_length--) {

               word_res_it_from.forward();

             }

           }

           if (test_pt)

             debug_fix_space_level.set_value(0);

         }

         fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,

                        block_res_it.data()->block);

         // Last word in row

       }

     }

   }

 }


 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,

                                      ROW *row,

                                      BLOCK* block) {

   inT16 best_score;

   WERD_RES_LIST current_perm;

   inT16 current_score;

   BOOL8 improved = FALSE;


   best_score = eval_word_spacing(best_perm);  // default score

   dump_words(best_perm, best_score, 1, improved);


   if (best_score != PERFECT_WERDS)

     initialise_search(best_perm, current_perm);


   while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {

     match_current_words(current_perm, row, block);

     current_score = eval_word_spacing(current_perm);

     dump_words(current_perm, current_score, 2, improved);

     if (current_score > best_score) {

       best_perm.clear();

       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);

       best_score = current_score;

       improved = TRUE;

     }

     if (current_score < PERFECT_WERDS)

       transform_to_next_perm(current_perm);

   }

   dump_words(best_perm, best_score, 3, improved);

 }


 }  // namespace tesseract


 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {

   WERD_RES_IT src_it(&src_list);

   WERD_RES_IT new_it(&new_list);

   WERD_RES *src_wd;

   WERD_RES *new_wd;


   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {

     src_wd = src_it.data();

     if (!src_wd->combination) {

       new_wd = WERD_RES::deep_copy(src_wd);

       new_wd->combination = FALSE;

       new_wd->part_of_combo = FALSE;

       new_it.add_after_then_move(new_wd);

     }

   }

 }


 namespace tesseract {

 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,

                                     BLOCK* block) {

   WERD_RES_IT word_it(&words);

   WERD_RES *word;

   // Since we are not using PAGE_RES to iterate over words, we need to update

   // prev_word_best_choice_ before calling classify_word_pass2().

   prev_word_best_choice_ = NULL;

   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

     word = word_it.data();

     if ((!word->part_of_combo) && (word->box_word == NULL)) {

       WordData word_data(block, row, word);

       SetupWordPassN(2, &word_data);

       classify_word_and_language(2, NULL, &word_data);

     }

     prev_word_best_choice_ = word->best_choice;

   }

 }


 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {

   WERD_RES_IT word_res_it(&word_res_list);

   inT16 total_score = 0;

   inT16 word_count = 0;

   inT16 done_word_count = 0;

   inT16 word_len;

   inT16 i;

   inT16 offset;

   WERD_RES *word;                 // current word

   inT16 prev_word_score = 0;

   BOOL8 prev_word_done = FALSE;

   BOOL8 prev_char_1 = FALSE;      // prev ch a "1/I/l"?

   BOOL8 prev_char_digit = FALSE;  // prev ch 2..9 or 0

   BOOL8 current_char_1 = FALSE;

   BOOL8 current_word_ok_so_far;

   STRING punct_chars = "!\"`',.:;";

   BOOL8 prev_char_punct = FALSE;

   BOOL8 current_char_punct = FALSE;

   BOOL8 word_done = FALSE;


   do {

     word = word_res_it.data();

     word_done = fixspace_thinks_word_done(word);

     word_count++;

     if (word->tess_failed) {

       total_score += prev_word_score;

       if (prev_word_done)

         done_word_count++;

       prev_word_score = 0;

       prev_char_1 = FALSE;

       prev_char_digit = FALSE;

       prev_word_done = FALSE;

     } else {

       /*

         Can we add the prev word score and potentially count this word?

         Yes IF it didnt end in a 1 when the first char of this word is a digit

           AND it didnt end in a digit when the first char of this word is a 1

       */

       word_len = word->reject_map.length();

       current_word_ok_so_far = FALSE;

       if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||

             (prev_char_digit && (

                 (word_done &&

                  word->best_choice->unichar_lengths().string()[0] == 1 &&

                  word->best_choice->unichar_string()[0] == '1') ||

                 (!word_done && STRING(conflict_set_I_l_1).contains(

                       word->best_choice->unichar_string()[0])))))) {

         total_score += prev_word_score;

         if (prev_word_done)

           done_word_count++;

         current_word_ok_so_far = word_done;

       }


       if (current_word_ok_so_far) {

         prev_word_done = TRUE;

         prev_word_score = word_len;

       } else {

         prev_word_done = FALSE;

         prev_word_score = 0;

       }


       /* Add 1 to total score for every joined 1 regardless of context and

          rejtn */

       for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {

         current_char_1 = word->best_choice->unichar_string()[i] == '1';

         if (prev_char_1 || (current_char_1 && (i > 0)))

           total_score++;

         prev_char_1 = current_char_1;

       }


       /* Add 1 to total score for every joined punctuation regardless of context

         and rejtn */

       if (tessedit_prefer_joined_punct) {

         for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;

              offset += word->best_choice->unichar_lengths()[i++]) {

           current_char_punct =

             punct_chars.contains(word->best_choice->unichar_string()[offset]);

           if (prev_char_punct || (current_char_punct && i > 0))

             total_score++;

           prev_char_punct = current_char_punct;

         }

       }

       prev_char_digit = digit_or_numeric_punct(word, word_len - 1);

       for (i = 0, offset = 0; i < word_len - 1;

            offset += word->best_choice->unichar_lengths()[i++]);

       prev_char_1 =

           ((word_done && (word->best_choice->unichar_string()[offset] == '1'))

            || (!word_done && STRING(conflict_set_I_l_1).contains(

                    word->best_choice->unichar_string()[offset])));

     }

     /* Find next word */

     do {

       word_res_it.forward();

     } while (word_res_it.data()->part_of_combo);

   } while (!word_res_it.at_first());

   total_score += prev_word_score;

   if (prev_word_done)

     done_word_count++;

   if (done_word_count == word_count)

     return PERFECT_WERDS;

   else

     return total_score;

 }


 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {

   int i;

   int offset;


   for (i = 0, offset = 0; i < char_position;

        offset += word->best_choice->unichar_lengths()[i++]);

   return (

       word->uch_set->get_isdigit(

           word->best_choice->unichar_string().string() + offset,

           word->best_choice->unichar_lengths()[i]) ||

       (word->best_choice->permuter() == NUMBER_PERM &&

        STRING(numeric_punctuation).contains(

            word->best_choice->unichar_string().string()[offset])));

 }


 }  // namespace tesseract


 void transform_to_next_perm(WERD_RES_LIST &words) {

   WERD_RES_IT word_it(&words);

   WERD_RES_IT prev_word_it(&words);

   WERD_RES *word;

   WERD_RES *prev_word;

   WERD_RES *combo;

   WERD *copy_word;

   inT16 prev_right = -MAX_INT16;

   TBOX box;

   inT16 gap;

   inT16 min_gap = MAX_INT16;


   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

     word = word_it.data();

     if (!word->part_of_combo) {

       box = word->word->bounding_box();

       if (prev_right > -MAX_INT16) {

         gap = box.left() - prev_right;

         if (gap < min_gap)

           min_gap = gap;

       }

       prev_right = box.right();

     }

   }

   if (min_gap < MAX_INT16) {

     prev_right = -MAX_INT16;        // back to start

     word_it.set_to_list(&words);

     // Note: we can't use cycle_pt due to inserted combos at start of list.

     for (; (prev_right == -MAX_INT16) || !word_it.at_first();

          word_it.forward()) {

       word = word_it.data();

       if (!word->part_of_combo) {

         box = word->word->bounding_box();

         if (prev_right > -MAX_INT16) {

           gap = box.left() - prev_right;

           if (gap <= min_gap) {

             prev_word = prev_word_it.data();

             if (prev_word->combination) {

               combo = prev_word;

             } else {

               /* Make a new combination and insert before

                * the first word being joined. */

               copy_word = new WERD;

               *copy_word = *(prev_word->word);

               // deep copy

               combo = new WERD_RES(copy_word);

               combo->combination = TRUE;

               combo->x_height = prev_word->x_height;

               prev_word->part_of_combo = TRUE;

               prev_word_it.add_before_then_move(combo);

             }

             combo->word->set_flag(W_EOL, word->word->flag(W_EOL));

             if (word->combination) {

               combo->word->join_on(word->word);

               // Move blobs to combo

               // old combo no longer needed

               delete word_it.extract();

             } else {

               // Copy current wd to combo

               combo->copy_on(word);

               word->part_of_combo = TRUE;

             }

             combo->done = FALSE;

             combo->ClearResults();

           } else {

             prev_word_it = word_it;  // catch up

           }

         }

         prev_right = box.right();

       }

     }

   } else {

     words.clear();  // signal termination

   }

 }


 namespace tesseract {

 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,

                            inT16 mode, BOOL8 improved) {

   WERD_RES_IT word_res_it(&perm);


   if (debug_fix_space_level > 0) {

     if (mode == 1) {

       stats_.dump_words_str = "";

       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();

            word_res_it.forward()) {

         if (!word_res_it.data()->part_of_combo) {

           stats_.dump_words_str +=

               word_res_it.data()->best_choice->unichar_string();

           stats_.dump_words_str += ' ';

         }

       }

     }


     if (debug_fix_space_level > 1) {

       switch (mode) {

         case 1:

           tprintf("EXTRACTED (%d): \"", score);

           break;

         case 2:

           tprintf("TESTED (%d): \"", score);

           break;

         case 3:

           tprintf("RETURNED (%d): \"", score);

           break;

       }


       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();

            word_res_it.forward()) {

         if (!word_res_it.data()->part_of_combo) {

           tprintf("%s/%1d ",

                   word_res_it.data()->best_choice->unichar_string().string(),

                   (int)word_res_it.data()->best_choice->permuter());

         }

       }

       tprintf("\"\n");

     } else if (improved) {

       tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());

       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();

            word_res_it.forward()) {

         if (!word_res_it.data()->part_of_combo) {

           tprintf("%s/%1d ",

                   word_res_it.data()->best_choice->unichar_string().string(),

                   (int)word_res_it.data()->best_choice->permuter());

         }

       }

       tprintf("\"\n");

     }

   }

 }


 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {

   if (word->done)

     return TRUE;


   /*

     Use all the standard pass 2 conditions for mode 5 in set_done() in

     reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT

     CARE WHETHER WE HAVE of/at on/an etc.

   */

   if (fixsp_done_mode > 0 &&

       (word->tess_accepted ||

        (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||

        fixsp_done_mode == 3) &&

       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&

       ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||

        (word->best_choice->permuter() == FREQ_DAWG_PERM) ||

        (word->best_choice->permuter() == USER_DAWG_PERM) ||

        (word->best_choice->permuter() == NUMBER_PERM))) {

     return TRUE;

   } else {

     return FALSE;

   }

 }


 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,

                                BLOCK* block) {

   WERD_RES *word_res;

   WERD_RES_LIST sub_word_list;

   WERD_RES_IT sub_word_list_it(&sub_word_list);

   inT16 blob_index;

   inT16 new_length;

   float junk;


   word_res = word_res_it.data();

   if (word_res->word->flag(W_REP_CHAR) ||

       word_res->combination ||

       word_res->part_of_combo ||

       !word_res->word->flag(W_DONT_CHOP))

     return;


   blob_index = worst_noise_blob(word_res, &junk);

   if (blob_index < 0)

     return;


   if (debug_fix_space_level > 1) {

     tprintf("FP fixspace working on \"%s\"\n",

             word_res->best_choice->unichar_string().string());

   }

   word_res->word->rej_cblob_list()->sort(c_blob_comparator);

   sub_word_list_it.add_after_stay_put(word_res_it.extract());

   fix_noisy_space_list(sub_word_list, row, block);

   new_length = sub_word_list.length();

   word_res_it.add_list_before(&sub_word_list);

   for (; !word_res_it.at_last() && new_length > 1; new_length--) {

     word_res_it.forward();

   }

 }


 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,

                                      BLOCK* block) {

   inT16 best_score;

   WERD_RES_IT best_perm_it(&best_perm);

   WERD_RES_LIST current_perm;

   WERD_RES_IT current_perm_it(&current_perm);

   WERD_RES *old_word_res;

   inT16 current_score;

   BOOL8 improved = FALSE;


   best_score = fp_eval_word_spacing(best_perm);  // default score


   dump_words(best_perm, best_score, 1, improved);


   old_word_res = best_perm_it.data();

   // Even deep_copy doesn't copy the underlying WERD unless its combination

   // flag is true!.

   old_word_res->combination = TRUE;   // Kludge to force deep copy

   current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));

   old_word_res->combination = FALSE;  // Undo kludge


   break_noisiest_blob_word(current_perm);


   while (best_score != PERFECT_WERDS && !current_perm.empty()) {

     match_current_words(current_perm, row, block);

     current_score = fp_eval_word_spacing(current_perm);

     dump_words(current_perm, current_score, 2, improved);

     if (current_score > best_score) {

       best_perm.clear();

       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);

       best_score = current_score;

       improved = TRUE;

     }

     if (current_score < PERFECT_WERDS) {

       break_noisiest_blob_word(current_perm);

     }

   }

   dump_words(best_perm, best_score, 3, improved);

 }


 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {

   WERD_RES_IT word_it(&words);

   WERD_RES_IT worst_word_it;

   float worst_noise_score = 9999;

   int worst_blob_index = -1;     // Noisiest blob of noisiest wd

   int blob_index;                // of wds noisiest blob

   float noise_score;             // of wds noisiest blob

   WERD_RES *word_res;

   C_BLOB_IT blob_it;

   C_BLOB_IT rej_cblob_it;

   C_BLOB_LIST new_blob_list;

   C_BLOB_IT new_blob_it;

   C_BLOB_IT new_rej_cblob_it;

   WERD *new_word;

   inT16 start_of_noise_blob;

   inT16 i;


   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

     blob_index = worst_noise_blob(word_it.data(), &noise_score);

     if (blob_index > -1 && worst_noise_score > noise_score) {

       worst_noise_score = noise_score;

       worst_blob_index = blob_index;

       worst_word_it = word_it;

     }

   }

   if (worst_blob_index < 0) {

     words.clear();          // signal termination

     return;

   }


   /* Now split the worst_word_it */


   word_res = worst_word_it.data();


   /* Move blobs before noise blob to a new bloblist */


   new_blob_it.set_to_list(&new_blob_list);

   blob_it.set_to_list(word_res->word->cblob_list());

   for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {

     new_blob_it.add_after_then_move(blob_it.extract());

   }

   start_of_noise_blob = blob_it.data()->bounding_box().left();

   delete blob_it.extract();     // throw out noise blob


   new_word = new WERD(&new_blob_list, word_res->word);

   new_word->set_flag(W_EOL, FALSE);

   word_res->word->set_flag(W_BOL, FALSE);

   word_res->word->set_blanks(1);  // After break


   new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());

   rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());

   for (;

        (!rej_cblob_it.empty() &&

         (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));

        rej_cblob_it.forward()) {

     new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());

   }


   WERD_RES* new_word_res = new WERD_RES(new_word);

   new_word_res->combination = TRUE;

   worst_word_it.add_before_then_move(new_word_res);


   word_res->ClearResults();

 }


 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,

                                   float *worst_noise_score) {

   float noise_score[512];

   int i;

   int min_noise_blob;            // 1st contender

   int max_noise_blob;            // last contender

   int non_noise_count;

   int worst_noise_blob;          // Worst blob

   float small_limit = kBlnXHeight * fixsp_small_outlines_size;

   float non_noise_limit = kBlnXHeight * 0.8;


   if (word_res->rebuild_word == NULL)

     return -1;  // Can't handle cube words.


   // Normalised.

   int blob_count = word_res->box_word->length();

   ASSERT_HOST(blob_count <= 512);

   if (blob_count < 5)

     return -1;                   // too short to split


   /* Get the noise scores for all blobs */


   #ifndef SECURE_NAMES

   if (debug_fix_space_level > 5)

     tprintf("FP fixspace Noise metrics for \"%s\": ",

             word_res->best_choice->unichar_string().string());

   #endif


   for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {

     TBLOB* blob = word_res->rebuild_word->blobs[i];

     if (word_res->reject_map[i].accepted())

       noise_score[i] = non_noise_limit;

     else

       noise_score[i] = blob_noise_score(blob);


     if (debug_fix_space_level > 5)

       tprintf("%1.1f ", noise_score[i]);

   }

   if (debug_fix_space_level > 5)

     tprintf("\n");


   /* Now find the worst one which is far enough away from the end of the word */


   non_noise_count = 0;

   for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {

     if (noise_score[i] >= non_noise_limit) {

       non_noise_count++;

     }

   }

   if (non_noise_count < fixsp_non_noise_limit)

     return -1;


   min_noise_blob = i;


   non_noise_count = 0;

   for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;

        i--) {

     if (noise_score[i] >= non_noise_limit) {

       non_noise_count++;

     }

   }

   if (non_noise_count < fixsp_non_noise_limit)

     return -1;


   max_noise_blob = i;


   if (min_noise_blob > max_noise_blob)

     return -1;


   *worst_noise_score = small_limit;

   worst_noise_blob = -1;

   for (i = min_noise_blob; i <= max_noise_blob; i++) {

     if (noise_score[i] < *worst_noise_score) {

       worst_noise_blob = i;

       *worst_noise_score = noise_score[i];

     }

   }

   return worst_noise_blob;

 }


 float Tesseract::blob_noise_score(TBLOB *blob) {

   TBOX box;                       // BB of outline

   inT16 outline_count = 0;

   inT16 max_dimension;

   inT16 largest_outline_dimension = 0;


   for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {

     outline_count++;

     box = ol->bounding_box();

     if (box.height() > box.width()) {

       max_dimension = box.height();

     } else {

       max_dimension = box.width();

     }


     if (largest_outline_dimension < max_dimension)

       largest_outline_dimension = max_dimension;

   }


   if (outline_count > 5) {

     // penalise LOTS of blobs

     largest_outline_dimension *= 2;

   }


   box = blob->bounding_box();

   if (box.bottom() > kBlnBaselineOffset * 4 ||

       box.top() < kBlnBaselineOffset / 2) {

     // Lax blob is if high or low

     largest_outline_dimension /= 2;

   }


   return largest_outline_dimension;

 }

 }  // namespace tesseract


 void fixspace_dbg(WERD_RES *word) {

   TBOX box = word->word->bounding_box();

   BOOL8 show_map_detail = FALSE;

   inT16 i;


   box.print();

   tprintf(" \"%s\" ", word->best_choice->unichar_string().string());

   tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",

           word->word->cblob_list()->length(),

           word->rebuild_word->NumBlobs(),

           word->box_word->length());

   word->reject_map.print(debug_fp);

   tprintf("\n");

   if (show_map_detail) {

     tprintf("\"%s\"\n", word->best_choice->unichar_string().string());

     for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {

       tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);

       word->reject_map[i].full_print(debug_fp);

     }

   }


   tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");

   tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");

 }


 namespace tesseract {

 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {

   WERD_RES_IT word_it(&word_res_list);

   WERD_RES *word;

   inT16 word_length;

   inT16 score = 0;

   inT16 i;

   float small_limit = kBlnXHeight * fixsp_small_outlines_size;


   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

     word = word_it.data();

     if (word->rebuild_word == NULL)

       continue;  // Can't handle cube words.

     word_length = word->reject_map.length();

     if (word->done ||

         word->tess_accepted ||

         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||

         word->best_choice->permuter() == FREQ_DAWG_PERM ||

         word->best_choice->permuter() == USER_DAWG_PERM ||

         safe_dict_word(word) > 0) {

       int num_blobs = word->rebuild_word->NumBlobs();

       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");

       for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {

         TBLOB* blob = word->rebuild_word->blobs[i];

         if (word->best_choice->unichar_id(i) == space ||

             blob_noise_score(blob) < small_limit) {

           score -= 1;  // penalise possibly erroneous non-space

         } else if (word->reject_map[i].accepted()) {

           score++;

         }

       }

     }

   }

   if (score < 0)

     score = 0;

   return score;

 }


 }  // namespace tesseract

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:28

WERD_RES::tess_accepted
BOOL8 tess_accepted
Definition: pageres.h:280

TBLOB
Definition: blobs.h:261

globals.h

WERD::join_on
void join_on(WERD *other)
Definition: werd.cpp:211

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:250

tesseract::Tesseract::break_noisiest_blob_word
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:616

WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1140

PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62

UNICHARSET::unichar_to_id
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

tesseract::Tesseract::worst_noise_blob
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:681

WERD_RES
Definition: pageres.h:155

tesseract::Tesseract::numeric_punctuation
char * numeric_punctuation
Definition: tesseractclass.h:975

REJMAP::length
inT32 length() const
Definition: rejctmap.h:237

initialise_search
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:177

control.h

tesseract::TesseractStats::dict_words
inT32 dict_words
Definition: tesseractclass.h:134

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:271

USER_DAWG_PERM
Definition: ratngs.h:251

tessbox.h

TESSLINE
Definition: blobs.h:180

transform_to_next_perm
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:373

tesseract::Tesseract::fixsp_done_mode
int fixsp_done_mode
Definition: tesseractclass.h:972

ETEXT_DESC::ocr_alive
volatile inT8 ocr_alive
Definition: ocrclass.h:117

tprintf
#define tprintf(...)
Definition: tprintf.h:31

fixspace_dbg
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:796

ETEXT_DESC::cancel_this
void * cancel_this
Definition: ocrclass.h:120

TBOX::print
void print() const
Definition: rect.h:270

W_FUZZY_NON
Definition: werd.h:43

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:531

BOOL8
unsigned char BOOL8
Definition: host.h:113

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:160

mode
CMD_EVENTS mode
Definition: pgedit.cpp:116

ETEXT_DESC
Definition: ocrclass.h:112

tesseract::Tesseract::debug_fix_space_level
int debug_fix_space_level
Definition: tesseractclass.h:973

TBOX::right
inT16 right() const
Definition: rect.h:75

WERD_RES::x_height
float x_height
Definition: pageres.h:295

tesseract::Tesseract::safe_dict_word
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607

tesseract::Tesseract::eval_word_spacing
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:240

WERD_RES::deep_copy
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:630

tesseract::WordData
Definition: tesseractclass.h:144

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

ROW
Definition: ocrrow.h:32

tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

W_BOL
Definition: werd.h:35

WERD_RES::part_of_combo
BOOL8 part_of_combo
Definition: pageres.h:316

WERD_RES::combination
BOOL8 combination
Definition: pageres.h:315

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:425

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

tessvars.h

REJMAP::full_print
void full_print(FILE *fp)
Definition: rejctmap.cpp:406

tesseract::Tesseract::fix_fuzzy_space_list
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:145

ETEXT_DESC::cancel
CANCEL_FUNC cancel
Definition: ocrclass.h:119

W_EOL
Definition: werd.h:36

tesseract::Tesseract::check_debug_pt
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767

TBOX::left
inT16 left() const
Definition: rect.h:68

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:244

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:144

PAGE_RES
Definition: pageres.h:58

W_DONT_CHOP
Definition: werd.h:40

BLOCK
Definition: ocrblock.h:30

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

W_REP_CHAR
Definition: werd.h:41

tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1043

W_FUZZY_SP
Definition: werd.h:42

FREQ_DAWG_PERM
Definition: ratngs.h:252

NUMBER_PERM
Definition: ratngs.h:247

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

WERD_CHOICE::permuter
uinT8 permuter() const
Definition: ratngs.h:343

tesseract::Tesseract::fixsp_non_noise_limit
int fixsp_non_noise_limit
Definition: tesseractclass.h:969

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:29

statistc.h

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesseract::Tesseract::fix_noisy_space_list
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:570

WERD
Definition: werd.h:60

tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171

tesseract::Tesseract::fixsp_small_outlines_size
double fixsp_small_outlines_size
Definition: tesseractclass.h:970

reject.h

TBOX::bottom
inT16 bottom() const
Definition: rect.h:61

WERD_RES::done
BOOL8 done
Definition: pageres.h:282

tesseract::Tesseract::match_current_words
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196

WERD_RES::word
WERD * word
Definition: pageres.h:175

c_blob_comparator
int c_blob_comparator(const void *blob1p, const void *blob2p)
Definition: genblob.cpp:30

TBOX::height
inT16 height() const
Definition: rect.h:104

tesseract::Tesseract::blob_noise_score
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:761

tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268

genblob.h

TBOX::width
inT16 width() const
Definition: rect.h:111

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

tesseract::Tesseract::fixspace_thinks_word_done
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:504

tesseract::BoxWord::length
const int length() const
Definition: boxword.h:85

tesseract::Tesseract::fp_eval_word_spacing
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:831

FALSE
#define FALSE
Definition: capi.h:29

tesseract
Definition: baseapi.cpp:83

WERD_RES::tess_failed
BOOL8 tess_failed
Definition: pageres.h:272

ETEXT_DESC::progress
inT16 progress
Definition: ocrclass.h:115

REJMAP::reject_count
inT16 reject_count()
Definition: rejctmap.h:243

TBOX
Definition: rect.h:30

PERFECT_WERDS
#define PERFECT_WERDS
Definition: fixspace.cpp:33

TRUE
#define TRUE
Definition: capi.h:28

MAX_INT16
#define MAX_INT16
Definition: host.h:119

debug_fp
FILE * debug_fp
Definition: tessvars.cpp:24

tesseract::TesseractStats::dump_words_str
STRING dump_words_str
Definition: tesseractclass.h:135

WERD::flag
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128

STRING
Definition: strngs.h:44

NULL
#define NULL
Definition: host.h:144

tesseract::Tesseract::fix_sp_fp_word
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:536

TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:482

TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:377

tesseract::Tesseract::test_pt
bool test_pt
Definition: tesseractclass.h:886

WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:641

REJMAP::print
void print(FILE *fp)
Definition: rejctmap.cpp:394

STRING::string
const char * string() const
Definition: strngs.cpp:193

TBOX::top
inT16 top() const
Definition: rect.h:54

WERD::set_flag
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129

tesseract::Tesseract::tessedit_prefer_joined_punct
bool tessedit_prefer_joined_punct
Definition: tesseractclass.h:971

fixspace.h

tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48

tesseract::Tesseract::digit_or_numeric_punct
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:344

WERD::set_blanks
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107

WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95

WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:100

tesseract::Tesseract::dump_words
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:450

STRING::contains
BOOL8 contains(const char c) const
Definition: strngs.cpp:184

inT16
short inT16
Definition: host.h:100

inT32
int inT32
Definition: host.h:102

tesseractclass.h