All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
output.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: output.cpp (Formerly output.c)
3  * Description: Output pass
4  * Author: Phil Cheatle
5  * Created: Thu Aug 4 10:56:08 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include <string.h>
25 #include <ctype.h>
26 #ifdef __UNIX__
27 #include <assert.h>
28 #include <unistd.h>
29 #include <errno.h>
30 #endif
31 #include "helpers.h"
32 #include "tessvars.h"
33 #include "control.h"
34 #include "reject.h"
35 #include "docqual.h"
36 #include "output.h"
37 #include "globals.h"
38 #include "tesseractclass.h"
39 
40 #define EPAPER_EXT ".ep"
41 #define PAGE_YSIZE 3508
42 #define CTRL_INSET '\024' //dc4=text inset
43 #define CTRL_FONT '\016' //so=font change
44 #define CTRL_DEFAULT '\017' //si=default font
45 #define CTRL_SHIFT '\022' //dc2=x shift
46 #define CTRL_TAB '\011' //tab
47 #define CTRL_NEWLINE '\012' //newline
48 #define CTRL_HARDLINE '\015' //cr
49 
50 /**********************************************************************
51  * pixels_to_pts
52  *
53  * Convert an integer number of pixels to the nearest integer
54  * number of points.
55  **********************************************************************/
56 
57 inT32 pixels_to_pts( //convert coords
58  inT32 pixels,
59  inT32 pix_res //resolution
60  ) {
61  float pts; //converted value
62 
63  pts = pixels * 72.0 / pix_res;
64  return (inT32) (pts + 0.5); //round it
65 }
66 
67 namespace tesseract {
68 void Tesseract::output_pass( //Tess output pass //send to api
69  PAGE_RES_IT &page_res_it,
70  const TBOX *target_word_box) {
71  BLOCK_RES *block_of_last_word;
72  BOOL8 force_eol; //During output
73  BLOCK *nextblock; //block of next word
74  WERD *nextword; //next word
75 
76  page_res_it.restart_page ();
77  block_of_last_word = NULL;
78  while (page_res_it.word () != NULL) {
79  check_debug_pt (page_res_it.word (), 120);
80 
81  if (target_word_box)
82  {
83 
84  TBOX current_word_box=page_res_it.word ()->word->bounding_box();
85  FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
86  if (!target_word_box->contains(center_pt))
87  {
88  page_res_it.forward ();
89  continue;
90  }
91 
92  }
94  block_of_last_word != page_res_it.block ()) {
95  block_of_last_word = page_res_it.block ();
96  }
97 
98  force_eol = (tessedit_write_block_separators &&
99  (page_res_it.block () != page_res_it.next_block ())) ||
100  (page_res_it.next_word () == NULL);
101 
102  if (page_res_it.next_word () != NULL)
103  nextword = page_res_it.next_word ()->word;
104  else
105  nextword = NULL;
106  if (page_res_it.next_block () != NULL)
107  nextblock = page_res_it.next_block ()->block;
108  else
109  nextblock = NULL;
110  //regardless of tilde crunching
111  write_results(page_res_it,
112  determine_newline_type(page_res_it.word()->word,
113  page_res_it.block()->block,
114  nextword, nextblock), force_eol);
115  page_res_it.forward();
116  }
117 }
118 
119 
120 /*************************************************************************
121  * write_results()
122  *
123  * All recognition and rejection has now been done. Generate the following:
124  * .txt file - giving the final best choices with NO highlighting
125  * .raw file - giving the tesseract top choice output for each word
126  * .map file - showing how the .txt file has been rejected in the .ep file
127  * epchoice list - a list of one element per word, containing the text for the
128  * epaper. Reject strings are inserted.
129  * inset list - a list of bounding boxes of reject insets - indexed by the
130  * reject strings in the epchoice text.
131  *************************************************************************/
133  char newline_type, // type of newline
134  BOOL8 force_eol) { // override tilde crunch?
135  WERD_RES *word = page_res_it.word();
136  const UNICHARSET &uchset = *word->uch_set;
137  int i;
138  BOOL8 need_reject = FALSE;
139  UNICHAR_ID space = uchset.unichar_to_id(" ");
140 
141  if ((word->unlv_crunch_mode != CR_NONE ||
142  word->best_choice->length() == 0) &&
144  if ((word->unlv_crunch_mode != CR_DELETE) &&
145  (!stats_.tilde_crunch_written ||
146  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
147  (word->word->space () > 0) &&
148  !word->word->flag (W_FUZZY_NON) &&
149  !word->word->flag (W_FUZZY_SP)))) {
150  if (!word->word->flag (W_BOL) &&
151  (word->word->space () > 0) &&
152  !word->word->flag (W_FUZZY_NON) &&
153  !word->word->flag (W_FUZZY_SP)) {
154  stats_.last_char_was_tilde = false;
155  }
156  need_reject = TRUE;
157  }
158  if ((need_reject && !stats_.last_char_was_tilde) ||
159  (force_eol && stats_.write_results_empty_block)) {
160  /* Write a reject char - mark as rejected unless zero_rejection mode */
161  stats_.last_char_was_tilde = TRUE;
162  stats_.tilde_crunch_written = true;
163  stats_.last_char_was_newline = false;
164  stats_.write_results_empty_block = false;
165  }
166 
167  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
168  stats_.tilde_crunch_written = false;
169  stats_.last_char_was_newline = true;
170  stats_.last_char_was_tilde = false;
171  }
172 
173  if (force_eol)
174  stats_.write_results_empty_block = true;
175  return;
176  }
177 
178  /* NORMAL PROCESSING of non tilde crunched words */
179 
180  stats_.tilde_crunch_written = false;
181  if (newline_type)
182  stats_.last_char_was_newline = true;
183  else
184  stats_.last_char_was_newline = false;
185  stats_.write_results_empty_block = force_eol; // about to write a real word
186 
187  if (unlv_tilde_crunching &&
188  stats_.last_char_was_tilde &&
189  (word->word->space() == 0) &&
191  (word->best_choice->unichar_id(0) == space)) {
192  /* Prevent adjacent tilde across words - we know that adjacent tildes within
193  words have been removed */
194  word->MergeAdjacentBlobs(0);
195  }
196  if (newline_type ||
198  stats_.last_char_was_tilde = false;
199  else {
200  if (word->reject_map.length () > 0) {
201  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
202  stats_.last_char_was_tilde = true;
203  else
204  stats_.last_char_was_tilde = false;
205  }
206  else if (word->word->space () > 0)
207  stats_.last_char_was_tilde = false;
208  /* else it is unchanged as there are no output chars */
209  }
210 
211  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
212 
213  set_unlv_suspects(word);
214  check_debug_pt (word, 120);
216  tprintf ("Dict word: \"%s\": %d\n",
217  word->best_choice->debug_string().string(),
218  dict_word(*(word->best_choice)));
219  }
220  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
222  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
223  for (i = 0; i < word->best_choice->length(); ++i) {
224  if (word->reject_map[i].rejected())
225  word->reject_map[i].setrej_minimal_rej_accept();
226  }
227  }
229  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
230  for (i = 0; i < word->best_choice->length(); ++i) {
231  if ((word->best_choice->unichar_id(i) != space) &&
232  word->reject_map[i].rejected())
233  word->reject_map[i].setrej_minimal_rej_accept();
234  }
235  }
236  }
237 }
238 } // namespace tesseract
239 
240 /**********************************************************************
241  * determine_newline_type
242  *
243  * Find whether we have a wrapping or hard newline.
244  * Return FALSE if not at end of line.
245  **********************************************************************/
246 
247 char determine_newline_type( //test line ends
248  WERD *word, //word to do
249  BLOCK *block, //current block
250  WERD *next_word, //next word
251  BLOCK *next_block //block of next word
252  ) {
253  inT16 end_gap; //to right edge
254  inT16 width; //of next word
255  TBOX word_box; //bounding
256  TBOX next_box; //next word
257  TBOX block_box; //block bounding
258 
259  if (!word->flag (W_EOL))
260  return FALSE; //not end of line
261  if (next_word == NULL || next_block == NULL || block != next_block)
262  return CTRL_NEWLINE;
263  if (next_word->space () > 0)
264  return CTRL_HARDLINE; //it is tabbed
265  word_box = word->bounding_box ();
266  next_box = next_word->bounding_box ();
267  block_box = block->bounding_box ();
268  //gap to eol
269  end_gap = block_box.right () - word_box.right ();
270  end_gap -= (inT32) block->space ();
271  width = next_box.right () - next_box.left ();
272  // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
273  // block_box.right(),word_box.right(),end_gap,
274  // next_box.right(),next_box.left(),width,
275  // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
276  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
277 }
278 
279 /*************************************************************************
280  * get_rep_char()
281  * Return the first accepted character from the repetition string. This is the
282  * character which is repeated - as determined earlier by fix_rep_char()
283  *************************************************************************/
284 namespace tesseract {
285 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
286  int i;
287  for (i = 0; ((i < word->reject_map.length()) &&
288  (word->reject_map[i].rejected())); ++i);
289 
290  if (i < word->reject_map.length()) {
291  return word->best_choice->unichar_id(i);
292  } else {
293  return word->uch_set->unichar_to_id(unrecognised_char.string());
294  }
295 }
296 
297 /*************************************************************************
298  * SUSPECT LEVELS
299  *
300  * 0 - dont reject ANYTHING
301  * 1,2 - partial rejection
302  * 3 - BEST
303  *
304  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
305  * tessedit_minimal_rejection.
306  *************************************************************************/
308  int len = word_res->reject_map.length();
309  const WERD_CHOICE &word = *(word_res->best_choice);
310  const UNICHARSET &uchset = *word.unicharset();
311  int i;
312  float rating_per_ch;
313 
314  if (suspect_level == 0) {
315  for (i = 0; i < len; i++) {
316  if (word_res->reject_map[i].rejected())
317  word_res->reject_map[i].setrej_minimal_rej_accept();
318  }
319  return;
320  }
321 
322  if (suspect_level >= 3)
323  return; //Use defaults
324 
325  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
326 
327  if (safe_dict_word(word_res) &&
328  (count_alphas(word) > suspect_short_words)) {
329  /* Unreject alphas in dictionary words */
330  for (i = 0; i < len; ++i) {
331  if (word_res->reject_map[i].rejected() &&
332  uchset.get_isalpha(word.unichar_id(i)))
333  word_res->reject_map[i].setrej_minimal_rej_accept();
334  }
335  }
336 
337  rating_per_ch = word.rating() / word_res->reject_map.length();
338 
339  if (rating_per_ch >= suspect_rating_per_ch)
340  return; //Dont touch bad ratings
341 
342  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
343  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
344  for (i = 0; i < len; ++i) {
345  if (word_res->reject_map[i].rejected() &&
346  (!uchset.eq(word.unichar_id(i), " ")))
347  word_res->reject_map[i].setrej_minimal_rej_accept();
348  }
349  }
350 
351  for (i = 0; i < len; i++) {
352  if (word_res->reject_map[i].rejected()) {
353  if (word_res->reject_map[i].flag(R_DOC_REJ))
354  word_res->reject_map[i].setrej_minimal_rej_accept();
355  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
356  word_res->reject_map[i].setrej_minimal_rej_accept();
357  if (word_res->reject_map[i].flag(R_ROW_REJ))
358  word_res->reject_map[i].setrej_minimal_rej_accept();
359  }
360  }
361 
362  if (suspect_level == 2)
363  return;
364 
365  if (!suspect_constrain_1Il ||
366  (word_res->reject_map.length() <= suspect_short_words)) {
367  for (i = 0; i < len; i++) {
368  if (word_res->reject_map[i].rejected()) {
369  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
370  word_res->reject_map[i].flag(R_POSTNN_1IL)))
371  word_res->reject_map[i].setrej_minimal_rej_accept();
372 
373  if (!suspect_constrain_1Il &&
374  word_res->reject_map[i].flag(R_MM_REJECT))
375  word_res->reject_map[i].setrej_minimal_rej_accept();
376  }
377  }
378  }
379 
380  if (acceptable_word_string(*word_res->uch_set,
381  word.unichar_string().string(),
382  word.unichar_lengths().string()) !=
383  AC_UNACCEPTABLE ||
385  word.unichar_lengths().string())) {
386  if (word_res->reject_map.length() > suspect_short_words) {
387  for (i = 0; i < len; i++) {
388  if (word_res->reject_map[i].rejected() &&
389  (!word_res->reject_map[i].perm_rejected() ||
390  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
391  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
392  word_res->reject_map[i].flag (R_MM_REJECT))) {
393  word_res->reject_map[i].setrej_minimal_rej_accept();
394  }
395  }
396  }
397  }
398 }
399 
401  int count = 0;
402  for (int i = 0; i < word.length(); ++i) {
403  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
404  count++;
405  }
406  return count;
407 }
408 
409 
411  int count = 0;
412  for (int i = 0; i < word.length(); ++i) {
413  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
414  word.unicharset()->get_isdigit(word.unichar_id(i)))
415  count++;
416  }
417  return count;
418 }
419 
420 
422  const char *lengths) {
423  BOOL8 prev_digit = FALSE;
424 
425  if (*lengths == 1 && *s == '(')
426  s++;
427 
428  if (*lengths == 1 &&
429  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
430  s++;
431 
432  for (; *s != '\0'; s += *(lengths++)) {
433  if (unicharset.get_isdigit(s, *lengths))
434  prev_digit = TRUE;
435  else if (prev_digit &&
436  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
437  prev_digit = FALSE;
438  else if (prev_digit && *lengths == 1 &&
439  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
440  return TRUE;
441  else if (prev_digit &&
442  *lengths == 1 && (*s == '%') &&
443  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
444  (*(s + *lengths + *(lengths + 1)) == '\0'))
445  return TRUE;
446  else
447  return FALSE;
448  }
449  return TRUE;
450 }
451 } // namespace tesseract
BOOL8 tess_accepted
Definition: pageres.h:280
#define CTRL_NEWLINE
Definition: output.cpp:47
inT16 count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:400
float rating() const
Definition: ratngs.h:324
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:247
inT32 length() const
Definition: rejctmap.h:237
inT32 pixels_to_pts(inT32 pixels, inT32 pix_res)
Definition: output.cpp:57
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
const STRING & unichar_lengths() const
Definition: ratngs.h:531
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
inT16 right() const
Definition: rect.h:75
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:124
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
BLOCK * block
Definition: pageres.h:99
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
Definition: output.cpp:132
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:410
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: werd.h:35
BLOCK_RES * block() const
Definition: pageres.h:739
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
WERD_RES * forward()
Definition: pageres.h:713
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
WERD_RES * restart_page()
Definition: pageres.h:680
BOOL8 acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:421
Definition: werd.h:36
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
inT16 left() const
Definition: rect.h:68
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
Definition: ocrblock.h:30
const UNICHARSET * uch_set
Definition: pageres.h:192
const STRING debug_string() const
Definition: ratngs.h:502
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:285
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
int UNICHAR_ID
Definition: unichar.h:33
BLOCK_RES * next_block() const
Definition: pageres.h:748
Definition: werd.h:60
inT16 bottom() const
Definition: rect.h:61
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:307
WERD * word
Definition: pageres.h:175
Unacceptable word.
Definition: control.h:36
#define CTRL_HARDLINE
Definition: output.cpp:48
#define FALSE
Definition: capi.h:29
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
Definition: rect.h:30
#define TRUE
Definition: capi.h:28
uinT8 space()
Definition: werd.h:104
WERD_RES * next_word() const
Definition: pageres.h:742
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:968
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool contains(const FCOORD pt) const
Definition: rect.h:323
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
#define NULL
Definition: host.h:144
inT16 space() const
return spacing
Definition: ocrblock.h:102
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
Definition: points.h:189
WERD_RES * word() const
Definition: pageres.h:733
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102