All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
gap_map.cpp
Go to the documentation of this file.
1 #include "statistc.h"
2 #include "gap_map.h"
3 
4 #define EXTERN
5 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables");
7 "Use large space at start and end of rows");
9 "Ensure gaps not less than 2quanta wide");
10 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier");
11 
12 /*************************************************************************
13  * A block gap map is a quantised histogram of whitespace regions in the
14  * block. It is a vertical projection of wide gaps WITHIN lines
15  *
16  * The map is held as an array of counts of rows which have a wide gap
17  * covering that region of the row. Each bucket in the map represents a width
18  * of about half an xheight - (The median of the xhts in the rows is used.)
19  *
20  * The block is considered RECTANGULAR - delimited by the left and right
21  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
22  * counted.
23  *
24  *************************************************************************/
25 
26 GAPMAP::GAPMAP( //Constructor
27  TO_BLOCK *block //block
28  ) {
29  TO_ROW_IT row_it; //row iterator
30  TO_ROW *row; //current row
31  BLOBNBOX_IT blob_it; //iterator
32  TBOX blob_box;
33  TBOX prev_blob_box;
34  inT16 gap_width;
35  inT16 start_of_row;
36  inT16 end_of_row;
37  STATS xht_stats (0, 128);
38  inT16 min_quantum;
39  inT16 max_quantum;
40  inT16 i;
41 
42  row_it.set_to_list (block->get_rows ());
43  /*
44  Find left and right extremes and bucket size
45  */
46  map = NULL;
47  min_left = MAX_INT16;
48  max_right = -MAX_INT16;
49  total_rows = 0;
50  any_tabs = FALSE;
51  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
52  row = row_it.data ();
53  if (!row->blob_list ()->empty ()) {
54  total_rows++;
55  xht_stats.add ((inT16) floor (row->xheight + 0.5), 1);
56  blob_it.set_to_list (row->blob_list ());
57  start_of_row = blob_it.data ()->bounding_box ().left ();
58  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
59  if (min_left > start_of_row)
60  min_left = start_of_row;
61  if (max_right < end_of_row)
62  max_right = end_of_row;
63  }
64  }
65  if ((total_rows < 3) || (min_left >= max_right)) {
66  total_rows = 0;
67  min_left = max_right = 0;
68  return;
69  }
70  bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2;
71  map_max = (max_right - min_left) / bucket_size;
72  map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16));
73  for (i = 0; i <= map_max; i++)
74  map[i] = 0;
75 
76  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
77  row = row_it.data ();
78  if (!row->blob_list ()->empty ()) {
79  blob_it.set_to_list (row->blob_list ());
80  blob_it.mark_cycle_pt ();
81  blob_box = box_next (&blob_it);
82  prev_blob_box = blob_box;
83  if (gapmap_use_ends) {
84  /* Leading space */
85  gap_width = blob_box.left () - min_left;
86  if ((gap_width > gapmap_big_gaps * row->xheight)
87  && gap_width > 2) {
88  max_quantum = (blob_box.left () - min_left) / bucket_size;
89  if (max_quantum > map_max) max_quantum = map_max;
90  for (i = 0; i <= max_quantum; i++)
91  map[i]++;
92  }
93  }
94  while (!blob_it.cycled_list ()) {
95  blob_box = box_next (&blob_it);
96  gap_width = blob_box.left () - prev_blob_box.right ();
97  if ((gap_width > gapmap_big_gaps * row->xheight)
98  && gap_width > 2) {
99  min_quantum =
100  (prev_blob_box.right () - min_left) / bucket_size;
101  max_quantum = (blob_box.left () - min_left) / bucket_size;
102  if (max_quantum > map_max) max_quantum = map_max;
103  for (i = min_quantum; i <= max_quantum; i++)
104  map[i]++;
105  }
106  prev_blob_box = blob_box;
107  }
108  if (gapmap_use_ends) {
109  /* Trailing space */
110  gap_width = max_right - prev_blob_box.right ();
111  if ((gap_width > gapmap_big_gaps * row->xheight)
112  && gap_width > 2) {
113  min_quantum =
114  (prev_blob_box.right () - min_left) / bucket_size;
115  if (min_quantum < 0) min_quantum = 0;
116  for (i = min_quantum; i <= map_max; i++)
117  map[i]++;
118  }
119  }
120  }
121  }
122  for (i = 0; i <= map_max; i++) {
123  if (map[i] > total_rows / 2) {
125  (((i == 0) &&
126  (map[i + 1] <= total_rows / 2)) ||
127  ((i == map_max) &&
128  (map[i - 1] <= total_rows / 2)) ||
129  ((i > 0) &&
130  (i < map_max) &&
131  (map[i - 1] <= total_rows / 2) &&
132  (map[i + 1] <= total_rows / 2)))) {
133  map[i] = 0; //prevent isolated quantum
134  }
135  else
136  any_tabs = TRUE;
137  }
138  }
139  if (gapmap_debug && any_tabs)
140  tprintf ("Table found\n");
141 }
142 
143 
144 /*************************************************************************
145  * GAPMAP::table_gap()
146  * Is there a bucket in the specified range where more than half the rows in the
147  * block have a wide gap?
148  *************************************************************************/
149 
150 BOOL8 GAPMAP::table_gap( //Is gap a table?
151  inT16 left, //From here
152  inT16 right //To here
153  ) {
154  inT16 min_quantum;
155  inT16 max_quantum;
156  inT16 i;
157  BOOL8 tab_found = FALSE;
158 
159  if (!any_tabs)
160  return FALSE;
161 
162  min_quantum = (left - min_left) / bucket_size;
163  max_quantum = (right - min_left) / bucket_size;
164  // Clip to the bounds of the array. In some circumstances (big blob followed
165  // by small blob) max_quantum can exceed the map_max bounds, but we clip
166  // here instead, as it provides better long-term safety.
167  if (min_quantum < 0) min_quantum = 0;
168  if (max_quantum > map_max) max_quantum = map_max;
169  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
170  if (map[i] > total_rows / 2)
171  tab_found = TRUE;
172  return tab_found;
173 }
#define double_VAR(name, val, comment)
Definition: params.h:286
#define tprintf(...)
Definition: tprintf.h:31
Definition: statistc.h:33
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
#define BOOL_VAR(name, val, comment)
Definition: params.h:280
unsigned char BOOL8
Definition: host.h:113
EXTERN bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:9
BOOL8 table_gap(inT16 left, inT16 right)
Definition: gap_map.cpp:150
inT16 right() const
Definition: rect.h:75
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
EXTERN double gapmap_big_gaps
Definition: gap_map.cpp:10
double median() const
Definition: statistc.cpp:243
inT16 left() const
Definition: rect.h:68
#define EXTERN
Definition: gap_map.cpp:4
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:629
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
#define FALSE
Definition: capi.h:29
GAPMAP(TO_BLOCK *block)
Definition: gap_map.cpp:26
Definition: rect.h:30
#define TRUE
Definition: capi.h:28
#define MAX_INT16
Definition: host.h:119
EXTERN bool gapmap_use_ends
Definition: gap_map.cpp:7
void * alloc_mem(inT32 count)
Definition: memry.cpp:47
#define NULL
Definition: host.h:144
float xheight
Definition: blobbox.h:653
EXTERN bool gapmap_debug
Definition: gap_map.cpp:5
short inT16
Definition: host.h:100