tesseract v5.3.3.20231005
shapetable.cpp
Go to the documentation of this file.
1// Copyright 2010 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
4// File: shapetable.cpp
5// Description: Class to map a classifier shape index to unicharset
6// indices and font indices.
7// Author: Ray Smith
8//
9// (C) Copyright 2010, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#include "shapetable.h"
23
24#include "bitvector.h"
25#include "fontinfo.h"
26#include "intfeaturespace.h"
27#include "unicharset.h"
28#include "unicity_table.h"
29
30#include <algorithm>
31
32namespace tesseract {
33
34// Helper function to get the index of the first result with the required
35// unichar_id. If the results are sorted by rating, this will also be the
36// best result with the required unichar_id.
37// Returns -1 if the unichar_id is not found
38int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,
39 const ShapeTable &shape_table, UNICHAR_ID unichar_id) {
40 for (unsigned r = 0; r < results.size(); ++r) {
41 const auto shape_id = results[r].shape_id;
42 const Shape &shape = shape_table.GetShape(shape_id);
43 if (shape.ContainsUnichar(unichar_id)) {
44 return r;
45 }
46 }
47 return -1;
48}
49
50// Helper function to get the index of the first result with the required
51// unichar_id. If the results are sorted by rating, this will also be the
52// best result with the required unichar_id.
53// Returns -1 if the unichar_id is not found
54int UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results,
55 UNICHAR_ID unichar_id) {
56 for (unsigned r = 0; r < results.size(); ++r) {
57 if (results[r].unichar_id == unichar_id) {
58 return r;
59 }
60 }
61 return -1;
62}
63
64// Writes to the given file. Returns false in case of error.
65bool UnicharAndFonts::Serialize(FILE *fp) const {
67}
68
69// Reads from the given file. Returns false in case of error.
71 return fp->DeSerialize(&unichar_id) && fp->DeSerialize(font_ids);
72}
73
74// Sort function to sort a pair of UnicharAndFonts by unichar_id.
75int UnicharAndFonts::SortByUnicharId(const void *v1, const void *v2) {
76 const auto *p1 = static_cast<const UnicharAndFonts *>(v1);
77 const auto *p2 = static_cast<const UnicharAndFonts *>(v2);
78 return p1->unichar_id - p2->unichar_id;
79}
80
82 return v1.unichar_id < v2.unichar_id;
83}
84
85// Writes to the given file. Returns false in case of error.
86bool Shape::Serialize(FILE *fp) const {
87 uint8_t sorted = unichars_sorted_;
88 return tesseract::Serialize(fp, &sorted) && tesseract::Serialize(fp, unichars_);
89}
90// Reads from the given file. Returns false in case of error.
91
93 uint8_t sorted;
94 if (!fp->DeSerialize(&sorted)) {
95 return false;
96 }
97 unichars_sorted_ = sorted != 0;
98 return fp->DeSerialize(unichars_);
99}
100
101// Adds a font_id for the given unichar_id. If the unichar_id is not
102// in the shape, it is added.
103void Shape::AddToShape(int unichar_id, int font_id) {
104 for (auto &unichar : unichars_) {
105 if (unichar.unichar_id == unichar_id) {
106 // Found the unichar in the shape table.
107 std::vector<int> &font_list = unichar.font_ids;
108 for (int f : font_list) {
109 if (f == font_id) {
110 return; // Font is already there.
111 }
112 }
113 font_list.push_back(font_id);
114 return;
115 }
116 }
117 // Unichar_id is not in shape, so add it to shape.
118 unichars_.emplace_back(unichar_id, font_id);
119 unichars_sorted_ = unichars_.size() <= 1;
120}
121
122// Adds everything in other to this.
123void Shape::AddShape(const Shape &other) {
124 for (const auto &unichar : other.unichars_) {
125 for (unsigned f = 0; f < unichar.font_ids.size(); ++f) {
126 AddToShape(unichar.unichar_id, unichar.font_ids[f]);
127 }
128 }
129 unichars_sorted_ = unichars_.size() <= 1;
130}
131
132// Returns true if the shape contains the given unichar_id, font_id pair.
133bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
134 for (const auto &unichar : unichars_) {
135 if (unichar.unichar_id == unichar_id) {
136 // Found the unichar, so look for the font.
137 auto &font_list = unichar.font_ids;
138 for (int f : font_list) {
139 if (f == font_id) {
140 return true;
141 }
142 }
143 return false;
144 }
145 }
146 return false;
147}
148
149// Returns true if the shape contains the given unichar_id, ignoring font.
150bool Shape::ContainsUnichar(int unichar_id) const {
151 for (const auto &unichar : unichars_) {
152 if (unichar.unichar_id == unichar_id) {
153 return true;
154 }
155 }
156 return false;
157}
158
159// Returns true if the shape contains the given font, ignoring unichar_id.
160bool Shape::ContainsFont(int font_id) const {
161 for (const auto &unichar : unichars_) {
162 auto &font_list = unichar.font_ids;
163 for (int f : font_list) {
164 if (f == font_id) {
165 return true;
166 }
167 }
168 }
169 return false;
170}
171// Returns true if the shape contains the given font properties, ignoring
172// unichar_id.
173bool Shape::ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const {
174 for (const auto &unichar : unichars_) {
175 auto &font_list = unichar.font_ids;
176 for (int f : font_list) {
177 if (font_table.at(f).properties == properties) {
178 return true;
179 }
180 }
181 }
182 return false;
183}
184// Returns true if the shape contains multiple different font properties,
185// ignoring unichar_id.
187 uint32_t properties = font_table.at(unichars_[0].font_ids[0]).properties;
188 for (const auto &unichar : unichars_) {
189 auto &font_list = unichar.font_ids;
190 for (int f : font_list) {
191 if (font_table.at(f).properties != properties) {
192 return true;
193 }
194 }
195 }
196 return false;
197}
198
199// Returns true if this shape is equal to other (ignoring order of unichars
200// and fonts).
201bool Shape::operator==(const Shape &other) const {
202 return IsSubsetOf(other) && other.IsSubsetOf(*this);
203}
204
205// Returns true if this is a subset (including equal) of other.
206bool Shape::IsSubsetOf(const Shape &other) const {
207 for (const auto &unichar : unichars_) {
208 int unichar_id = unichar.unichar_id;
209 const std::vector<int> &font_list = unichar.font_ids;
210 for (int f : font_list) {
211 if (!other.ContainsUnicharAndFont(unichar_id, f)) {
212 return false;
213 }
214 }
215 }
216 return true;
217}
218
219// Returns true if the lists of unichar ids are the same in this and other,
220// ignoring fonts.
221// NOT const, as it will sort the unichars on demand.
223 if (unichars_.size() != other->unichars_.size()) {
224 return false;
225 }
226 if (!unichars_sorted_) {
227 SortUnichars();
228 }
229 if (!other->unichars_sorted_) {
230 other->SortUnichars();
231 }
232 for (unsigned c = 0; c < unichars_.size(); ++c) {
233 if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) {
234 return false;
235 }
236 }
237 return true;
238}
239
240// Sorts the unichars_ vector by unichar.
241void Shape::SortUnichars() {
242 std::sort(unichars_.begin(), unichars_.end(), UnicharAndFonts::StdSortByUnicharId);
243 unichars_sorted_ = true;
244}
245
246ShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {}
247ShapeTable::ShapeTable(const UNICHARSET &unicharset) : unicharset_(&unicharset), num_fonts_(0) {}
248
249// Writes to the given file. Returns false in case of error.
250bool ShapeTable::Serialize(FILE *fp) const {
251 return tesseract::Serialize(fp, shape_table_);
252}
253// Reads from the given file. Returns false in case of error.
254
256 if (!fp->DeSerialize(shape_table_)) {
257 return false;
258 }
259 num_fonts_ = 0;
260 return true;
261}
262
263// Returns the number of fonts used in this ShapeTable, computing it if
264// necessary.
266 if (num_fonts_ <= 0) {
267 for (auto shape_id : shape_table_) {
268 const Shape &shape = *shape_id;
269 for (int c = 0; c < shape.size(); ++c) {
270 for (int font_id : shape[c].font_ids) {
271 if (font_id >= num_fonts_) {
272 num_fonts_ = font_id + 1;
273 }
274 }
275 }
276 }
277 }
278 return num_fonts_;
279}
280
281// Re-indexes the class_ids in the shapetable according to the given map.
282// Useful in conjunction with set_unicharset.
283void ShapeTable::ReMapClassIds(const std::vector<int> &unicharset_map) {
284 for (auto shape : shape_table_) {
285 for (int c = 0; c < shape->size(); ++c) {
286 shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
287 }
288 }
289}
290
291// Returns a string listing the classes/fonts in a shape.
292std::string ShapeTable::DebugStr(unsigned shape_id) const {
293 if (shape_id >= shape_table_.size()) {
294 return "INVALID_UNICHAR_ID";
295 }
296 const Shape &shape = GetShape(shape_id);
297 std::string result;
298 result += "Shape" + std::to_string(shape_id);
299 if (shape.size() > 100) {
300 result += " Num unichars=" + std::to_string(shape.size());
301 return result;
302 }
303 for (int c = 0; c < shape.size(); ++c) {
304 result += " c_id=" + std::to_string(shape[c].unichar_id);
305 result += "=";
306 result += unicharset_->id_to_unichar(shape[c].unichar_id);
307 if (shape.size() < 10) {
308 result += ", " + std::to_string(shape[c].font_ids.size());
309 result += " fonts =";
310 int num_fonts = shape[c].font_ids.size();
311 if (num_fonts > 10) {
312 result += " " + std::to_string(shape[c].font_ids[0]);
313 result += " ... " + std::to_string(shape[c].font_ids[num_fonts - 1]);
314 } else {
315 for (int f = 0; f < num_fonts; ++f) {
316 result += " " + std::to_string(shape[c].font_ids[f]);
317 }
318 }
319 }
320 }
321 return result;
322}
323
324// Returns a debug string summarizing the table.
325std::string ShapeTable::SummaryStr() const {
326 int max_unichars = 0;
327 int num_multi_shapes = 0;
328 int num_master_shapes = 0;
329 for (unsigned s = 0; s < shape_table_.size(); ++s) {
330 if (MasterDestinationIndex(s) != s) {
331 continue;
332 }
333 ++num_master_shapes;
334 int shape_size = GetShape(s).size();
335 if (shape_size > 1) {
336 ++num_multi_shapes;
337 }
338 if (shape_size > max_unichars) {
339 max_unichars = shape_size;
340 }
341 }
342 std::string result;
343 result += "Number of shapes = " + std::to_string(num_master_shapes);
344 result += " max unichars = " + std::to_string(max_unichars);
345 result += " number with multiple unichars = " + std::to_string(num_multi_shapes);
346 return result;
347}
348
349// Adds a new shape starting with the given unichar_id and font_id.
350// Returns the assigned index.
351unsigned ShapeTable::AddShape(int unichar_id, int font_id) {
352 auto index = shape_table_.size();
353 auto *shape = new Shape;
354 shape->AddToShape(unichar_id, font_id);
355 shape_table_.push_back(shape);
356 num_fonts_ = std::max(num_fonts_, font_id + 1);
357 return index;
358}
359
360// Adds a copy of the given shape unless it is already present.
361// Returns the assigned index or index of existing shape if already present.
362unsigned ShapeTable::AddShape(const Shape &other) {
363 unsigned index;
364 for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) {
365 continue;
366 }
367 if (index == shape_table_.size()) {
368 auto *shape = new Shape(other);
369 shape_table_.push_back(shape);
370 }
371 num_fonts_ = 0;
372 return index;
373}
374
375// Removes the shape given by the shape index.
376void ShapeTable::DeleteShape(unsigned shape_id) {
377 delete shape_table_[shape_id];
378 shape_table_.erase(shape_table_.begin() + shape_id);
379}
380
381// Adds a font_id to the given existing shape index for the given
382// unichar_id. If the unichar_id is not in the shape, it is added.
383void ShapeTable::AddToShape(unsigned shape_id, int unichar_id, int font_id) {
384 Shape &shape = *shape_table_[shape_id];
385 shape.AddToShape(unichar_id, font_id);
386 num_fonts_ = std::max(num_fonts_, font_id + 1);
387}
388
389// Adds the given shape to the existing shape with the given index.
390void ShapeTable::AddShapeToShape(unsigned shape_id, const Shape &other) {
391 Shape &shape = *shape_table_[shape_id];
392 shape.AddShape(other);
393 num_fonts_ = 0;
394}
395
396// Returns the id of the shape that contains the given unichar and font.
397// If not found, returns -1.
398// If font_id < 0, the font_id is ignored and the first shape that matches
399// the unichar_id is returned.
400int ShapeTable::FindShape(int unichar_id, int font_id) const {
401 for (unsigned s = 0; s < shape_table_.size(); ++s) {
402 const Shape &shape = GetShape(s);
403 for (int c = 0; c < shape.size(); ++c) {
404 if (shape[c].unichar_id == unichar_id) {
405 if (font_id < 0) {
406 return s; // We don't care about the font.
407 }
408 for (int f : shape[c].font_ids) {
409 if (f == font_id) {
410 return s;
411 }
412 }
413 }
414 }
415 }
416 return -1;
417}
418
419// Returns the first unichar_id and font_id in the given shape.
420void ShapeTable::GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const {
421 const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0];
422 *unichar_id = unichar_and_fonts.unichar_id;
423 *font_id = unichar_and_fonts.font_ids[0];
424}
425
426// Expands all the classes/fonts in the shape individually to build
427// a ShapeTable.
428int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) {
429 BitVector shape_map(master_shapes.NumShapes());
430 for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
431 for (unsigned f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
432 int c = shape[u_ind].unichar_id;
433 int f = shape[u_ind].font_ids[f_ind];
434 int master_id = master_shapes.FindShape(c, f);
435 if (master_id >= 0) {
436 shape_map.SetBit(master_id);
437 } else if (FindShape(c, f) < 0) {
438 AddShape(c, f);
439 }
440 }
441 }
442 int num_masters = 0;
443 for (unsigned s = 0; s < master_shapes.NumShapes(); ++s) {
444 if (shape_map[s]) {
445 AddShape(master_shapes.GetShape(s));
446 ++num_masters;
447 }
448 }
449 return num_masters;
450}
451
452// Returns true if the shapes are already merged.
453bool ShapeTable::AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const {
454 return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
455}
456
457// Returns true if any shape contains multiple unichars.
459 auto num_shapes = NumShapes();
460 for (unsigned s1 = 0; s1 < num_shapes; ++s1) {
461 if (MasterDestinationIndex(s1) != s1) {
462 continue;
463 }
464 if (GetShape(s1).size() > 1) {
465 return true;
466 }
467 }
468 return false;
469}
470
471// Returns the maximum number of unichars over all shapes.
473 int max_num_unichars = 0;
474 int num_shapes = NumShapes();
475 for (int s = 0; s < num_shapes; ++s) {
476 if (GetShape(s).size() > max_num_unichars) {
477 max_num_unichars = GetShape(s).size();
478 }
479 }
480 return max_num_unichars;
481}
482
483// Merges shapes with a common unichar over the [start, end) interval.
484// Assumes single unichar per shape.
485void ShapeTable::ForceFontMerges(unsigned start, unsigned end) {
486 for (unsigned s1 = start; s1 < end; ++s1) {
487 if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
488 int unichar_id = GetShape(s1)[0].unichar_id;
489 for (auto s2 = s1 + 1; s2 < end; ++s2) {
490 if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
491 unichar_id == GetShape(s2)[0].unichar_id) {
492 MergeShapes(s1, s2);
493 }
494 }
495 }
496 }
497 ShapeTable compacted(*unicharset_);
498 compacted.AppendMasterShapes(*this, nullptr);
499 *this = compacted;
500}
501
502// Returns the number of unichars in the master shape.
503unsigned ShapeTable::MasterUnicharCount(unsigned shape_id) const {
504 int master_id = MasterDestinationIndex(shape_id);
505 return GetShape(master_id).size();
506}
507
508// Returns the sum of the font counts in the master shape.
509int ShapeTable::MasterFontCount(unsigned shape_id) const {
510 int master_id = MasterDestinationIndex(shape_id);
511 const Shape &shape = GetShape(master_id);
512 int font_count = 0;
513 for (int c = 0; c < shape.size(); ++c) {
514 font_count += shape[c].font_ids.size();
515 }
516 return font_count;
517}
518
519// Returns the number of unichars that would result from merging the shapes.
520int ShapeTable::MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const {
521 // Do it the easy way for now.
522 int master_id1 = MasterDestinationIndex(shape_id1);
523 int master_id2 = MasterDestinationIndex(shape_id2);
524 Shape combined_shape(*shape_table_[master_id1]);
525 combined_shape.AddShape(*shape_table_[master_id2]);
526 return combined_shape.size();
527}
528
529// Merges two shape_ids, leaving shape_id2 marked as merged.
530void ShapeTable::MergeShapes(unsigned shape_id1, unsigned shape_id2) {
531 auto master_id1 = MasterDestinationIndex(shape_id1);
532 auto master_id2 = MasterDestinationIndex(shape_id2);
533 // Point master_id2 (and all merged shapes) to master_id1.
534 shape_table_[master_id2]->set_destination_index(master_id1);
535 // Add all the shapes of master_id2 to master_id1.
536 shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
537}
538
539// Swaps two shape_ids.
540void ShapeTable::SwapShapes(unsigned shape_id1, unsigned shape_id2) {
541 Shape *tmp = shape_table_[shape_id1];
542 shape_table_[shape_id1] = shape_table_[shape_id2];
543 shape_table_[shape_id2] = tmp;
544}
545
546// Returns the destination of this shape, (if merged), taking into account
547// the fact that the destination may itself have been merged.
548unsigned ShapeTable::MasterDestinationIndex(unsigned shape_id) const {
549 auto dest_id = shape_table_[shape_id]->destination_index();
550 if (static_cast<unsigned>(dest_id) == shape_id || dest_id < 0) {
551 return shape_id; // Is master already.
552 }
553 auto master_id = shape_table_[dest_id]->destination_index();
554 if (master_id == dest_id || master_id < 0) {
555 return dest_id; // Dest is the master and shape_id points to it.
556 }
557 master_id = MasterDestinationIndex(master_id);
558 return master_id;
559}
560
561// Returns false if the unichars in neither shape is a subset of the other.
562bool ShapeTable::SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const {
563 const Shape &shape1 = GetShape(shape_id1);
564 const Shape &shape2 = GetShape(shape_id2);
565 int c1, c2;
566 for (c1 = 0; c1 < shape1.size(); ++c1) {
567 int unichar_id1 = shape1[c1].unichar_id;
568 if (!shape2.ContainsUnichar(unichar_id1)) {
569 break;
570 }
571 }
572 for (c2 = 0; c2 < shape2.size(); ++c2) {
573 int unichar_id2 = shape2[c2].unichar_id;
574 if (!shape1.ContainsUnichar(unichar_id2)) {
575 break;
576 }
577 }
578 return c1 == shape1.size() || c2 == shape2.size();
579}
580
581// Returns false if the unichars in neither shape is a subset of the other.
582bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const {
583 const Shape &merge1 = GetShape(merge_id1);
584 const Shape &merge2 = GetShape(merge_id2);
585 const Shape &shape = GetShape(shape_id);
586 int cm1, cm2, cs;
587 for (cs = 0; cs < shape.size(); ++cs) {
588 int unichar_id = shape[cs].unichar_id;
589 if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {
590 break; // Shape is not a subset of the merge.
591 }
592 }
593 for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
594 int unichar_id1 = merge1[cm1].unichar_id;
595 if (!shape.ContainsUnichar(unichar_id1)) {
596 break; // Merge is not a subset of shape
597 }
598 }
599 for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
600 int unichar_id2 = merge2[cm2].unichar_id;
601 if (!shape.ContainsUnichar(unichar_id2)) {
602 break; // Merge is not a subset of shape
603 }
604 }
605 return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
606}
607
608// Returns true if the unichar sets are equal between the shapes.
609bool ShapeTable::EqualUnichars(unsigned shape_id1, unsigned shape_id2) const {
610 const Shape &shape1 = GetShape(shape_id1);
611 const Shape &shape2 = GetShape(shape_id2);
612 for (int c1 = 0; c1 < shape1.size(); ++c1) {
613 int unichar_id1 = shape1[c1].unichar_id;
614 if (!shape2.ContainsUnichar(unichar_id1)) {
615 return false;
616 }
617 }
618 for (int c2 = 0; c2 < shape2.size(); ++c2) {
619 int unichar_id2 = shape2[c2].unichar_id;
620 if (!shape1.ContainsUnichar(unichar_id2)) {
621 return false;
622 }
623 }
624 return true;
625}
626
627// Returns true if the unichar sets are equal between the shapes.
628bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const {
629 const Shape &merge1 = GetShape(merge_id1);
630 const Shape &merge2 = GetShape(merge_id2);
631 const Shape &shape = GetShape(shape_id);
632 for (int cs = 0; cs < shape.size(); ++cs) {
633 int unichar_id = shape[cs].unichar_id;
634 if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) {
635 return false; // Shape has a unichar that appears in neither merge.
636 }
637 }
638 for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
639 int unichar_id1 = merge1[cm1].unichar_id;
640 if (!shape.ContainsUnichar(unichar_id1)) {
641 return false; // Merge has a unichar that is not in shape.
642 }
643 }
644 for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
645 int unichar_id2 = merge2[cm2].unichar_id;
646 if (!shape.ContainsUnichar(unichar_id2)) {
647 return false; // Merge has a unichar that is not in shape.
648 }
649 }
650 return true;
651}
652
653// Returns true if there is a common unichar between the shapes.
654bool ShapeTable::CommonUnichars(unsigned shape_id1, unsigned shape_id2) const {
655 const Shape &shape1 = GetShape(shape_id1);
656 const Shape &shape2 = GetShape(shape_id2);
657 for (int c1 = 0; c1 < shape1.size(); ++c1) {
658 int unichar_id1 = shape1[c1].unichar_id;
659 if (shape2.ContainsUnichar(unichar_id1)) {
660 return true;
661 }
662 }
663 return false;
664}
665
666// Returns true if there is a common font id between the shapes.
667bool ShapeTable::CommonFont(unsigned shape_id1, unsigned shape_id2) const {
668 const Shape &shape1 = GetShape(shape_id1);
669 const Shape &shape2 = GetShape(shape_id2);
670 for (int c1 = 0; c1 < shape1.size(); ++c1) {
671 const std::vector<int> &font_list1 = shape1[c1].font_ids;
672 for (int f : font_list1) {
673 if (shape2.ContainsFont(f)) {
674 return true;
675 }
676 }
677 }
678 return false;
679}
680
681// Appends the master shapes from other to this.
682// If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
683void ShapeTable::AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map) {
684 if (shape_map != nullptr) {
685 shape_map->clear();
686 shape_map->resize(other.NumShapes(), -1);
687 }
688 for (unsigned s = 0; s < other.shape_table_.size(); ++s) {
689 if (other.shape_table_[s]->destination_index() < 0) {
690 int index = AddShape(*other.shape_table_[s]);
691 if (shape_map != nullptr) {
692 (*shape_map)[s] = index;
693 }
694 }
695 }
696}
697
698// Returns the number of master shapes remaining after merging.
700 int num_shapes = 0;
701 for (auto s : shape_table_) {
702 if (s->destination_index() < 0) {
703 ++num_shapes;
704 }
705 }
706 return num_shapes;
707}
708
709// Adds the unichars of the given shape_id to the vector of results. Any
710// unichar_id that is already present just has the fonts added to the
711// font set for that result without adding a new entry in the vector.
712// NOTE: it is assumed that the results are given to this function in order
713// of decreasing rating.
714// The unichar_map vector indicates the index of the results entry containing
715// each unichar, or -1 if the unichar is not yet included in results.
716void ShapeTable::AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,
717 std::vector<UnicharRating> *results) const {
718 if (shape_rating.joined) {
719 AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, results);
720 }
721 if (shape_rating.broken) {
722 AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, results);
723 }
724 const Shape &shape = GetShape(shape_rating.shape_id);
725 for (int u = 0; u < shape.size(); ++u) {
726 int result_index =
727 AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results);
728 for (int font_id : shape[u].font_ids) {
729 (*results)[result_index].fonts.emplace_back(font_id,
730 IntCastRounded(shape_rating.rating * INT16_MAX));
731 }
732 }
733}
734
735// Adds the given unichar_id to the results if needed, updating unichar_map
736// and returning the index of unichar in results.
737int ShapeTable::AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,
738 std::vector<UnicharRating> *results) const {
739 int result_index = unichar_map->at(unichar_id);
740 if (result_index < 0) {
741 UnicharRating result(unichar_id, rating);
742 result_index = results->size();
743 results->push_back(result);
744 (*unichar_map)[unichar_id] = result_index;
745 }
746 return result_index;
747}
748
749} // namespace tesseract
int IntCastRounded(double x)
Definition: helpers.h:170
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236
int UNICHAR_ID
Definition: unichar.h:34
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ UNICHAR_JOINED
Definition: unicharset.h:37
uint32_t properties
Definition: fontinfo.h:135
void SetBit(int index)
Definition: bitvector.h:78
T & at(int index) const
Definition: genericvector.h:89
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
static int FirstResultWithUnichar(const std::vector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:54
static int FirstResultWithUnichar(const std::vector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:38
static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2)
Definition: shapetable.cpp:81
std::vector< int32_t > font_ids
Definition: shapetable.h:144
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:75
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:70
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:65
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:206
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:186
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:133
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:160
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:103
int size() const
Definition: shapetable.h:169
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:86
void AddShape(const Shape &other)
Definition: shapetable.cpp:123
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:222
bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const
Definition: shapetable.cpp:173
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:92
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:150
bool operator==(const Shape &other) const
Definition: shapetable.cpp:201
void SwapShapes(unsigned shape_id1, unsigned shape_id2)
Definition: shapetable.cpp:540
void AddToShape(unsigned shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:383
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:458
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255
int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:520
bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const
Definition: shapetable.cpp:628
void ReMapClassIds(const std::vector< int > &unicharset_map)
Definition: shapetable.cpp:283
bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const
Definition: shapetable.cpp:582
std::string DebugStr(unsigned shape_id) const
Definition: shapetable.cpp:292
unsigned AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:351
int NumMasterShapes() const
Definition: shapetable.cpp:699
std::string SummaryStr() const
Definition: shapetable.cpp:325
unsigned MasterDestinationIndex(unsigned shape_id) const
Definition: shapetable.cpp:548
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
int MasterFontCount(unsigned shape_id) const
Definition: shapetable.cpp:509
unsigned NumShapes() const
Definition: shapetable.h:248
void DeleteShape(unsigned shape_id)
Definition: shapetable.cpp:376
bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:453
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292
void AddShapeToShape(unsigned shape_id, const Shape &other)
Definition: shapetable.cpp:390
void AddShapeToResults(const ShapeRating &shape_rating, std::vector< int > *unichar_map, std::vector< UnicharRating > *results) const
Definition: shapetable.cpp:716
int MaxNumUnichars() const
Definition: shapetable.cpp:472
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes)
Definition: shapetable.cpp:428
bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:609
void MergeShapes(unsigned shape_id1, unsigned shape_id2)
Definition: shapetable.cpp:530
unsigned MasterUnicharCount(unsigned shape_id) const
Definition: shapetable.cpp:503
void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:420
bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:562
bool CommonFont(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:667
void ForceFontMerges(unsigned start, unsigned end)
Definition: shapetable.cpp:485
bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:654
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:400
void AppendMasterShapes(const ShapeTable &other, std::vector< int > *shape_map)
Definition: shapetable.cpp:683