tesseract v5.3.3.20231005
fontinfo.h
Go to the documentation of this file.
1
2// File: fontinfo.h
3// Description: Font information classes abstracted from intproto.h/cpp.
4// Author: rays@google.com (Ray Smith)
5//
6// (C) Copyright 2011, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
20#define TESSERACT_CCSTRUCT_FONTINFO_H_
21
22#include "errcode.h"
23
24#include <tesseract/unichar.h>
25#include "genericvector.h"
26
27#include <cstdint> // for uint16_t, uint32_t
28#include <cstdio> // for FILE
29#include <vector>
30
31namespace tesseract {
32
33template <typename T>
34class UnicityTable;
35
36// Simple struct to hold a font and a score. The scores come from the low-level
37// integer matcher, so they are in the uint16_t range. Fonts are an index to
38// fontinfo_table.
39// These get copied around a lot, so best to keep them small.
40struct ScoredFont {
42 ScoredFont(int font_id, uint16_t classifier_score)
43 : fontinfo_id(font_id), score(classifier_score) {}
44
45 // Index into fontinfo table, but inside the classifier, may be a shapetable
46 // index.
47 int32_t fontinfo_id;
48 // Raw score from the low-level classifier.
49 uint16_t score;
50};
51
52// Struct for information about spacing between characters in a particular font.
54 int16_t x_gap_before;
55 int16_t x_gap_after;
56 std::vector<UNICHAR_ID> kerned_unichar_ids;
57 std::vector<int16_t> kerned_x_gaps;
58};
59
60/*
61 * font_properties contains properties about boldness, italicness, fixed pitch,
62 * serif, fraktur
63 */
64struct FontInfo {
65 FontInfo() : name(nullptr), properties(0), universal_id(0), spacing_vec(nullptr) {}
66 ~FontInfo() = default;
67
68 bool operator==(const FontInfo &rhs) const {
69 return strcmp(name, rhs.name) == 0;
70 }
71
72 // Writes to the given file. Returns false in case of error.
73 bool Serialize(FILE *fp) const;
74 // Reads from the given file. Returns false in case of error.
75 // If swap is true, assumes a big/little-endian swap is needed.
76 bool DeSerialize(TFile *fp);
77
78 // Reserves unicharset_size spots in spacing_vec.
79 void init_spacing(int unicharset_size) {
80 spacing_vec = new std::vector<FontSpacingInfo *>(unicharset_size);
81 }
82 // Adds the given pointer to FontSpacingInfo to spacing_vec member
83 // (FontInfo class takes ownership of the pointer).
84 // Note: init_spacing should be called before calling this function.
85 void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
86 ASSERT_HOST(static_cast<size_t>(uch_id) < spacing_vec->size());
87 (*spacing_vec)[uch_id] = spacing_info;
88 }
89
90 // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
91 const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
92 return (spacing_vec == nullptr || spacing_vec->size() <= static_cast<size_t>(uch_id)) ? nullptr
93 : (*spacing_vec)[uch_id];
94 }
95
96 // Fills spacing with the value of the x gap expected between the two given
97 // UNICHAR_IDs. Returns true on success.
98 bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const {
99 const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
100 const FontSpacingInfo *fsi = this->get_spacing(uch_id);
101 if (prev_fsi == nullptr || fsi == nullptr) {
102 return false;
103 }
104 size_t i = 0;
105 for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
106 if (prev_fsi->kerned_unichar_ids[i] == uch_id) {
107 break;
108 }
109 }
110 if (i < prev_fsi->kerned_unichar_ids.size()) {
111 *spacing = prev_fsi->kerned_x_gaps[i];
112 } else {
113 *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
114 }
115 return true;
116 }
117
118 bool is_italic() const {
119 return properties & 1;
120 }
121 bool is_bold() const {
122 return (properties & 2) != 0;
123 }
124 bool is_fixed_pitch() const {
125 return (properties & 4) != 0;
126 }
127 bool is_serif() const {
128 return (properties & 8) != 0;
129 }
130 bool is_fraktur() const {
131 return (properties & 16) != 0;
132 }
133
134 char *name;
135 uint32_t properties;
136 // The universal_id is a field reserved for the initialization process
137 // to assign a unique id number to all fonts loaded for the current
138 // combination of languages. This id will then be returned by
139 // ResultIterator::WordFontAttributes.
141 // Horizontal spacing between characters (indexed by UNICHAR_ID).
142 std::vector<FontSpacingInfo *> *spacing_vec;
143};
144
145// Every class (character) owns a FontSet that represents all the fonts that can
146// render this character.
147// Since almost all the characters from the same script share the same set of
148// fonts, the sets are shared over multiple classes (see
149// Classify::fontset_table_). Thus, a class only store an id to a set.
150// Because some fonts cannot render just one character of a set, there are a
151// lot of FontSet that differ only by one font. Rather than storing directly
152// the FontInfo in the FontSet structure, it's better to share FontInfos among
153// FontSets (Classify::fontinfo_table_).
154using FontSet = std::vector<int>;
155
156// Class that adds a bit of functionality on top of GenericVector to
157// implement a table of FontInfo that replaces UniCityTable<FontInfo>.
158// TODO(rays) change all references once all existing traineddata files
159// are replaced.
160class FontInfoTable : public GenericVector<FontInfo> {
161public:
162 TESS_API // when you remove inheritance from GenericVector, move this on
163 // class level
167
168 // Writes to the given file. Returns false in case of error.
170 bool Serialize(FILE *fp) const;
171 // Reads from the given file. Returns false in case of error.
172 // If swap is true, assumes a big/little-endian swap is needed.
174 bool DeSerialize(TFile *fp);
175
176 // Returns true if the given set of fonts includes one with the same
177 // properties as font_id.
179 bool SetContainsFontProperties(int font_id, const std::vector<ScoredFont> &font_set) const;
180 // Returns true if the given set of fonts includes multiple properties.
182 bool SetContainsMultipleFontProperties(const std::vector<ScoredFont> &font_set) const;
183
184 // Moves any non-empty FontSpacingInfo entries from other to this.
187 // Moves this to the target unicity table.
189 void MoveTo(UnicityTable<FontInfo> *target);
190};
191
192// Deletion callbacks for GenericVector.
194
195// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
196bool read_info(TFile *f, FontInfo *fi);
197bool write_info(FILE *f, const FontInfo &fi);
198bool read_spacing_info(TFile *f, FontInfo *fi);
199bool write_spacing_info(FILE *f, const FontInfo &fi);
200bool write_set(FILE *f, const FontSet &fs);
201
202} // namespace tesseract.
203
204#endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */
#define ASSERT_HOST(x)
Definition: errcode.h:54
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:129
bool write_set(FILE *f, const FontSet &fs)
Definition: fontinfo.cpp:222
bool write_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:157
bool write_spacing_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:194
std::vector< int > FontSet
Definition: fontinfo.h:154
int UNICHAR_ID
Definition: unichar.h:34
bool read_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:143
bool read_spacing_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:163
ScoredFont(int font_id, uint16_t classifier_score)
Definition: fontinfo.h:42
std::vector< int16_t > kerned_x_gaps
Definition: fontinfo.h:57
std::vector< UNICHAR_ID > kerned_unichar_ids
Definition: fontinfo.h:56
const FontSpacingInfo * get_spacing(UNICHAR_ID uch_id) const
Definition: fontinfo.h:91
int32_t universal_id
Definition: fontinfo.h:140
bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const
Definition: fontinfo.h:98
bool is_italic() const
Definition: fontinfo.h:118
bool operator==(const FontInfo &rhs) const
Definition: fontinfo.h:68
bool DeSerialize(TFile *fp)
Definition: fontinfo.cpp:37
std::vector< FontSpacingInfo * > * spacing_vec
Definition: fontinfo.h:142
bool is_fixed_pitch() const
Definition: fontinfo.h:124
bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:26
bool is_bold() const
Definition: fontinfo.h:121
uint32_t properties
Definition: fontinfo.h:135
void init_spacing(int unicharset_size)
Definition: fontinfo.h:79
bool is_fraktur() const
Definition: fontinfo.h:130
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
Definition: fontinfo.h:85
bool is_serif() const
Definition: fontinfo.h:127
TESS_API bool DeSerialize(TFile *fp)
Definition: fontinfo.cpp:60
TESS_API FontInfoTable()
Definition: fontinfo.cpp:47
TESS_API void MoveSpacingInfoFrom(FontInfoTable *other)
Definition: fontinfo.cpp:95
TESS_API bool SetContainsFontProperties(int font_id, const std::vector< ScoredFont > &font_set) const
Definition: fontinfo.cpp:67
TESS_API bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:55
TESS_API void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:116
TESS_API bool SetContainsMultipleFontProperties(const std::vector< ScoredFont > &font_set) const
Definition: fontinfo.cpp:79
#define TESS_API
Definition: export.h:32