tesseract v5.3.3.20231005
ligature_table.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: ligature_table.cpp
3 * Description: Class for adding and removing optional latin ligatures,
4 * conditional on codepoint support by a specified font
5 * (if specified).
6 * Author: Ranjith Unnikrishnan
7 *
8 * (C) Copyright 2013, Google Inc.
9 * Licensed under the Apache License, Version 2.0 (the "License");
10 * you may not use this file except in compliance with the License.
11 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 **********************************************************************/
20
21#include "ligature_table.h"
22
23#include <tesseract/unichar.h>
24#include "pango_font_info.h"
25#include "tlog.h"
26#include "unicharset.h"
27#include "unicode/errorcode.h" // from libicu
28#include "unicode/normlzr.h" // from libicu
29#include "unicode/unistr.h" // from libicu
30#include "unicode/utypes.h" // from libicu
31
32#include <utility>
33
34namespace tesseract {
35
36static std::string EncodeAsUTF8(const char32 ch32) {
37 UNICHAR uni_ch(ch32);
38 return std::string(uni_ch.utf8(), uni_ch.utf8_len());
39}
40
41// Range of optional latin ligature characters in Unicode to build ligatures
42// from. Note that this range does not contain the custom ligatures that we
43// encode in the private use area.
44const int kMinLigature = 0xfb00;
45const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in.
46
47/* static */
48std::unique_ptr<LigatureTable> LigatureTable::instance_;
49
50/* static */
52 if (instance_ == nullptr) {
53 instance_.reset(new LigatureTable());
54 instance_->Init();
55 }
56 return instance_.get();
57}
58
60 : min_lig_length_(0), max_lig_length_(0), min_norm_length_(0), max_norm_length_(0) {}
61
63 if (norm_to_lig_table_.empty()) {
64 for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
65 // For each char in the range, convert to utf8, nfc normalize, and if
66 // the strings are different put the both mappings in the hash_maps.
67 std::string lig8 = EncodeAsUTF8(lig);
68 icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
69 icu::UnicodeString normed8_result;
70 icu::ErrorCode status;
71 icu::Normalizer::normalize(unicode_lig8, UNORM_NFC, 0, normed8_result, status);
72 std::string normed8;
73 normed8_result.toUTF8String(normed8);
74 int lig_length = lig8.length();
75 int norm_length = normed8.size();
76 if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
77 norm_to_lig_table_[normed8] = lig8;
78 lig_to_norm_table_[lig8] = normed8;
79 if (min_lig_length_ == 0 || lig_length < min_lig_length_) {
80 min_lig_length_ = lig_length;
81 }
82 if (lig_length > max_lig_length_) {
83 max_lig_length_ = lig_length;
84 }
85 if (min_norm_length_ == 0 || norm_length < min_norm_length_) {
86 min_norm_length_ = norm_length;
87 }
88 if (norm_length > max_norm_length_) {
89 max_norm_length_ = norm_length;
90 }
91 }
92 }
93 // Add custom extra ligatures.
94 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
96 int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
97 if (min_norm_length_ == 0 || norm_length < min_norm_length_) {
98 min_norm_length_ = norm_length;
99 }
100 if (norm_length > max_norm_length_) {
101 max_norm_length_ = norm_length;
102 }
103
105 }
106 }
107}
108
109std::string LigatureTable::RemoveLigatures(const std::string &str) const {
110 std::string result;
111 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
112 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
113 char tmp[5];
114 int len;
115 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
116 len = it.get_utf8(tmp);
117 tmp[len] = '\0';
118 auto lig_it = lig_to_norm_table_.find(tmp);
119 if (lig_it != lig_to_norm_table_.end()) {
120 result += lig_it->second;
121 } else {
122 result += tmp;
123 }
124 }
125 return result;
126}
127
128std::string LigatureTable::RemoveCustomLigatures(const std::string &str) const {
129 std::string result;
130 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
131 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
132 char tmp[5];
133 int len;
134 int norm_ind;
135 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
136 len = it.get_utf8(tmp);
137 tmp[len] = '\0';
138 norm_ind = -1;
139 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) {
140 if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
141 norm_ind = i;
142 }
143 }
144 if (norm_ind >= 0) {
145 result += UNICHARSET::kCustomLigatures[norm_ind][0];
146 } else {
147 result += tmp;
148 }
149 }
150 return result;
151}
152
153std::string LigatureTable::AddLigatures(const std::string &str, const PangoFontInfo *font) const {
154 std::string result;
155 int len = str.size();
156 int step = 0;
157 int i = 0;
158 for (i = 0; i < len - min_norm_length_ + 1; i += step) {
159 step = 0;
160 for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
161 if (i + liglen <= len) {
162 std::string lig_cand = str.substr(i, liglen);
163 auto it = norm_to_lig_table_.find(lig_cand);
164 if (it != norm_to_lig_table_.end()) {
165 tlog(3, "Considering %s -> %s\n", lig_cand.c_str(), it->second.c_str());
166 if (font) {
167 // Test for renderability.
168 if (!font->CanRenderString(it->second.data(), it->second.length())) {
169 continue; // Not renderable
170 }
171 }
172 // Found a match so convert it.
173 step = liglen;
174 result += it->second;
175 tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(), it->second.c_str());
176 break;
177 }
178 }
179 }
180 if (step == 0) {
181 result += str[i];
182 step = 1;
183 }
184 }
185 result += str.substr(i, len - i);
186 return result;
187}
188
189} // namespace tesseract
#define tlog(level,...)
Definition: tlog.h:36
signed int char32
signed int char32
Definition: unichar.h:49
const int kMinLigature
const int kMaxLigature
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213
static const char * kCustomLigatures[][2]
Definition: unicharset.h:169
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
static std::unique_ptr< LigatureTable > instance_
std::string RemoveLigatures(const std::string &str) const
static LigatureTable * Get()
std::string RemoveCustomLigatures(const std::string &str) const
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const