33#include "unicode/uchar.h"
34#include "unicode/uscript.h"
41 for (
size_t unichar_id = 0; unichar_id < unicharset->
size(); ++unichar_id) {
43 const char *unichar_str = unicharset->
id_to_unichar(unichar_id);
56 bool unichar_isalpha =
false;
57 bool unichar_islower =
false;
58 bool unichar_isupper =
false;
59 bool unichar_isdigit =
false;
60 bool unichar_ispunct =
false;
62 for (
char32 u_ch : uni_vector) {
63 if (u_isalpha(u_ch)) {
64 unichar_isalpha =
true;
66 if (u_islower(u_ch)) {
67 unichar_islower =
true;
69 if (u_isupper(u_ch)) {
70 unichar_isupper =
true;
72 if (u_isdigit(u_ch)) {
73 unichar_isdigit =
true;
75 if (u_ispunct(u_ch)) {
76 unichar_ispunct =
true;
80 unicharset->
set_isalpha(unichar_id, unichar_isalpha);
81 unicharset->
set_islower(unichar_id, unichar_islower);
82 unicharset->
set_isupper(unichar_id, unichar_isupper);
83 unicharset->
set_isdigit(unichar_id, unichar_isdigit);
87 unicharset->
set_script(unichar_id, uscript_getName(uscript_getScript(uni_vector[0], err)));
89 const int num_code_points = uni_vector.size();
92 if (unichar_islower || unichar_isupper) {
93 std::vector<char32> other_case(num_code_points, 0);
94 for (
int i = 0;
i < num_code_points; ++
i) {
99 other_case[
i] = unichar_islower ? u_toupper(uni_vector[
i]) : u_tolower(uni_vector[
i]);
103 if (other_case_id != INVALID_UNICHAR_ID) {
106 tprintf(
"Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str);
111 std::vector<char32> mirrors(num_code_points, 0);
112 for (
int i = 0;
i < num_code_points; ++
i) {
113 mirrors[
i] = u_charMirror(uni_vector[
i]);
121 if (mirror_uch_id != INVALID_UNICHAR_ID) {
122 unicharset->
set_mirror(unichar_id, mirror_uch_id);
123 }
else if (report_errors) {
124 tprintf(
"Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str);
128 std::string normed_str;
129 if (unichar_id != 0 &&
134 !normed_str.empty()) {
135 unicharset->
set_normed(unichar_id, normed_str.c_str());
137 unicharset->
set_normed(unichar_id, unichar_str);
148 std::string filename =
154 tprintf(
"Failed to load script unicharset from:%s\n", filename.c_str());
159 tprintf(
"Warning: properties incomplete for index %d = %s\n", c,
167 std::string xheights_str;
171 std::string script_heights;
173 xheights_str += script_heights;
185 const std::string &input_unicharset_file,
186 const std::string &output_unicharset_file,
187 const std::string &output_xheights_file) {
192 tprintf(
"Loaded unicharset of size %zu from file %s\n", unicharset.
size(),
193 input_unicharset_file.c_str());
196 tprintf(
"Setting unichar properties\n");
198 tprintf(
"Setting script properties\n");
200 if (!output_xheights_file.empty()) {
206 tprintf(
"Writing unicharset to file %s\n", output_unicharset_file.c_str());
207 unicharset.
save_to_file(output_unicharset_file.c_str());
std::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset)
void tprintf(const char *format,...)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void SetPropertiesForInputFile(const std::string &script_dir, const std::string &input_unicharset_file, const std::string &output_unicharset_file, const std::string &output_xheights_file)
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
@ SPECIAL_UNICHAR_CODES_COUNT
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
void set_script(UNICHAR_ID unichar_id, const char *value)
const char * get_script_from_script_id(int id) const
int get_script_table_size() const
void set_isupper(UNICHAR_ID unichar_id, bool value)
void set_normed(UNICHAR_ID unichar_id, const char *normed)
bool load_from_file(const char *const filename, bool skip_fragments)
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
const char * id_to_unichar(UNICHAR_ID id) const
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
bool save_to_file(const char *const filename) const
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
static const char * kCustomLigatures[][2]
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void set_islower(UNICHAR_ID unichar_id, bool value)
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
void SetPropertiesFromOther(const UNICHARSET &src)
void set_isdigit(UNICHAR_ID unichar_id, bool value)
static bool ReadFileToString(const std::string &filename, std::string *out)
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)