37 "Normalization mode: 1=Combine graphemes, "
38 "2=Split graphemes, 3=Pure unicode");
44static void AddStringsToUnicharset(
const std::vector<std::string> &strings,
int norm_mode,
46 for (
const auto &
string : strings) {
47 std::vector<std::string> normalized;
50 true,
string.c_str(), &normalized)) {
51 for (
const std::string &normed : normalized) {
59 tprintf(
"Normalization failed for string '%s'\n",
string.c_str());
64static int Main(
int argc,
char **argv) {
67 for (
int arg = 1; arg < argc; ++arg) {
68 std::filesystem::path filePath = argv[arg];
70 if (file_data.empty()) {
73 std::vector<std::string> texts;
74 if (filePath.extension() ==
".box") {
75 tprintf(
"Extracting unicharset from box file %s\n", argv[arg]);
77 false,
nullptr, &texts,
80 tprintf(
"Cannot read box data from '%s'\n", argv[arg]);
84 tprintf(
"Extracting unicharset from plain text file %s\n", argv[arg]);
86 texts =
split(file_data,
'\n');
88 AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
92 if (unicharset.
save_to_file(FLAGS_output_unicharset.c_str())) {
93 tprintf(
"Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
95 tprintf(
"Cannot save unicharset file %s\n", FLAGS_output_unicharset.c_str());
103int main(
int argc,
char **argv) {
104 tesseract::CheckSharedLibraryVersion();
110 "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
111 " box_or_text_file [...]\n",
113 tprintf(
"Where mode means:\n");
114 tprintf(
" 1=combine graphemes (use for Latin and other simple scripts)\n");
115 tprintf(
" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
116 tprintf(
" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
117 tprintf(
"Reads box or plain text files to extract the unicharset.\n");
120 return tesseract::Main(argc, argv);
#define INT_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
int main(int argc, char **argv)
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void tprintf(const char *format,...)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
std::string ReadFile(const std::string &filename, FileReader reader)
const std::vector< std::string > split(const std::string &s, char c)
bool IsUTF8Whitespace(const char *text)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool save_to_file(const char *const filename) const