tesseract-ocr.github.io/5.3.3/a00869_source.html

// File:        unicharset_extractor.cpp

// Description: Unicode character/ligature set extractor.

// Author:      Thomas Kielbus

//

// (C) Copyright 2006, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//


// Given a list of box files or text files on the command line, this program

// normalizes the text according to command-line options and generates

// a unicharset.


#include <cstdlib>

#include <filesystem>

#include "boxread.h"

#include "commandlineflags.h"

#include "commontraining.h" // CheckSharedLibraryVersion

#include "lang_model_helpers.h"

#include "normstrngs.h"

#include "unicharset.h"

#include "unicharset_training_utils.h"


using namespace tesseract;


static STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");

static INT_PARAM_FLAG(norm_mode, 1,

                      "Normalization mode: 1=Combine graphemes, "

                      "2=Split graphemes, 3=Pure unicode");


namespace tesseract {


// Helper normalizes and segments the given strings according to norm_mode, and

// adds the segmented parts to unicharset.

static void AddStringsToUnicharset(const std::vector<std::string> &strings, int norm_mode,

                                   UNICHARSET *unicharset) {

  for (const auto &string : strings) {

    std::vector<std::string> normalized;

    if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,

                                     static_cast<GraphemeNormMode>(norm_mode),

                                     /*report_errors*/ true, string.c_str(), &normalized)) {

      for (const std::string &normed : normalized) {

        // normed is a UTF-8 encoded string

        if (normed.empty() || IsUTF8Whitespace(normed.c_str())) {

          continue;

        }

        unicharset->unichar_insert(normed.c_str());

      }

    } else {

      tprintf("Normalization failed for string '%s'\n", string.c_str());

    }

  }

}


static int Main(int argc, char **argv) {

  UNICHARSET unicharset;

  // Load input files

  for (int arg = 1; arg < argc; ++arg) {

    std::filesystem::path filePath = argv[arg];

    std::string file_data = tesseract::ReadFile(argv[arg]);

    if (file_data.empty()) {

      continue;

    }

    std::vector<std::string> texts;

    if (filePath.extension() == ".box") {

      tprintf("Extracting unicharset from box file %s\n", argv[arg]);

      bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],

                   /*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,

                   /*box_texts*/ nullptr, /*pages*/ nullptr);

      if (!res) {

        tprintf("Cannot read box data from '%s'\n", argv[arg]);

        return EXIT_FAILURE;

      }

    } else {

      tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);

      texts.clear();

      texts = split(file_data, '\n');

    }

    AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);

  }

  SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false, &unicharset);

  // Write unicharset file.

  if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {

    tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());

  } else {

    tprintf("Cannot save unicharset file %s\n", FLAGS_output_unicharset.c_str());

    return EXIT_FAILURE;

  }

  return EXIT_SUCCESS;

}


} // namespace tesseract


int main(int argc, char **argv) {

  tesseract::CheckSharedLibraryVersion();

  if (argc > 1) {

    tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);

  }

  if (argc < 2) {

    tprintf(

        "Usage: %s [--output_unicharset filename] [--norm_mode mode]"

        " box_or_text_file [...]\n",

        argv[0]);

    tprintf("Where mode means:\n");

    tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");

    tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");

    tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");

    tprintf("Reads box or plain text files to extract the unicharset.\n");

    return EXIT_FAILURE;

  }

  return tesseract::Main(argc, argv);

}

unicharset.h

boxread.h

commandlineflags.h

INT_PARAM_FLAG
#define INT_PARAM_FLAG(name, val, comment)
Definition: commandlineflags.h:26

STRING_PARAM_FLAG
#define STRING_PARAM_FLAG(name, val, comment)
Definition: commandlineflags.h:32

commontraining.h

main
int main(int argc, char **argv)
Definition: unicharset_extractor.cpp:103

lang_model_helpers.h

unicharset_training_utils.h

normstrngs.h

tesseract
Definition: baseapi.h:39

tesseract::GraphemeNormMode
GraphemeNormMode
Definition: validator.h:36

tesseract::ReadMemBoxes
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:97

tesseract::ParseCommandLineFlags
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
Definition: commandlineflags.cpp:168

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::UnicodeNormMode::kNFC
@ kNFC

tesseract::OCRNorm::kNone
@ kNone

tesseract::SetupBasicProperties
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
Definition: unicharset_training_utils.cpp:40

tesseract::NormalizeCleanAndSegmentUTF8
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:179

tesseract::ReadFile
std::string ReadFile(const std::string &filename, FileReader reader)
Definition: lang_model_helpers.cpp:63

tesseract::split
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43

tesseract::IsUTF8Whitespace
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654

tesseract::UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361