All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
unicharset_extractor.cpp File Reference
#include <stdio.h>
#include <locale.h>
#include "boxread.h"
#include "rect.h"
#include "strngs.h"
#include "tessopt.h"
#include "unichar.h"
#include "unicharset.h"

Go to the source code of this file.

Functions

UNICHAR_ID wc_to_unichar_id (const UNICHARSET &unicharset, int wc)
 
void set_properties (UNICHARSET *unicharset, const char *const c_string)
 
int main (int argc, char **argv)
 

Function Documentation

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 96 of file unicharset_extractor.cpp.

96  {
97  int option;
98  const char* output_directory = ".";
99  STRING unicharset_file_name;
100  // Special characters are now included by default.
101  UNICHARSET unicharset;
102 
103  setlocale(LC_ALL, "");
104 
105  // Print usage
106  if (argc <= 1) {
107  printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
108  exit(1);
109 
110  }
111 
112  // Parse arguments
113  while ((option = tessopt(argc, argv, "D" )) != EOF) {
114  switch (option) {
115  case 'D':
116  output_directory = tessoptarg;
117  ++tessoptind;
118  break;
119  }
120  }
121 
122  // Save file name
123  unicharset_file_name = output_directory;
124  unicharset_file_name += "/";
125  unicharset_file_name += kUnicharsetFileName;
126 
127  // Load box files
128  for (; tessoptind < argc; ++tessoptind) {
129  printf("Extracting unicharset from %s\n", argv[tessoptind]);
130 
131  FILE* box_file = fopen(argv[tessoptind], "rb");
132  if (box_file == NULL) {
133  printf("Cannot open box file %s\n", argv[tessoptind]);
134  return -1;
135  }
136 
137  TBOX box;
138  STRING unichar_string;
139  int line_number = 0;
140  while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
141  unicharset.unichar_insert(unichar_string.string());
142  set_properties(&unicharset, unichar_string.string());
143  }
144  }
145 
146  // Write unicharset file
147  if (unicharset.save_to_file(unicharset_file_name.string())) {
148  printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
149  }
150  else {
151  printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
152  return -1;
153  }
154  return 0;
155 }
int tessopt(inT32 argc, char *argv[], const char *arglist)
Definition: tessopt.cpp:33
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:118
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
Definition: rect.h:30
char * tessoptarg
Definition: tessopt.cpp:25
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void set_properties(UNICHARSET *unicharset, const char *const c_string)
const char * string() const
Definition: strngs.cpp:193
int tessoptind
Definition: tessopt.cpp:24
void set_properties ( UNICHARSET unicharset,
const char *const  c_string 
)

Definition at line 55 of file unicharset_extractor.cpp.

55  {
56 #ifdef USING_WCTYPE
57  UNICHAR_ID id;
58  int wc;
59 
60  // Convert the string to a unichar id.
61  id = unicharset->unichar_to_id(c_string);
62 
63  // Set the other_case property to be this unichar id by default.
64  unicharset->set_other_case(id, id);
65 
66  int step = UNICHAR::utf8_step(c_string);
67  if (step == 0)
68  return; // Invalid utf-8.
69 
70  // Get the next Unicode code point in the string.
71  UNICHAR ch(c_string, step);
72  wc = ch.first_uni();
73 
74  /* Copy the properties. */
75  if (iswalpha(wc)) {
76  unicharset->set_isalpha(id, 1);
77  if (iswlower(wc)) {
78  unicharset->set_islower(id, 1);
79  unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
80  towupper(wc)));
81  }
82  if (iswupper(wc)) {
83  unicharset->set_isupper(id, 1);
84  unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
85  towlower(wc)));
86  }
87  }
88  if (iswdigit(wc))
89  unicharset->set_isdigit(id, 1);
90  if(iswpunct(wc))
91  unicharset->set_ispunctuation(id, 1);
92 
93 #endif
94 }
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:399
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:394
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:409
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:404
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc)
int UNICHAR_ID
Definition: unichar.h:33
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:425
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:389
UNICHAR_ID wc_to_unichar_id ( const UNICHARSET unicharset,
int  wc 
)

Definition at line 43 of file unicharset_extractor.cpp.

43  {
44  UNICHAR uch(wc);
45  char *unichar = uch.utf8_str();
46  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
47  delete[] unichar;
48  return unichar_id;
49 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int UNICHAR_ID
Definition: unichar.h:33