All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (const GenericVector< int > *allowed_scripts, OSResults *osr, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 94 of file osdetect.h.

Constructor & Destructor Documentation

ScriptDetector::ScriptDetector ( const GenericVector< int > *  allowed_scripts,
OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 448 of file osdetect.cpp.

449  {
450  osr_ = osr;
451  tess_ = tess;
452  allowed_scripts_ = allowed_scripts;
453  katakana_id_ = tess_->unicharset.add_script(katakana_script);
454  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
455  han_id_ = tess_->unicharset.add_script(han_script);
456  hangul_id_ = tess_->unicharset.add_script(hangul_script);
457  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
458  korean_id_ = tess_->unicharset.add_script(korean_script_);
459  latin_id_ = tess_->unicharset.add_script(latin_script);
460  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
461 }
UNICHARSET unicharset
Definition: ccutil.h:72
int add_script(const char *script)

Member Function Documentation

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 466 of file osdetect.cpp.

466  {
467  bool done[kMaxNumberOfScripts];
468  for (int i = 0; i < 4; ++i) {
469  for (int j = 0; j < kMaxNumberOfScripts; ++j)
470  done[j] = false;
471 
472  BLOB_CHOICE_IT choice_it;
473  choice_it.set_to_list(scores + i);
474 
475  float prev_score = -1;
476  int script_count = 0;
477  int prev_id = -1;
478  int prev_fontinfo_id = -1;
479  const char* prev_unichar = "";
480  const char* unichar = "";
481 
482  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
483  choice_it.forward()) {
484  BLOB_CHOICE* choice = choice_it.data();
485  int id = choice->script_id();
486  if (allowed_scripts_ != NULL && !allowed_scripts_->empty()) {
487  // Check that the choice is in an allowed script.
488  int s = 0;
489  for (s = 0; s < allowed_scripts_->size(); ++s) {
490  if ((*allowed_scripts_)[s] == id) break;
491  }
492  if (s == allowed_scripts_->size()) continue; // Not found in list.
493  }
494  // Script already processed before.
495  if (done[id]) continue;
496  done[id] = true;
497 
498  unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
499  // Save data from the first match
500  if (prev_score < 0) {
501  prev_score = -choice->certainty();
502  script_count = 1;
503  prev_id = id;
504  prev_unichar = unichar;
505  prev_fontinfo_id = choice->fontinfo_id();
506  } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
507  ++script_count;
508  }
509 
510  if (strlen(prev_unichar) == 1)
511  if (unichar[0] >= '0' && unichar[0] <= '9')
512  break;
513 
514  // if script_count is >= 2, character is ambiguous, skip other matches
515  // since they are useless.
516  if (script_count >= 2)
517  break;
518  }
519  // Character is non ambiguous
520  if (script_count == 1) {
521  // Update the score of the winning script
522  osr_->scripts_na[i][prev_id] += 1.0;
523 
524  // Workaround for Fraktur
525  if (prev_id == latin_id_) {
526  if (prev_fontinfo_id >= 0) {
527  const tesseract::FontInfo &fi =
528  tess_->get_fontinfo_table().get(prev_fontinfo_id);
529  //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
530  // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
531  // fi.is_serif(), fi.is_fraktur(),
532  // prev_unichar);
533  if (fi.is_fraktur()) {
534  osr_->scripts_na[i][prev_id] -= 1.0;
535  osr_->scripts_na[i][fraktur_id_] += 1.0;
536  }
537  }
538  }
539 
540  // Update Japanese / Korean pseudo-scripts
541  if (prev_id == katakana_id_)
542  osr_->scripts_na[i][japanese_id_] += 1.0;
543  if (prev_id == hiragana_id_)
544  osr_->scripts_na[i][japanese_id_] += 1.0;
545  if (prev_id == hangul_id_)
546  osr_->scripts_na[i][korean_id_] += 1.0;
547  if (prev_id == han_id_) {
548  osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
549  osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
550  }
551  }
552  } // iterate over each orientation
553 }
int size() const
Definition: genericvector.h:72
int script_id() const
Definition: ratngs.h:111
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:76
UNICHARSET unicharset
Definition: ccutil.h:72
const int kMaxNumberOfScripts
Definition: osdetect.h:36
const float kHanRatioInKorean
Definition: osdetect.cpp:45
inT16 fontinfo_id() const
Definition: ratngs.h:85
const float kNonAmbiguousMargin
Definition: osdetect.cpp:48
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
bool empty() const
Definition: genericvector.h:84
bool is_fraktur() const
Definition: fontinfo.h:115
#define NULL
Definition: host.h:144
const float kHanRatioInJapanese
Definition: osdetect.cpp:46
float certainty() const
Definition: ratngs.h:82
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool ScriptDetector::must_stop ( int  orientation)

Definition at line 555 of file osdetect.cpp.

555  {
556  osr_->update_best_script(orientation);
557  return osr_->best_result.sconfidence > 1;
558 }
float sconfidence
Definition: osdetect.h:43
void update_best_script(int orientation_id)
Definition: osdetect.cpp:94
OSBestResult best_result
Definition: osdetect.h:79

The documentation for this class was generated from the following files: