tesseract  4.00.00dev
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (const GenericVector< int > *allowed_scripts, OSResults *osr, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 93 of file osdetect.h.

Constructor & Destructor Documentation

◆ ScriptDetector()

ScriptDetector::ScriptDetector ( const GenericVector< int > *  allowed_scripts,
OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 447 of file osdetect.cpp.

448  {
449  osr_ = osr;
450  tess_ = tess;
451  allowed_scripts_ = allowed_scripts;
452  katakana_id_ = tess_->unicharset.add_script(katakana_script);
453  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
454  han_id_ = tess_->unicharset.add_script(han_script);
455  hangul_id_ = tess_->unicharset.add_script(hangul_script);
456  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
457  korean_id_ = tess_->unicharset.add_script(korean_script_);
458  latin_id_ = tess_->unicharset.add_script(latin_script);
459  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
460 }
UNICHARSET unicharset
Definition: ccutil.h:68
int add_script(const char *script)

Member Function Documentation

◆ detect_blob()

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 465 of file osdetect.cpp.

465  {
466  bool done[kMaxNumberOfScripts];
467  for (int i = 0; i < 4; ++i) {
468  for (int j = 0; j < kMaxNumberOfScripts; ++j)
469  done[j] = false;
470 
471  BLOB_CHOICE_IT choice_it;
472  choice_it.set_to_list(scores + i);
473 
474  float prev_score = -1;
475  int script_count = 0;
476  int prev_id = -1;
477  int prev_fontinfo_id = -1;
478  const char* prev_unichar = "";
479  const char* unichar = "";
480 
481  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
482  choice_it.forward()) {
483  BLOB_CHOICE* choice = choice_it.data();
484  int id = choice->script_id();
485  if (allowed_scripts_ != NULL && !allowed_scripts_->empty()) {
486  // Check that the choice is in an allowed script.
487  int s = 0;
488  for (s = 0; s < allowed_scripts_->size(); ++s) {
489  if ((*allowed_scripts_)[s] == id) break;
490  }
491  if (s == allowed_scripts_->size()) continue; // Not found in list.
492  }
493  // Script already processed before.
494  if (done[id]) continue;
495  done[id] = true;
496 
497  unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
498  // Save data from the first match
499  if (prev_score < 0) {
500  prev_score = -choice->certainty();
501  script_count = 1;
502  prev_id = id;
503  prev_unichar = unichar;
504  prev_fontinfo_id = choice->fontinfo_id();
505  } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
506  ++script_count;
507  }
508 
509  if (strlen(prev_unichar) == 1)
510  if (unichar[0] >= '0' && unichar[0] <= '9')
511  break;
512 
513  // if script_count is >= 2, character is ambiguous, skip other matches
514  // since they are useless.
515  if (script_count >= 2)
516  break;
517  }
518  // Character is non ambiguous
519  if (script_count == 1) {
520  // Update the score of the winning script
521  osr_->scripts_na[i][prev_id] += 1.0;
522 
523  // Workaround for Fraktur
524  if (prev_id == latin_id_) {
525  if (prev_fontinfo_id >= 0) {
526  const tesseract::FontInfo &fi =
527  tess_->get_fontinfo_table().get(prev_fontinfo_id);
528  //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
529  // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
530  // fi.is_serif(), fi.is_fraktur(),
531  // prev_unichar);
532  if (fi.is_fraktur()) {
533  osr_->scripts_na[i][prev_id] -= 1.0;
534  osr_->scripts_na[i][fraktur_id_] += 1.0;
535  }
536  }
537  }
538 
539  // Update Japanese / Korean pseudo-scripts
540  if (prev_id == katakana_id_)
541  osr_->scripts_na[i][japanese_id_] += 1.0;
542  if (prev_id == hiragana_id_)
543  osr_->scripts_na[i][japanese_id_] += 1.0;
544  if (prev_id == hangul_id_)
545  osr_->scripts_na[i][korean_id_] += 1.0;
546  if (prev_id == han_id_) {
547  osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
548  osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
549  }
550  }
551  } // iterate over each orientation
552 }
bool empty() const
Definition: genericvector.h:91
int script_id() const
Definition: ratngs.h:111
const float kHanRatioInJapanese
Definition: osdetect.cpp:45
int size() const
Definition: genericvector.h:72
const float kNonAmbiguousMargin
Definition: osdetect.cpp:47
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:76
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
UNICHARSET unicharset
Definition: ccutil.h:68
const float kHanRatioInKorean
Definition: osdetect.cpp:44
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
const int kMaxNumberOfScripts
Definition: osdetect.h:36
inT16 fontinfo_id() const
Definition: ratngs.h:85
float certainty() const
Definition: ratngs.h:82
bool is_fraktur() const
Definition: fontinfo.h:115
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ must_stop()

bool ScriptDetector::must_stop ( int  orientation)

Definition at line 554 of file osdetect.cpp.

554  {
555  osr_->update_best_script(orientation);
556  return osr_->best_result.sconfidence > 1;
557 }
float sconfidence
Definition: osdetect.h:43
OSBestResult best_result
Definition: osdetect.h:79
void update_best_script(int orientation_id)
Definition: osdetect.cpp:88

The documentation for this class was generated from the following files: