tesseract  4.00.00dev
WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for WERD_CHOICE:
ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uinT8 src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
int length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const UNICHAR_IDunichar_ids () const
 
UNICHAR_ID unichar_id (int index) const
 
int state (int index) const
 
tesseract::ScriptPos BlobPosition (int index) const
 
float rating () const
 
float certainty () const
 
float certainty (int index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uinT8 permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (int index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (int index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, int index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uinT8 perm)
 
void set_length (int len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (int reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uinT8 src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, int index)
 
void set_blob_choice (int index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (int index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (int index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (int *start_core, int *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (int start, int end) const
 
void string_and_lengths (STRING *word_str, STRING *word_lengths_str) const
 
const STRING debug_string () const
 
bool ContainsAnyNonSpaceDelimited () const
 
bool IsAllSpaces () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
const STRINGunichar_string () const
 
const STRINGunichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word)
 
void SetScriptPositions (const tesseract::ScriptPos *positions, int length)
 
void SetAllScriptPositions (tesseract::ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
int TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uinT8 permuter)
 
static tesseract::ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 269 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 274 of file ratngs.h.

275  : unicharset_(unicharset) { this->init(8); }
void init(int reserved)
Definition: ratngs.h:405
UNICHARSET unicharset_

◆ WERD_CHOICE() [2/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 276 of file ratngs.h.

277  : unicharset_(unicharset) { this->init(reserved); }
void init(int reserved)
Definition: ratngs.h:405
UNICHARSET unicharset_

◆ WERD_CHOICE() [3/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uinT8  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 278 of file ratngs.h.

284  : unicharset_(&unicharset) {
285  this->init(src_string, src_lengths, src_rating,
286  src_certainty, src_permuter);
287  }
void init(int reserved)
Definition: ratngs.h:405
UNICHARSET unicharset_

◆ WERD_CHOICE() [4/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not NULL.

Definition at line 199 of file ratngs.cpp.

201  : unicharset_(&unicharset){
202  GenericVector<UNICHAR_ID> encoding;
203  GenericVector<char> lengths;
204  string cleaned = unicharset.CleanupString(src_string);
205  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
206  NULL)) {
207  lengths.push_back('\0');
208  STRING src_lengths = &lengths[0];
209  this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
210  } else { // There must have been an invalid unichar in the string.
211  this->init(8);
212  this->make_bad();
213  }
214 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
void init(int reserved)
Definition: ratngs.h:405
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
const char * string() const
Definition: strngs.cpp:198
UNICHARSET unicharset_
Definition: strngs.h:45
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:439

◆ WERD_CHOICE() [5/5]

WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 289 of file ratngs.h.

290  : ELIST_LINK(word), unicharset_(word.unicharset_) {
291  this->init(word.length());
292  this->operator=(word);
293  }
void init(int reserved)
Definition: ratngs.h:405
UNICHARSET unicharset_
ELIST_LINK()
Definition: elst.h:92
int length() const
Definition: ratngs.h:299
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:502

◆ ~WERD_CHOICE()

WERD_CHOICE::~WERD_CHOICE ( )

WERD_CHOICE::~WERD_CHOICE

Definition at line 257 of file ratngs.cpp.

257  {
258  delete[] unichar_ids_;
259  delete[] script_pos_;
260  delete[] state_;
261  delete[] certainties_;
262 }

Member Function Documentation

◆ adjust_factor()

float WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 302 of file ratngs.h.

302  {
303  return adjust_factor_;
304  }

◆ append_unichar_id()

void WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 449 of file ratngs.cpp.

451  {
452  if (length_ == reserved_) {
453  this->double_the_size();
454  }
456  rating, certainty);
457 }
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:383
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:448

◆ append_unichar_id_space_allocated()

void WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 448 of file ratngs.h.

450  {
451  assert(reserved_ > length_);
452  length_++;
453  this->set_unichar_id(unichar_id, blob_count,
454  rating, certainty, length_-1);
455  }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:355
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311

◆ blob_choices()

BLOB_CHOICE_LIST * WERD_CHOICE::blob_choices ( int  index,
MATRIX ratings 
) const

Definition at line 271 of file ratngs.cpp.

271  {
272  MATRIX_COORD coord = MatrixCoord(index);
273  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
274  if (result == NULL) {
275  result = new BLOB_CHOICE_LIST;
276  ratings->put(coord.col, coord.row, result);
277  }
278  return result;
279 }
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:283
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
T get(ICOORD pos) const
Definition: matrix.h:227

◆ BlobPosition()

tesseract::ScriptPos WERD_CHOICE::BlobPosition ( int  index) const
inline

Definition at line 318 of file ratngs.h.

318  {
319  if (index < 0 || index >= length_)
320  return tesseract::SP_NORMAL;
321  return script_pos_[index];
322  }

◆ certainty() [1/2]

float WERD_CHOICE::certainty ( ) const
inline

Definition at line 326 of file ratngs.h.

326  {
327  return certainty_;
328  }

◆ certainty() [2/2]

float WERD_CHOICE::certainty ( int  index) const
inline

Definition at line 329 of file ratngs.h.

329  {
330  return certainties_[index];
331  }

◆ contains_unichar_id()

bool WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 307 of file ratngs.cpp.

307  {
308  for (int i = 0; i < length_; ++i) {
309  if (unichar_ids_[i] == unichar_id) {
310  return true;
311  }
312  }
313  return false;
314 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311

◆ ContainsAnyNonSpaceDelimited()

bool WERD_CHOICE::ContainsAnyNonSpaceDelimited ( ) const
inline

Definition at line 510 of file ratngs.h.

510  {
511  for (int i = 0; i < length_; ++i) {
512  if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true;
513  }
514  return false;
515  }
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:651
UNICHARSET unicharset_

◆ dangerous_ambig_found()

bool WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 359 of file ratngs.h.

359  {
360  return dangerous_ambig_found_;
361  }

◆ debug_string()

const STRING WERD_CHOICE::debug_string ( ) const
inline

Definition at line 501 of file ratngs.h.

501  {
502  STRING word_str;
503  for (int i = 0; i < length_; ++i) {
504  word_str += unicharset_->debug_str(unichar_ids_[i]);
505  word_str += " ";
506  }
507  return word_str;
508  }
UNICHARSET unicharset_
Definition: strngs.h:45
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:340

◆ DisplaySegmentation()

void WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 750 of file ratngs.cpp.

750  {
751 #ifndef GRAPHICS_DISABLED
752  // Number of different colors to draw with.
753  const int kNumColors = 6;
754  static ScrollView *segm_window = NULL;
755  // Check the state against the static prev_drawn_state.
756  static GenericVector<int> prev_drawn_state;
757  bool already_done = prev_drawn_state.size() == length_;
758  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
759  for (int i = 0; i < length_; ++i) {
760  if (prev_drawn_state[i] != state_[i]) {
761  already_done = false;
762  }
763  prev_drawn_state[i] = state_[i];
764  }
765  if (already_done || word->blobs.empty()) return;
766 
767  // Create the window if needed.
768  if (segm_window == NULL) {
769  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
770  2000.0, 256.0, true);
771  } else {
772  segm_window->Clear();
773  }
774 
775  TBOX bbox;
776  int blob_index = 0;
777  for (int c = 0; c < length_; ++c) {
778  ScrollView::Color color =
779  static_cast<ScrollView::Color>(c % kNumColors + 3);
780  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
781  TBLOB* blob = word->blobs[blob_index];
782  bbox += blob->bounding_box();
783  blob->plot(segm_window, color, color);
784  }
785  }
786  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
787  bbox.right(), bbox.bottom());
788  segm_window->Update();
789  window_wait(segm_window);
790 #endif
791 }
bool empty() const
Definition: genericvector.h:91
static void Update()
Definition: scrollview.cpp:715
int size() const
Definition: genericvector.h:72
inT16 top() const
Definition: rect.h:54
void Clear()
Definition: scrollview.cpp:595
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:261
char window_wait(ScrollView *win)
Definition: callcpp.cpp:111
TBOX bounding_box() const
Definition: blobs.cpp:482
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:524
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
inT16 right() const
Definition: rect.h:75
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void init_to_size(int size, T t)

◆ double_the_size()

void WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 383 of file ratngs.h.

383  {
384  if (reserved_ > 0) {
386  reserved_, unichar_ids_);
388  reserved_, script_pos_);
390  reserved_, state_);
392  reserved_, certainties_);
393  reserved_ *= 2;
394  } else {
395  unichar_ids_ = new UNICHAR_ID[1];
396  script_pos_ = new tesseract::ScriptPos[1];
397  state_ = new int[1];
398  certainties_ = new float[1];
399  reserved_ = 1;
400  }
401  }
static T * double_the_size_memcpy(int current_size, T *data)
int UNICHAR_ID
Definition: unichar.h:35

◆ GetNonSuperscriptSpan()

void WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 378 of file ratngs.cpp.

378  {
379  int end = length();
380  while (end > 0 &&
381  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
383  end--;
384  }
385  int start = 0;
386  while (start < end &&
387  unicharset_->get_isdigit(unichar_ids_[start]) &&
389  start++;
390  }
391  *pstart = start;
392  *pend = end;
393 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
UNICHARSET unicharset_
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:318
int length() const
Definition: ratngs.h:299

◆ GetTopScriptID()

int WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 656 of file ratngs.cpp.

656  {
657  int max_script = unicharset_->get_script_table_size();
658  int *sid = new int[max_script];
659  int x;
660  for (x = 0; x < max_script; x++) sid[x] = 0;
661  for (x = 0; x < length_; ++x) {
662  int script_id = unicharset_->get_script(unichar_id(x));
663  sid[script_id]++;
664  }
665  if (unicharset_->han_sid() != unicharset_->null_sid()) {
666  // Add the Hiragana & Katakana counts to Han and zero them out.
668  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
669  sid[unicharset_->hiragana_sid()] = 0;
670  }
672  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
673  sid[unicharset_->katakana_sid()] = 0;
674  }
675  }
676  // Note that high script ID overrides lower one on a tie, thus biasing
677  // towards non-Common script (if sorted that way in unicharset file).
678  int max_sid = 0;
679  for (x = 1; x < max_script; x++)
680  if (sid[x] >= sid[max_sid]) max_sid = x;
681  if (sid[max_sid] < length_ / 2)
682  max_sid = unicharset_->null_sid();
683  delete[] sid;
684  return max_sid;
685 }
int han_sid() const
Definition: unicharset.h:887
int katakana_sid() const
Definition: unicharset.h:889
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int hiragana_sid() const
Definition: unicharset.h:888
UNICHARSET unicharset_
int get_script_table_size() const
Definition: unicharset.h:848
int null_sid() const
Definition: unicharset.h:882
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662

◆ has_rtl_unichar_id()

bool WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 412 of file ratngs.cpp.

412  {
413  int i;
414  for (i = 0; i < length_; ++i) {
415  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
416  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
418  return true;
419  }
420  }
421  return false;
422 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:689
UNICHARSET unicharset_

◆ init() [1/2]

void WERD_CHOICE::init ( int  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 405 of file ratngs.h.

405  {
406  reserved_ = reserved;
407  if (reserved > 0) {
408  unichar_ids_ = new UNICHAR_ID[reserved];
409  script_pos_ = new tesseract::ScriptPos[reserved];
410  state_ = new int[reserved];
411  certainties_ = new float[reserved];
412  } else {
413  unichar_ids_ = NULL;
414  script_pos_ = NULL;
415  state_ = NULL;
416  certainties_ = NULL;
417  }
418  length_ = 0;
419  adjust_factor_ = 1.0f;
420  rating_ = 0.0;
421  certainty_ = MAX_FLOAT32;
422  min_x_height_ = 0.0f;
423  max_x_height_ = MAX_FLOAT32;
424  permuter_ = NO_PERM;
425  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
426  dangerous_ambig_found_ = false;
427  }
#define MAX_FLOAT32
Definition: host.h:66
int UNICHAR_ID
Definition: unichar.h:35

◆ init() [2/2]

void WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uinT8  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not NULL. src_lengths argument could be NULL, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not NULL. src_lengths argument could be NULL, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 226 of file ratngs.cpp.

230  {
231  int src_string_len = strlen(src_string);
232  if (src_string_len == 0) {
233  this->init(8);
234  } else {
235  this->init(src_lengths ? strlen(src_lengths): src_string_len);
236  length_ = reserved_;
237  int offset = 0;
238  for (int i = 0; i < length_; ++i) {
239  int unichar_length = src_lengths ? src_lengths[i] : 1;
240  unichar_ids_[i] =
241  unicharset_->unichar_to_id(src_string+offset, unichar_length);
242  state_[i] = 1;
243  certainties_[i] = src_certainty;
244  offset += unichar_length;
245  }
246  }
247  adjust_factor_ = 1.0f;
248  rating_ = src_rating;
249  certainty_ = src_certainty;
250  permuter_ = src_permuter;
251  dangerous_ambig_found_ = false;
252 }
void init(int reserved)
Definition: ratngs.h:405
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
UNICHARSET unicharset_

◆ IsAllSpaces()

bool WERD_CHOICE::IsAllSpaces ( ) const
inline

Definition at line 517 of file ratngs.h.

517  {
518  for (int i = 0; i < length_; ++i) {
519  if (unichar_ids_[i] != UNICHAR_SPACE) return false;
520  }
521  return true;
522  }

◆ length()

int WERD_CHOICE::length ( ) const
inline

Definition at line 299 of file ratngs.h.

299  {
300  return length_;
301  }

◆ make_bad()

void WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 439 of file ratngs.h.

439  {
440  length_ = 0;
441  rating_ = kBadRating;
442  certainty_ = -MAX_FLOAT32;
443  }
static const float kBadRating
Definition: ratngs.h:271
#define MAX_FLOAT32
Definition: host.h:66

◆ MatrixCoord()

MATRIX_COORD WERD_CHOICE::MatrixCoord ( int  index) const

Definition at line 283 of file ratngs.cpp.

283  {
284  int col = 0;
285  for (int i = 0; i < index; ++i)
286  col += state_[i];
287  int row = col + state_[index] - 1;
288  return MATRIX_COORD(col, row);
289 }

◆ max_x_height()

float WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 335 of file ratngs.h.

335  {
336  return max_x_height_;
337  }

◆ min_x_height()

float WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 332 of file ratngs.h.

332  {
333  return min_x_height_;
334  }

◆ operator+=()

WERD_CHOICE & WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 466 of file ratngs.cpp.

466  {
467  ASSERT_HOST(unicharset_ == second.unicharset_);
468  while (reserved_ < length_ + second.length()) {
469  this->double_the_size();
470  }
471  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
472  for (int i = 0; i < second.length(); ++i) {
473  unichar_ids_[length_ + i] = other_unichar_ids[i];
474  state_[length_ + i] = second.state_[i];
475  certainties_[length_ + i] = second.certainties_[i];
476  script_pos_[length_ + i] = second.BlobPosition(i);
477  }
478  length_ += second.length();
479  if (second.adjust_factor_ > adjust_factor_)
480  adjust_factor_ = second.adjust_factor_;
481  rating_ += second.rating(); // add ratings
482  if (second.certainty() < certainty_) // take min
483  certainty_ = second.certainty();
484  if (second.dangerous_ambig_found_)
485  dangerous_ambig_found_ = true;
486  if (permuter_ == NO_PERM) {
487  permuter_ = second.permuter();
488  } else if (second.permuter() != NO_PERM &&
489  second.permuter() != permuter_) {
490  permuter_ = COMPOUND_PERM;
491  }
492  return *this;
493 }
uinT8 permuter() const
Definition: ratngs.h:342
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:383
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:308
UNICHARSET unicharset_
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:318
#define ASSERT_HOST(x)
Definition: errcode.h:84
int length() const
Definition: ratngs.h:299
int UNICHAR_ID
Definition: unichar.h:35

◆ operator=()

WERD_CHOICE & WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 502 of file ratngs.cpp.

502  {
503  while (reserved_ < source.length()) {
504  this->double_the_size();
505  }
506 
507  unicharset_ = source.unicharset_;
508  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
509  for (int i = 0; i < source.length(); ++i) {
510  unichar_ids_[i] = other_unichar_ids[i];
511  state_[i] = source.state_[i];
512  certainties_[i] = source.certainties_[i];
513  script_pos_[i] = source.BlobPosition(i);
514  }
515  length_ = source.length();
516  adjust_factor_ = source.adjust_factor_;
517  rating_ = source.rating();
518  certainty_ = source.certainty();
519  min_x_height_ = source.min_x_height();
520  max_x_height_ = source.max_x_height();
521  permuter_ = source.permuter();
522  dangerous_ambig_found_ = source.dangerous_ambig_found_;
523  return *this;
524 }
uinT8 permuter() const
Definition: ratngs.h:342
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:383
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
float min_x_height() const
Definition: ratngs.h:332
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:308
UNICHARSET unicharset_
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:318
int length() const
Definition: ratngs.h:299
float max_x_height() const
Definition: ratngs.h:335
int UNICHAR_ID
Definition: unichar.h:35

◆ permuter()

uinT8 WERD_CHOICE::permuter ( ) const
inline

Definition at line 342 of file ratngs.h.

342  {
343  return permuter_;
344  }

◆ permuter_name() [1/2]

const char * WERD_CHOICE::permuter_name ( uinT8  permuter)
static

Definition at line 175 of file ratngs.cpp.

175  {
176  return kPermuterTypeNames[permuter];
177 }
uinT8 permuter() const
Definition: ratngs.h:342

◆ permuter_name() [2/2]

const char * WERD_CHOICE::permuter_name ( ) const

Definition at line 264 of file ratngs.cpp.

264  {
265  return kPermuterTypeNames[permuter_];
266 }

◆ print() [1/2]

void WERD_CHOICE::print ( ) const
inline

Definition at line 576 of file ratngs.h.

576 { this->print(""); }
void print() const
Definition: ratngs.h:576

◆ print() [2/2]

void WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 713 of file ratngs.cpp.

713  {
714  tprintf("%s : ", msg);
715  for (int i = 0; i < length_; ++i) {
716  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
717  }
718  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
719  rating_, certainty_, adjust_factor_, permuter_,
720  min_x_height_, max_x_height_, dangerous_ambig_found_);
721  tprintf("pos");
722  for (int i = 0; i < length_; ++i) {
723  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
724  }
725  tprintf("\nstr");
726  for (int i = 0; i < length_; ++i) {
727  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
728  }
729  tprintf("\nstate:");
730  for (int i = 0; i < length_; ++i) {
731  tprintf("\t%d ", state_[i]);
732  }
733  tprintf("\nC");
734  for (int i = 0; i < length_; ++i) {
735  tprintf("\t%.3f", certainties_[i]);
736  }
737  tprintf("\n");
738 }
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset_
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:181

◆ print_state()

void WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 741 of file ratngs.cpp.

741  {
742  tprintf("%s", msg);
743  for (int i = 0; i < length_; ++i)
744  tprintf(" %d", state_[i]);
745  tprintf("\n");
746 }
#define tprintf(...)
Definition: tprintf.h:31

◆ punct_stripped()

void WERD_CHOICE::punct_stripped ( int *  start,
int *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 364 of file ratngs.cpp.

364  {
365  *start = 0;
366  *end = length() - 1;
367  while (*start < length() &&
368  unicharset()->get_ispunctuation(unichar_id(*start))) {
369  (*start)++;
370  }
371  while (*end > -1 &&
372  unicharset()->get_ispunctuation(unichar_id(*end))) {
373  (*end)--;
374  }
375  (*end)++;
376 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:296
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int length() const
Definition: ratngs.h:299

◆ rating()

float WERD_CHOICE::rating ( ) const
inline

Definition at line 323 of file ratngs.h.

323  {
324  return rating_;
325  }

◆ remove_last_unichar_id()

void WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 479 of file ratngs.h.

479 { --length_; }

◆ remove_unichar_id()

void WERD_CHOICE::remove_unichar_id ( int  index)
inline

Definition at line 480 of file ratngs.h.

480  {
481  this->remove_unichar_ids(index, 1);
482  }
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:323

◆ remove_unichar_ids()

void WERD_CHOICE::remove_unichar_ids ( int  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 323 of file ratngs.cpp.

323  {
324  ASSERT_HOST(start >= 0 && start + num <= length_);
325  // Accumulate the states to account for the merged blobs.
326  for (int i = 0; i < num; ++i) {
327  if (start > 0)
328  state_[start - 1] += state_[start + i];
329  else if (start + num < length_)
330  state_[start + num] += state_[start + i];
331  }
332  for (int i = start; i + num < length_; ++i) {
333  unichar_ids_[i] = unichar_ids_[i + num];
334  script_pos_[i] = script_pos_[i + num];
335  state_[i] = state_[i + num];
336  certainties_[i] = certainties_[i + num];
337  }
338  length_ -= num;
339 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ reverse_and_mirror_unichar_ids()

void WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 346 of file ratngs.cpp.

346  {
347  for (int i = 0; i < length_ / 2; ++i) {
348  UNICHAR_ID tmp_id = unichar_ids_[i];
349  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
350  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
351  }
352  if (length_ % 2 != 0) {
353  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
354  }
355 }
UNICHARSET unicharset_
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:696
int UNICHAR_ID
Definition: unichar.h:35

◆ ScriptPositionOf()

ScriptPos WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 618 of file ratngs.cpp.

621  {
623  int top = blob_box.top();
624  int bottom = blob_box.bottom();
625  int min_bottom, max_bottom, min_top, max_top;
626  unicharset.get_top_bottom(unichar_id,
627  &min_bottom, &max_bottom,
628  &min_top, &max_top);
629 
630  int sub_thresh_top = min_top - kMinSubscriptOffset;
631  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
632  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
633  if (bottom <= kMaxDropCapBottom) {
634  retval = tesseract::SP_DROPCAP;
635  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
636  retval = tesseract::SP_SUBSCRIPT;
637  } else if (bottom > sup_thresh_bot) {
638  retval = tesseract::SP_SUPERSCRIPT;
639  }
640 
641  if (print_debug) {
642  const char *pos = ScriptPosToString(retval);
643  tprintf("%s Character %s[bot:%d top: %d] "
644  "bot_range[%d,%d] top_range[%d, %d] "
645  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
646  pos, unicharset.id_to_unichar(unichar_id),
647  bottom, top,
648  min_bottom, max_bottom, min_top, max_top,
649  sub_thresh_bot, sub_thresh_top,
650  sup_thresh_bot);
651  }
652  return retval;
653 }
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:567
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
#define tprintf(...)
Definition: tprintf.h:31
inT16 top() const
Definition: rect.h:54
inT16 bottom() const
Definition: rect.h:61
const int kMaxDropCapBottom
Definition: ratngs.cpp:46
const int kMinSuperscriptOffset
Definition: ratngs.cpp:44
const int kBlnBaselineOffset
Definition: normalis.h:29
const int kMinSubscriptOffset
Definition: ratngs.cpp:42
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:181

◆ set_adjust_factor()

void WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 305 of file ratngs.h.

305  {
306  adjust_factor_ = factor;
307  }

◆ set_blob_choice()

void WERD_CHOICE::set_blob_choice ( int  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 293 of file ratngs.cpp.

294  {
295  unichar_ids_[index] = blob_choice->unichar_id();
296  script_pos_[index] = tesseract::SP_NORMAL;
297  state_[index] = blob_count;
298  certainties_[index] = blob_choice->certainty();
299 }
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
float certainty() const
Definition: ratngs.h:82

◆ set_certainty()

void WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 368 of file ratngs.h.

368  {
369  certainty_ = new_val;
370  }

◆ set_dangerous_ambig_found_()

void WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 362 of file ratngs.h.

362  {
363  dangerous_ambig_found_ = value;
364  }

◆ set_length()

void WERD_CHOICE::set_length ( int  len)
inline

Definition at line 377 of file ratngs.h.

377  {
378  ASSERT_HOST(reserved_ >= len);
379  length_ = len;
380  }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ set_permuter()

void WERD_CHOICE::set_permuter ( uinT8  perm)
inline

Definition at line 371 of file ratngs.h.

371  {
372  permuter_ = perm;
373  }

◆ set_rating()

void WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 365 of file ratngs.h.

365  {
366  rating_ = new_val;
367  }

◆ set_unichar_id() [1/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  index 
)
inline

Definition at line 355 of file ratngs.h.

355  {
356  assert(index < length_);
357  unichar_ids_[index] = unichar_id;
358  }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311

◆ set_unichar_id() [2/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
int  index 
)
inline

Definition at line 460 of file ratngs.h.

461  {
462  assert(index < length_);
463  unichar_ids_[index] = unichar_id;
464  state_[index] = blob_count;
465  certainties_[index] = certainty;
466  script_pos_[index] = tesseract::SP_NORMAL;
467  rating_ += rating;
468  if (certainty < certainty_) {
469  certainty_ = certainty;
470  }
471  }
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311

◆ set_unichars_in_script_order()

bool WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 527 of file ratngs.h.

527  {
528  return unichars_in_script_order_ = in_script_order;
529  }

◆ set_x_heights()

void WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 338 of file ratngs.h.

338  {
339  min_x_height_ = min_height;
340  max_x_height_ = max_height;
341  }

◆ SetAllScriptPositions()

void WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 612 of file ratngs.cpp.

612  {
613  for (int i = 0; i < length_; ++i)
614  script_pos_[i] = position;
615 }

◆ SetScriptPositions() [1/2]

void WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word 
)

Definition at line 531 of file ratngs.cpp.

531  {
532  // Since WERD_CHOICE isn't supposed to depend on a Tesseract,
533  // we don't have easy access to the flags Tesseract stores. Therefore, debug
534  // for this module is hard compiled in.
535  int debug = 0;
536 
537  // Initialize to normal.
538  for (int i = 0; i < length_; ++i)
539  script_pos_[i] = tesseract::SP_NORMAL;
540  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
541  return;
542  }
543 
544  int position_counts[4];
545  for (int i = 0; i < 4; i++) {
546  position_counts[i] = 0;
547  }
548 
549  int chunk_index = 0;
550  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
551  TBLOB* tblob = word->blobs[chunk_index];
552  int uni_id = unichar_id(blob_index);
553  TBOX blob_box = tblob->bounding_box();
554  if (state_ != NULL) {
555  for (int i = 1; i < state_[blob_index]; ++i) {
556  ++chunk_index;
557  tblob = word->blobs[chunk_index];
558  blob_box += tblob->bounding_box();
559  }
560  }
561  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
562  uni_id);
563  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
564  script_pos_[blob_index] = tesseract::SP_NORMAL;
565  }
566  position_counts[script_pos_[blob_index]]++;
567  }
568  // If almost everything looks like a superscript or subscript,
569  // we most likely just got the baseline wrong.
570  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
571  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
572  if (debug >= 2) {
573  tprintf("Most characters of %s are subscript or superscript.\n"
574  "That seems wrong, so I'll assume we got the baseline wrong\n",
575  unichar_string().string());
576  }
577  for (int i = 0; i < length_; i++) {
578  ScriptPos sp = script_pos_[i];
580  position_counts[sp]--;
581  position_counts[tesseract::SP_NORMAL]++;
582  script_pos_[i] = tesseract::SP_NORMAL;
583  }
584  }
585  }
586 
587  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
588  debug >= 2) {
589  tprintf("SetScriptPosition on %s\n", unichar_string().string());
590  int chunk_index = 0;
591  for (int blob_index = 0; blob_index < length_; ++blob_index) {
592  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
593  TBLOB* tblob = word->blobs[chunk_index];
594  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
595  unichar_id(blob_index));
596  }
597  chunk_index += state_ != NULL ? state_[blob_index] : 1;
598  }
599  }
600 }
bool empty() const
Definition: genericvector.h:91
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
#define tprintf(...)
Definition: tprintf.h:31
int TotalOfStates() const
Definition: ratngs.cpp:700
UNICHARSET unicharset_
Definition: rect.h:30
Definition: blobs.h:261
int NumBlobs() const
Definition: blobs.h:425
TBOX bounding_box() const
Definition: blobs.cpp:482
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:618
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const STRING & unichar_string() const
Definition: ratngs.h:537

◆ SetScriptPositions() [2/2]

void WERD_CHOICE::SetScriptPositions ( const tesseract::ScriptPos positions,
int  length 
)

Definition at line 602 of file ratngs.cpp.

603  {
604  ASSERT_HOST(length == length_);
605  if (positions != script_pos_) {
606  delete [] script_pos_;
607  script_pos_ = new ScriptPos[length];
608  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
609  }
610 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
int length() const
Definition: ratngs.h:299

◆ shallow_copy()

WERD_CHOICE WERD_CHOICE::shallow_copy ( int  start,
int  end 
) const

Definition at line 395 of file ratngs.cpp.

395  {
396  ASSERT_HOST(start >= 0 && start <= length_);
397  ASSERT_HOST(end >= 0 && end <= length_);
398  if (end < start) { end = start; }
399  WERD_CHOICE retval(unicharset_, end - start);
400  for (int i = start; i < end; i++) {
401  retval.append_unichar_id_space_allocated(
402  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
403  }
404  return retval;
405 }
UNICHARSET unicharset_
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ state()

int WERD_CHOICE::state ( int  index) const
inline

Definition at line 315 of file ratngs.h.

315  {
316  return state_[index];
317  }

◆ string_and_lengths()

void WERD_CHOICE::string_and_lengths ( STRING word_str,
STRING word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 430 of file ratngs.cpp.

431  {
432  *word_str = "";
433  if (word_lengths_str != NULL) *word_lengths_str = "";
434  for (int i = 0; i < length_; ++i) {
435  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
436  *word_str += ch;
437  if (word_lengths_str != NULL) {
438  *word_lengths_str += strlen(ch);
439  }
440  }
441 }
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:296
UNICHARSET unicharset_

◆ TotalOfStates()

int WERD_CHOICE::TotalOfStates ( ) const

Definition at line 700 of file ratngs.cpp.

700  {
701  int total_chunks = 0;
702  for (int i = 0; i < length_; ++i) {
703  total_chunks += state_[i];
704  }
705  return total_chunks;
706 }

◆ unichar_id()

UNICHAR_ID WERD_CHOICE::unichar_id ( int  index) const
inline

Definition at line 311 of file ratngs.h.

311  {
312  assert(index < length_);
313  return unichar_ids_[index];
314  }

◆ unichar_ids()

const UNICHAR_ID* WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 308 of file ratngs.h.

308  {
309  return unichar_ids_;
310  }

◆ unichar_lengths()

const STRING& WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 544 of file ratngs.h.

544  {
545  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
546  return unichar_lengths_;
547  }
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:430

◆ unichar_string()

const STRING& WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 537 of file ratngs.h.

537  {
538  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
539  return unichar_string_;
540  }
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:430

◆ unichars_in_script_order()

bool WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 531 of file ratngs.h.

531  {
532  return unichars_in_script_order_;
533  }

◆ unicharset()

const UNICHARSET* WERD_CHOICE::unicharset ( ) const
inline

Definition at line 296 of file ratngs.h.

296  {
297  return unicharset_;
298  }
UNICHARSET unicharset_

◆ UpdateStateForSplit()

void WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 688 of file ratngs.cpp.

688  {
689  int total_chunks = 0;
690  for (int i = 0; i < length_; ++i) {
691  total_chunks += state_[i];
692  if (total_chunks > blob_position) {
693  ++state_[i];
694  return;
695  }
696  }
697 }

Member Data Documentation

◆ kBadRating

const float WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 271 of file ratngs.h.


The documentation for this class was generated from the following files: