tesseract v5.3.3.20231005
tesseract::WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for tesseract::WERD_CHOICE:
tesseract::ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
bool empty () const
 
unsigned length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const std::vector< UNICHAR_ID > & unichar_ids () const
 
UNICHAR_ID unichar_id (unsigned index) const
 
unsigned state (unsigned index) const
 
ScriptPos BlobPosition (unsigned index) const
 
float rating () const
 
float certainty () const
 
float certainty (unsigned index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uint8_t permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (unsigned index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (unsigned index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, unsigned index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uint8_t perm)
 
void set_length (unsigned len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (unsigned reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, unsigned index)
 
void set_blob_choice (unsigned index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (unsigned index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (unsigned index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (unsigned *start_core, unsigned *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (unsigned start, unsigned end) const
 
void string_and_lengths (std::string *word_str, std::string *word_lengths_str) const
 
std::string debug_string () const
 
bool ContainsAnyNonSpaceDelimited () const
 
bool IsAllSpaces () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
std::string & unichar_string ()
 
const std::string & unichar_string () const
 
const std::string & unichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word, int debug=0)
 
void SetAllScriptPositions (ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
unsigned TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from tesseract::ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uint8_t permuter)
 
static ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 258 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 263 of file ratngs.h.

263 : unicharset_(unicharset) {
264 this->init(8);
265 }
void init(unsigned reserved)
Definition: ratngs.h:386
const UNICHARSET * unicharset() const
Definition: ratngs.h:281

◆ WERD_CHOICE() [2/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 266 of file ratngs.h.

266 : unicharset_(unicharset) {
267 this->init(reserved);
268 }

◆ WERD_CHOICE() [3/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 269 of file ratngs.h.

271 : unicharset_(&unicharset) {
272 this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);
273 }

◆ WERD_CHOICE() [4/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not nullptr.

Definition at line 213 of file ratngs.cpp.

214 : unicharset_(&unicharset) {
215 std::vector<UNICHAR_ID> encoding;
216 std::vector<char> lengths;
217 std::string cleaned = unicharset.CleanupString(src_string);
218 if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) {
219 lengths.push_back('\0');
220 std::string src_lengths = &lengths[0];
221 this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
222 } else { // There must have been an invalid unichar in the string.
223 this->init(8);
224 this->make_bad();
225 }
226}
@ NO_PERM
Definition: ratngs.h:236
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:419
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265

◆ WERD_CHOICE() [5/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 275 of file ratngs.h.

275 : ELIST_LINK(word), unicharset_(word.unicharset_) {
276 this->init(word.length());
277 this->operator=(word);
278 }
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:499

◆ ~WERD_CHOICE()

tesseract::WERD_CHOICE::~WERD_CHOICE ( )
default

Member Function Documentation

◆ adjust_factor()

float tesseract::WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 290 of file ratngs.h.

290 {
291 return adjust_factor_;
292 }

◆ append_unichar_id()

void tesseract::WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 447 of file ratngs.cpp.

448 {
449 if (length_ == reserved_) {
450 this->double_the_size();
451 }
453}
float certainty() const
Definition: ratngs.h:315
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:428
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:372
float rating() const
Definition: ratngs.h:312

◆ append_unichar_id_space_allocated()

void tesseract::WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 428 of file ratngs.h.

429 {
430 assert(reserved_ > length_);
431 length_++;
432 this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);
433 }
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:344

◆ blob_choices()

BLOB_CHOICE_LIST * tesseract::WERD_CHOICE::blob_choices ( unsigned  index,
MATRIX ratings 
) const

Definition at line 274 of file ratngs.cpp.

274 {
275 MATRIX_COORD coord = MatrixCoord(index);
276 BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
277 if (result == nullptr) {
278 result = new BLOB_CHOICE_LIST;
279 ratings->put(coord.col, coord.row, result);
280 }
281 return result;
282}
MATRIX_COORD MatrixCoord(unsigned index) const
Definition: ratngs.cpp:286

◆ BlobPosition()

ScriptPos tesseract::WERD_CHOICE::BlobPosition ( unsigned  index) const
inline

Definition at line 306 of file ratngs.h.

306 {
307 if (index >= length_) {
308 return SP_NORMAL;
309 }
310 return script_pos_[index];
311 }
@ SP_NORMAL
Definition: ratngs.h:254

◆ certainty() [1/2]

float tesseract::WERD_CHOICE::certainty ( ) const
inline

Definition at line 315 of file ratngs.h.

315 {
316 return certainty_;
317 }

◆ certainty() [2/2]

float tesseract::WERD_CHOICE::certainty ( unsigned  index) const
inline

Definition at line 318 of file ratngs.h.

318 {
319 return certainties_[index];
320 }

◆ contains_unichar_id()

bool tesseract::WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 309 of file ratngs.cpp.

309 {
310 for (unsigned i = 0; i < length_; ++i) {
311 if (unichar_ids_[i] == unichar_id) {
312 return true;
313 }
314 }
315 return false;
316}

◆ ContainsAnyNonSpaceDelimited()

bool tesseract::WERD_CHOICE::ContainsAnyNonSpaceDelimited ( ) const
inline

Definition at line 488 of file ratngs.h.

488 {
489 for (unsigned i = 0; i < length_; ++i) {
490 if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
491 return true;
492 }
493 }
494 return false;
495 }
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:668

◆ dangerous_ambig_found()

bool tesseract::WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 348 of file ratngs.h.

348 {
349 return dangerous_ambig_found_;
350 }

◆ debug_string()

std::string tesseract::WERD_CHOICE::debug_string ( ) const
inline

Definition at line 479 of file ratngs.h.

479 {
480 std::string word_str;
481 for (unsigned i = 0; i < length_; ++i) {
482 word_str += unicharset_->debug_str(unichar_ids_[i]);
483 word_str += " ";
484 }
485 return word_str;
486 }
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331

◆ DisplaySegmentation()

void tesseract::WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 728 of file ratngs.cpp.

728 {
729 // Number of different colors to draw with.
730 const int kNumColors = 6;
731 static ScrollView *segm_window = nullptr;
732 // Check the state against the static prev_drawn_state.
733 static std::vector<int> prev_drawn_state;
734 bool already_done = prev_drawn_state.size() == length_;
735 if (!already_done) {
736 prev_drawn_state.clear();
737 prev_drawn_state.resize(length_);
738 }
739 for (unsigned i = 0; i < length_; ++i) {
740 if (prev_drawn_state[i] != state_[i]) {
741 already_done = false;
742 }
743 prev_drawn_state[i] = state_[i];
744 }
745 if (already_done || word->blobs.empty()) {
746 return;
747 }
748
749 // Create the window if needed.
750 if (segm_window == nullptr) {
751 segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true);
752 } else {
753 segm_window->Clear();
754 }
755
756 TBOX bbox;
757 int blob_index = 0;
758 for (unsigned c = 0; c < length_; ++c) {
759 auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
760 for (int i = 0; i < state_[c]; ++i, ++blob_index) {
761 TBLOB *blob = word->blobs[blob_index];
762 bbox += blob->bounding_box();
763 blob->plot(segm_window, color, color);
764 }
765 }
766 segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom());
767 segm_window->Update();
768 segm_window->Wait();
769}
@ TBOX

◆ double_the_size()

void tesseract::WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 372 of file ratngs.h.

372 {
373 if (reserved_ > 0) {
374 reserved_ *= 2;
375 } else {
376 reserved_ = 1;
377 }
378 unichar_ids_.resize(reserved_);
379 script_pos_.resize(reserved_);
380 state_.resize(reserved_);
381 certainties_.resize(reserved_);
382 }

◆ empty()

bool tesseract::WERD_CHOICE::empty ( ) const
inline

Definition at line 284 of file ratngs.h.

284 {
285 return length_ == 0;
286 }

◆ GetNonSuperscriptSpan()

void tesseract::WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 378 of file ratngs.cpp.

378 {
379 int end = length();
380 while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
382 end--;
383 }
384 int start = 0;
385 while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&
387 start++;
388 }
389 *pstart = start;
390 *pend = end;
391}
@ SP_SUPERSCRIPT
Definition: ratngs.h:254
unsigned length() const
Definition: ratngs.h:287
ScriptPos BlobPosition(unsigned index) const
Definition: ratngs.h:306
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

◆ GetTopScriptID()

int tesseract::WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 631 of file ratngs.cpp.

631 {
632 unsigned max_script = unicharset_->get_script_table_size();
633 std::vector<unsigned> sid(max_script);
634 for (unsigned x = 0; x < length_; ++x) {
635 int script_id = unicharset_->get_script(unichar_id(x));
636 sid[script_id]++;
637 }
638 if (unicharset_->han_sid() != unicharset_->null_sid()) {
639 // Add the Hiragana & Katakana counts to Han and zero them out.
640 if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
641 sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
642 sid[unicharset_->hiragana_sid()] = 0;
643 }
644 if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
645 sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
646 sid[unicharset_->katakana_sid()] = 0;
647 }
648 }
649 // Note that high script ID overrides lower one on a tie, thus biasing
650 // towards non-Common script (if sorted that way in unicharset file).
651 unsigned max_sid = 0;
652 for (unsigned x = 1; x < max_script; x++) {
653 if (sid[x] >= sid[max_sid]) {
654 max_sid = x;
655 }
656 }
657 if (sid[max_sid] < length_ / 2) {
658 max_sid = unicharset_->null_sid();
659 }
660 return max_sid;
661}
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:681
int han_sid() const
Definition: unicharset.h:931
int get_script_table_size() const
Definition: unicharset.h:881
int hiragana_sid() const
Definition: unicharset.h:934
int null_sid() const
Definition: unicharset.h:916
int katakana_sid() const
Definition: unicharset.h:937

◆ has_rtl_unichar_id()

bool tesseract::WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 411 of file ratngs.cpp.

411 {
412 for (unsigned i = 0; i < length_; ++i) {
413 UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
415 return true;
416 }
417 }
418 return false;
419}
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:712

◆ init() [1/2]

void tesseract::WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 238 of file ratngs.cpp.

239 {
240 int src_string_len = strlen(src_string);
241 if (src_string_len == 0) {
242 this->init(8);
243 } else {
244 this->init(src_lengths ? strlen(src_lengths) : src_string_len);
245 length_ = reserved_;
246 int offset = 0;
247 for (unsigned i = 0; i < length_; ++i) {
248 int unichar_length = src_lengths ? src_lengths[i] : 1;
249 unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
250 state_[i] = 1;
251 certainties_[i] = src_certainty;
252 offset += unichar_length;
253 }
254 }
255 adjust_factor_ = 1.0f;
256 rating_ = src_rating;
257 certainty_ = src_certainty;
258 permuter_ = src_permuter;
259 dangerous_ambig_found_ = false;
260}
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

◆ init() [2/2]

void tesseract::WERD_CHOICE::init ( unsigned  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 386 of file ratngs.h.

386 {
387 reserved_ = reserved;
388 if (reserved > 0) {
389 unichar_ids_.resize(reserved);
390 script_pos_.resize(reserved);
391 state_.resize(reserved);
392 certainties_.resize(reserved);
393 } else {
394 unichar_ids_.clear();
395 script_pos_.clear();
396 state_.clear();
397 certainties_.clear();
398 }
399 length_ = 0;
400 adjust_factor_ = 1.0f;
401 rating_ = 0.0;
402 certainty_ = FLT_MAX;
403 min_x_height_ = 0.0f;
404 max_x_height_ = FLT_MAX;
405 permuter_ = NO_PERM;
406 unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
407 dangerous_ambig_found_ = false;
408 }

◆ IsAllSpaces()

bool tesseract::WERD_CHOICE::IsAllSpaces ( ) const
inline

Definition at line 497 of file ratngs.h.

497 {
498 for (unsigned i = 0; i < length_; ++i) {
499 if (unichar_ids_[i] != UNICHAR_SPACE) {
500 return false;
501 }
502 }
503 return true;
504 }
@ UNICHAR_SPACE
Definition: unicharset.h:36

◆ length()

unsigned tesseract::WERD_CHOICE::length ( ) const
inline

Definition at line 287 of file ratngs.h.

287 {
288 return length_;
289 }

◆ make_bad()

void tesseract::WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 419 of file ratngs.h.

419 {
420 length_ = 0;
421 rating_ = kBadRating;
422 certainty_ = -FLT_MAX;
423 }
static const float kBadRating
Definition: ratngs.h:260

◆ MatrixCoord()

MATRIX_COORD tesseract::WERD_CHOICE::MatrixCoord ( unsigned  index) const

Definition at line 286 of file ratngs.cpp.

286 {
287 int col = 0;
288 for (unsigned i = 0; i < index; ++i) {
289 col += state_[i];
290 }
291 int row = col + state_[index] - 1;
292 return MATRIX_COORD(col, row);
293}

◆ max_x_height()

float tesseract::WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 324 of file ratngs.h.

324 {
325 return max_x_height_;
326 }

◆ min_x_height()

float tesseract::WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 321 of file ratngs.h.

321 {
322 return min_x_height_;
323 }

◆ operator+=()

WERD_CHOICE & tesseract::WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 462 of file ratngs.cpp.

462 {
463 ASSERT_HOST(unicharset_ == second.unicharset_);
464 while (reserved_ < length_ + second.length()) {
465 this->double_the_size();
466 }
467 const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
468 for (unsigned i = 0; i < second.length(); ++i) {
469 unichar_ids_[length_ + i] = other_unichar_ids[i];
470 state_[length_ + i] = second.state_[i];
471 certainties_[length_ + i] = second.certainties_[i];
472 script_pos_[length_ + i] = second.BlobPosition(i);
473 }
474 length_ += second.length();
475 if (second.adjust_factor_ > adjust_factor_) {
476 adjust_factor_ = second.adjust_factor_;
477 }
478 rating_ += second.rating(); // add ratings
479 if (second.certainty() < certainty_) { // take min
480 certainty_ = second.certainty();
481 }
482 if (second.dangerous_ambig_found_) {
483 dangerous_ambig_found_ = true;
484 }
485 if (permuter_ == NO_PERM) {
486 permuter_ = second.permuter();
487 } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) {
488 permuter_ = COMPOUND_PERM;
489 }
490 return *this;
491}
#define ASSERT_HOST(x)
Definition: errcode.h:54
@ COMPOUND_PERM
Definition: ratngs.h:248

◆ operator=()

WERD_CHOICE & tesseract::WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 499 of file ratngs.cpp.

499 {
500 while (reserved_ < source.length()) {
501 this->double_the_size();
502 }
503
504 unicharset_ = source.unicharset_;
505 const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
506 for (unsigned i = 0; i < source.length(); ++i) {
507 unichar_ids_[i] = other_unichar_ids[i];
508 state_[i] = source.state_[i];
509 certainties_[i] = source.certainties_[i];
510 script_pos_[i] = source.BlobPosition(i);
511 }
512 length_ = source.length();
513 adjust_factor_ = source.adjust_factor_;
514 rating_ = source.rating();
515 certainty_ = source.certainty();
516 min_x_height_ = source.min_x_height();
517 max_x_height_ = source.max_x_height();
518 permuter_ = source.permuter();
519 dangerous_ambig_found_ = source.dangerous_ambig_found_;
520 return *this;
521}

◆ permuter()

uint8_t tesseract::WERD_CHOICE::permuter ( ) const
inline

Definition at line 331 of file ratngs.h.

331 {
332 return permuter_;
333 }

◆ permuter_name() [1/2]

const char * tesseract::WERD_CHOICE::permuter_name ( ) const

Definition at line 267 of file ratngs.cpp.

267 {
268 return kPermuterTypeNames[permuter_];
269}

◆ permuter_name() [2/2]

const char * tesseract::WERD_CHOICE::permuter_name ( uint8_t  permuter)
static

Definition at line 189 of file ratngs.cpp.

189 {
190 return kPermuterTypeNames[permuter];
191}
uint8_t permuter() const
Definition: ratngs.h:331

◆ print() [1/2]

void tesseract::WERD_CHOICE::print ( ) const
inline

Definition at line 561 of file ratngs.h.

561 {
562 this->print("");
563 }
void print() const
Definition: ratngs.h:561

◆ print() [2/2]

void tesseract::WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 689 of file ratngs.cpp.

689 {
690 tprintf("%s : ", msg);
691 for (unsigned i = 0; i < length_; ++i) {
692 tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
693 }
694 tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
695 adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
696 tprintf("pos");
697 for (unsigned i = 0; i < length_; ++i) {
698 tprintf("\t%s", ScriptPosToString(script_pos_[i]));
699 }
700 tprintf("\nstr");
701 for (unsigned i = 0; i < length_; ++i) {
702 tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
703 }
704 tprintf("\nstate:");
705 for (unsigned i = 0; i < length_; ++i) {
706 tprintf("\t%d ", state_[i]);
707 }
708 tprintf("\nC");
709 for (unsigned i = 0; i < length_; ++i) {
710 tprintf("\t%.3f", certainties_[i]);
711 }
712 tprintf("\n");
713}
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:193
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ print_state()

void tesseract::WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 716 of file ratngs.cpp.

716 {
717 tprintf("%s", msg);
718 for (unsigned i = 0; i < length_; ++i) {
719 tprintf(" %d", state_[i]);
720 }
721 tprintf("\n");
722}

◆ punct_stripped()

void tesseract::WERD_CHOICE::punct_stripped ( unsigned *  start,
unsigned *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 367 of file ratngs.cpp.

367 {
368 *start = 0;
369 *end = length();
370 while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
371 (*start)++;
372 }
373 while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
374 (*end)--;
375 }
376}

◆ rating()

float tesseract::WERD_CHOICE::rating ( ) const
inline

Definition at line 312 of file ratngs.h.

312 {
313 return rating_;
314 }

◆ remove_last_unichar_id()

void tesseract::WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 455 of file ratngs.h.

455 {
456 --length_;
457 }

◆ remove_unichar_id()

void tesseract::WERD_CHOICE::remove_unichar_id ( unsigned  index)
inline

Definition at line 458 of file ratngs.h.

458 {
459 this->remove_unichar_ids(index, 1);
460 }
void remove_unichar_ids(unsigned index, int num)
Definition: ratngs.cpp:325

◆ remove_unichar_ids()

void tesseract::WERD_CHOICE::remove_unichar_ids ( unsigned  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 325 of file ratngs.cpp.

325 {
326 ASSERT_HOST(start + num <= length_);
327 // Accumulate the states to account for the merged blobs.
328 for (int i = 0; i < num; ++i) {
329 if (start > 0) {
330 state_[start - 1] += state_[start + i];
331 } else if (start + num < length_) {
332 state_[start + num] += state_[start + i];
333 }
334 }
335 for (unsigned i = start; i + num < length_; ++i) {
336 unichar_ids_[i] = unichar_ids_[i + num];
337 script_pos_[i] = script_pos_[i + num];
338 state_[i] = state_[i + num];
339 certainties_[i] = certainties_[i + num];
340 }
341 length_ -= num;
342}

◆ reverse_and_mirror_unichar_ids()

void tesseract::WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 349 of file ratngs.cpp.

349 {
350 for (unsigned i = 0; i < length_ / 2; ++i) {
351 UNICHAR_ID tmp_id = unichar_ids_[i];
352 unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
353 unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
354 }
355 if (length_ % 2 != 0) {
356 unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]);
357 }
358}
int UNICHAR_ID
Definition: unichar.h:34
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:721

◆ ScriptPositionOf()

ScriptPos tesseract::WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 599 of file ratngs.cpp.

600 {
602 int top = blob_box.top();
603 int bottom = blob_box.bottom();
604 int min_bottom, max_bottom, min_top, max_top;
605 unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
606
607 int sub_thresh_top = min_top - kMinSubscriptOffset;
608 int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
609 int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
610 if (bottom <= kMaxDropCapBottom) {
611 retval = tesseract::SP_DROPCAP;
612 } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
614 } else if (bottom > sup_thresh_bot) {
616 }
617
618 if (print_debug) {
619 const char *pos = ScriptPosToString(retval);
620 tprintf(
621 "%s Character %s[bot:%d top: %d] "
622 "bot_range[%d,%d] top_range[%d, %d] "
623 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
624 pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top,
625 max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);
626 }
627 return retval;
628}
const int kMaxDropCapBottom
Definition: ratngs.cpp:43
@ SP_SUBSCRIPT
Definition: ratngs.h:254
@ SP_DROPCAP
Definition: ratngs.h:254
const int kMinSubscriptOffset
Definition: ratngs.cpp:39
const int kMinSuperscriptOffset
Definition: ratngs.cpp:41
const int kBlnBaselineOffset
Definition: normalis.h:34
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

◆ set_adjust_factor()

void tesseract::WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 293 of file ratngs.h.

293 {
294 adjust_factor_ = factor;
295 }

◆ set_blob_choice()

void tesseract::WERD_CHOICE::set_blob_choice ( unsigned  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 297 of file ratngs.cpp.

297 {
298 unichar_ids_[index] = blob_choice->unichar_id();
299 script_pos_[index] = tesseract::SP_NORMAL;
300 state_[index] = blob_count;
301 certainties_[index] = blob_choice->certainty();
302}

◆ set_certainty()

void tesseract::WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 357 of file ratngs.h.

357 {
358 certainty_ = new_val;
359 }

◆ set_dangerous_ambig_found_()

void tesseract::WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 351 of file ratngs.h.

351 {
352 dangerous_ambig_found_ = value;
353 }
int value

◆ set_length()

void tesseract::WERD_CHOICE::set_length ( unsigned  len)
inline

Definition at line 366 of file ratngs.h.

366 {
367 ASSERT_HOST(reserved_ >= len);
368 length_ = len;
369 }

◆ set_permuter()

void tesseract::WERD_CHOICE::set_permuter ( uint8_t  perm)
inline

Definition at line 360 of file ratngs.h.

360 {
361 permuter_ = perm;
362 }

◆ set_rating()

void tesseract::WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 354 of file ratngs.h.

354 {
355 rating_ = new_val;
356 }

◆ set_unichar_id() [1/2]

void tesseract::WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
unsigned  index 
)
inline

Definition at line 437 of file ratngs.h.

438 {
439 assert(index < length_);
440 unichar_ids_[index] = unichar_id;
441 state_[index] = blob_count;
442 certainties_[index] = certainty;
443 script_pos_[index] = SP_NORMAL;
444 rating_ += rating;
445 if (certainty < certainty_) {
446 certainty_ = certainty;
447 }
448 }

◆ set_unichar_id() [2/2]

void tesseract::WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
unsigned  index 
)
inline

Definition at line 344 of file ratngs.h.

344 {
345 assert(index < length_);
346 unichar_ids_[index] = unichar_id;
347 }

◆ set_unichars_in_script_order()

bool tesseract::WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 509 of file ratngs.h.

509 {
510 return unichars_in_script_order_ = in_script_order;
511 }

◆ set_x_heights()

void tesseract::WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 327 of file ratngs.h.

327 {
328 min_x_height_ = min_height;
329 max_x_height_ = max_height;
330 }

◆ SetAllScriptPositions()

void tesseract::WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 592 of file ratngs.cpp.

592 {
593 for (unsigned i = 0; i < length_; ++i) {
594 script_pos_[i] = position;
595 }
596}

◆ SetScriptPositions()

void tesseract::WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word,
int  debug = 0 
)

Definition at line 528 of file ratngs.cpp.

528 {
529 // Initialize to normal.
530 for (unsigned i = 0; i < length_; ++i) {
531 script_pos_[i] = tesseract::SP_NORMAL;
532 }
533 if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
534 return;
535 }
536
537 unsigned position_counts[4] = {0, 0, 0, 0};
538
539 int chunk_index = 0;
540 for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
541 TBLOB *tblob = word->blobs[chunk_index];
542 int uni_id = unichar_id(blob_index);
543 TBOX blob_box = tblob->bounding_box();
544 if (!state_.empty()) {
545 for (int i = 1; i < state_[blob_index]; ++i) {
546 ++chunk_index;
547 tblob = word->blobs[chunk_index];
548 blob_box += tblob->bounding_box();
549 }
550 }
551 script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id);
552 if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
553 script_pos_[blob_index] = tesseract::SP_NORMAL;
554 }
555 position_counts[script_pos_[blob_index]]++;
556 }
557 // If almost everything looks like a superscript or subscript,
558 // we most likely just got the baseline wrong.
559 if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||
560 4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {
561 if (debug >= 2) {
562 tprintf(
563 "Most characters of %s are subscript or superscript.\n"
564 "That seems wrong, so I'll assume we got the baseline wrong\n",
565 unichar_string().c_str());
566 }
567 for (unsigned i = 0; i < length_; i++) {
568 ScriptPos sp = script_pos_[i];
570 ASSERT_HOST(position_counts[sp] > 0);
571 position_counts[sp]--;
572 position_counts[tesseract::SP_NORMAL]++;
573 script_pos_[i] = tesseract::SP_NORMAL;
574 }
575 }
576 }
577
578 if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
579 tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
580 int chunk_index = 0;
581 for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {
582 if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
583 TBLOB *tblob = word->blobs[chunk_index];
584 ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
585 }
586 chunk_index += state_.empty() ? 1 : state_[blob_index];
587 }
588 }
589}
unsigned TotalOfStates() const
Definition: ratngs.cpp:676
static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:599
std::string & unichar_string()
Definition: ratngs.h:519

◆ shallow_copy()

WERD_CHOICE tesseract::WERD_CHOICE::shallow_copy ( unsigned  start,
unsigned  end 
) const

Definition at line 393 of file ratngs.cpp.

393 {
394 ASSERT_HOST(start <= length_);
395 ASSERT_HOST(end <= length_);
396 if (end < start) {
397 end = start;
398 }
399 WERD_CHOICE retval(unicharset_, end - start);
400 for (auto i = start; i < end; i++) {
401 retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
402 }
403 return retval;
404}
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:263

◆ state()

unsigned tesseract::WERD_CHOICE::state ( unsigned  index) const
inline

Definition at line 303 of file ratngs.h.

303 {
304 return state_[index];
305 }

◆ string_and_lengths()

void tesseract::WERD_CHOICE::string_and_lengths ( std::string *  word_str,
std::string *  word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 427 of file ratngs.cpp.

427 {
428 *word_str = "";
429 if (word_lengths_str != nullptr) {
430 *word_lengths_str = "";
431 }
432 for (unsigned i = 0; i < length_; ++i) {
433 const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
434 *word_str += ch;
435 if (word_lengths_str != nullptr) {
436 *word_lengths_str += (char)strlen(ch);
437 }
438 }
439}
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:287

◆ TotalOfStates()

unsigned tesseract::WERD_CHOICE::TotalOfStates ( ) const

Definition at line 676 of file ratngs.cpp.

676 {
677 unsigned total_chunks = 0;
678 for (unsigned i = 0; i < length_; ++i) {
679 total_chunks += state_[i];
680 }
681 return total_chunks;
682}

◆ unichar_id()

UNICHAR_ID tesseract::WERD_CHOICE::unichar_id ( unsigned  index) const
inline

Definition at line 299 of file ratngs.h.

299 {
300 assert(index < length_);
301 return unichar_ids_[index];
302 }

◆ unichar_ids()

const std::vector< UNICHAR_ID > & tesseract::WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 296 of file ratngs.h.

296 {
297 return unichar_ids_;
298 }

◆ unichar_lengths()

const std::string & tesseract::WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 533 of file ratngs.h.

533 {
534 this->string_and_lengths(&unichar_string_, &unichar_lengths_);
535 return unichar_lengths_;
536 }
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427

◆ unichar_string() [1/2]

std::string & tesseract::WERD_CHOICE::unichar_string ( )
inline

Definition at line 519 of file ratngs.h.

519 {
520 this->string_and_lengths(&unichar_string_, &unichar_lengths_);
521 return unichar_string_;
522 }

◆ unichar_string() [2/2]

const std::string & tesseract::WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 526 of file ratngs.h.

526 {
527 this->string_and_lengths(&unichar_string_, &unichar_lengths_);
528 return unichar_string_;
529 }

◆ unichars_in_script_order()

bool tesseract::WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 513 of file ratngs.h.

513 {
514 return unichars_in_script_order_;
515 }

◆ unicharset()

const UNICHARSET * tesseract::WERD_CHOICE::unicharset ( ) const
inline

Definition at line 281 of file ratngs.h.

281 {
282 return unicharset_;
283 }

◆ UpdateStateForSplit()

void tesseract::WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 664 of file ratngs.cpp.

664 {
665 int total_chunks = 0;
666 for (unsigned i = 0; i < length_; ++i) {
667 total_chunks += state_[i];
668 if (total_chunks > blob_position) {
669 ++state_[i];
670 return;
671 }
672 }
673}

Member Data Documentation

◆ kBadRating

const float tesseract::WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 260 of file ratngs.h.


The documentation for this class was generated from the following files: