tesseract v5.3.3.20231005
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract () override
 
DictgetDict () override
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Imagemutable_pix_binary ()
 
Image pix_binary () const
 
Image pix_grey () const
 
void set_pix_grey (Image grey_pix)
 
Image pix_original () const
 
void set_pix_original (Image original_pix)
 
Image BestPix () const
 
void set_pix_thresholds (Image thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Image scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Image color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix, Image *music_mask_pix)
 
void PrerecAllWordsPar (const std::vector< WordData > &words)
 
bool TrainLineRecognizer (const char *input_imagename, const std::string &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
bool recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
bool check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
bool acceptable_number_string (const char *s, const char *lengths)
 
int16_t count_alphanums (const WERD_CHOICE &word)
 
int16_t count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
 
int init_tesseract (const std::string &datapath, const std::string &language, OcrEngineMode oem)
 
int init_tesseract_internal (const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
void recognize_page (std::string &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
 
void ParseLanguageString (const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
bool process_cmd_win_event (int32_t cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
bool word_display (PAGE_RES_IT *pr_it)
 
bool word_bln_display (PAGE_RES_IT *pr_it)
 
bool word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
bool word_set_display (PAGE_RES_IT *pr_it)
 
bool word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, int16_t pass)
 
bool one_ell_conflict (WERD_RES *word_res, bool update_map)
 
int16_t first_alphanum_index (const char *word, const char *word_lengths)
 
int16_t first_alphanum_offset (const char *word, const char *word_lengths)
 
int16_t alpha_count (const char *word, const char *word_lengths)
 
bool word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
int16_t count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
bool non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, int16_t pass)
 
int16_t safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
bool word_adaptable (WERD_RES *word, uint16_t mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, bool ok_dict_word)
 
bool potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
int16_t word_blob_quality (WERD_RES *word)
 
void word_char_quality (WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word)
 
int16_t count_outline_errs (char c, int16_t outline_count)
 
int16_t word_outline_errs (WERD_RES *word)
 
bool terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, int16_t &delete_mode)
 
int16_t failure_count (WERD_RES *word)
 
bool noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, std::vector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const std::string &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
 BOOL_VAR_H (tessedit_resegment_from_boxes)
 
 BOOL_VAR_H (tessedit_resegment_from_line_boxes)
 
 BOOL_VAR_H (tessedit_train_from_boxes)
 
 BOOL_VAR_H (tessedit_make_boxes_from_boxes)
 
 BOOL_VAR_H (tessedit_train_line_recognizer)
 
 BOOL_VAR_H (tessedit_dump_pageseg_images)
 
 BOOL_VAR_H (tessedit_do_invert)
 
 double_VAR_H (invert_threshold)
 
 INT_VAR_H (tessedit_pageseg_mode)
 
 INT_VAR_H (thresholding_method)
 
 BOOL_VAR_H (thresholding_debug)
 
 double_VAR_H (thresholding_window_size)
 
 double_VAR_H (thresholding_kfactor)
 
 double_VAR_H (thresholding_tile_size)
 
 double_VAR_H (thresholding_smooth_kernel_size)
 
 double_VAR_H (thresholding_score_fraction)
 
 INT_VAR_H (tessedit_ocr_engine_mode)
 
 STRING_VAR_H (tessedit_char_blacklist)
 
 STRING_VAR_H (tessedit_char_whitelist)
 
 STRING_VAR_H (tessedit_char_unblacklist)
 
 BOOL_VAR_H (tessedit_ambigs_training)
 
 INT_VAR_H (pageseg_devanagari_split_strategy)
 
 INT_VAR_H (ocr_devanagari_split_strategy)
 
 STRING_VAR_H (tessedit_write_params_to_file)
 
 BOOL_VAR_H (tessedit_adaption_debug)
 
 INT_VAR_H (bidi_debug)
 
 INT_VAR_H (applybox_debug)
 
 INT_VAR_H (applybox_page)
 
 STRING_VAR_H (applybox_exposure_pattern)
 
 BOOL_VAR_H (applybox_learn_chars_and_char_frags_mode)
 
 BOOL_VAR_H (applybox_learn_ngrams_mode)
 
 BOOL_VAR_H (tessedit_display_outwords)
 
 BOOL_VAR_H (tessedit_dump_choices)
 
 BOOL_VAR_H (tessedit_timing_debug)
 
 BOOL_VAR_H (tessedit_fix_fuzzy_spaces)
 
 BOOL_VAR_H (tessedit_unrej_any_wd)
 
 BOOL_VAR_H (tessedit_fix_hyphens)
 
 BOOL_VAR_H (tessedit_enable_doc_dict)
 
 BOOL_VAR_H (tessedit_debug_fonts)
 
 INT_VAR_H (tessedit_font_id)
 
 BOOL_VAR_H (tessedit_debug_block_rejection)
 
 BOOL_VAR_H (tessedit_enable_bigram_correction)
 
 BOOL_VAR_H (tessedit_enable_dict_correction)
 
 INT_VAR_H (tessedit_bigram_debug)
 
 BOOL_VAR_H (enable_noise_removal)
 
 INT_VAR_H (debug_noise_removal)
 
 double_VAR_H (noise_cert_basechar)
 
 double_VAR_H (noise_cert_disjoint)
 
 double_VAR_H (noise_cert_punc)
 
 double_VAR_H (noise_cert_factor)
 
 INT_VAR_H (noise_maxperblob)
 
 INT_VAR_H (noise_maxperword)
 
 INT_VAR_H (debug_x_ht_level)
 
 STRING_VAR_H (chs_leading_punct)
 
 STRING_VAR_H (chs_trailing_punct1)
 
 STRING_VAR_H (chs_trailing_punct2)
 
 double_VAR_H (quality_rej_pc)
 
 double_VAR_H (quality_blob_pc)
 
 double_VAR_H (quality_outline_pc)
 
 double_VAR_H (quality_char_pc)
 
 INT_VAR_H (quality_min_initial_alphas_reqd)
 
 INT_VAR_H (tessedit_tess_adaption_mode)
 
 BOOL_VAR_H (tessedit_minimal_rej_pass1)
 
 BOOL_VAR_H (tessedit_test_adaption)
 
 BOOL_VAR_H (test_pt)
 
 double_VAR_H (test_pt_x)
 
 double_VAR_H (test_pt_y)
 
 INT_VAR_H (multilang_debug_level)
 
 INT_VAR_H (paragraph_debug_level)
 
 BOOL_VAR_H (paragraph_text_based)
 
 BOOL_VAR_H (lstm_use_matrix)
 
 STRING_VAR_H (outlines_odd)
 
 STRING_VAR_H (outlines_2)
 
 BOOL_VAR_H (tessedit_good_quality_unrej)
 
 BOOL_VAR_H (tessedit_use_reject_spaces)
 
 double_VAR_H (tessedit_reject_doc_percent)
 
 double_VAR_H (tessedit_reject_block_percent)
 
 double_VAR_H (tessedit_reject_row_percent)
 
 double_VAR_H (tessedit_whole_wd_rej_row_percent)
 
 BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds)
 
 BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds)
 
 BOOL_VAR_H (tessedit_dont_blkrej_good_wds)
 
 BOOL_VAR_H (tessedit_dont_rowrej_good_wds)
 
 INT_VAR_H (tessedit_preserve_min_wd_len)
 
 BOOL_VAR_H (tessedit_row_rej_good_docs)
 
 double_VAR_H (tessedit_good_doc_still_rowrej_wd)
 
 BOOL_VAR_H (tessedit_reject_bad_qual_wds)
 
 BOOL_VAR_H (tessedit_debug_doc_rejection)
 
 BOOL_VAR_H (tessedit_debug_quality_metrics)
 
 BOOL_VAR_H (bland_unrej)
 
 double_VAR_H (quality_rowrej_pc)
 
 BOOL_VAR_H (unlv_tilde_crunching)
 
 BOOL_VAR_H (hocr_font_info)
 
 BOOL_VAR_H (hocr_char_boxes)
 
 BOOL_VAR_H (crunch_early_merge_tess_fails)
 
 BOOL_VAR_H (crunch_early_convert_bad_unlv_chs)
 
 double_VAR_H (crunch_terrible_rating)
 
 BOOL_VAR_H (crunch_terrible_garbage)
 
 double_VAR_H (crunch_poor_garbage_cert)
 
 double_VAR_H (crunch_poor_garbage_rate)
 
 double_VAR_H (crunch_pot_poor_rate)
 
 double_VAR_H (crunch_pot_poor_cert)
 
 double_VAR_H (crunch_del_rating)
 
 double_VAR_H (crunch_del_cert)
 
 double_VAR_H (crunch_del_min_ht)
 
 double_VAR_H (crunch_del_max_ht)
 
 double_VAR_H (crunch_del_min_width)
 
 double_VAR_H (crunch_del_high_word)
 
 double_VAR_H (crunch_del_low_word)
 
 double_VAR_H (crunch_small_outlines_size)
 
 INT_VAR_H (crunch_rating_max)
 
 INT_VAR_H (crunch_pot_indicators)
 
 BOOL_VAR_H (crunch_leave_ok_strings)
 
 BOOL_VAR_H (crunch_accept_ok)
 
 BOOL_VAR_H (crunch_leave_accept_strings)
 
 BOOL_VAR_H (crunch_include_numerals)
 
 INT_VAR_H (crunch_leave_lc_strings)
 
 INT_VAR_H (crunch_leave_uc_strings)
 
 INT_VAR_H (crunch_long_repetitions)
 
 INT_VAR_H (crunch_debug)
 
 INT_VAR_H (fixsp_non_noise_limit)
 
 double_VAR_H (fixsp_small_outlines_size)
 
 BOOL_VAR_H (tessedit_prefer_joined_punct)
 
 INT_VAR_H (fixsp_done_mode)
 
 INT_VAR_H (debug_fix_space_level)
 
 STRING_VAR_H (numeric_punctuation)
 
 INT_VAR_H (x_ht_acceptance_tolerance)
 
 INT_VAR_H (x_ht_min_change)
 
 INT_VAR_H (superscript_debug)
 
 double_VAR_H (superscript_worse_certainty)
 
 double_VAR_H (superscript_bettered_certainty)
 
 double_VAR_H (superscript_scaledown_ratio)
 
 double_VAR_H (subscript_max_y_top)
 
 double_VAR_H (superscript_min_y_bottom)
 
 BOOL_VAR_H (tessedit_write_block_separators)
 
 BOOL_VAR_H (tessedit_write_rep_codes)
 
 BOOL_VAR_H (tessedit_write_unlv)
 
 BOOL_VAR_H (tessedit_create_txt)
 
 BOOL_VAR_H (tessedit_create_hocr)
 
 BOOL_VAR_H (tessedit_create_alto)
 
 BOOL_VAR_H (tessedit_create_lstmbox)
 
 BOOL_VAR_H (tessedit_create_tsv)
 
 BOOL_VAR_H (tessedit_create_wordstrbox)
 
 BOOL_VAR_H (tessedit_create_pdf)
 
 BOOL_VAR_H (textonly_pdf)
 
 INT_VAR_H (jpg_quality)
 
 INT_VAR_H (user_defined_dpi)
 
 INT_VAR_H (min_characters_to_try)
 
 STRING_VAR_H (unrecognised_char)
 
 INT_VAR_H (suspect_level)
 
 INT_VAR_H (suspect_short_words)
 
 BOOL_VAR_H (suspect_constrain_1Il)
 
 double_VAR_H (suspect_rating_per_ch)
 
 double_VAR_H (suspect_accept_rating)
 
 BOOL_VAR_H (tessedit_minimal_rejection)
 
 BOOL_VAR_H (tessedit_zero_rejection)
 
 BOOL_VAR_H (tessedit_word_for_word)
 
 BOOL_VAR_H (tessedit_zero_kelvin_rejection)
 
 INT_VAR_H (tessedit_reject_mode)
 
 BOOL_VAR_H (tessedit_rejection_debug)
 
 BOOL_VAR_H (tessedit_flip_0O)
 
 double_VAR_H (tessedit_lower_flip_hyphen)
 
 double_VAR_H (tessedit_upper_flip_hyphen)
 
 BOOL_VAR_H (rej_trust_doc_dawg)
 
 BOOL_VAR_H (rej_1Il_use_dict_word)
 
 BOOL_VAR_H (rej_1Il_trust_permuter_type)
 
 BOOL_VAR_H (rej_use_tess_accepted)
 
 BOOL_VAR_H (rej_use_tess_blanks)
 
 BOOL_VAR_H (rej_use_good_perm)
 
 BOOL_VAR_H (rej_use_sensible_wd)
 
 BOOL_VAR_H (rej_alphas_in_number_perm)
 
 double_VAR_H (rej_whole_of_mostly_reject_word_fract)
 
 INT_VAR_H (tessedit_image_border)
 
 STRING_VAR_H (ok_repeated_ch_non_alphanum_wds)
 
 STRING_VAR_H (conflict_set_I_l_1)
 
 INT_VAR_H (min_sane_x_ht_pixels)
 
 BOOL_VAR_H (tessedit_create_boxfile)
 
 INT_VAR_H (tessedit_page_number)
 
 BOOL_VAR_H (tessedit_write_images)
 
 BOOL_VAR_H (interactive_display_mode)
 
 STRING_VAR_H (file_type)
 
 BOOL_VAR_H (tessedit_override_permuter)
 
 STRING_VAR_H (tessedit_load_sublangs)
 
 BOOL_VAR_H (tessedit_use_primary_params_model)
 
 double_VAR_H (min_orientation_margin)
 
 BOOL_VAR_H (textord_tabfind_show_vlines)
 
 BOOL_VAR_H (textord_use_cjk_fp_model)
 
 BOOL_VAR_H (poly_allow_detailed_fx)
 
 BOOL_VAR_H (tessedit_init_config_only)
 
 BOOL_VAR_H (textord_equation_detect)
 
 BOOL_VAR_H (textord_tabfind_vertical_text)
 
 BOOL_VAR_H (textord_tabfind_force_vertical_text)
 
 double_VAR_H (textord_tabfind_vertical_text_ratio)
 
 double_VAR_H (textord_tabfind_aligned_gap_fraction)
 
 INT_VAR_H (tessedit_parallelize)
 
 BOOL_VAR_H (preserve_interword_spaces)
 
 STRING_VAR_H (page_separator)
 
 INT_VAR_H (lstm_choice_mode)
 
 INT_VAR_H (lstm_choice_iterations)
 
 double_VAR_H (lstm_rating_coefficient)
 
 BOOL_VAR_H (pageseg_apply_music_mask)
 
FILE * init_recog_training (const char *filename)
 
void recog_training_segmented (const char *filename, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word

  • 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

bool digit_or_numeric_punct (WERD_RES *word, int char_position)
 
int16_t eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

int16_t fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
int16_t worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
transform_to_next_perm()

Examines the current word list to find the smallest word gap size. Then walks the word list closing any gaps of this size by either inserted new combination words, or extending existing ones.

The routine COULD be limited to stop it building words longer than N blobs.

If there are no more gaps then it DELETES the entire list and returns the empty list to cause termination.

void dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
 
bool fixspace_thinks_word_done (WERD_RES *word)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 BOOL_VAR_H (merge_fragments_in_matrix)
 
 BOOL_VAR_H (wordrec_enable_assoc)
 
 BOOL_VAR_H (force_word_assoc)
 
 INT_VAR_H (repair_unchopped_blobs)
 
 double_VAR_H (tessedit_certainty_threshold)
 
 INT_VAR_H (chop_debug)
 
 BOOL_VAR_H (chop_enable)
 
 BOOL_VAR_H (chop_vertical_creep)
 
 INT_VAR_H (chop_split_length)
 
 INT_VAR_H (chop_same_distance)
 
 INT_VAR_H (chop_min_outline_points)
 
 INT_VAR_H (chop_seam_pile_size)
 
 BOOL_VAR_H (chop_new_seam_pile)
 
 INT_VAR_H (chop_inside_angle)
 
 INT_VAR_H (chop_min_outline_area)
 
 double_VAR_H (chop_split_dist_knob)
 
 double_VAR_H (chop_overlap_knob)
 
 double_VAR_H (chop_center_knob)
 
 INT_VAR_H (chop_centered_maxwidth)
 
 double_VAR_H (chop_sharpness_knob)
 
 double_VAR_H (chop_width_change_knob)
 
 double_VAR_H (chop_ok_split)
 
 double_VAR_H (chop_good_split)
 
 INT_VAR_H (chop_x_y_weight)
 
 BOOL_VAR_H (assume_fixed_pitch_char_segment)
 
 INT_VAR_H (wordrec_debug_level)
 
 INT_VAR_H (wordrec_max_join_chunks)
 
 BOOL_VAR_H (wordrec_skip_no_truth_words)
 
 BOOL_VAR_H (wordrec_debug_blamer)
 
 BOOL_VAR_H (wordrec_run_blamer)
 
 INT_VAR_H (segsearch_debug_level)
 
 INT_VAR_H (segsearch_max_pain_points)
 
 INT_VAR_H (segsearch_max_futile_classifications)
 
 double_VAR_H (segsearch_max_char_wh_ratio)
 
 BOOL_VAR_H (save_alt_choices)
 
 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void program_editup (const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (int32_t elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, ScrollView::Color color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const std::vector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const std::vector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const std::vector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, unsigned *blob_number)
 
SEAMimprove_one_blob (const std::vector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, unsigned *blob_number)
 
SEAMchop_one_blob (const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending)
 
int select_blob_to_split (const std::vector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
int GetFontinfoId (ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
 
ADAPT_TEMPLATES_STRUCTReadAdaptedTemplates (TFile *File)
 
void ConvertProto (PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class)
 
INT_TEMPLATES_STRUCTCreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates)
 
void AmbigClassifier (const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
std::string ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, std::vector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES_STRUCTReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const char *filename)
 
 BOOL_VAR_H (allow_blob_division)
 
 BOOL_VAR_H (prioritize_division)
 
 BOOL_VAR_H (classify_enable_learning)
 
 INT_VAR_H (classify_debug_level)
 
 INT_VAR_H (classify_norm_method)
 
 double_VAR_H (classify_char_norm_range)
 
 double_VAR_H (classify_max_rating_ratio)
 
 double_VAR_H (classify_max_certainty_margin)
 
 BOOL_VAR_H (tess_cn_matching)
 
 BOOL_VAR_H (tess_bn_matching)
 
 BOOL_VAR_H (classify_enable_adaptive_matcher)
 
 BOOL_VAR_H (classify_use_pre_adapted_templates)
 
 BOOL_VAR_H (classify_save_adapted_templates)
 
 BOOL_VAR_H (classify_enable_adaptive_debugger)
 
 BOOL_VAR_H (classify_nonlinear_norm)
 
 INT_VAR_H (matcher_debug_level)
 
 INT_VAR_H (matcher_debug_flags)
 
 INT_VAR_H (classify_learning_debug_level)
 
 double_VAR_H (matcher_good_threshold)
 
 double_VAR_H (matcher_reliable_adaptive_result)
 
 double_VAR_H (matcher_perfect_threshold)
 
 double_VAR_H (matcher_bad_match_pad)
 
 double_VAR_H (matcher_rating_margin)
 
 double_VAR_H (matcher_avg_noise_size)
 
 INT_VAR_H (matcher_permanent_classes_min)
 
 INT_VAR_H (matcher_min_examples_for_prototyping)
 
 INT_VAR_H (matcher_sufficient_examples_for_prototyping)
 
 double_VAR_H (matcher_clustering_max_angle_delta)
 
 double_VAR_H (classify_misfit_junk_penalty)
 
 double_VAR_H (rating_scale)
 
 double_VAR_H (tessedit_class_miss_scale)
 
 double_VAR_H (classify_adapted_pruning_factor)
 
 double_VAR_H (classify_adapted_pruning_threshold)
 
 INT_VAR_H (classify_adapt_proto_threshold)
 
 INT_VAR_H (classify_adapt_feature_threshold)
 
 BOOL_VAR_H (disable_character_fragments)
 
 double_VAR_H (classify_character_fragments_garbage_certainty_threshold)
 
 BOOL_VAR_H (classify_debug_character_fragments)
 
 BOOL_VAR_H (matcher_debug_separate_windows)
 
 STRING_VAR_H (classify_learn_debug_str)
 
 INT_VAR_H (classify_class_pruner_threshold)
 
 INT_VAR_H (classify_class_pruner_multiplier)
 
 INT_VAR_H (classify_cp_cutoff_strength)
 
 INT_VAR_H (classify_integer_matcher_multiplier)
 
 BOOL_VAR_H (classify_bln_numeric_mode)
 
 double_VAR_H (speckle_large_max_size)
 
 double_VAR_H (speckle_rating_penalty)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const std::string &argv0, const std::string &basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 
 INT_VAR_H (ambigs_debug_level)
 
 BOOL_VAR_H (use_ambigs_for_adaption)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, std::vector< INT_FEATURE_STRUCT > *bl_features, std::vector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, std::vector< int > *outline_cn_counts)
 
- Public Attributes inherited from tesseract::Wordrec
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
INT_TEMPLATES_STRUCTPreTrainedTemplates = nullptr
 
ADAPT_TEMPLATES_STRUCTAdaptedTemplates = nullptr
 
ADAPT_TEMPLATES_STRUCTBackupAdaptedTemplates = nullptr
 
BIT_VECTOR AllProtosOn = nullptr
 
BIT_VECTOR AllConfigsOn = nullptr
 
BIT_VECTOR AllConfigsOff = nullptr
 
BIT_VECTOR TempProtoMask = nullptr
 
NORM_PROTOSNormProtos = nullptr
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning = true
 
- Public Attributes inherited from tesseract::CCUtil
std::string datadir
 
std::string imagebasename
 
std::string lang
 
std::string language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
std::string imagefile
 
std::string directory
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, std::vector< SegSearchPending > &pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, std::string &blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_ = nullptr
 

Detailed Description

Definition at line 178 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 53 of file tesseractclass.cpp.

54 : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
55 "Take segmentation and labeling from box file", this->params())
56 , BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
57 "Conversion of word/line box file to char box file", this->params())
58 , BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars",
59 this->params())
60 , BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars",
61 this->params())
62 , BOOL_MEMBER(tessedit_train_line_recognizer, false,
63 "Break input into lines and remap boxes if present", this->params())
64 , BOOL_MEMBER(tessedit_dump_pageseg_images, false,
65 "Dump intermediate images made during page segmentation", this->params())
66 // TODO: remove deprecated tessedit_do_invert in release 6.
67 , BOOL_MEMBER(tessedit_do_invert, true,
68 "Try inverted line image if necessary (deprecated, will be "
69 "removed in release 6, use the 'invert_threshold' parameter instead)",
70 this->params())
71 , double_MEMBER(invert_threshold, 0.7,
72 "For lines with a mean confidence below this value, OCR is also tried with an inverted image",
73 this->params())
74 ,
75 // The default for pageseg_mode is the old behaviour, so as not to
76 // upset anything that relies on that.
77 INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
78 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, "
79 "4=column,"
80 " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
81 "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
82 " (Values from PageSegMode enum in tesseract/publictypes.h)",
83 this->params())
84 , INT_MEMBER(thresholding_method,
85 static_cast<int>(ThresholdMethod::Otsu),
86 "Thresholding method: 0 = Otsu, 1 = LeptonicaOtsu, 2 = "
87 "Sauvola",
88 this->params())
89 , BOOL_MEMBER(thresholding_debug, false,
90 "Debug the thresholding process",
91 this->params())
92 , double_MEMBER(thresholding_window_size, 0.33,
93 "Window size for measuring local statistics (to be "
94 "multiplied by image DPI). "
95 "This parameter is used by the Sauvola thresholding method",
96 this->params())
97 , double_MEMBER(thresholding_kfactor, 0.34,
98 "Factor for reducing threshold due to variance. "
99 "This parameter is used by the Sauvola thresholding method."
100 " Normal range: 0.2-0.5",
101 this->params())
102 , double_MEMBER(thresholding_tile_size, 0.33,
103 "Desired tile size (to be multiplied by image DPI). "
104 "This parameter is used by the LeptonicaOtsu thresholding "
105 "method",
106 this->params())
107 , double_MEMBER(thresholding_smooth_kernel_size, 0.0,
108 "Size of convolution kernel applied to threshold array "
109 "(to be multiplied by image DPI). Use 0 for no smoothing. "
110 "This parameter is used by the LeptonicaOtsu thresholding "
111 "method",
112 this->params())
113 , double_MEMBER(thresholding_score_fraction, 0.1,
114 "Fraction of the max Otsu score. "
115 "This parameter is used by the LeptonicaOtsu thresholding "
116 "method. "
117 "For standard Otsu use 0.0, otherwise 0.1 is recommended",
118 this->params())
119 , INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
120 "Which OCR engine(s) to run (Tesseract, LSTM, both)."
121 " Defaults to loading and running the most accurate"
122 " available.",
123 this->params())
124 , STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize",
125 this->params())
126 , STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params())
127 , STRING_MEMBER(tessedit_char_unblacklist, "",
128 "List of chars to override tessedit_char_blacklist", this->params())
129 , BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities",
130 this->params())
131 , INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
132 "Whether to use the top-line splitting process for Devanagari "
133 "documents while performing page-segmentation.",
134 this->params())
135 , INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
136 "Whether to use the top-line splitting process for Devanagari "
137 "documents while performing ocr.",
138 this->params())
139 , STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.",
140 this->params())
141 , BOOL_MEMBER(tessedit_adaption_debug, false,
142 "Generate and print debug"
143 " information for adaption",
144 this->params())
145 , INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params())
146 , INT_MEMBER(applybox_debug, 1, "Debug level", this->params())
147 , INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params())
148 , STRING_MEMBER(applybox_exposure_pattern, ".exp",
149 "Exposure value follows"
150 " this pattern in the image filename. The name of the image"
151 " files are expected to be in the form"
152 " [lang].[fontname].exp[num].tif",
153 this->params())
154 , BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
155 "Learn both character fragments (as is done in the"
156 " special low exposure mode) as well as unfragmented"
157 " characters.",
158 this->params())
159 , BOOL_MEMBER(applybox_learn_ngrams_mode, false,
160 "Each bounding box"
161 " is assumed to contain ngrams. Only learn the ngrams"
162 " whose outlines overlap horizontally.",
163 this->params())
164 , BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params())
165 , BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params())
166 , BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params())
167 , BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params())
168 , BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility",
169 this->params())
170 , BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params())
171 , BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",
172 this->params())
173 , BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())
174 , INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())
175 , BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())
176 , BOOL_MEMBER(tessedit_enable_bigram_correction, true,
177 "Enable correction based on the word bigram dictionary.", this->params())
178 , BOOL_MEMBER(tessedit_enable_dict_correction, false,
179 "Enable single word correction based on the dictionary.", this->params())
180 , INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.",
181 this->params())
182 , BOOL_MEMBER(enable_noise_removal, true,
183 "Remove and conditionally reassign small outlines when they"
184 " confuse layout analysis, determining diacritics vs noise",
185 this->params())
186 , INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params())
187 ,
188 // Worst (min) certainty, for which a diacritic is allowed to make the
189 // base
190 // character worse and still be included.
191 double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params())
192 ,
193 // Worst (min) certainty, for which a non-overlapping diacritic is allowed
194 // to make the base character worse and still be included.
195 double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params())
196 ,
197 // Worst (min) certainty, for which a diacritic is allowed to make a new
198 // stand-alone blob.
199 double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params())
200 ,
201 // Factor of certainty margin for adding diacritics to not count as worse.
202 double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint",
203 this->params())
204 , INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params())
205 , INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params())
206 , INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params())
207 , STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params())
208 , STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params())
209 , STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params())
210 , double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params())
211 , double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params())
212 , double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit",
213 this->params())
214 , double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params())
215 , INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params())
216 , INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess",
217 this->params())
218 , BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output",
219 this->params())
220 , BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params())
221 , BOOL_MEMBER(test_pt, false, "Test for point", this->params())
222 , double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params())
223 , double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params())
224 , INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params())
225 , INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params())
226 , BOOL_MEMBER(paragraph_text_based, true,
227 "Run paragraph detection on the post-text-recognition "
228 "(more accurate)",
229 this->params())
230 , BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params())
231 , STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params())
232 , STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params())
233 , BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs",
234 this->params())
235 , BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params())
236 , double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc",
237 this->params())
238 , double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block",
239 this->params())
240 , double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row",
241 this->params())
242 , double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
243 "Number of row rejects in whole word rejects"
244 " which prevents whole row rejection",
245 this->params())
246 , BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
247 "Only rej partially rejected words in block rejection", this->params())
248 , BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
249 "Only rej partially rejected words in row rejection", this->params())
250 , BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric",
251 this->params())
252 , BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric",
253 this->params())
254 , INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this",
255 this->params())
256 , BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs",
257 this->params())
258 , double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
259 "rej good doc wd if more than this fraction rejected", this->params())
260 , BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params())
261 , BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params())
262 , BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file",
263 this->params())
264 , BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params())
265 , double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params())
266 , BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params())
267 , BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
268 , BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
269 this->params())
270 , BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
271 , BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
272 , double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
273 , BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params())
274 , double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params())
275 , double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params())
276 , double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params())
277 , double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params())
278 , double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params())
279 , double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params())
280 , double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params())
281 , double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params())
282 , double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params())
283 , double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params())
284 , double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params())
285 , double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params())
286 , INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params())
287 , INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params())
288 , BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params())
289 , BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params())
290 , BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings",
291 this->params())
292 , BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params())
293 , INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings",
294 this->params())
295 , INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings",
296 this->params())
297 , INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params())
298 , INT_MEMBER(crunch_debug, 0, "As it says", this->params())
299 , INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params())
300 , double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params())
301 , BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params())
302 , INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params())
303 , INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params())
304 , STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params())
305 , INT_MEMBER(x_ht_acceptance_tolerance, 8,
306 "Max allowed deviation of blob top outside of font data", this->params())
307 , INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params())
308 , INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params())
309 , double_MEMBER(superscript_worse_certainty, 2.0,
310 "How many times worse "
311 "certainty does a superscript position glyph need to be for "
312 "us to try classifying it as a char with a different "
313 "baseline?",
314 this->params())
315 , double_MEMBER(superscript_bettered_certainty, 0.97,
316 "What reduction in "
317 "badness do we think sufficient to choose a superscript "
318 "over what we'd thought. For example, a value of 0.6 means "
319 "we want to reduce badness of certainty by at least 40%",
320 this->params())
321 , double_MEMBER(superscript_scaledown_ratio, 0.4,
322 "A superscript scaled down more than this is unbelievably "
323 "small. For example, 0.3 means we expect the font size to "
324 "be no smaller than 30% of the text line font size.",
325 this->params())
326 , double_MEMBER(subscript_max_y_top, 0.5,
327 "Maximum top of a character measured as a multiple of "
328 "x-height above the baseline for us to reconsider whether "
329 "it's a subscript.",
330 this->params())
331 , double_MEMBER(superscript_min_y_bottom, 0.3,
332 "Minimum bottom of a character measured as a multiple of "
333 "x-height above the baseline for us to reconsider whether "
334 "it's a superscript.",
335 this->params())
336 , BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output",
337 this->params())
338 , BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params())
339 , BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params())
340 , BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
341 , BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
342 , BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
343 , BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
344 this->params())
345 , BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
346 , BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
347 this->params())
348 , BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params())
349 , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
350 this->params())
351 , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())
352 , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())
353 , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",
354 this->params())
355 , STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params())
356 , INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params())
357 , INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params())
358 , BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params())
359 , double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params())
360 , double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params())
361 , BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params())
362 , BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params())
363 , BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD",
364 this->params())
365 , BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL",
366 this->params())
367 , INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params())
368 , BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params())
369 , BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params())
370 , double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params())
371 , double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params())
372 , BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params())
373 , BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params())
374 , BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params())
375 , BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params())
376 , BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params())
377 , BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params())
378 , BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params())
379 , BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params())
380 , double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params())
381 , INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params())
382 , STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params())
383 , STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params())
384 , INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params())
385 , BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params())
386 , INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process",
387 this->params())
388 , BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params())
389 , BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params())
390 , STRING_MEMBER(file_type, ".tif", "Filename extension", this->params())
391 , BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params())
392 , STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one",
393 this->params())
394 , BOOL_MEMBER(tessedit_use_primary_params_model, false,
395 "In multilingual mode use params model of the"
396 " primary language",
397 this->params())
398 , double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin",
399 this->params())
400 , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
401 , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
402 , BOOL_MEMBER(poly_allow_detailed_fx, false,
403 "Allow feature extractors to see the original outline", this->params())
404 , BOOL_INIT_MEMBER(tessedit_init_config_only, false,
405 "Only initialize with the config file. Useful if the "
406 "instance is not going to be used for OCR but say only "
407 "for layout analysis.",
408 this->params())
409#ifndef DISABLED_LEGACY_ENGINE
410 , BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params())
411#endif // ndef DISABLED_LEGACY_ENGINE
412 , BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params())
413 , BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode",
414 this->params())
415 , double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
416 "Fraction of textlines deemed vertical to use vertical page "
417 "mode",
418 this->params())
419 , double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
420 "Fraction of height used as a minimum gap for aligned blobs.", this->params())
421 , INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params())
422 , BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces",
423 this->params())
424 , STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)",
425 this->params())
426 , INT_MEMBER(lstm_choice_mode, 0,
427 "Allows to include alternative symbols choices in the hOCR output. "
428 "Valid input values are 0, 1 and 2. 0 is the default value. "
429 "With 1 the alternative symbol choices per timestep are included. "
430 "With 2 alternative symbol choices are extracted from the CTC "
431 "process instead of the lattice. The choices are mapped per "
432 "character.",
433 this->params())
434 , INT_MEMBER(lstm_choice_iterations, 5,
435 "Sets the number of cascading iterations for the Beamsearch in "
436 "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
437 "value greater than 0 to produce results.",
438 this->params())
439 , double_MEMBER(lstm_rating_coefficient, 5,
440 "Sets the rating coefficient for the lstm choices. The smaller the "
441 "coefficient, the better are the ratings for each choice and less "
442 "information is lost due to the cut off at 0. The standard value is "
443 "5",
444 this->params())
445 , BOOL_MEMBER(pageseg_apply_music_mask, false,
446 "Detect music staff and remove intersecting components", this->params())
447 ,
448
449 backup_config_file_(nullptr)
450 , pix_binary_(nullptr)
451 , pix_grey_(nullptr)
452 , pix_original_(nullptr)
453 , pix_thresholds_(nullptr)
454 , source_resolution_(0)
455 , textord_(this)
456 , right_to_left_(false)
457 , scaled_color_(nullptr)
458 , scaled_factor_(-1)
459 , deskew_(1.0f, 0.0f)
460 , reskew_(1.0f, 0.0f)
461 , most_recently_used_(this)
462 , font_table_size_(0)
463#ifndef DISABLED_LEGACY_ENGINE
464 , equ_detect_(nullptr)
465#endif // ndef DISABLED_LEGACY_ENGINE
466 , lstm_recognizer_(nullptr)
467 , train_line_page_num_(0) {}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:369
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:377
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:379
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:375
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:373
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:371
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:166
ParamsVectors * params()
Definition: ccutil.h:53

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )
override

Definition at line 469 of file tesseractclass.cpp.

469 {
470 Clear();
471 pix_original_.destroy();
473 for (auto *lang : sub_langs_) {
474 delete lang;
475 }
476 delete lstm_recognizer_;
477 lstm_recognizer_ = nullptr;
478}
void destroy()
Definition: image.cpp:32
std::string lang
Definition: ccutil.h:59

Member Function Documentation

◆ acceptable_number_string()

bool tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 386 of file output.cpp.

386 {
387 bool prev_digit = false;
388
389 if (*lengths == 1 && *s == '(') {
390 s++;
391 }
392
393 if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
394 s++;
395 }
396
397 for (; *s != '\0'; s += *(lengths++)) {
398 if (unicharset.get_isdigit(s, *lengths)) {
399 prev_digit = true;
400 } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
401 prev_digit = false;
402 } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
403 ((*s == '%') || (*s == ')'))) {
404 return true;
405 } else if (prev_digit && *lengths == 1 && (*s == '%') &&
406 (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
407 (*(s + *lengths + *(lengths + 1)) == '\0')) {
408 return true;
409 } else {
410 return false;
411 }
412 }
413 return true;
414}
UNICHARSET unicharset
Definition: ccutil.h:61
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1692 of file control.cpp.

1693 {
1694 int i = 0;
1695 int offset = 0;
1696 int leading_punct_count;
1697 int upper_count = 0;
1698 int hyphen_pos = -1;
1700
1701 if (strlen(lengths) > 20) {
1702 return word_type;
1703 }
1704
1705 /* Single Leading punctuation char*/
1706
1707 if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1708 offset += lengths[i++];
1709 }
1710 leading_punct_count = i;
1711
1712 /* Initial cap */
1713 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1714 offset += lengths[i++];
1715 upper_count++;
1716 }
1717 if (upper_count > 1) {
1718 word_type = AC_UPPER_CASE;
1719 } else {
1720 /* Lower case word, possibly with an initial cap */
1721 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1722 offset += lengths[i++];
1723 }
1724 if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1725 goto not_a_word;
1726 }
1727 /*
1728Allow a single hyphen in a lower case word
1729- don't trust upper case - I've seen several cases of "H" -> "I-I"
1730*/
1731 if (lengths[i] == 1 && s[offset] == '-') {
1732 hyphen_pos = i;
1733 offset += lengths[i++];
1734 if (s[offset] != '\0') {
1735 while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1736 offset += lengths[i++];
1737 }
1738 if (i < hyphen_pos + 3) {
1739 goto not_a_word;
1740 }
1741 }
1742 } else {
1743 /* Allow "'s" in NON hyphenated lower case words */
1744 if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1745 (s[offset + lengths[i]] == 's')) {
1746 offset += lengths[i++];
1747 offset += lengths[i++];
1748 }
1749 }
1750 if (upper_count > 0) {
1751 word_type = AC_INITIAL_CAP;
1752 } else {
1753 word_type = AC_LOWER_CASE;
1754 }
1755 }
1756
1757 /* Up to two different, constrained trailing punctuation chars */
1758 if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1759 offset += lengths[i++];
1760 }
1761 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1762 chs_trailing_punct2.contains(s[offset])) {
1763 offset += lengths[i++];
1764 }
1765
1766 if (s[offset] != '\0') {
1767 word_type = AC_UNACCEPTABLE;
1768 }
1769
1770not_a_word:
1771
1772 if (word_type == AC_UNACCEPTABLE) {
1773 /* Look for abbreviation string */
1774 i = 0;
1775 offset = 0;
1776 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1777 word_type = AC_UC_ABBREV;
1778 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1779 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1780 offset += lengths[i++];
1781 offset += lengths[i++];
1782 }
1783 } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1784 word_type = AC_LC_ABBREV;
1785 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1786 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1787 offset += lengths[i++];
1788 offset += lengths[i++];
1789 }
1790 }
1791 if (s[offset] != '\0') {
1792 word_type = AC_UNACCEPTABLE;
1793 }
1794 }
1795
1796 return word_type;
1797}
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
@ AC_UC_ABBREV
A.B.C.
Definition: control.h:34
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
@ AC_LC_ABBREV
a.b.c.
Definition: control.h:33
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:30

◆ alpha_count()

int16_t tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 483 of file reject.cpp.

483 {
484 int16_t i;
485 int16_t offset;
486 int16_t count = 0;
487
488 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489 if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
490 count++;
491 }
492 }
493 return count;
494}
int * count
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 203 of file recogtraining.cpp.

204 {
205 // Classify word.
206 fflush(stdout);
207 WordData word_data(*pr_it);
208 SetupWordPassN(1, &word_data);
209 classify_word_and_language(1, pr_it, &word_data);
210 WERD_RES *werd_res = word_data.word;
211 WERD_CHOICE *best_choice = werd_res->best_choice;
212 ASSERT_HOST(best_choice != nullptr);
213
214 // Compute the number of unichars in the label.
215 std::vector<UNICHAR_ID> encoding;
216 if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
217 tprintf("Not outputting illegal unichar %s\n", label);
218 return;
219 }
220
221 // Dump all paths through the ratings matrix (which is normally small).
222 int dim = werd_res->ratings->dimension();
223 const auto **blob_choices = new const BLOB_CHOICE *[dim];
224 PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
225 delete[] blob_choices;
226}
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 302 of file tesseractclass.h.

302 {
303 if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
304 return true;
305 }
306 for (auto &lang : sub_langs_) {
307 if (lang->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
308 return true;
309 }
310 }
311 return false;
312 }
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:264

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 290 of file tesseractclass.h.

290 {
291 if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
292 return true;
293 }
294 for (auto &lang : sub_langs_) {
295 if (lang->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
296 return true;
297 }
298 }
299 return false;
300 }

◆ ApplyBoxes()

PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const char *  filename,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 110 of file applybox.cpp.

111 {
112 std::vector<TBOX> boxes;
113 std::vector<std::string> texts, full_texts;
114 if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
115 return nullptr; // Can't do it.
116 }
117
118 const int box_count = boxes.size();
119 int box_failures = 0;
120
121 // In word mode, we use the boxes to make a word for each box, but
122 // in blob mode we use the existing words and maximally chop them first.
123 PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
124 clear_any_old_text(block_list);
125
126 for (int i = 0; i < box_count; i++) {
127 bool foundit = false;
128 if (page_res != nullptr) {
129 foundit =
130 ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
131 (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
132 } else {
133 foundit = ResegmentWordBox(block_list, boxes[i],
134 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
135 }
136 if (!foundit) {
137 box_failures++;
138 ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
139 }
140 }
141
142 if (page_res == nullptr) {
143 // In word/line mode, we now maximally chop all the words and resegment
144 // them with the classifier.
145 page_res = SetupApplyBoxes(boxes, block_list);
147 }
148 if (applybox_debug > 0) {
149 tprintf("APPLY_BOXES:\n");
150 tprintf(" Boxes read from boxfile: %6d\n", box_count);
151 if (box_failures > 0) {
152 tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
153 }
154 }
155 TidyUp(page_res);
156 return page_res;
157}
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:76
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:414
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:685
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:310
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:495
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:743
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining ( const std::string &  fontname,
PAGE_RES page_res 
)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 751 of file applybox.cpp.

751 {
752 PAGE_RES_IT pr_it(page_res);
753 int word_count = 0;
754 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
755 LearnWord(fontname.c_str(), word_res);
756 ++word_count;
757 }
758 tprintf("Generated training data for %d words\n", word_count);
759}
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const std::vector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
std::vector< bool > *  word_wanted,
std::vector< C_BLOB * > *  target_blobs 
)

Definition at line 1036 of file control.cpp.

1039 {
1040 std::vector<bool> blob_wanted;
1041 word_wanted->clear();
1042 word_wanted->resize(outlines.size());
1043 target_blobs->clear();
1044 target_blobs->resize(outlines.size());
1045 // Check for outlines that need to be turned into stand-alone blobs.
1046 for (unsigned i = 0; i < outlines.size(); ++i) {
1047 if (outlines[i] == nullptr) {
1048 continue;
1049 }
1050 // Get a set of adjacent outlines that don't overlap any existing blob.
1051 blob_wanted.clear();
1052 blob_wanted.resize(outlines.size());
1053 int num_blob_outlines = 0;
1054 TBOX total_ol_box(outlines[i]->bounding_box());
1055 while (i < outlines.size() && outlines[i] != nullptr) {
1056 blob_wanted[i] = true;
1057 total_ol_box += outlines[i]->bounding_box();
1058 ++i;
1059 ++num_blob_outlines;
1060 }
1061 // Find the insertion point.
1062 C_BLOB_IT blob_it(real_word->cblob_list());
1063 while (!blob_it.at_last() &&
1064 blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1065 blob_it.forward();
1066 }
1067 // Choose which combination of them we actually want and where to put
1068 // them.
1069 if (debug_noise_removal) {
1070 tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1071 }
1072 C_BLOB *left_blob = blob_it.data();
1073 TBOX left_box = left_blob->bounding_box();
1074 C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1075 if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1076 !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1077 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1078 num_blob_outlines, &blob_wanted)) {
1079 if (debug_noise_removal) {
1080 tprintf("Added to left blob\n");
1081 }
1082 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1083 if (blob_wanted[j]) {
1084 (*word_wanted)[j] = true;
1085 (*target_blobs)[j] = left_blob;
1086 }
1087 }
1088 } else if (right_blob != nullptr &&
1089 (!left_box.x_overlap(total_ol_box) ||
1090 right_blob->bounding_box().x_overlap(total_ol_box)) &&
1091 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1092 num_blob_outlines, &blob_wanted)) {
1093 if (debug_noise_removal) {
1094 tprintf("Added to right blob\n");
1095 }
1096 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1097 if (blob_wanted[j]) {
1098 (*word_wanted)[j] = true;
1099 (*target_blobs)[j] = right_blob;
1100 }
1101 }
1102 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1103 num_blob_outlines, &blob_wanted)) {
1104 if (debug_noise_removal) {
1105 tprintf("Fitted between blobs\n");
1106 }
1107 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1108 if (blob_wanted[j]) {
1109 (*word_wanted)[j] = true;
1110 (*target_blobs)[j] = nullptr;
1111 }
1112 }
1113 }
1114 }
1115}
@ TBOX
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
Definition: control.cpp:1120

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const std::vector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
std::vector< bool > *  word_wanted,
std::vector< bool > *  overlapped_any_blob,
std::vector< C_BLOB * > *  target_blobs 
)

Definition at line 981 of file control.cpp.

985 {
986 std::vector<bool> blob_wanted;
987 word_wanted->clear();
988 word_wanted->resize(outlines.size());
989 overlapped_any_blob->clear();
990 overlapped_any_blob->resize(outlines.size());
991 target_blobs->clear();
992 target_blobs->resize(outlines.size());
993 // For each real blob, find the outlines that seriously overlap it.
994 // A single blob could be several merged characters, so there can be quite
995 // a few outlines overlapping, and the full engine needs to be used to chop
996 // and join to get a sensible result.
997 C_BLOB_IT blob_it(real_word->cblob_list());
998 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
999 C_BLOB *blob = blob_it.data();
1000 const TBOX blob_box = blob->bounding_box();
1001 blob_wanted.clear();
1002 blob_wanted.resize(outlines.size());
1003 int num_blob_outlines = 0;
1004 for (unsigned i = 0; i < outlines.size(); ++i) {
1005 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1006 blob_wanted[i] = true;
1007 (*overlapped_any_blob)[i] = true;
1008 ++num_blob_outlines;
1009 }
1010 }
1011 if (debug_noise_removal) {
1012 tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1013 blob_box.print();
1014 }
1015 // If any outlines overlap the blob, and not too many, classify the blob
1016 // (using the full engine, languages and all), and choose the maximal
1017 // combination of outlines that doesn't hurt the end-result classification
1018 // by too much. Mark them as wanted.
1019 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1020 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1021 num_blob_outlines, &blob_wanted)) {
1022 for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1023 if (blob_wanted[i]) {
1024 // Claim the outline and record where it is going.
1025 (*word_wanted)[i] = true;
1026 (*target_blobs)[i] = blob;
1027 }
1028 }
1029 }
1030 }
1031 }
1032}

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 199 of file pagesegmain.cpp.

200 {
201 Image photomask_pix = nullptr;
202 Image musicmask_pix = nullptr;
203 // The blocks made by the ColumnFinder. Moved to blocks before return.
204 BLOCK_LIST found_blocks;
205 TO_BLOCK_LIST temp_blocks;
206
207 ColumnFinder *finder = SetupPageSegAndDetectOrientation(
208 pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
209 pageseg_apply_music_mask ? &musicmask_pix : nullptr);
210 int result = 0;
211 if (finder != nullptr) {
212 TO_BLOCK_IT to_block_it(&temp_blocks);
213 TO_BLOCK *to_block = to_block_it.data();
214 if (musicmask_pix != nullptr) {
215 // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
216 // blocks separately. For now combine with photomask_pix.
217 photomask_pix |= musicmask_pix;
218 }
219#ifndef DISABLED_LEGACY_ENGINE
220 if (equ_detect_) {
221 finder->SetEquationDetect(equ_detect_);
222 }
223#endif // ndef DISABLED_LEGACY_ENGINE
224 result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
225 photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
226 &found_blocks, diacritic_blobs, to_blocks);
227 if (result >= 0) {
228 finder->GetDeskewVectors(&deskew_, &reskew_);
229 }
230 delete finder;
231 }
232 photomask_pix.destroy();
233 musicmask_pix.destroy();
234 if (result < 0) {
235 return result;
236 }
237
238 blocks->clear();
239 BLOCK_IT block_it(blocks);
240 // Move the found blocks to the input/output blocks.
241 block_it.add_list_after(&found_blocks);
242 return result;
243}
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix, Image *music_mask_pix)

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 503 of file superscript.cpp.

504 {
505 unsigned initial_ok_run_count = 0;
506 unsigned ok_run_count = 0;
507 float worst_certainty = 0.0f;
508 const WERD_CHOICE &wc = *word.best_choice;
509
510 const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
511 for (unsigned i = 0; i < wc.length(); i++) {
512 TBLOB *blob = word.rebuild_word->blobs[i];
513 UNICHAR_ID unichar_id = wc.unichar_id(i);
514 float char_certainty = wc.certainty(i);
515 bool bad_certainty = char_certainty < certainty_threshold;
516 bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
517 bool is_italic = word.fontinfo && word.fontinfo->is_italic();
518 BLOB_CHOICE *choice = word.GetBlobChoice(i);
519 if (choice && fontinfo_table.size() > 0) {
520 // Get better information from the specific choice, if available.
521 int font_id1 = choice->fontinfo_id();
522 bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
523 int font_id2 = choice->fontinfo_id2();
524 is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
525 }
526
527 float height_fraction = 1.0f;
528 float char_height = blob->bounding_box().height();
529 float normal_height = char_height;
530 if (wc.unicharset()->top_bottom_useful()) {
531 int min_bot, max_bot, min_top, max_top;
532 wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
533 float hi_height = max_top - max_bot;
534 float lo_height = min_top - min_bot;
535 normal_height = (hi_height + lo_height) / 2;
536 if (normal_height >= kBlnXHeight) {
537 // Only ding characters that we have decent information for because
538 // they're supposed to be normal sized, not tiny specks or dashes.
539 height_fraction = char_height / normal_height;
540 }
541 }
542 bool bad_height = height_fraction < superscript_scaledown_ratio;
543
544 if (debug) {
545 if (is_italic) {
546 tprintf(" Rejecting: superscript is italic.\n");
547 }
548 if (is_punc) {
549 tprintf(" Rejecting: punctuation present.\n");
550 }
551 const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
552 if (bad_certainty) {
553 tprintf(
554 " Rejecting: don't believe character %s with certainty %.2f "
555 "which is less than threshold %.2f\n",
556 char_str, char_certainty, certainty_threshold);
557 }
558 if (bad_height) {
559 tprintf(
560 " Rejecting: character %s seems too small @ %.2f versus "
561 "expected %.2f\n",
562 char_str, char_height, normal_height);
563 }
564 }
565 if (bad_certainty || bad_height || is_punc || is_italic) {
566 if (ok_run_count == i) {
567 initial_ok_run_count = ok_run_count;
568 }
569 ok_run_count = 0;
570 } else {
571 ok_run_count++;
572 }
573 if (char_certainty < worst_certainty) {
574 worst_certainty = char_certainty;
575 }
576 }
577 bool all_ok = ok_run_count == wc.length();
578 if (all_ok && debug) {
579 tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
580 }
581 if (!all_ok) {
582 if (left_ok) {
583 *left_ok = initial_ok_run_count;
584 }
585 if (right_ok) {
586 *right_ok = ok_run_count;
587 }
588 }
589 return all_ok;
590}
const int kBlnXHeight
Definition: normalis.h:33
int UNICHAR_ID
Definition: unichar.h:34
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324

◆ BestPix()

Image tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 238 of file tesseractclass.h.

238 {
239 if (pixGetWidth(pix_original_) == ImageWidth()) {
240 return pix_original_;
241 } else if (pix_grey_ != nullptr) {
242 return pix_grey_;
243 } else {
244 return pix_binary_;
245 }
246 }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 456 of file control.cpp.

456 {
457 PAGE_RES_IT word_it(page_res);
458
459 WERD_RES *w_prev = nullptr;
460 WERD_RES *w = word_it.word();
461 while (true) {
462 w_prev = w;
463 while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
464 // advance word_it, skipping over parts of combos
465 }
466 if (!word_it.word()) {
467 break;
468 }
469 w = word_it.word();
470 if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
471 continue;
472 }
473 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
474 if (tessedit_bigram_debug) {
475 tprintf("Skipping because one of the words is W_REP_CHAR\n");
476 }
477 continue;
478 }
479 // Two words sharing the same language model, excellent!
480 std::vector<WERD_CHOICE *> overrides_word1;
481 std::vector<WERD_CHOICE *> overrides_word2;
482
483 const auto orig_w1_str = w_prev->best_choice->unichar_string();
484 const auto orig_w2_str = w->best_choice->unichar_string();
485 WERD_CHOICE prev_best(w->uch_set);
486 {
487 int w1start, w1end;
488 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
489 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
490 }
491 WERD_CHOICE this_best(w->uch_set);
492 {
493 int w2start, w2end;
494 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
495 this_best = w->best_choice->shallow_copy(w2start, w2end);
496 }
497
498 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
499 if (tessedit_bigram_debug) {
500 tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
501 orig_w2_str.c_str());
502 }
503 continue;
504 }
505 if (tessedit_bigram_debug > 2) {
506 tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
507 }
508 if (tessedit_bigram_debug > 1) {
509 if (!w_prev->best_choices.singleton()) {
510 w_prev->PrintBestChoices();
511 }
512 if (!w->best_choices.singleton()) {
513 w->PrintBestChoices();
514 }
515 }
516 float best_rating = 0.0;
517 int best_idx = 0;
518 WERD_CHOICE_IT prev_it(&w_prev->best_choices);
519 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
520 WERD_CHOICE *p1 = prev_it.data();
521 WERD_CHOICE strip1(w->uch_set);
522 {
523 int p1start, p1end;
524 p1->GetNonSuperscriptSpan(&p1start, &p1end);
525 strip1 = p1->shallow_copy(p1start, p1end);
526 }
527 WERD_CHOICE_IT w_it(&w->best_choices);
528 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
529 WERD_CHOICE *p2 = w_it.data();
530 WERD_CHOICE strip2(w->uch_set);
531 {
532 int p2start, p2end;
533 p2->GetNonSuperscriptSpan(&p2start, &p2end);
534 strip2 = p2->shallow_copy(p2start, p2end);
535 }
536 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
537 overrides_word1.push_back(p1);
538 overrides_word2.push_back(p2);
539 if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
540 best_rating = p1->rating() + p2->rating();
541 best_idx = overrides_word1.size() - 1;
542 }
543 }
544 }
545 }
546 if (!overrides_word1.empty()) {
547 // Excellent, we have some bigram matches.
548 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
549 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
550 if (tessedit_bigram_debug > 1) {
551 tprintf(
552 "Top choice \"%s %s\" verified (sans case) by bigram "
553 "model.\n",
554 orig_w1_str.c_str(), orig_w2_str.c_str());
555 }
556 continue;
557 }
558 const auto new_w1_str = overrides_word1[best_idx]->unichar_string();
559 const auto new_w2_str = overrides_word2[best_idx]->unichar_string();
560 if (new_w1_str != orig_w1_str) {
561 w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
562 }
563 if (new_w2_str != orig_w2_str) {
564 w->ReplaceBestChoice(overrides_word2[best_idx]);
565 }
566 if (tessedit_bigram_debug > 0) {
567 std::string choices_description;
568 int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
569 if (num_bigram_choices == 1) {
570 choices_description = "This was the unique bigram choice.";
571 } else {
572 if (tessedit_bigram_debug > 1) {
573 std::string bigrams_list;
574 const int kMaxChoicesToPrint = 20;
575 for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
576 if (i > 0) {
577 bigrams_list += ", ";
578 }
579 WERD_CHOICE *p1 = overrides_word1[i];
580 WERD_CHOICE *p2 = overrides_word2[i];
581 bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
582 }
583 choices_description = "There were many choices: {";
584 choices_description += bigrams_list;
585 choices_description += "}";
586 } else {
587 choices_description += "There were " + std::to_string(num_bigram_choices);
588 choices_description += " compatible bigrams.";
589 }
590 }
591 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
592 orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
593 choices_description.c_str());
594 }
595 }
596 }
597}
@ W_REP_CHAR
repeated character
Definition: werd.h:40
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:773

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 683 of file control.cpp.

683 {
684 if (!wordrec_run_blamer) {
685 return;
686 }
687 PAGE_RES_IT page_res_it(page_res);
688 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
689 WERD_RES *word = page_res_it.word();
690 BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
691 page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
692 }
693 tprintf("Blame reasons:\n");
694 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
696 page_res->blame_reasons[bl]);
697 }
698 if (page_res->misadaption_log.size() > 0) {
699 tprintf("Misadaption log:\n");
700 for (auto &log : page_res->misadaption_log) {
701 tprintf("%s\n", log.c_str());
702 }
703 }
704}
IncorrectResultReason
Definition: blamer.h:56
@ IRR_NUM_REASONS
Definition: blamer.h:103
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:540

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 913 of file pgedit.cpp.

913 {
914# ifndef DISABLED_LEGACY_ENGINE
915 PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
916 if (it != nullptr) {
917 WERD_RES *word_res = it->word();
918 word_res->x_height = it->row()->row->x_height();
919 word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
920 classify_bln_numeric_mode, textord_use_cjk_fp_model,
921 poly_allow_detailed_fx, it->row()->row, it->block()->block);
922 TWERD *bln_word = word_res->chopped_word;
923 TBLOB *bln_blob = bln_word->blobs[0];
924 INT_FX_RESULT_STRUCT fx_info;
925 std::vector<INT_FEATURE_STRUCT> bl_features;
926 std::vector<INT_FEATURE_STRUCT> cn_features;
927 Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features,
928 &fx_info, nullptr);
929 // Display baseline features.
930 ScrollView *bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
932 for (auto &bl_feature : bl_features) {
933 RenderIntFeature(bl_win, &bl_feature, ScrollView::GREEN);
934 }
935 bl_win->Update();
936 // Display cn features.
937 ScrollView *cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
939 for (auto &cn_feature : cn_features) {
940 RenderIntFeature(cn_win, &cn_feature, ScrollView::GREEN);
941 }
942 cn_win->Update();
943
944 it->DeleteCurrentWord();
945 delete it;
946 }
947# endif // ndef DISABLED_LEGACY_ENGINE
948}
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:38
@ character
Definition: mfoutline.h:53
@ baseline
Definition: mfoutline.h:53
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1622
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1500
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:889
Image BestPix() const
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, std::vector< INT_FEATURE_STRUCT > *bl_features, std::vector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, std::vector< int > *outline_cn_counts)
Definition: intfx.cpp:436

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 772 of file fixspace.cpp.

772 {
773 TBOX box; // BB of outline
774 int16_t outline_count = 0;
775 int16_t max_dimension;
776 int16_t largest_outline_dimension = 0;
777
778 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
779 outline_count++;
780 box = ol->bounding_box();
781 if (box.height() > box.width()) {
782 max_dimension = box.height();
783 } else {
784 max_dimension = box.width();
785 }
786
787 if (largest_outline_dimension < max_dimension) {
788 largest_outline_dimension = max_dimension;
789 }
790 }
791
792 if (outline_count > 5) {
793 // penalise LOTS of blobs
794 largest_outline_dimension *= 2;
795 }
796
797 box = blob->bounding_box();
798 if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
799 // Lax blob is if high or low
800 largest_outline_dimension /= 2;
801 }
802
803 return largest_outline_dimension;
804}
const int kBlnBaselineOffset
Definition: normalis.h:34

◆ BOOL_VAR_H() [1/91]

tesseract::Tesseract::BOOL_VAR_H ( applybox_learn_chars_and_char_frags_mode  )

◆ BOOL_VAR_H() [2/91]

tesseract::Tesseract::BOOL_VAR_H ( applybox_learn_ngrams_mode  )

◆ BOOL_VAR_H() [3/91]

tesseract::Tesseract::BOOL_VAR_H ( bland_unrej  )

◆ BOOL_VAR_H() [4/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_accept_ok  )

◆ BOOL_VAR_H() [5/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_early_convert_bad_unlv_chs  )

◆ BOOL_VAR_H() [6/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_early_merge_tess_fails  )

◆ BOOL_VAR_H() [7/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_include_numerals  )

◆ BOOL_VAR_H() [8/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_leave_accept_strings  )

◆ BOOL_VAR_H() [9/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_leave_ok_strings  )

◆ BOOL_VAR_H() [10/91]

tesseract::Tesseract::BOOL_VAR_H ( crunch_terrible_garbage  )

◆ BOOL_VAR_H() [11/91]

tesseract::Tesseract::BOOL_VAR_H ( enable_noise_removal  )

◆ BOOL_VAR_H() [12/91]

tesseract::Tesseract::BOOL_VAR_H ( hocr_char_boxes  )

◆ BOOL_VAR_H() [13/91]

tesseract::Tesseract::BOOL_VAR_H ( hocr_font_info  )

◆ BOOL_VAR_H() [14/91]

tesseract::Tesseract::BOOL_VAR_H ( interactive_display_mode  )

◆ BOOL_VAR_H() [15/91]

tesseract::Tesseract::BOOL_VAR_H ( lstm_use_matrix  )

◆ BOOL_VAR_H() [16/91]

tesseract::Tesseract::BOOL_VAR_H ( pageseg_apply_music_mask  )

◆ BOOL_VAR_H() [17/91]

tesseract::Tesseract::BOOL_VAR_H ( paragraph_text_based  )

◆ BOOL_VAR_H() [18/91]

tesseract::Tesseract::BOOL_VAR_H ( poly_allow_detailed_fx  )

◆ BOOL_VAR_H() [19/91]

tesseract::Tesseract::BOOL_VAR_H ( preserve_interword_spaces  )

◆ BOOL_VAR_H() [20/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_1Il_trust_permuter_type  )

◆ BOOL_VAR_H() [21/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_1Il_use_dict_word  )

◆ BOOL_VAR_H() [22/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_alphas_in_number_perm  )

◆ BOOL_VAR_H() [23/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_trust_doc_dawg  )

◆ BOOL_VAR_H() [24/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_good_perm  )

◆ BOOL_VAR_H() [25/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_sensible_wd  )

◆ BOOL_VAR_H() [26/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_tess_accepted  )

◆ BOOL_VAR_H() [27/91]

tesseract::Tesseract::BOOL_VAR_H ( rej_use_tess_blanks  )

◆ BOOL_VAR_H() [28/91]

tesseract::Tesseract::BOOL_VAR_H ( suspect_constrain_1Il  )

◆ BOOL_VAR_H() [29/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_adaption_debug  )

◆ BOOL_VAR_H() [30/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_ambigs_training  )

◆ BOOL_VAR_H() [31/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_alto  )

◆ BOOL_VAR_H() [32/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_boxfile  )

◆ BOOL_VAR_H() [33/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_hocr  )

◆ BOOL_VAR_H() [34/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_lstmbox  )

◆ BOOL_VAR_H() [35/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_pdf  )

◆ BOOL_VAR_H() [36/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_tsv  )

◆ BOOL_VAR_H() [37/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_txt  )

◆ BOOL_VAR_H() [38/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_create_wordstrbox  )

◆ BOOL_VAR_H() [39/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_block_rejection  )

◆ BOOL_VAR_H() [40/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_doc_rejection  )

◆ BOOL_VAR_H() [41/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_fonts  )

◆ BOOL_VAR_H() [42/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_debug_quality_metrics  )

◆ BOOL_VAR_H() [43/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_display_outwords  )

◆ BOOL_VAR_H() [44/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_do_invert  )

◆ BOOL_VAR_H() [45/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dont_blkrej_good_wds  )

◆ BOOL_VAR_H() [46/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dont_rowrej_good_wds  )

◆ BOOL_VAR_H() [47/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dump_choices  )

◆ BOOL_VAR_H() [48/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_dump_pageseg_images  )

◆ BOOL_VAR_H() [49/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_enable_bigram_correction  )

◆ BOOL_VAR_H() [50/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_enable_dict_correction  )

◆ BOOL_VAR_H() [51/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_enable_doc_dict  )

◆ BOOL_VAR_H() [52/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_fix_fuzzy_spaces  )

◆ BOOL_VAR_H() [53/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_fix_hyphens  )

◆ BOOL_VAR_H() [54/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_flip_0O  )

◆ BOOL_VAR_H() [55/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_good_quality_unrej  )

◆ BOOL_VAR_H() [56/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_init_config_only  )

◆ BOOL_VAR_H() [57/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_make_boxes_from_boxes  )

◆ BOOL_VAR_H() [58/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_minimal_rej_pass1  )

◆ BOOL_VAR_H() [59/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_minimal_rejection  )

◆ BOOL_VAR_H() [60/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_override_permuter  )

◆ BOOL_VAR_H() [61/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_prefer_joined_punct  )

◆ BOOL_VAR_H() [62/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_preserve_blk_rej_perfect_wds  )

◆ BOOL_VAR_H() [63/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_preserve_row_rej_perfect_wds  )

◆ BOOL_VAR_H() [64/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_reject_bad_qual_wds  )

◆ BOOL_VAR_H() [65/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_rejection_debug  )

◆ BOOL_VAR_H() [66/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_resegment_from_boxes  )

◆ BOOL_VAR_H() [67/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_resegment_from_line_boxes  )

◆ BOOL_VAR_H() [68/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_row_rej_good_docs  )

◆ BOOL_VAR_H() [69/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_test_adaption  )

◆ BOOL_VAR_H() [70/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_timing_debug  )

◆ BOOL_VAR_H() [71/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_train_from_boxes  )

◆ BOOL_VAR_H() [72/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_train_line_recognizer  )

◆ BOOL_VAR_H() [73/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_unrej_any_wd  )

◆ BOOL_VAR_H() [74/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_use_primary_params_model  )

◆ BOOL_VAR_H() [75/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_use_reject_spaces  )

◆ BOOL_VAR_H() [76/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_word_for_word  )

◆ BOOL_VAR_H() [77/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_block_separators  )

◆ BOOL_VAR_H() [78/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_images  )

◆ BOOL_VAR_H() [79/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_rep_codes  )

◆ BOOL_VAR_H() [80/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_write_unlv  )

◆ BOOL_VAR_H() [81/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_zero_kelvin_rejection  )

◆ BOOL_VAR_H() [82/91]

tesseract::Tesseract::BOOL_VAR_H ( tessedit_zero_rejection  )

◆ BOOL_VAR_H() [83/91]

tesseract::Tesseract::BOOL_VAR_H ( test_pt  )

◆ BOOL_VAR_H() [84/91]

tesseract::Tesseract::BOOL_VAR_H ( textonly_pdf  )

◆ BOOL_VAR_H() [85/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_equation_detect  )

◆ BOOL_VAR_H() [86/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_tabfind_force_vertical_text  )

◆ BOOL_VAR_H() [87/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_tabfind_show_vlines  )

◆ BOOL_VAR_H() [88/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_tabfind_vertical_text  )

◆ BOOL_VAR_H() [89/91]

tesseract::Tesseract::BOOL_VAR_H ( textord_use_cjk_fp_model  )

◆ BOOL_VAR_H() [90/91]

tesseract::Tesseract::BOOL_VAR_H ( thresholding_debug  )

◆ BOOL_VAR_H() [91/91]

tesseract::Tesseract::BOOL_VAR_H ( unlv_tilde_crunching  )

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 621 of file fixspace.cpp.

621 {
622 WERD_RES_IT word_it(&words);
623 WERD_RES_IT worst_word_it;
624 float worst_noise_score = 9999;
625 int worst_blob_index = -1; // Noisiest blob of noisiest wd
626 int blob_index; // of wds noisiest blob
627 float noise_score; // of wds noisiest blob
628 WERD_RES *word_res;
629 C_BLOB_IT blob_it;
630 C_BLOB_IT rej_cblob_it;
631 C_BLOB_LIST new_blob_list;
632 C_BLOB_IT new_blob_it;
633 C_BLOB_IT new_rej_cblob_it;
634 WERD *new_word;
635 int16_t start_of_noise_blob;
636 int16_t i;
637
638 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
639 blob_index = worst_noise_blob(word_it.data(), &noise_score);
640 if (blob_index > -1 && worst_noise_score > noise_score) {
641 worst_noise_score = noise_score;
642 worst_blob_index = blob_index;
643 worst_word_it = word_it;
644 }
645 }
646 if (worst_blob_index < 0) {
647 words.clear(); // signal termination
648 return;
649 }
650
651 /* Now split the worst_word_it */
652
653 word_res = worst_word_it.data();
654
655 /* Move blobs before noise blob to a new bloblist */
656
657 new_blob_it.set_to_list(&new_blob_list);
658 blob_it.set_to_list(word_res->word->cblob_list());
659 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
660 new_blob_it.add_after_then_move(blob_it.extract());
661 }
662 start_of_noise_blob = blob_it.data()->bounding_box().left();
663 delete blob_it.extract(); // throw out noise blob
664
665 new_word = new WERD(&new_blob_list, word_res->word);
666 new_word->set_flag(W_EOL, false);
667 word_res->word->set_flag(W_BOL, false);
668 word_res->word->set_blanks(1); // After break
669
670 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
671 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
672 for (; (!rej_cblob_it.empty() &&
673 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
674 rej_cblob_it.forward()) {
675 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
676 }
677
678 auto *new_word_res = new WERD_RES(new_word);
679 new_word_res->combination = true;
680 worst_word_it.add_before_then_move(new_word_res);
681
682 word_res->ClearResults();
683}
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:685

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

build_menu()

Construct the menu tree used by the command window

Definition at line 275 of file pgedit.cpp.

275 {
276 SVMenuNode *parent_menu;
277 auto *root_menu_item = new SVMenuNode();
278
279 SVMenuNode *modes_menu_item = root_menu_item->AddChild("MODES");
280
281 modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
282 modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
283 modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
284 modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
285 modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
286 modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
287 modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
288 modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
289
290 parent_menu = root_menu_item->AddChild("DISPLAY");
291
292 parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
293 parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
294 parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
295 parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
296 parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
297 parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
298 parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
299 parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
300 parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
301 parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
302 parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
303 parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
304 parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
305 parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
306 parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
307
308 parent_menu = root_menu_item->AddChild("OTHER");
309
310 parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
311 parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
312 parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
313 parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
314 parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
315 parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
316
317 return root_menu_item;
318}
@ SHOW_SUBSCRIPT_CMD_EVENT
Definition: pgedit.cpp:72
@ DEBUG_WERD_CMD_EVENT
Definition: pgedit.cpp:56
@ SHOW_UNDERLINE_CMD_EVENT
Definition: pgedit.cpp:76
@ SHOW_SERIF_CMD_EVENT
Definition: pgedit.cpp:78
@ BASELINES_CMD_EVENT
Definition: pgedit.cpp:65
@ SHOW_BOLD_CMD_EVENT
Definition: pgedit.cpp:75
@ BLAMER_CMD_EVENT
Definition: pgedit.cpp:57
@ SHOW_BLN_WERD_CMD_EVENT
Definition: pgedit.cpp:55
@ RECOG_PSEUDO
Definition: pgedit.cpp:70
@ SHOW_SUPERSCRIPT_CMD_EVENT
Definition: pgedit.cpp:73
@ BL_NORM_CMD_EVENT
Definition: pgedit.cpp:61
@ REFRESH_CMD_EVENT
Definition: pgedit.cpp:67
@ BITMAP_CMD_EVENT
Definition: pgedit.cpp:62
@ DUMP_WERD_CMD_EVENT
Definition: pgedit.cpp:53
@ SHOW_BLOB_FEATURES
Definition: pgedit.cpp:71
@ SHOW_POINT_CMD_EVENT
Definition: pgedit.cpp:54
@ IMAGE_CMD_EVENT
Definition: pgedit.cpp:63
@ RECOG_WERDS
Definition: pgedit.cpp:69
@ SHOW_DROPCAPS_CMD_EVENT
Definition: pgedit.cpp:80
@ SHOW_FIXEDPITCH_CMD_EVENT
Definition: pgedit.cpp:77
@ CHANGE_DISP_CMD_EVENT
Definition: pgedit.cpp:52
@ CORRECT_TEXT_CMD_EVENT
Definition: pgedit.cpp:59
@ BOUNDING_BOX_CMD_EVENT
Definition: pgedit.cpp:58
@ BLOCKS_CMD_EVENT
Definition: pgedit.cpp:64
@ POLYGONAL_CMD_EVENT
Definition: pgedit.cpp:60
@ UNIFORM_DISP_CMD_EVENT
Definition: pgedit.cpp:66
@ QUIT_CMD_EVENT
Definition: pgedit.cpp:68
@ SHOW_SMALLCAPS_CMD_EVENT
Definition: pgedit.cpp:79
@ SHOW_ITALIC_CMD_EVENT
Definition: pgedit.cpp:74

◆ check_debug_pt()

bool tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1799 of file control.cpp.

1799 {
1800 bool show_map_detail = false;
1801 int16_t i;
1802
1803 if (!test_pt) {
1804 return false;
1805 }
1806
1807 tessedit_rejection_debug.set_value(false);
1808 debug_x_ht_level.set_value(0);
1809
1810 if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1811 if (location < 0) {
1812 return true; // For breakpoint use
1813 }
1814 tessedit_rejection_debug.set_value(true);
1815 debug_x_ht_level.set_value(2);
1816 tprintf("\n\nTESTWD::");
1817 switch (location) {
1818 case 0:
1819 tprintf("classify_word_pass1 start\n");
1820 word->word->print();
1821 break;
1822 case 10:
1823 tprintf("make_reject_map: initial map");
1824 break;
1825 case 20:
1826 tprintf("make_reject_map: after NN");
1827 break;
1828 case 30:
1829 tprintf("classify_word_pass2 - START");
1830 break;
1831 case 40:
1832 tprintf("classify_word_pass2 - Pre Xht");
1833 break;
1834 case 50:
1835 tprintf("classify_word_pass2 - END");
1836 show_map_detail = true;
1837 break;
1838 case 60:
1839 tprintf("fixspace");
1840 break;
1841 case 70:
1842 tprintf("MM pass START");
1843 break;
1844 case 80:
1845 tprintf("MM pass END");
1846 break;
1847 case 90:
1848 tprintf("After Poor quality rejection");
1849 break;
1850 case 100:
1851 tprintf("unrej_good_quality_words - START");
1852 break;
1853 case 110:
1854 tprintf("unrej_good_quality_words - END");
1855 break;
1856 case 120:
1857 tprintf("Write results pass");
1858 show_map_detail = true;
1859 break;
1860 }
1861 if (word->best_choice != nullptr) {
1862 tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1863 word->reject_map.print(debug_fp);
1864 tprintf("\n");
1865 if (show_map_detail) {
1866 tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1867 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1868 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1869 word->reject_map[i].full_print(debug_fp);
1870 }
1871 }
1872 } else {
1873 tprintf("null best choice\n");
1874 }
1875 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1876 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1877 return true;
1878 } else {
1879 return false;
1880 }
1881}
FILE * debug_fp
Definition: tessvars.cpp:24

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1302 of file control.cpp.

1302 {
1303#ifdef DISABLED_LEGACY_ENGINE
1305#else
1306 WordRecognizer recognizer =
1308#endif // def DISABLED_LEGACY_ENGINE
1309
1310 // Best result so far.
1311 PointerVector<WERD_RES> best_words;
1312 // Points to the best result. May be word or in lang_words.
1313 const WERD_RES *word = word_data->word;
1314 clock_t start_t = clock();
1315 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1316 if (debug) {
1317 tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1318 most_recently_used_->lang.c_str());
1319 word->word->bounding_box().print();
1320 }
1321 if (word->done) {
1322 // If done on pass1, leave it as-is.
1323 if (!word->tess_failed) {
1324 most_recently_used_ = word->tesseract;
1325 }
1326 return;
1327 }
1328 auto sub = sub_langs_.size();
1329 if (most_recently_used_ != this) {
1330 // Get the index of the most_recently_used_.
1331 for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1332 }
1333 }
1334 most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1335 &best_words);
1336 Tesseract *best_lang_tess = most_recently_used_;
1337 if (!WordsAcceptable(best_words)) {
1338 // Try all the other languages to see if they are any better.
1339 if (most_recently_used_ != this &&
1340 this->RetryWithLanguage(*word_data, recognizer, debug,
1341 &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1342 best_lang_tess = this;
1343 }
1344 for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1345 if (most_recently_used_ != sub_langs_[i] &&
1346 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1347 &best_words) > 0) {
1348 best_lang_tess = sub_langs_[i];
1349 }
1350 }
1351 }
1352 most_recently_used_ = best_lang_tess;
1353 if (!best_words.empty()) {
1354 if (best_words.size() == 1 && !best_words[0]->combination) {
1355 // Move the best single result to the main word.
1356 word_data->word->ConsumeWordResults(best_words[0]);
1357 } else {
1358 // Words came from LSTM, and must be moved to the PAGE_RES properly.
1359 word_data->word = best_words.back();
1360 pr_it->ReplaceCurrentWord(&best_words);
1361 }
1362 ASSERT_HOST(word_data->word->box_word != nullptr);
1363 } else {
1364 tprintf("no best words!!\n");
1365 }
1366 clock_t ocr_t = clock();
1367 if (tessedit_timing_debug) {
1368 tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
1369 static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
1370 }
1371}
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1379
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:873
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1535

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1379 of file control.cpp.

1380 {
1381 ROW *row = word_data.row;
1382 BLOCK *block = word_data.block;
1384 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1385#ifdef DISABLED_LEGACY_ENGINE
1386 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1387#else
1388 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1389 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1390#endif // def DISABLED_LEGACY_ENGINE
1391 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1392 LSTMRecognizeWord(*block, row, *in_word, out_words);
1393 if (!out_words->empty()) {
1394 return; // Successful lstm recognition.
1395 }
1396 }
1397 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1398 // No fallback allowed, so use a fake.
1399 (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1400 return;
1401 }
1402
1403#ifndef DISABLED_LEGACY_ENGINE
1404 // Fall back to tesseract for failed words or odd words.
1405 (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1406 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1407 poly_allow_detailed_fx, row, block);
1408#endif // ndef DISABLED_LEGACY_ENGINE
1409 }
1410
1411#ifndef DISABLED_LEGACY_ENGINE
1412 WERD_RES *word = *in_word;
1413 match_word_pass_n(1, word, row, block);
1414 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1415 word->tess_would_adapt = AdaptableWord(word);
1416 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1417
1418 if (adapt_ok) {
1419 // Send word to adaptive classifier for training.
1420 word->BestChoiceToCorrectText();
1421 LearnWord(nullptr, word);
1422 // Mark misadaptions if running blamer.
1423 if (word->blamer_bundle != nullptr) {
1424 word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1425 }
1426 }
1427
1428 if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1429 tess_add_doc_word(word->best_choice);
1430 }
1431 }
1432#endif // ndef DISABLED_LEGACY_ENGINE
1433}
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:266
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:230
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1589
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:73
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:811
const UNICHARSET & GetUnicharset() const
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1535 of file control.cpp.

1536 {
1537 // Return if we do not want to run Tesseract.
1538 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1539 return;
1540 }
1541#ifndef DISABLED_LEGACY_ENGINE
1542 ROW *row = word_data.row;
1543 BLOCK *block = word_data.block;
1544 WERD_RES *word = *in_word;
1546 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1547
1548 check_debug_pt(word, 30);
1549 if (!word->done) {
1550 word->caps_height = 0.0;
1551 if (word->x_height == 0.0f) {
1552 word->x_height = row->x_height();
1553 }
1554 match_word_pass_n(2, word, row, block);
1555 check_debug_pt(word, 40);
1556 }
1557
1559
1560 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1562 block->classify_rotation().y() == 0.0f) {
1563 // Use the tops and bottoms since they are available.
1564 TrainedXheightFix(word, block, row);
1565 }
1566 }
1567# ifndef GRAPHICS_DISABLED
1568 if (tessedit_display_outwords) {
1569 if (fx_win == nullptr) {
1570 create_fx_win();
1571 }
1572 clear_fx_win();
1573 word->rebuild_word->plot(fx_win);
1574 TBOX wbox = word->rebuild_word->bounding_box();
1575 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1577 }
1578# endif
1579 check_debug_pt(word, 50);
1580#endif // ndef DISABLED_LEGACY_ENGINE
1581}
void clear_fx_win()
Definition: drawfx.cpp:61
void create_fx_win()
Definition: drawfx.cpp:50
ScrollView * fx_win
Definition: drawfx.cpp:42
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1455
bool script_has_xheight() const
Definition: unicharset.h:958
bool top_bottom_useful() const
Definition: unicharset.h:555
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:742
static void Update()
Definition: scrollview.cpp:700

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
std::string &  best_str,
float *  c2 
)

Definition at line 1252 of file control.cpp.

1253 {
1254 WERD *real_word = pr_it->word()->word;
1255 WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1256 C_BLOB::deep_copy(blob));
1257 WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1258 // Get a new iterator that points to the new word.
1259 PAGE_RES_IT it(pr_it->page_res);
1260 while (it.word() != word_res && it.word() != nullptr) {
1261 it.forward();
1262 }
1263 ASSERT_HOST(it.word() == word_res);
1264 WordData wd(it);
1265 // Force full initialization.
1266 SetupWordPassN(1, &wd);
1267 classify_word_and_language(pass_n, &it, &wd);
1268 if (debug_noise_removal) {
1269 if (wd.word->raw_choice != nullptr) {
1270 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1271 wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());
1272 } else {
1273 tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1274 wd.row->x_height());
1275 }
1276 }
1277 float cert = 0.0f;
1278 if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1279 cert = wd.word->raw_choice->certainty();
1280 float rat = wd.word->raw_choice->rating();
1281 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1282 best_str = wd.word->raw_choice->unichar_string();
1283 } else {
1284 *c2 = 0.0f;
1285 best_str.clear();
1286 }
1287 it.DeleteCurrentWord();
1288 pr_it->ResetWordIterator();
1289 return cert;
1290}
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const std::vector< bool > &  ok_outlines,
const std::vector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
std::string &  best_str 
)

Definition at line 1207 of file control.cpp.

1209 {
1210 C_OUTLINE_IT ol_it;
1211 C_OUTLINE *first_to_keep = nullptr;
1212 C_BLOB *local_blob = nullptr;
1213 if (blob != nullptr) {
1214 // Add the required outlines to the blob.
1215 ol_it.set_to_list(blob->out_list());
1216 first_to_keep = ol_it.data();
1217 }
1218 for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1219 if (ok_outlines[i]) {
1220 // This outline is to be added.
1221 if (blob == nullptr) {
1222 local_blob = new C_BLOB(outlines[i]);
1223 blob = local_blob;
1224 ol_it.set_to_list(blob->out_list());
1225 } else {
1226 ol_it.add_before_stay_put(outlines[i]);
1227 }
1228 }
1229 }
1230 float c2;
1231 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1232 ol_it.move_to_first();
1233 if (first_to_keep == nullptr) {
1234 // We created blob. Empty its outlines and delete it.
1235 for (; !ol_it.empty(); ol_it.forward()) {
1236 ol_it.extract();
1237 }
1238 delete local_blob;
1239 cert = -c2;
1240 } else {
1241 // Remove the outlines that we put in.
1242 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1243 ol_it.extract();
1244 }
1245 }
1246 return cert;
1247}
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
Definition: control.cpp:1252

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 489 of file tesseractclass.cpp.

489 {
490 std::string debug_name = imagebasename + "_debug.pdf";
491 pixa_debug_.WritePDF(debug_name.c_str());
492 pix_binary_.destroy();
493 pix_grey_.destroy();
494 pix_thresholds_.destroy();
495 scaled_color_.destroy();
496 deskew_ = FCOORD(1.0f, 0.0f);
497 reskew_ = FCOORD(1.0f, 0.0f);
498 splitter_.Clear();
499 scaled_factor_ = -1;
500 for (auto &sub_lang : sub_langs_) {
501 sub_lang->Clear();
502 }
503}
void WritePDF(const char *filename)
Definition: debugpixa.h:42
std::string imagebasename
Definition: ccutil.h:58

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 105 of file fixxht.cpp.

105 {
106 STATS top_stats(0, UINT8_MAX - 1);
107 STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1);
108 int bottom_shift = 0;
109 int num_blobs = word_res->rebuild_word->NumBlobs();
110 do {
111 top_stats.clear();
112 shift_stats.clear();
113 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
114 TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
115 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
116 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
117 int top = blob->bounding_box().top() + bottom_shift;
118 // Clip the top to the limit of normalized feature space.
119 if (top >= INT_FEAT_RANGE) {
120 top = INT_FEAT_RANGE - 1;
121 }
122 int bottom = blob->bounding_box().bottom() + bottom_shift;
123 int min_bottom, max_bottom, min_top, max_top;
124 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
125 // Chars with a wild top range would mess up the result so ignore them.
126 if (max_top - min_top > kMaxCharTopRange) {
127 continue;
128 }
129 int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
130 top - (max_top + x_ht_acceptance_tolerance));
131 int height = top - kBlnBaselineOffset;
132 if (debug_x_ht_level >= 2) {
133 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
134 unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
135 max_top, bottom, top);
136 }
137 // Use only chars that fit in the expected bottom range, and where
138 // the range of tops is sensibly near the xheight.
139 if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
140 bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
141 max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
142 // Compute the x-height position using proportionality between the
143 // actual height and expected height.
144 int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
145 int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
146 if (debug_x_ht_level >= 2) {
147 tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
148 }
149 // The range of expected heights gets a vote equal to the distance
150 // of the actual top from the expected top.
151 for (int y = min_xht; y <= max_xht; ++y) {
152 top_stats.add(y, misfit_dist);
153 }
154 } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
155 bottom - x_ht_acceptance_tolerance > max_bottom) &&
156 bottom_shift == 0) {
157 // Get the range of required bottom shift.
158 int min_shift = min_bottom - bottom;
159 int max_shift = max_bottom - bottom;
160 if (debug_x_ht_level >= 2) {
161 tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
162 }
163 // The range of expected shifts gets a vote equal to the min distance
164 // of the actual bottom from the expected bottom, spread over the
165 // range of its acceptance.
166 int misfit_weight = abs(min_shift);
167 if (max_shift > min_shift) {
168 misfit_weight /= max_shift - min_shift;
169 }
170 for (int y = min_shift; y <= max_shift; ++y) {
171 shift_stats.add(y, misfit_weight);
172 }
173 } else {
174 if (bottom_shift == 0) {
175 // Things with bottoms that are already ok need to say so, on the
176 // 1st iteration only.
177 shift_stats.add(0, kBlnBaselineOffset);
178 }
179 if (debug_x_ht_level >= 2) {
180 tprintf(" already OK\n");
181 }
182 }
183 }
184 }
185 if (shift_stats.get_total() > top_stats.get_total()) {
186 bottom_shift = IntCastRounded(shift_stats.median());
187 if (debug_x_ht_level >= 2) {
188 tprintf("Applying bottom shift=%d\n", bottom_shift);
189 }
190 }
191 } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
192 // Baseline shift is opposite sign to the bottom shift.
193 *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194 if (debug_x_ht_level >= 2) {
195 tprintf("baseline shift=%g\n", *baseline_shift);
196 }
197 if (top_stats.get_total() == 0) {
198 return bottom_shift != 0 ? word_res->x_height : 0.0f;
199 }
200 // The new xheight is just the median vote, which is then scaled out
201 // of BLN space back to pixel space to get the x-height in pixel space.
202 float new_xht = top_stats.median();
203 if (debug_x_ht_level >= 2) {
204 tprintf("Median xht=%f\n", new_xht);
205 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
206 new_xht / word_res->denorm.y_scale());
207 }
208 // The xheight must change by at least x_ht_min_change to be used.
209 if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
210 return new_xht / word_res->denorm.y_scale();
211 } else {
212 return bottom_shift != 0 ? word_res->x_height : 0.0f;
213 }
214}
#define INT_FEAT_RANGE
Definition: float2int.h:27
const double y
int IntCastRounded(double x)
Definition: helpers.h:170
int DivRounded(int a, int b)
Definition: helpers.h:162
const int kMaxCharTopRange
Definition: fixxht.cpp:69
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 594 of file docqual.cpp.

594 {
595 int i;
596 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
597 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
598 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
599 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
600 for (i = 0; i < word_res->reject_map.length(); ++i) {
601 if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
602 word_res->best_choice->set_unichar_id(unichar_dash, i);
603 if (word_res->reject_map[i].accepted()) {
604 word_res->reject_map[i].setrej_unlv_rej();
605 }
606 }
607 if (word_res->best_choice->unichar_id(i) == unichar_pow) {
608 word_res->best_choice->set_unichar_id(unichar_space, i);
609 if (word_res->reject_map[i].accepted()) {
610 word_res->reject_map[i].setrej_unlv_rej();
611 }
612 }
613 }
614}

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
std::vector< UNICHAR_ID > *  class_ids 
)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns
false if an invalid UNICHAR_ID is encountered.

Definition at line 520 of file applybox.cpp.

520 {
521 for (int step = 0; *utf8 != '\0'; utf8 += step) {
522 const char *next_space = strchr(utf8, ' ');
523 if (next_space == nullptr) {
524 next_space = utf8 + strlen(utf8);
525 }
526 step = next_space - utf8;
527 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
528 if (class_id == INVALID_UNICHAR_ID) {
529 return false;
530 }
531 while (utf8[step] == ' ') {
532 ++step;
533 }
534 class_ids->push_back(class_id);
535 }
536 return true;
537}
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 764 of file applybox.cpp.

764 {
765 PAGE_RES_IT pr_it(page_res);
766 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
767 auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
768 for (auto &correct_text : word_res->correct_text) {
769 // The part before the first space is the real ground truth, and the
770 // rest is the bounding box location and page number.
771 std::vector<std::string> tokens = split(correct_text, ' ');
772 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
773 choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
774 }
775 word_res->ClearWordChoices();
776 word_res->LogNewRawChoice(choice);
777 word_res->LogNewCookedChoice(1, false, choice);
778 }
779}
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43

◆ count_alphanums() [1/2]

int16_t tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 375 of file output.cpp.

375 {
376 int count = 0;
377 for (unsigned i = 0; i < word.length(); ++i) {
378 if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
379 word.unicharset()->get_isdigit(word.unichar_id(i))) {
380 count++;
381 }
382 }
383 return count;
384}

◆ count_alphanums() [2/2]

int16_t tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 542 of file reject.cpp.

542 {
543 int count = 0;
544 const WERD_CHOICE *best_choice = word_res->best_choice;
545 for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
546 if ((word_res->reject_map[i].accepted()) &&
547 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
548 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
549 count++;
550 }
551 }
552 return count;
553}

◆ count_alphas()

int16_t tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 365 of file output.cpp.

365 {
366 int count = 0;
367 for (unsigned i = 0; i < word.length(); ++i) {
368 if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
369 count++;
370 }
371 }
372 return count;
373}

◆ count_outline_errs()

int16_t tesseract::Tesseract::count_outline_errs ( char  c,
int16_t  outline_count 
)

Definition at line 107 of file docqual.cpp.

107 {
108 int expected_outline_count;
109
110 if (outlines_odd.contains(c)) {
111 return 0; // Don't use this char
112 } else if (outlines_2.contains(c)) {
113 expected_outline_count = 2;
114 } else {
115 expected_outline_count = 1;
116 }
117 return abs(outline_count - expected_outline_count);
118}

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 72 of file fixxht.cpp.

72 {
73 int bad_blobs = 0;
74 int num_blobs = word_res->rebuild_word->NumBlobs();
75 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
76 TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
77 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
78 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
79 int top = blob->bounding_box().top();
80 if (top >= INT_FEAT_RANGE) {
81 top = INT_FEAT_RANGE - 1;
82 }
83 int min_bottom, max_bottom, min_top, max_top;
84 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
85 if (max_top - min_top > kMaxCharTopRange) {
86 continue;
87 }
88 bool bad =
89 top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
90 if (bad) {
91 ++bad_blobs;
92 }
93 if (debug_x_ht_level >= 1) {
94 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
95 unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
96 static_cast<int>(x_ht_acceptance_tolerance));
97 }
98 }
99 }
100 return bad_blobs;
101}

◆ debug_word()

void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 640 of file pgedit.cpp.

640 {
641# ifndef DISABLED_LEGACY_ENGINE
643# endif
644 recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
645}
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2057 of file control.cpp.

2057 {
2058 PAGE_RES_IT word_it(page_res);
2059 for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2060 if (word->best_choices.singleton()) {
2061 continue; // There are no alternates.
2062 }
2063
2064 const WERD_CHOICE *best = word->best_choice;
2065 if (word->tesseract->getDict().valid_word(*best) != 0) {
2066 continue; // The best choice is in the dictionary.
2067 }
2068
2069 WERD_CHOICE_IT choice_it(&word->best_choices);
2070 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2071 WERD_CHOICE *alternate = choice_it.data();
2072 if (word->tesseract->getDict().valid_word(*alternate)) {
2073 // The alternate choice is in the dictionary.
2074 if (tessedit_bigram_debug) {
2075 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2076 best->unichar_string().c_str(), alternate->unichar_string().c_str());
2077 }
2078 // Replace the 'best' choice with a better choice.
2079 word->ReplaceBestChoice(alternate);
2080 break;
2081 }
2082 }
2083 }
2084}

◆ digit_or_numeric_punct()

bool tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 366 of file fixspace.cpp.

366 {
367 int i;
368 int offset;
369
370 for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
371 ;
372 }
373 return (
374 word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
375 word->best_choice->unichar_lengths()[i]) ||
376 (word->best_choice->permuter() == NUMBER_PERM &&
377 numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
378}
@ NUMBER_PERM
Definition: ratngs.h:242

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 325 of file pgedit.cpp.

325 {
326 int block_count = 1;
327
328 image_win->Clear();
329 if (display_image) {
330 image_win->Draw(pix_binary_, 0, 0);
331 }
332
333 image_win->Brush(ScrollView::NONE);
334 PAGE_RES_IT pr_it(current_page_res);
335 for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
336 (this->*word_painter)(&pr_it);
337 if (display_baselines && pr_it.row() != pr_it.prev_row()) {
338 pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
339 }
340 if (display_blocks && pr_it.block() != pr_it.prev_block()) {
341 pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
342 }
343 }
344 image_win->Update();
345}
void Draw(Image image, int x_pos, int y_pos)
Definition: scrollview.cpp:750
void Brush(Color color)
Definition: scrollview.cpp:716

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 210 of file docqual.cpp.

211 {
212 int16_t block_no = 0;
213 int16_t row_no = 0;
214 BLOCK_RES *current_block;
215 ROW_RES *current_row;
216
217 bool rej_word;
218 bool prev_word_rejected;
219 int16_t char_quality = 0;
220 int16_t accepted_char_quality;
221
222 if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
223 tessedit_reject_doc_percent) {
224 reject_whole_page(page_res_it);
225 if (tessedit_debug_doc_rejection) {
226 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
227 page_res_it.page_res->rej_count);
228 }
229 } else {
230 if (tessedit_debug_doc_rejection) {
231 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
232 page_res_it.page_res->rej_count);
233 }
234
235 /* Walk blocks testing for block rejection */
236
237 page_res_it.restart_page();
238 WERD_RES *word;
239 while ((word = page_res_it.word()) != nullptr) {
240 current_block = page_res_it.block();
241 block_no = current_block->block->pdblk.index();
242 if (current_block->char_count > 0 &&
243 (current_block->rej_count * 100.0 / current_block->char_count) >
244 tessedit_reject_block_percent) {
245 if (tessedit_debug_block_rejection) {
246 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
247 current_block->char_count, current_block->rej_count);
248 }
249 prev_word_rejected = false;
250 while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
251 if (tessedit_preserve_blk_rej_perfect_wds) {
252 rej_word = word->reject_map.reject_count() > 0 ||
253 word->reject_map.length() < tessedit_preserve_min_wd_len;
254 if (rej_word && tessedit_dont_blkrej_good_wds &&
255 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
256 acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
257 word->best_choice->unichar_lengths().c_str()) !=
259 word_char_quality(word, &char_quality, &accepted_char_quality);
260 rej_word = char_quality != word->reject_map.length();
261 }
262 } else {
263 rej_word = true;
264 }
265 if (rej_word) {
266 /*
267 Reject spacing if both current and prev words are rejected.
268 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
269 generated more space errors.
270*/
271 if (tessedit_use_reject_spaces && prev_word_rejected &&
272 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
273 word->reject_spaces = true;
274 }
275 word->reject_map.rej_word_block_rej();
276 }
277 prev_word_rejected = rej_word;
278 page_res_it.forward();
279 }
280 } else {
281 if (tessedit_debug_block_rejection) {
282 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
283 page_res_it.block()->char_count, page_res_it.block()->rej_count);
284 }
285
286 /* Walk rows in block testing for row rejection */
287 row_no = 0;
288 while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
289 current_row = page_res_it.row();
290 row_no++;
291 /* Reject whole row if:
292 fraction of chars on row which are rejected exceed a limit AND
293 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
294 limit
295*/
296 if (current_row->char_count > 0 &&
297 (current_row->rej_count * 100.0 / current_row->char_count) >
298 tessedit_reject_row_percent &&
299 (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
300 tessedit_whole_wd_rej_row_percent) {
301 if (tessedit_debug_block_rejection) {
302 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
303 current_row->char_count, current_row->rej_count);
304 }
305 prev_word_rejected = false;
306 while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
307 /* Preserve words on good docs unless they are mostly rejected*/
308 if (!tessedit_row_rej_good_docs && good_quality_doc) {
309 rej_word = word->reject_map.reject_count() /
310 static_cast<float>(word->reject_map.length()) >
311 tessedit_good_doc_still_rowrej_wd;
312 } else if (tessedit_preserve_row_rej_perfect_wds) {
313 /* Preserve perfect words anyway */
314 rej_word = word->reject_map.reject_count() > 0 ||
315 word->reject_map.length() < tessedit_preserve_min_wd_len;
316 if (rej_word && tessedit_dont_rowrej_good_wds &&
317 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
319 *word->uch_set, word->best_choice->unichar_string().c_str(),
320 word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
321 word_char_quality(word, &char_quality, &accepted_char_quality);
322 rej_word = char_quality != word->reject_map.length();
323 }
324 } else {
325 rej_word = true;
326 }
327 if (rej_word) {
328 /*
329 Reject spacing if both current and prev words are rejected.
330 NOTE - this is NOT restricted to FUZZY spaces. - When tried
331 this generated more space errors.
332*/
333 if (tessedit_use_reject_spaces && prev_word_rejected &&
334 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
335 word->reject_spaces = true;
336 }
337 word->reject_map.rej_word_row_rej();
338 }
339 prev_word_rejected = rej_word;
340 page_res_it.forward();
341 }
342 } else {
343 if (tessedit_debug_block_rejection) {
344 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
345 current_row->char_count, current_row->rej_count);
346 }
347 while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
348 page_res_it.forward();
349 }
350 }
351 }
352 }
353 }
354 }
355}
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:363
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1692

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 513 of file reject.cpp.

513 {
514 int word_len = word->reject_map.length();
515 const char *s = word->best_choice->unichar_string().c_str();
516 const char *lengths = word->best_choice->unichar_lengths().c_str();
517 bool accepted_1Il = false;
518
519 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
520 if (word->reject_map[i].accepted()) {
521 if (conflict_set_I_l_1.contains(s[offset])) {
522 accepted_1Il = true;
523 } else {
524 if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
525 word->uch_set->get_isdigit(s + offset, lengths[i])) {
526 return; // >=1 non 1Il ch accepted
527 }
528 }
529 }
530 }
531 if (!accepted_1Il) {
532 return; // Nothing to worry about
533 }
534
535 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
536 if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
537 word->reject_map[i].setrej_postNN_1Il();
538 }
539 }
540}

◆ double_VAR_H() [1/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_cert  )

◆ double_VAR_H() [2/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_high_word  )

◆ double_VAR_H() [3/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_low_word  )

◆ double_VAR_H() [4/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_max_ht  )

◆ double_VAR_H() [5/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_min_ht  )

◆ double_VAR_H() [6/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_min_width  )

◆ double_VAR_H() [7/50]

tesseract::Tesseract::double_VAR_H ( crunch_del_rating  )

◆ double_VAR_H() [8/50]

tesseract::Tesseract::double_VAR_H ( crunch_poor_garbage_cert  )

◆ double_VAR_H() [9/50]

tesseract::Tesseract::double_VAR_H ( crunch_poor_garbage_rate  )

◆ double_VAR_H() [10/50]

tesseract::Tesseract::double_VAR_H ( crunch_pot_poor_cert  )

◆ double_VAR_H() [11/50]

tesseract::Tesseract::double_VAR_H ( crunch_pot_poor_rate  )

◆ double_VAR_H() [12/50]

tesseract::Tesseract::double_VAR_H ( crunch_small_outlines_size  )

◆ double_VAR_H() [13/50]

tesseract::Tesseract::double_VAR_H ( crunch_terrible_rating  )

◆ double_VAR_H() [14/50]

tesseract::Tesseract::double_VAR_H ( fixsp_small_outlines_size  )

◆ double_VAR_H() [15/50]

tesseract::Tesseract::double_VAR_H ( invert_threshold  )

◆ double_VAR_H() [16/50]

tesseract::Tesseract::double_VAR_H ( lstm_rating_coefficient  )

◆ double_VAR_H() [17/50]

tesseract::Tesseract::double_VAR_H ( min_orientation_margin  )

◆ double_VAR_H() [18/50]

tesseract::Tesseract::double_VAR_H ( noise_cert_basechar  )

◆ double_VAR_H() [19/50]

tesseract::Tesseract::double_VAR_H ( noise_cert_disjoint  )

◆ double_VAR_H() [20/50]

tesseract::Tesseract::double_VAR_H ( noise_cert_factor  )

◆ double_VAR_H() [21/50]

tesseract::Tesseract::double_VAR_H ( noise_cert_punc  )

◆ double_VAR_H() [22/50]

tesseract::Tesseract::double_VAR_H ( quality_blob_pc  )

◆ double_VAR_H() [23/50]

tesseract::Tesseract::double_VAR_H ( quality_char_pc  )

◆ double_VAR_H() [24/50]

tesseract::Tesseract::double_VAR_H ( quality_outline_pc  )

◆ double_VAR_H() [25/50]

tesseract::Tesseract::double_VAR_H ( quality_rej_pc  )

◆ double_VAR_H() [26/50]

tesseract::Tesseract::double_VAR_H ( quality_rowrej_pc  )

◆ double_VAR_H() [27/50]

tesseract::Tesseract::double_VAR_H ( rej_whole_of_mostly_reject_word_fract  )

◆ double_VAR_H() [28/50]

tesseract::Tesseract::double_VAR_H ( subscript_max_y_top  )

◆ double_VAR_H() [29/50]

tesseract::Tesseract::double_VAR_H ( superscript_bettered_certainty  )

◆ double_VAR_H() [30/50]

tesseract::Tesseract::double_VAR_H ( superscript_min_y_bottom  )

◆ double_VAR_H() [31/50]

tesseract::Tesseract::double_VAR_H ( superscript_scaledown_ratio  )

◆ double_VAR_H() [32/50]

tesseract::Tesseract::double_VAR_H ( superscript_worse_certainty  )

◆ double_VAR_H() [33/50]

tesseract::Tesseract::double_VAR_H ( suspect_accept_rating  )

◆ double_VAR_H() [34/50]

tesseract::Tesseract::double_VAR_H ( suspect_rating_per_ch  )

◆ double_VAR_H() [35/50]

tesseract::Tesseract::double_VAR_H ( tessedit_good_doc_still_rowrej_wd  )

◆ double_VAR_H() [36/50]

tesseract::Tesseract::double_VAR_H ( tessedit_lower_flip_hyphen  )

◆ double_VAR_H() [37/50]

tesseract::Tesseract::double_VAR_H ( tessedit_reject_block_percent  )

◆ double_VAR_H() [38/50]

tesseract::Tesseract::double_VAR_H ( tessedit_reject_doc_percent  )

◆ double_VAR_H() [39/50]

tesseract::Tesseract::double_VAR_H ( tessedit_reject_row_percent  )

◆ double_VAR_H() [40/50]

tesseract::Tesseract::double_VAR_H ( tessedit_upper_flip_hyphen  )

◆ double_VAR_H() [41/50]

tesseract::Tesseract::double_VAR_H ( tessedit_whole_wd_rej_row_percent  )

◆ double_VAR_H() [42/50]

tesseract::Tesseract::double_VAR_H ( test_pt_x  )

◆ double_VAR_H() [43/50]

tesseract::Tesseract::double_VAR_H ( test_pt_y  )

◆ double_VAR_H() [44/50]

tesseract::Tesseract::double_VAR_H ( textord_tabfind_aligned_gap_fraction  )

◆ double_VAR_H() [45/50]

tesseract::Tesseract::double_VAR_H ( textord_tabfind_vertical_text_ratio  )

◆ double_VAR_H() [46/50]

tesseract::Tesseract::double_VAR_H ( thresholding_kfactor  )

◆ double_VAR_H() [47/50]

tesseract::Tesseract::double_VAR_H ( thresholding_score_fraction  )

◆ double_VAR_H() [48/50]

tesseract::Tesseract::double_VAR_H ( thresholding_smooth_kernel_size  )

◆ double_VAR_H() [49/50]

tesseract::Tesseract::double_VAR_H ( thresholding_tile_size  )

◆ double_VAR_H() [50/50]

tesseract::Tesseract::double_VAR_H ( thresholding_window_size  )

◆ dump_words()

void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
int16_t  score,
int16_t  mode,
bool  improved 
)

Definition at line 467 of file fixspace.cpp.

467 {
468 WERD_RES_IT word_res_it(&perm);
469
470 if (debug_fix_space_level > 0) {
471 if (mode == 1) {
472 stats_.dump_words_str = "";
473 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
474 if (!word_res_it.data()->part_of_combo) {
475 stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
476 stats_.dump_words_str += ' ';
477 }
478 }
479 }
480
481 if (debug_fix_space_level > 1) {
482 switch (mode) {
483 case 1:
484 tprintf("EXTRACTED (%d): \"", score);
485 break;
486 case 2:
487 tprintf("TESTED (%d): \"", score);
488 break;
489 case 3:
490 tprintf("RETURNED (%d): \"", score);
491 break;
492 }
493
494 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
495 if (!word_res_it.data()->part_of_combo) {
496 tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
497 static_cast<int>(word_res_it.data()->best_choice->permuter()));
498 }
499 }
500 tprintf("\"\n");
501 } else if (improved) {
502 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
503 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
504 if (!word_res_it.data()->part_of_combo) {
505 tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
506 static_cast<int>(word_res_it.data()->best_choice->permuter()));
507 }
508 }
509 tprintf("\"\n");
510 }
511 }
512}

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 457 of file tessedit.cpp.

457 {
458 end_recog();
459}

◆ eval_word_spacing()

int16_t tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 260 of file fixspace.cpp.

260 {
261 WERD_RES_IT word_res_it(&word_res_list);
262 int16_t total_score = 0;
263 int16_t word_count = 0;
264 int16_t done_word_count = 0;
265 int i;
266 int16_t offset;
267 int16_t prev_word_score = 0;
268 bool prev_word_done = false;
269 bool prev_char_1 = false; // prev ch a "1/I/l"?
270 bool prev_char_digit = false; // prev ch 2..9 or 0
271 const char *punct_chars = "!\"`',.:;";
272 bool prev_char_punct = false;
273
274 do {
275 // current word
276 WERD_RES *word = word_res_it.data();
277 bool word_done = fixspace_thinks_word_done(word);
278 word_count++;
279 if (word->tess_failed) {
280 total_score += prev_word_score;
281 if (prev_word_done) {
282 done_word_count++;
283 }
284 prev_word_score = 0;
285 prev_char_1 = false;
286 prev_char_digit = false;
287 prev_word_done = false;
288 } else {
289 /*
290 Can we add the prev word score and potentially count this word?
291 Yes IF it didn't end in a 1 when the first char of this word is a digit
292 AND it didn't end in a digit when the first char of this word is a 1
293*/
294 auto word_len = word->reject_map.length();
295 bool current_word_ok_so_far = false;
296 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
297 (prev_char_digit &&
298 ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
299 word->best_choice->unichar_string()[0] == '1') ||
300 (!word_done &&
301 conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
302 total_score += prev_word_score;
303 if (prev_word_done) {
304 done_word_count++;
305 }
306 current_word_ok_so_far = word_done;
307 }
308
309 if (current_word_ok_so_far) {
310 prev_word_done = true;
311 prev_word_score = word_len;
312 } else {
313 prev_word_done = false;
314 prev_word_score = 0;
315 }
316
317 /* Add 1 to total score for every joined 1 regardless of context and
318 rejtn */
319 for (i = 0, prev_char_1 = false; i < word_len; i++) {
320 bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
321 if (prev_char_1 || (current_char_1 && (i > 0))) {
322 total_score++;
323 }
324 prev_char_1 = current_char_1;
325 }
326
327 /* Add 1 to total score for every joined punctuation regardless of context
328 and rejtn */
329 if (tessedit_prefer_joined_punct) {
330 for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
331 offset += word->best_choice->unichar_lengths()[i++]) {
332 bool current_char_punct =
333 strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
334 if (prev_char_punct || (current_char_punct && i > 0)) {
335 total_score++;
336 }
337 prev_char_punct = current_char_punct;
338 }
339 }
340 prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
341 for (i = 0, offset = 0; i < word_len - 1;
342 offset += word->best_choice->unichar_lengths()[i++]) {
343 ;
344 }
345 prev_char_1 =
346 ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
347 (!word_done &&
348 conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
349 }
350 /* Find next word */
351 do {
352 word_res_it.forward();
353 } while (word_res_it.data()->part_of_combo);
354 } while (!word_res_it.at_first());
355 total_score += prev_word_score;
356 if (prev_word_done) {
357 done_word_count++;
358 }
359 if (done_word_count == word_count) {
360 return PERFECT_WERDS;
361 } else {
362 return total_score;
363 }
364}
#define PERFECT_WERDS
Definition: fixspace.cpp:48
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:514
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:366

◆ failure_count()

int16_t tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 895 of file docqual.cpp.

895 {
896 const char *str = word->best_choice->unichar_string().c_str();
897 int tess_rejs = 0;
898
899 for (; *str != '\0'; str++) {
900 if (*str == ' ') {
901 tess_rejs++;
902 }
903 }
904 return tess_rejs;
905}

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation ( const std::vector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.

Definition at line 545 of file applybox.cpp.

545 {
546 // Classify all required combinations of blobs and save results in choices.
547 const int word_length = word_res->box_word->length();
548 auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
549 for (int i = 0; i < word_length; ++i) {
550 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
551 BLOB_CHOICE_LIST *match_result =
552 classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
553 word_res->blamer_bundle);
554 if (applybox_debug > 2) {
555 tprintf("%d+%d:", i, j);
556 print_ratings_list("Segment:", match_result, unicharset);
557 }
558 choices[i].push_back(match_result);
559 }
560 }
561 // Search the segmentation graph for the target text. Must be an exact
562 // match. Using wildcards makes it difficult to find the correct
563 // segmentation even when it is there.
564 word_res->best_state.clear();
565 std::vector<int> search_segmentation;
566 float best_rating = 0.0f;
567 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
568 &word_res->best_state);
569 for (int i = 0; i < word_length; ++i) {
570 for (auto choice : choices[i]) {
571 delete choice;
572 }
573 }
574 delete[] choices;
575 if (word_res->best_state.empty()) {
576 // Build the original segmentation and if it is the same length as the
577 // truth, assume it will do.
578 int blob_count = 1;
579 for (auto s : word_res->seam_array) {
580 SEAM *seam = s;
581 if (!seam->HasAnySplits()) {
582 word_res->best_state.push_back(blob_count);
583 blob_count = 1;
584 } else {
585 ++blob_count;
586 }
587 }
588 word_res->best_state.push_back(blob_count);
589 if (word_res->best_state.size() != target_text.size()) {
590 word_res->best_state.clear(); // No good. Original segmentation bad size.
591 return false;
592 }
593 }
594 word_res->correct_text.clear();
595 for (auto &text : target_text) {
596 word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
597 }
598 return true;
599}
const int kMaxGroupSize
Definition: applybox.cpp:33
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
void SearchForText(const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
Definition: applybox.cpp:615
virtual BLOB_CHOICE_LIST * classify_piece(const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:49

◆ first_alphanum_index()

int16_t tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 457 of file reject.cpp.

457 {
458 int16_t i;
459 int16_t offset;
460
461 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
462 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
463 unicharset.get_isdigit(word + offset, word_lengths[i])) {
464 return i;
465 }
466 }
467 return -1;
468}

◆ first_alphanum_offset()

int16_t tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 470 of file reject.cpp.

470 {
471 int16_t i;
472 int16_t offset;
473
474 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476 unicharset.get_isdigit(word + offset, word_lengths[i])) {
477 return offset;
478 }
479 }
480 return -1;
481}

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 171 of file fixspace.cpp.

171 {
172 int16_t best_score;
173 WERD_RES_LIST current_perm;
174 int16_t current_score;
175 bool improved = false;
176
177 best_score = eval_word_spacing(best_perm); // default score
178 dump_words(best_perm, best_score, 1, improved);
179
180 if (best_score != PERFECT_WERDS) {
181 initialise_search(best_perm, current_perm);
182 }
183
184 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
185 match_current_words(current_perm, row, block);
186 current_score = eval_word_spacing(current_perm);
187 dump_words(current_perm, current_score, 2, improved);
188 if (current_score > best_score) {
189 best_perm.clear();
190 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
191 best_score = current_score;
192 improved = true;
193 }
194 if (current_score < PERFECT_WERDS) {
195 transform_to_next_perm(current_perm);
196 }
197 }
198 dump_words(best_perm, best_score, 3, improved);
199}
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:391
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:201
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:467
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:260
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:218
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:655

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
int32_t  word_count,
PAGE_RES page_res 
)

Definition at line 77 of file fixspace.cpp.

77 {
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
82 WERD_RES *word_res;
83 WERD_RES_LIST fuzzy_space_words;
84 int16_t new_length;
85 bool prevent_null_wd_fixsp; // DON'T process blobless wds
86 int32_t word_index; // current word
87
88 block_res_it.set_to_list(&page_res->block_res_list);
89 word_index = 0;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
91 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
92 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
93 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
94 while (!word_res_it_from.at_last()) {
95 word_res = word_res_it_from.data();
96 while (!word_res_it_from.at_last() &&
97 !(word_res->combination ||
98 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
99 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
100 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
101 word_res = word_res_it_from.forward();
102 word_index++;
103 if (monitor != nullptr) {
104 monitor->ocr_alive = true;
105 monitor->progress = 90 + 5 * word_index / word_count;
106 if (monitor->deadline_exceeded() ||
107 (monitor->cancel != nullptr &&
108 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
109 return;
110 }
111 }
112 }
113
114 if (!word_res_it_from.at_last()) {
115 word_res_it_to = word_res_it_from;
116 prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
117 if (check_debug_pt(word_res, 60)) {
118 debug_fix_space_level.set_value(10);
119 }
120 word_res_it_to.forward();
121 word_index++;
122 if (monitor != nullptr) {
123 monitor->ocr_alive = true;
124 monitor->progress = 90 + 5 * word_index / word_count;
125 if (monitor->deadline_exceeded() ||
126 (monitor->cancel != nullptr &&
127 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
128 return;
129 }
130 }
131 while (!word_res_it_to.at_last() &&
132 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
133 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
134 if (check_debug_pt(word_res, 60)) {
135 debug_fix_space_level.set_value(10);
136 }
137 if (word_res->word->cblob_list()->empty()) {
138 prevent_null_wd_fixsp = true;
139 }
140 word_res = word_res_it_to.forward();
141 }
142 if (check_debug_pt(word_res, 60)) {
143 debug_fix_space_level.set_value(10);
144 }
145 if (word_res->word->cblob_list()->empty()) {
146 prevent_null_wd_fixsp = true;
147 }
148 if (prevent_null_wd_fixsp) {
149 word_res_it_from = word_res_it_to;
150 } else {
151 fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
152 fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
153 block_res_it.data()->block);
154 new_length = fuzzy_space_words.length();
155 word_res_it_from.add_list_before(&fuzzy_space_words);
156 for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
157 word_res_it_from.forward();
158 }
159 }
160 if (test_pt) {
161 debug_fix_space_level.set_value(0);
162 }
163 }
164 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
165 // Last word in row
166 }
167 }
168 }
169}
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:545
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:171

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 577 of file fixspace.cpp.

577 {
578 int16_t best_score;
579 WERD_RES_IT best_perm_it(&best_perm);
580 WERD_RES_LIST current_perm;
581 WERD_RES_IT current_perm_it(&current_perm);
582 WERD_RES *old_word_res;
583 int16_t current_score;
584 bool improved = false;
585
586 best_score = fp_eval_word_spacing(best_perm); // default score
587
588 dump_words(best_perm, best_score, 1, improved);
589
590 old_word_res = best_perm_it.data();
591 // Even deep_copy doesn't copy the underlying WERD unless its combination
592 // flag is true!.
593 old_word_res->combination = true; // Kludge to force deep copy
594 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
595 old_word_res->combination = false; // Undo kludge
596
597 break_noisiest_blob_word(current_perm);
598
599 while (best_score != PERFECT_WERDS && !current_perm.empty()) {
600 match_current_words(current_perm, row, block);
601 current_score = fp_eval_word_spacing(current_perm);
602 dump_words(current_perm, current_score, 2, improved);
603 if (current_score > best_score) {
604 best_perm.clear();
605 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
606 best_score = current_score;
607 improved = true;
608 }
609 if (current_score < PERFECT_WERDS) {
610 break_noisiest_blob_word(current_perm);
611 }
612 }
613 dump_words(best_perm, best_score, 3, improved);
614}
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:621
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:837

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1665 of file control.cpp.

1665 {
1666 WERD_RES *word_res = page_res_it->word();
1667 const WERD_CHOICE &word = *(word_res->best_choice);
1668
1669 // Find the frequency of each unique character in the word.
1670 SortHelper<UNICHAR_ID> rep_ch(word.length());
1671 for (unsigned i = 0; i < word.length(); ++i) {
1672 rep_ch.Add(word.unichar_id(i), 1);
1673 }
1674
1675 // Find the most frequent result.
1676 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1677 int max_count = rep_ch.MaxCount(&maxch_id);
1678 // Find the best exemplar of a classifier result for maxch_id.
1679 BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1680 if (best_choice == nullptr) {
1681 tprintf("Failed to find a choice for %s, occurring %d times\n",
1682 word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1683 return;
1684 }
1685 word_res->done = true;
1686
1687 // Just correct existing classification.
1688 CorrectRepcharChoices(best_choice, word_res);
1689 word_res->reject_map.initialise(word.length());
1690}

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 545 of file fixspace.cpp.

545 {
546 WERD_RES *word_res;
547 WERD_RES_LIST sub_word_list;
548 WERD_RES_IT sub_word_list_it(&sub_word_list);
549 int16_t blob_index;
550 int16_t new_length;
551 float junk;
552
553 word_res = word_res_it.data();
554 if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
555 !word_res->word->flag(W_DONT_CHOP)) {
556 return;
557 }
558
559 blob_index = worst_noise_blob(word_res, &junk);
560 if (blob_index < 0) {
561 return;
562 }
563
564 if (debug_fix_space_level > 1) {
565 tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
566 }
567 word_res->word->rej_cblob_list()->sort(c_blob_comparator);
568 sub_word_list_it.add_after_stay_put(word_res_it.extract());
569 fix_noisy_space_list(sub_word_list, row, block);
570 new_length = sub_word_list.length();
571 word_res_it.add_list_before(&sub_word_list);
572 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
573 word_res_it.forward();
574 }
575}
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:577

◆ fixspace_thinks_word_done()

bool tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 514 of file fixspace.cpp.

514 {
515 if (word->done) {
516 return true;
517 }
518
519 /*
520 Use all the standard pass 2 conditions for mode 5 in set_done() in
521 reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
522 CARE WHETHER WE HAVE of/at on/an etc.
523*/
524 if (fixsp_done_mode > 0 &&
525 (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
526 fixsp_done_mode == 3) &&
527 (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
528 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
529 (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
530 (word->best_choice->permuter() == USER_DAWG_PERM) ||
531 (word->best_choice->permuter() == NUMBER_PERM))) {
532 return true;
533 } else {
534 return false;
535 }
536}
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:246
@ FREQ_DAWG_PERM
Definition: ratngs.h:247

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 660 of file reject.cpp.

660 {
661 WERD_CHOICE *best_choice = word_res->best_choice;
662 TBOX out_box;
663
664 if (!tessedit_flip_0O) {
665 return;
666 }
667
668 auto num_blobs = word_res->rebuild_word->NumBlobs();
669 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
670 TBLOB *blob = word_res->rebuild_word->blobs[i];
671 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
672 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
673 out_box = blob->bounding_box();
674 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
675 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
676 return; // Beware words with sub/superscripts
677 }
678 }
679 }
680 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
681 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
682 if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
683 unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
684 return; // 0 or O are not present/enabled in unicharset
685 }
686 for (unsigned i = 1; i < best_choice->length(); ++i) {
687 if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
688 /* A0A */
689 if ((i + 1) < best_choice->length() &&
690 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
691 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
692 best_choice->set_unichar_id(unichar_O, i);
693 }
694 /* A00A */
695 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
696 (i + 1) < best_choice->length() &&
697 (best_choice->unichar_id(i + 1) == unichar_0 ||
698 best_choice->unichar_id(i + 1) == unichar_O) &&
699 (i + 2) < best_choice->length() &&
700 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
701 best_choice->set_unichar_id(unichar_O, i);
702 i++;
703 }
704 /* AA0<non digit or end of word> */
705 if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
706 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
707 (((i + 1) < best_choice->length() &&
708 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
709 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
710 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
711 (i == best_choice->length() - 1))) {
712 best_choice->set_unichar_id(unichar_O, i);
713 }
714 /* 9O9 */
715 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
716 (i + 1) < best_choice->length() &&
717 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
718 best_choice->set_unichar_id(unichar_0, i);
719 }
720 /* 9OOO */
721 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
722 (i + 2) < best_choice->length() &&
723 (best_choice->unichar_id(i + 1) == unichar_0 ||
724 best_choice->unichar_id(i + 1) == unichar_O) &&
725 (best_choice->unichar_id(i + 2) == unichar_0 ||
726 best_choice->unichar_id(i + 2) == unichar_O)) {
727 best_choice->set_unichar_id(unichar_0, i);
728 best_choice->set_unichar_id(unichar_0, i + 1);
729 best_choice->set_unichar_id(unichar_0, i + 2);
730 i += 2;
731 }
732 /* 9OO<non upper> */
733 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
734 (i + 2) < best_choice->length() &&
735 (best_choice->unichar_id(i + 1) == unichar_0 ||
736 best_choice->unichar_id(i + 1) == unichar_O) &&
737 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
738 best_choice->set_unichar_id(unichar_0, i);
739 best_choice->set_unichar_id(unichar_0, i + 1);
740 i++;
741 }
742 /* 9O<non upper> */
743 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
744 (i + 1) < best_choice->length() &&
745 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
746 best_choice->set_unichar_id(unichar_0, i);
747 }
748 /* 9[.,]OOO.. */
749 if ((i > 1) &&
750 (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
751 word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
752 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
753 best_choice->unichar_id(i - 2) == unichar_O)) {
754 if (best_choice->unichar_id(i - 2) == unichar_O) {
755 best_choice->set_unichar_id(unichar_0, i - 2);
756 }
757 while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
758 best_choice->unichar_id(i) == unichar_0)) {
759 best_choice->set_unichar_id(unichar_0, i);
760 i++;
761 }
762 i--;
763 }
764 }
765 }
766}
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:768
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:772

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 602 of file reject.cpp.

602 {
603 WERD_CHOICE *best_choice = word_res->best_choice;
604 int prev_right = -9999;
605 int next_left;
606 TBOX out_box;
607 float aspect_ratio;
608
609 if (tessedit_lower_flip_hyphen <= 1) {
610 return;
611 }
612
613 auto num_blobs = word_res->rebuild_word->NumBlobs();
614 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
615 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
616 TBLOB *blob = word_res->rebuild_word->blobs[i];
617 out_box = blob->bounding_box();
618 if (i + 1 == num_blobs) {
619 next_left = 9999;
620 } else {
621 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
622 }
623 // Don't touch small or touching blobs - it is too dangerous.
624 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
625 (out_box.right() < next_left)) {
626 aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
627 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
628 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
629 word_res->uch_set->contains_unichar_id(unichar_dash) &&
630 word_res->uch_set->get_enabled(unichar_dash)) {
631 /* Certain HYPHEN */
632 best_choice->set_unichar_id(unichar_dash, i);
633 if (word_res->reject_map[i].rejected()) {
634 word_res->reject_map[i].setrej_hyphen_accept();
635 }
636 }
637 if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
638 // Suspected HYPHEN
639 word_res->reject_map[i].setrej_hyphen();
640 }
641 } else if (best_choice->unichar_id(i) == unichar_dash) {
642 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
643 word_res->reject_map[i].setrej_hyphen_accept();
644 }
645 // Certain HYPHEN
646
647 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
648 // Suspected HYPHEN
649 word_res->reject_map[i].setrej_hyphen();
650 }
651 }
652 }
653 prev_right = out_box.right();
654 }
655}

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 2003 of file control.cpp.

2003 {
2004 PAGE_RES_IT page_res_it(page_res);
2005 WERD_RES *word; // current word
2006 STATS doc_fonts(0, font_table_size_ - 1); // font counters
2007
2008 // Gather font id statistics.
2009 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2010 word = page_res_it.word();
2011 if (word->fontinfo != nullptr) {
2012 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2013 }
2014 if (word->fontinfo2 != nullptr) {
2015 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2016 }
2017 }
2018 int16_t doc_font; // modal font
2019 int8_t doc_font_count; // modal font
2020 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2021 if (doc_font_count == 0) {
2022 return;
2023 }
2024 // Get the modal font pointer.
2025 const FontInfo *modal_font = nullptr;
2026 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2027 word = page_res_it.word();
2028 if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2029 modal_font = word->fontinfo;
2030 break;
2031 }
2032 if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2033 modal_font = word->fontinfo2;
2034 break;
2035 }
2036 }
2037 ASSERT_HOST(modal_font != nullptr);
2038
2039 // Assign modal font to weak words.
2040 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2041 word = page_res_it.word();
2042 const int length = word->best_choice->length();
2043
2044 const int count = word->fontinfo_id_count;
2045 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2046 word->fontinfo = modal_font;
2047 // Counts only get 1 as it came from the doc.
2048 word->fontinfo_id_count = 1;
2049 }
2050 }
2051}

◆ fp_eval_word_spacing()

int16_t tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

fp_eval_word_spacing() Evaluation function for fixed pitch word lists.

Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars

Definition at line 837 of file fixspace.cpp.

837 {
838 WERD_RES_IT word_it(&word_res_list);
839 WERD_RES *word;
840 int16_t score = 0;
841 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
842
843 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
844 word = word_it.data();
845 if (word->rebuild_word == nullptr) {
846 continue; // Can't handle cube words.
847 }
848 if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
849 word->best_choice->permuter() == FREQ_DAWG_PERM ||
850 word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
851 auto num_blobs = word->rebuild_word->NumBlobs();
852 UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
853 for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
854 TBLOB *blob = word->rebuild_word->blobs[i];
855 if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
856 score -= 1; // penalise possibly erroneous non-space
857 } else if (word->reject_map[i].accepted()) {
858 score++;
859 }
860 }
861 }
862 }
863 if (score < 0) {
864 score = 0;
865 }
866 return score;
867}
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:772
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
bool  ok_dict_word 
)

Definition at line 616 of file docqual.cpp.

616 {
617 enum STATES {
618 JUNK,
619 FIRST_UPPER,
620 FIRST_LOWER,
621 FIRST_NUM,
622 SUBSEQUENT_UPPER,
623 SUBSEQUENT_LOWER,
624 SUBSEQUENT_NUM
625 };
626 const char *str = word->best_choice->unichar_string().c_str();
627 const char *lengths = word->best_choice->unichar_lengths().c_str();
628 STATES state = JUNK;
629 int len = 0;
630 int isolated_digits = 0;
631 int isolated_alphas = 0;
632 int bad_char_count = 0;
633 int tess_rejs = 0;
634 int dodgy_chars = 0;
635 int ok_chars;
636 UNICHAR_ID last_char = -1;
637 int alpha_repetition_count = 0;
638 int longest_alpha_repetition_count = 0;
639 int longest_lower_run_len = 0;
640 int lower_string_count = 0;
641 int longest_upper_run_len = 0;
642 int upper_string_count = 0;
643 int total_alpha_count = 0;
644 int total_digit_count = 0;
645
646 for (; *str != '\0'; str += *(lengths++)) {
647 len++;
648 if (word->uch_set->get_isupper(str, *lengths)) {
649 total_alpha_count++;
650 switch (state) {
651 case SUBSEQUENT_UPPER:
652 case FIRST_UPPER:
653 state = SUBSEQUENT_UPPER;
654 upper_string_count++;
655 if (longest_upper_run_len < upper_string_count) {
656 longest_upper_run_len = upper_string_count;
657 }
658 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
659 alpha_repetition_count++;
660 if (longest_alpha_repetition_count < alpha_repetition_count) {
661 longest_alpha_repetition_count = alpha_repetition_count;
662 }
663 } else {
664 last_char = word->uch_set->unichar_to_id(str, *lengths);
665 alpha_repetition_count = 1;
666 }
667 break;
668 case FIRST_NUM:
669 isolated_digits++;
670 // Fall through.
671 default:
672 state = FIRST_UPPER;
673 last_char = word->uch_set->unichar_to_id(str, *lengths);
674 alpha_repetition_count = 1;
675 upper_string_count = 1;
676 break;
677 }
678 } else if (word->uch_set->get_islower(str, *lengths)) {
679 total_alpha_count++;
680 switch (state) {
681 case SUBSEQUENT_LOWER:
682 case FIRST_LOWER:
683 state = SUBSEQUENT_LOWER;
684 lower_string_count++;
685 if (longest_lower_run_len < lower_string_count) {
686 longest_lower_run_len = lower_string_count;
687 }
688 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
689 alpha_repetition_count++;
690 if (longest_alpha_repetition_count < alpha_repetition_count) {
691 longest_alpha_repetition_count = alpha_repetition_count;
692 }
693 } else {
694 last_char = word->uch_set->unichar_to_id(str, *lengths);
695 alpha_repetition_count = 1;
696 }
697 break;
698 case FIRST_NUM:
699 isolated_digits++;
700 // Fall through.
701 default:
702 state = FIRST_LOWER;
703 last_char = word->uch_set->unichar_to_id(str, *lengths);
704 alpha_repetition_count = 1;
705 lower_string_count = 1;
706 break;
707 }
708 } else if (word->uch_set->get_isdigit(str, *lengths)) {
709 total_digit_count++;
710 switch (state) {
711 case FIRST_NUM:
712 state = SUBSEQUENT_NUM;
713 case SUBSEQUENT_NUM:
714 break;
715 case FIRST_UPPER:
716 case FIRST_LOWER:
717 isolated_alphas++;
718 // Fall through.
719 default:
720 state = FIRST_NUM;
721 break;
722 }
723 } else {
724 if (*lengths == 1 && *str == ' ') {
725 tess_rejs++;
726 } else {
727 bad_char_count++;
728 }
729 switch (state) {
730 case FIRST_NUM:
731 isolated_digits++;
732 break;
733 case FIRST_UPPER:
734 case FIRST_LOWER:
735 isolated_alphas++;
736 default:
737 break;
738 }
739 state = JUNK;
740 }
741 }
742
743 switch (state) {
744 case FIRST_NUM:
745 isolated_digits++;
746 break;
747 case FIRST_UPPER:
748 case FIRST_LOWER:
749 isolated_alphas++;
750 default:
751 break;
752 }
753
754 if (crunch_include_numerals) {
755 total_alpha_count += total_digit_count - isolated_digits;
756 }
757
758 if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
759 longest_alpha_repetition_count < crunch_long_repetitions) {
760 if ((crunch_accept_ok &&
761 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
762 longest_lower_run_len > crunch_leave_lc_strings ||
763 longest_upper_run_len > crunch_leave_uc_strings) {
764 return G_NEVER_CRUNCH;
765 }
766 }
767 if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
768 (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
769 word->best_choice->permuter() == FREQ_DAWG_PERM ||
770 word->best_choice->permuter() == USER_DAWG_PERM ||
771 word->best_choice->permuter() == NUMBER_PERM ||
772 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
773 return G_OK;
774 }
775
776 ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
777
778 if (crunch_debug > 3) {
779 tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
780 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
781 isolated_digits, isolated_alphas, tess_rejs);
782 }
783 if (bad_char_count == 0 && tess_rejs == 0 &&
784 (len > isolated_digits + isolated_alphas || len <= 2)) {
785 return G_OK;
786 }
787
788 if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
789 return G_TERRIBLE;
790 }
791
792 if (len > 4) {
793 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
794 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
795 return G_DODGY;
796 } else {
797 return G_OK;
798 }
799 } else {
800 dodgy_chars = 2 * tess_rejs + bad_char_count;
801 if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
802 return G_DODGY;
803 } else {
804 return G_OK;
805 }
806 }
807}
@ G_TERRIBLE
Definition: docqual.h:30
@ G_NEVER_CRUNCH
Definition: docqual.h:30
@ G_OK
Definition: docqual.h:30
@ G_DODGY
Definition: docqual.h:30

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 247 of file output.cpp.

247 { // what char is repeated?
248 int i;
249 for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
250 ;
251 }
252
253 if (i < word->reject_map.length()) {
254 return word->best_choice->unichar_id(i);
255 } else {
256 return word->uch_set->unichar_to_id(unrecognised_char.c_str());
257 }
258}

◆ get_sub_lang()

Tesseract * tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 286 of file tesseractclass.h.

286 {
287 return sub_langs_[index];
288 }

◆ getDict()

Dict & tesseract::Tesseract::getDict ( )
overridevirtual

Reimplemented from tesseract::Classify.

Definition at line 480 of file tesseractclass.cpp.

480 {
481 if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
482 if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
483 return *lstm_recognizer_->GetDict();
484 }
485 }
486 return Classify::getDict();
487}
bool AnyLSTMLang() const
virtual Dict & getDict()
Definition: classify.h:98
const Dict * GetDict() const

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const std::vector< TBOX > &  boxes,
const std::vector< std::string > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 133 of file linerec.cpp.

135 {
136 TBOX revised_box;
137 ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
138 if (image_data == nullptr) {
139 return nullptr;
140 }
141 image_data->set_page_number(applybox_page);
142 // Copy the boxes and shift them so they are relative to the image.
143 FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
144 ICOORD shift = -revised_box.botleft();
145 std::vector<TBOX> line_boxes;
146 std::vector<std::string> line_texts;
147 for (int b = start_box; b < end_box; ++b) {
148 TBOX box = boxes[b];
149 box.rotate(block_rotation);
150 box.move(shift);
151 line_boxes.push_back(box);
152 line_texts.push_back(texts[b]);
153 }
154 std::vector<int> page_numbers(line_boxes.size(), applybox_page);
155 image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156 return image_data;
157}
const int kImagePadding
Definition: imagedata.h:39
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:165

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 165 of file linerec.cpp.

166 {
167 TBOX wbox = box;
168 wbox.pad(padding, padding);
169 *revised_box = wbox;
170 // Number of clockwise 90 degree rotations needed to get back to tesseract
171 // coords from the clipped image.
172 int num_rotations = 0;
173 if (block.re_rotation().y() > 0.0f) {
174 num_rotations = 1;
175 } else if (block.re_rotation().x() < 0.0f) {
176 num_rotations = 2;
177 } else if (block.re_rotation().y() < 0.0f) {
178 num_rotations = 3;
179 }
180 // Handle two cases automatically: 1 the box came from the block, 2 the box
181 // came from a box file, and refers to the image, which the block may not.
182 if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
183 revised_box->rotate(block.re_rotation());
184 }
185 // Now revised_box always refers to the image.
186 // BestPix is never colormapped, but may be of any depth.
187 Image pix = BestPix();
188 int width = pixGetWidth(pix);
189 int height = pixGetHeight(pix);
190 TBOX image_box(0, 0, width, height);
191 // Clip to image bounds;
192 *revised_box &= image_box;
193 if (revised_box->null_box()) {
194 return nullptr;
195 }
196 Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
197 revised_box->height());
198 Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
199 boxDestroy(&clip_box);
200 if (box_pix == nullptr) {
201 return nullptr;
202 }
203 if (num_rotations > 0) {
204 Image rot_pix = pixRotateOrth(box_pix, num_rotations);
205 box_pix.destroy();
206 box_pix = rot_pix;
207 }
208 // Convert sub-8-bit images to 8 bit.
209 int depth = pixGetDepth(box_pix);
210 if (depth < 8) {
211 Image grey;
212 grey = pixConvertTo8(box_pix, false);
213 box_pix.destroy();
214 box_pix = grey;
215 }
216 bool vertical_text = false;
217 if (num_rotations > 0) {
218 // Rotated the clipped revised box back to internal coordinates.
219 FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
220 revised_box->rotate(rotation);
221 if (num_rotations != 2) {
222 vertical_text = true;
223 }
224 }
225 return new ImageData(vertical_text, box_pix);
226}

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 250 of file superscript.cpp.

254 {
255 *avg_certainty = *unlikely_threshold = 0.0f;
256 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
257 *leading_certainty = *trailing_certainty = 0.0f;
258
259 int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
260 int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
261
262 // Step one: Get an average certainty for "normally placed" characters.
263
264 // Counts here are of blobs in the rebuild_word / unichars in best_choice.
265 *leading_pos = *trailing_pos = SP_NORMAL;
266 int leading_outliers = 0;
267 int trailing_outliers = 0;
268 int num_normal = 0;
269 float normal_certainty_total = 0.0f;
270 float worst_normal_certainty = 0.0f;
271 ScriptPos last_pos = SP_NORMAL;
272 int num_blobs = word->rebuild_word->NumBlobs();
273 for (int b = 0; b < num_blobs; ++b) {
274 TBOX box = word->rebuild_word->blobs[b]->bounding_box();
275 ScriptPos pos = SP_NORMAL;
276 if (box.bottom() >= super_y_bottom) {
277 pos = SP_SUPERSCRIPT;
278 } else if (box.top() <= sub_y_top) {
279 pos = SP_SUBSCRIPT;
280 }
281 if (pos == SP_NORMAL) {
282 if (word->best_choice->unichar_id(b) != 0) {
283 float char_certainty = word->best_choice->certainty(b);
284 if (char_certainty < worst_normal_certainty) {
285 worst_normal_certainty = char_certainty;
286 }
287 num_normal++;
288 normal_certainty_total += char_certainty;
289 }
290 if (trailing_outliers == b) {
291 leading_outliers = trailing_outliers;
292 *leading_pos = last_pos;
293 }
294 trailing_outliers = 0;
295 } else {
296 if (last_pos == pos) {
297 trailing_outliers++;
298 } else {
299 trailing_outliers = 1;
300 }
301 }
302 last_pos = pos;
303 }
304 *trailing_pos = last_pos;
305 if (num_normal >= 3) { // throw out the worst as an outlier.
306 num_normal--;
307 normal_certainty_total -= worst_normal_certainty;
308 }
309 if (num_normal > 0) {
310 *avg_certainty = normal_certainty_total / num_normal;
311 *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
312 }
313 if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
314 return;
315 }
316
317 // Step two: Try to split off bits of the word that are both outliers
318 // and have much lower certainty than average
319 // Calculate num_leading and leading_certainty.
320 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
321 (*num_rebuilt_leading)++) {
322 float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
323 if (char_certainty > *unlikely_threshold) {
324 break;
325 }
326 if (char_certainty < *leading_certainty) {
327 *leading_certainty = char_certainty;
328 }
329 }
330
331 // Calculate num_trailing and trailing_certainty.
332 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
333 *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
334 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
335 float char_certainty = word->best_choice->certainty(blob_idx);
336 if (char_certainty > *unlikely_threshold) {
337 break;
338 }
339 if (char_certainty < *trailing_certainty) {
340 *trailing_certainty = char_certainty;
341 }
342 }
343}
@ SP_SUBSCRIPT
Definition: ratngs.h:254
@ SP_NORMAL
Definition: ratngs.h:254
@ SP_SUPERSCRIPT
Definition: ratngs.h:254

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 260 of file tesseractclass.h.

260 {
261 return pixGetHeight(pix_binary_);
262 }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 257 of file tesseractclass.h.

257 {
258 return pixGetWidth(pix_binary_);
259 }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const char *  filename)

Definition at line 36 of file recogtraining.cpp.

36 {
37 if (tessedit_ambigs_training) {
38 tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39 tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
40 // Explore all segmentations.
41 getDict().stopper_no_acceptable_choices.set_value(true);
42 }
43
44 std::string output_fname = filename;
45 const char *lastdot = strrchr(output_fname.c_str(), '.');
46 if (lastdot != nullptr) {
47 output_fname[lastdot - output_fname.c_str()] = '\0';
48 }
49 output_fname += ".txt";
50 FILE *output_file = fopen(output_fname.c_str(), "a+");
51 if (output_file == nullptr) {
52 tprintf("Error: Could not open file %s\n", output_fname.c_str());
53 ASSERT_HOST(output_file);
54 }
55 return output_file;
56}
Dict & getDict() override

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract ( const std::string &  arg0,
const std::string &  textbase,
const std::string &  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const std::vector< std::string > *  vars_vec,
const std::vector< std::string > *  vars_values,
bool  set_only_non_debug_params,
TessdataManager mgr 
)

Definition at line 288 of file tessedit.cpp.

292 {
293 std::vector<std::string> langs_to_load;
294 std::vector<std::string> langs_not_to_load;
295 ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296
297 for (auto *lang : sub_langs_) {
298 delete lang;
299 }
300
301 // Set the basename, compute the data directory.
302 main_setup(arg0, textbase);
303
304 sub_langs_.clear();
305 // Find the first loadable lang and load into this.
306 // Add any languages that this language requires
307 bool loaded_primary = false;
308 // Load the rest into sub_langs_.
309 // WARNING: A range based for loop does not work here because langs_to_load
310 // might be changed in the loop when a new submodel is found.
311 for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
312 auto &lang_to_load = langs_to_load[lang_index];
313 if (!IsStrInList(lang_to_load, langs_not_to_load)) {
314 const char *lang_str = lang_to_load.c_str();
315 Tesseract *tess_to_init;
316 if (!loaded_primary) {
317 tess_to_init = this;
318 } else {
319 tess_to_init = new Tesseract;
320 tess_to_init->main_setup(arg0, textbase);
321 }
322
323 int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
324 configs_size, vars_vec, vars_values,
325 set_only_non_debug_params, mgr);
326 // Forget that language, but keep any reader we were given.
327 mgr->Clear();
328
329 if (!loaded_primary) {
330 if (result < 0) {
331 tprintf("Failed loading language '%s'\n", lang_str);
332 } else {
333 ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
334 &langs_not_to_load);
335 loaded_primary = true;
336 }
337 } else {
338 if (result < 0) {
339 tprintf("Failed loading language '%s'\n", lang_str);
340 delete tess_to_init;
341 } else {
342 sub_langs_.push_back(tess_to_init);
343 // Add any languages that this language requires
344 ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
345 &langs_not_to_load);
346 }
347 }
348 }
349 }
350 if (!loaded_primary && !langs_to_load.empty()) {
351 tprintf("Tesseract couldn't load any languages!\n");
352 return -1; // Couldn't load any language!
353 }
354#ifndef DISABLED_LEGACY_ENGINE
355 if (!sub_langs_.empty()) {
356 // In multilingual mode word ratings have to be directly comparable,
357 // so use the same language model weights for all languages:
358 // use the primary language's params model if
359 // tessedit_use_primary_params_model is set,
360 // otherwise use default language model weights.
361 if (tessedit_use_primary_params_model) {
362 for (auto &sub_lang : sub_langs_) {
363 sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
364 }
365 tprintf("Using params model of the primary language\n");
366 } else {
367 this->language_model_->getParamsModel().Clear();
368 for (auto &sub_lang : sub_langs_) {
369 sub_lang->language_model_->getParamsModel().Clear();
370 }
371 }
372 }
373
375#endif // ndef DISABLED_LEGACY_ENGINE
376 return 0;
377}
void ParseLanguageString(const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
Definition: tessedit.cpp:244
void SetupUniversalFontIds()
Definition: tessedit.cpp:436
void main_setup(const std::string &argv0, const std::string &basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: ccutil.cpp:46
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract ( const std::string &  datapath,
const std::string &  language,
OcrEngineMode  oem 
)
inline

Definition at line 500 of file tesseractclass.h.

500 {
501 TessdataManager mgr;
502 return init_tesseract(datapath, {}, language, oem, nullptr, 0, nullptr, nullptr, false, &mgr);
503 }
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:288

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal ( const std::string &  arg0,
const std::string &  textbase,
const std::string &  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const std::vector< std::string > *  vars_vec,
const std::vector< std::string > *  vars_values,
bool  set_only_non_debug_params,
TessdataManager mgr 
)

Definition at line 395 of file tessedit.cpp.

400 {
401 if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
402 vars_values, set_only_non_debug_params, mgr)) {
403 return -1;
404 }
405 if (tessedit_init_config_only) {
406 return 0;
407 }
408 // If only LSTM will be used, skip loading Tesseract classifier's
409 // pre-trained templates and dictionary.
410 bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
411 program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
412 return 0; // Normal exit
413}
bool init_tesseract_lang_data(const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:76
void program_editup(const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:39

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data ( const std::string &  arg0,
const std::string &  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const std::vector< std::string > *  vars_vec,
const std::vector< std::string > *  vars_values,
bool  set_only_non_debug_params,
TessdataManager mgr 
)

Definition at line 76 of file tessedit.cpp.

81 {
82 // Set the language data path prefix
83 lang = !language.empty() ? language : "eng";
87
88 // Initialize TessdataManager.
89 std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
90 if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
91 tprintf("Error opening data file %s\n", tessdata_path.c_str());
92 tprintf(
93 "Please make sure the TESSDATA_PREFIX environment variable is set"
94 " to your \"tessdata\" directory.\n");
95 return false;
96 }
97#ifdef DISABLED_LEGACY_ENGINE
98 tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
99#else
100 if (oem == OEM_DEFAULT) {
101 // Set the engine mode from availability, which can then be overridden by
102 // the config file when we read it below.
103 if (!mgr->IsLSTMAvailable()) {
104 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
105 } else if (!mgr->IsBaseAvailable()) {
106 tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
107 } else {
108 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
109 }
110 }
111#endif // ndef DISABLED_LEGACY_ENGINE
112
113 // If a language specific config file (lang.config) exists, load it in.
114 TFile fp;
115 if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
117 }
118
119 SetParamConstraint set_params_constraint =
121 // Load tesseract variables from config files. This is done after loading
122 // language-specific variables from [lang].traineddata file, so that custom
123 // config files can override values in [lang].traineddata file.
124 for (int i = 0; i < configs_size; ++i) {
125 read_config_file(configs[i], set_params_constraint);
126 }
127
128 // Set params specified in vars_vec (done after setting params from config
129 // files, so that params in vars_vec can override those from files).
130 if (vars_vec != nullptr && vars_values != nullptr) {
131 for (unsigned i = 0; i < vars_vec->size(); ++i) {
132 if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
133 set_params_constraint, this->params())) {
134 tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
135 }
136 }
137 }
138
139 if (!tessedit_write_params_to_file.empty()) {
140 FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
141 if (params_file != nullptr) {
142 ParamUtils::PrintParams(params_file, this->params());
143 fclose(params_file);
144 } else {
145 tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
146 }
147 }
148
149#ifndef DISABLED_LEGACY_ENGINE
150 // Determine which ocr engine(s) should be loaded and used for recognition.
151 if (oem != OEM_DEFAULT) {
152 tessedit_ocr_engine_mode.set_value(oem);
153 }
154#endif
155
156 // If we are only loading the config file (and so not planning on doing any
157 // recognition) then there's nothing else do here.
158 if (tessedit_init_config_only) {
159 return true;
160 }
161
162// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
163// which engine-specific data files need to be loaded. If LSTM_ONLY is
164// requested, the base Tesseract files are *Not* required.
165#ifdef DISABLED_LEGACY_ENGINE
166 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
167#else
168 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
169 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
170#endif // ndef DISABLED_LEGACY_ENGINE
171 if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
172 lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
173 ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
174 } else {
175 tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
176 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
177 }
178 }
179
180 // Load the unicharset
181 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
182 // Avoid requiring a unicharset when we aren't running base tesseract.
183 unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
184 }
185#ifndef DISABLED_LEGACY_ENGINE
186 else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
187 tprintf(
188 "Error: Tesseract (legacy) engine requested, but components are "
189 "not present in %s!!\n",
190 tessdata_path.c_str());
191 return false;
192 }
193#endif // ndef DISABLED_LEGACY_ENGINE
195 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
196 return false;
197 }
198 right_to_left_ = unicharset.major_right_to_left();
199
200#ifndef DISABLED_LEGACY_ENGINE
201
202 // Setup initial unichar ambigs table and read universal ambigs.
203 UNICHARSET encoder_unicharset;
204 encoder_unicharset.CopyFrom(unicharset);
205 unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
206 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
207
208 if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
209 unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
210 use_ambigs_for_adaption, &unicharset);
211 }
212
213 // Init ParamsModel.
214 // Load pass1 and pass2 weights (for now these two sets are the same, but in
215 // the future separate sets of weights can be generated).
217 language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
218 if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
219 if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
220 return false;
221 }
222 }
223 }
224#endif // ndef DISABLED_LEGACY_ENGINE
225
226 return true;
227}
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
const char * p
SetParamConstraint
Definition: params.h:39
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:42
@ SET_PARAM_CONSTRAINT_NONE
Definition: params.h:40
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_LANG_CONFIG
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:46
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:64
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:51
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:72
std::string language_data_path_prefix
Definition: ccutil.h:60
std::string datadir
Definition: ccutil.h:57
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:63
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:51
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:81
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
bool major_right_to_left() const
Definition: unicharset.cpp:983
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:438
size_t size() const
Definition: unicharset.h:355
bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr)

◆ INT_VAR_H() [1/43]

tesseract::Tesseract::INT_VAR_H ( applybox_debug  )

◆ INT_VAR_H() [2/43]

tesseract::Tesseract::INT_VAR_H ( applybox_page  )

◆ INT_VAR_H() [3/43]

tesseract::Tesseract::INT_VAR_H ( bidi_debug  )

◆ INT_VAR_H() [4/43]

tesseract::Tesseract::INT_VAR_H ( crunch_debug  )

◆ INT_VAR_H() [5/43]

tesseract::Tesseract::INT_VAR_H ( crunch_leave_lc_strings  )

◆ INT_VAR_H() [6/43]

tesseract::Tesseract::INT_VAR_H ( crunch_leave_uc_strings  )

◆ INT_VAR_H() [7/43]

tesseract::Tesseract::INT_VAR_H ( crunch_long_repetitions  )

◆ INT_VAR_H() [8/43]

tesseract::Tesseract::INT_VAR_H ( crunch_pot_indicators  )

◆ INT_VAR_H() [9/43]

tesseract::Tesseract::INT_VAR_H ( crunch_rating_max  )

◆ INT_VAR_H() [10/43]

tesseract::Tesseract::INT_VAR_H ( debug_fix_space_level  )

◆ INT_VAR_H() [11/43]

tesseract::Tesseract::INT_VAR_H ( debug_noise_removal  )

◆ INT_VAR_H() [12/43]

tesseract::Tesseract::INT_VAR_H ( debug_x_ht_level  )

◆ INT_VAR_H() [13/43]

tesseract::Tesseract::INT_VAR_H ( fixsp_done_mode  )

◆ INT_VAR_H() [14/43]

tesseract::Tesseract::INT_VAR_H ( fixsp_non_noise_limit  )

◆ INT_VAR_H() [15/43]

tesseract::Tesseract::INT_VAR_H ( jpg_quality  )

◆ INT_VAR_H() [16/43]

tesseract::Tesseract::INT_VAR_H ( lstm_choice_iterations  )

◆ INT_VAR_H() [17/43]

tesseract::Tesseract::INT_VAR_H ( lstm_choice_mode  )

◆ INT_VAR_H() [18/43]

tesseract::Tesseract::INT_VAR_H ( min_characters_to_try  )

◆ INT_VAR_H() [19/43]

tesseract::Tesseract::INT_VAR_H ( min_sane_x_ht_pixels  )

◆ INT_VAR_H() [20/43]

tesseract::Tesseract::INT_VAR_H ( multilang_debug_level  )

◆ INT_VAR_H() [21/43]

tesseract::Tesseract::INT_VAR_H ( noise_maxperblob  )

◆ INT_VAR_H() [22/43]

tesseract::Tesseract::INT_VAR_H ( noise_maxperword  )

◆ INT_VAR_H() [23/43]

tesseract::Tesseract::INT_VAR_H ( ocr_devanagari_split_strategy  )

◆ INT_VAR_H() [24/43]

tesseract::Tesseract::INT_VAR_H ( pageseg_devanagari_split_strategy  )

◆ INT_VAR_H() [25/43]

tesseract::Tesseract::INT_VAR_H ( paragraph_debug_level  )

◆ INT_VAR_H() [26/43]

tesseract::Tesseract::INT_VAR_H ( quality_min_initial_alphas_reqd  )

◆ INT_VAR_H() [27/43]

tesseract::Tesseract::INT_VAR_H ( superscript_debug  )

◆ INT_VAR_H() [28/43]

tesseract::Tesseract::INT_VAR_H ( suspect_level  )

◆ INT_VAR_H() [29/43]

tesseract::Tesseract::INT_VAR_H ( suspect_short_words  )

◆ INT_VAR_H() [30/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_bigram_debug  )

◆ INT_VAR_H() [31/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_font_id  )

◆ INT_VAR_H() [32/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_image_border  )

◆ INT_VAR_H() [33/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_ocr_engine_mode  )

◆ INT_VAR_H() [34/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_page_number  )

◆ INT_VAR_H() [35/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_pageseg_mode  )

◆ INT_VAR_H() [36/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_parallelize  )

◆ INT_VAR_H() [37/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_preserve_min_wd_len  )

◆ INT_VAR_H() [38/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_reject_mode  )

◆ INT_VAR_H() [39/43]

tesseract::Tesseract::INT_VAR_H ( tessedit_tess_adaption_mode  )

◆ INT_VAR_H() [40/43]

tesseract::Tesseract::INT_VAR_H ( thresholding_method  )

◆ INT_VAR_H() [41/43]

tesseract::Tesseract::INT_VAR_H ( user_defined_dpi  )

◆ INT_VAR_H() [42/43]

tesseract::Tesseract::INT_VAR_H ( x_ht_acceptance_tolerance  )

◆ INT_VAR_H() [43/43]

tesseract::Tesseract::INT_VAR_H ( x_ht_min_change  )

◆ join_words()

void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 216 of file tfacepp.cpp.

216 {
217 TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
218 TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
219 // Tack the word2 outputs onto the end of the word outputs.
220 word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
221 word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
222 word2->chopped_word->blobs.clear();
223 word2->rebuild_word->blobs.clear();
224 TPOINT split_pt;
225 split_pt.x = (prev_box.right() + blob_box.left()) / 2;
226 split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
227 // Move the word2 seams onto the end of the word1 seam_array.
228 // Since the seam list is one element short, an empty seam marking the
229 // end of the last blob in the first word is needed first.
230 word->seam_array.push_back(new SEAM(0.0f, split_pt));
231 word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
232 word2->seam_array.clear();
233 // Fix widths and gaps.
234 word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
235 word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
236 // Fix the ratings matrix.
237 int rat1 = word->ratings->dimension();
238 int rat2 = word2->ratings->dimension();
239 word->ratings->AttachOnCorner(word2->ratings);
240 ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
241 word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
242 // Append the word choices.
243 *word->raw_choice += *word2->raw_choice;
244
245 // How many alt choices from each should we try to get?
246 const int kAltsPerPiece = 2;
247 // When do we start throwing away extra alt choices?
248 const int kTooManyAltChoices = 100;
249
250 // Construct the cartesian product of the best_choices of word(1) and word2.
251 WERD_CHOICE_LIST joined_choices;
252 WERD_CHOICE_IT jc_it(&joined_choices);
253 WERD_CHOICE_IT bc1_it(&word->best_choices);
254 WERD_CHOICE_IT bc2_it(&word2->best_choices);
255 int num_word1_choices = word->best_choices.length();
256 int total_joined_choices = num_word1_choices;
257 // Nota Bene: For the main loop here, we operate only on the 2nd and greater
258 // word2 choices, and put them in the joined_choices list. The 1st word2
259 // choice gets added to the original word1 choices in-place after we have
260 // finished with them.
261 int bc2_index = 1;
262 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
263 if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
264 break;
265 }
266 int bc1_index = 0;
267 for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
268 if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
269 break;
270 }
271 auto *wc = new WERD_CHOICE(*bc1_it.data());
272 *wc += *bc2_it.data();
273 jc_it.add_after_then_move(wc);
274 ++total_joined_choices;
275 }
276 }
277 // Now that we've filled in as many alternates as we want, paste the best
278 // choice for word2 onto the original word alt_choices.
279 bc1_it.move_to_first();
280 bc2_it.move_to_first();
281 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
282 *bc1_it.data() += *bc2_it.data();
283 }
284 bc1_it.move_to_last();
285 bc1_it.add_list_after(&joined_choices);
286
287 // Restore the pointer to original blamer bundle and combine blamer
288 // information recorded in the splits.
289 if (orig_bb != nullptr) {
290 orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
291 delete word->blamer_bundle;
292 word->blamer_bundle = orig_bb;
293 }
294 word->SetupBoxWord();
295 word->reject_map.initialise(word->box_word->length());
296 delete word2;
297}
@ TPOINT

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 230 of file linerec.cpp.

231 {
232 TBOX word_box = word->word->bounding_box();
233 // Get the word image - no frills.
234 if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
235 // In single word mode, use the whole image without any other row/word
236 // interpretation.
237 word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238 } else {
239 float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240 if (baseline + row->descenders() < word_box.bottom()) {
241 word_box.set_bottom(baseline + row->descenders());
242 }
243 if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
244 word_box.set_top(baseline + row->x_height() + row->ascenders());
245 }
246 }
247 ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
248 if (im_data == nullptr) {
249 return;
250 }
251
252 bool do_invert = tessedit_do_invert;
253 float threshold = do_invert ? double(invert_threshold) : 0.0f;
254 lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
255 kWorstDictCertainty / kCertaintyScale, word_box, words,
256 lstm_choice_mode, lstm_choice_iterations);
257 delete im_data;
258 SearchWords(words);
259}
const float kWorstDictCertainty
Definition: linerec.cpp:35
@ PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:168
const float kCertaintyScale
Definition: linerec.cpp:33
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:264
void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
int16_t  pass 
)

Definition at line 96 of file reject.cpp.

96 {
97 flip_0O(word);
98 check_debug_pt(word, -1); // For trap only
99 set_done(word, pass); // Set acceptance
100 word->reject_map.initialise(word->best_choice->unichar_lengths().length());
101 reject_blanks(word);
102 /*
1030: Rays original heuristic - the baseline
104*/
105 if (tessedit_reject_mode == 0) {
106 if (!word->done) {
108 }
109 } else if (tessedit_reject_mode == 5) {
110 /*
1115: Reject I/1/l from words where there is no strong contextual confirmation;
112 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
113 and the whole of any words which are very small
114*/
115 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
116 word->reject_map.rej_word_small_xht();
117 } else {
118 one_ell_conflict(word, true);
119 /*
120 Originally the code here just used the done flag. Now I have duplicated
121 and unpacked the conditions for setting the done flag so that each
122 mechanism can be turned on or off independently. This works WITHOUT
123 affecting the done flag setting.
124*/
125 if (rej_use_tess_accepted && !word->tess_accepted) {
126 word->reject_map.rej_word_not_tess_accepted();
127 }
128
129 if (rej_use_tess_blanks &&
130 (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
131 word->reject_map.rej_word_contains_blanks();
132 }
133
134 WERD_CHOICE *best_choice = word->best_choice;
135 if (rej_use_good_perm) {
136 if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
137 best_choice->permuter() == FREQ_DAWG_PERM ||
138 best_choice->permuter() == USER_DAWG_PERM) &&
139 (!rej_use_sensible_wd ||
140 acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
141 best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
142 // PASSED TEST
143 } else if (best_choice->permuter() == NUMBER_PERM) {
144 if (rej_alphas_in_number_perm) {
145 for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
146 offset += best_choice->unichar_lengths()[i++]) {
147 if (word->reject_map[i].accepted() &&
148 word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
149 best_choice->unichar_lengths()[i])) {
150 word->reject_map[i].setrej_bad_permuter();
151 }
152 // rej alpha
153 }
154 }
155 } else {
156 word->reject_map.rej_word_bad_permuter();
157 }
158 }
159 /* Ambig word rejection was here once !!*/
160 }
161 } else {
162 tprintf("BAD tessedit_reject_mode\n");
163 ASSERT_HOST("Fatal error encountered!" == nullptr);
164 }
165
166 if (tessedit_image_border > -1) {
167 reject_edge_blobs(word);
168 }
169
170 check_debug_pt(word, 10);
171 if (tessedit_rejection_debug) {
172 tprintf("Permuter Type = %d\n", word->best_choice->permuter());
173 tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
174 word->best_choice->rating());
175 tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
176 }
177
178 flip_hyphens(word);
179 check_debug_pt(word, 20);
180}
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:208
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:182
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:260
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:287
void set_done(WERD_RES *word, int16_t pass)
Definition: reject.cpp:62
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:602
void flip_0O(WERD_RES *word)
Definition: reject.cpp:660
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:86

◆ match_current_words()

void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 218 of file fixspace.cpp.

218 {
219 WERD_RES_IT word_it(&words);
220 WERD_RES *word;
221 // Since we are not using PAGE_RES to iterate over words, we need to update
222 // prev_word_best_choice_ before calling classify_word_pass2().
223 prev_word_best_choice_ = nullptr;
224 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
225 word = word_it.data();
226 if ((!word->part_of_combo) && (word->box_word == nullptr)) {
227 WordData word_data(block, row, word);
228 SetupWordPassN(2, &word_data);
229 classify_word_and_language(2, nullptr, &word_data);
230 }
231 prev_word_best_choice_ = word->best_choice;
232 }
233}

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1589 of file control.cpp.

1589 {
1590 if (word->tess_failed) {
1591 return;
1592 }
1593 tess_segment_pass_n(pass_n, word);
1594
1595 if (!word->tess_failed) {
1596 if (!word->word->flag(W_REP_CHAR)) {
1597 word->fix_quotes();
1598 if (tessedit_fix_hyphens) {
1599 word->fix_hyphens();
1600 }
1601 /* Don't trust fix_quotes! - though I think I've fixed the bug */
1602 if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1603 tprintf(
1604 "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1605 " #Blobs=%u\n",
1606 word->best_choice->debug_string().c_str(), word->best_choice->length(),
1607 word->box_word->length());
1608 }
1609 word->tess_accepted = tess_acceptable_word(word);
1610
1611 // Also sets word->done flag
1612 make_reject_map(word, row, pass_n);
1613 }
1614 }
1615 set_word_fonts(word);
1616
1617 ASSERT_HOST(word->raw_choice != nullptr);
1618}
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1915
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:64
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: reject.cpp:96

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord ( const std::vector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 231 of file applybox.cpp.

232 {
233 if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
234 classify_bln_numeric_mode, textord_use_cjk_fp_model,
235 poly_allow_detailed_fx, row, block)) {
236 word_res->CloneChoppedToRebuild();
237 return;
238 }
239 if (chop_debug) {
240 tprintf("Maximally chopping word at:");
241 word_res->word->bounding_box().print();
242 }
243 std::vector<BLOB_CHOICE *> blob_choices;
244 ASSERT_HOST(!word_res->chopped_word->blobs.empty());
245 auto rating = static_cast<float>(INT8_MAX);
246 for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
247 // The rating and certainty are not quite arbitrary. Since
248 // select_blob_to_chop uses the worst certainty to choose, they all have
249 // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
250 // in here, and then divide by e each time they are chopped, which
251 // should guarantee a set of unequal values for the whole tree of blobs
252 // produced, however much chopping is required. The chops are thus only
253 // limited by the ability of the chopper to find suitable chop points,
254 // and not by the value of the certainties.
255 auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
256 blob_choices.push_back(choice);
257 rating -= 0.125f;
258 }
259 const double e = exp(1.0); // The base of natural logs.
260 unsigned blob_number;
261 int right_chop_index = 0;
262 if (!assume_fixed_pitch_char_segment) {
263 // We only chop if the language is not fixed pitch like CJK.
264 SEAM *seam = nullptr;
265 while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
266 word_res->InsertSeam(blob_number, seam);
267 BLOB_CHOICE *left_choice = blob_choices[blob_number];
268 rating = left_choice->rating() / e;
269 left_choice->set_rating(rating);
270 left_choice->set_certainty(-rating);
271 // combine confidence w/ serial #
272 auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
273 0.0f, 0.0f, BCC_FAKE);
274 blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
275 }
276 }
277 word_res->CloneChoppedToRebuild();
278 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
279}
@ BCC_FAKE
Definition: ratngs.h:53
SEAM * chop_one_blob(const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
Definition: chopper.cpp:367

◆ mutable_pix_binary()

Image * tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 204 of file tesseractclass.h.

204 {
205 pix_binary_.destroy();
206 return &pix_binary_;
207 }

◆ mutable_textord()

Textord * tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 276 of file tesseractclass.h.

276 {
277 return &textord_;
278 }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)

◆ noise_outlines()

bool tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 907 of file docqual.cpp.

907 {
908 TBOX box; // BB of outline
909 int16_t outline_count = 0;
910 int16_t small_outline_count = 0;
911 int16_t max_dimension;
912 float small_limit = kBlnXHeight * crunch_small_outlines_size;
913
914 for (unsigned b = 0; b < word->NumBlobs(); ++b) {
915 TBLOB *blob = word->blobs[b];
916 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
917 outline_count++;
918 box = ol->bounding_box();
919 if (box.height() > box.width()) {
920 max_dimension = box.height();
921 } else {
922 max_dimension = box.width();
923 }
924 if (max_dimension < small_limit) {
925 small_outline_count++;
926 }
927 }
928 }
929 return small_outline_count >= outline_count;
930}

◆ non_0_digit()

bool tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 772 of file reject.cpp.

772 {
773 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
774}

◆ non_O_upper()

bool tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 768 of file reject.cpp.

768 {
769 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
770}

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 283 of file tesseractclass.h.

283 {
284 return sub_langs_.size();
285 }

◆ one_ell_conflict()

bool tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
bool  update_map 
)

Definition at line 287 of file reject.cpp.

287 {
288 const char *word;
289 const char *lengths;
290 int16_t word_len; // its length
291 int16_t first_alphanum_index_;
292 int16_t first_alphanum_offset_;
293 int16_t i;
294 int16_t offset;
295 bool non_conflict_set_char; // non conf set a/n?
296 bool conflict = false;
297 bool allow_1s;
298 ACCEPTABLE_WERD_TYPE word_type;
299 bool dict_perm_type;
300 bool dict_word_ok;
301 int dict_word_type;
302
303 word = word_res->best_choice->unichar_string().c_str();
304 lengths = word_res->best_choice->unichar_lengths().c_str();
305 word_len = strlen(lengths);
306 /*
307 If there are no occurrences of the conflict set characters then the word
308 is OK.
309*/
310 if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
311 return false;
312 }
313
314 /*
315 There is a conflict if there are NO other (confirmed) alphanumerics apart
316 from those in the conflict set.
317*/
318
319 for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
320 offset += lengths[i++]) {
321 non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
322 word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
323 !conflict_set_I_l_1.contains(word[offset]);
324 }
325 if (!non_conflict_set_char) {
326 if (update_map) {
327 reject_I_1_L(word_res);
328 }
329 return true;
330 }
331
332 /*
333 If the word is accepted by a dawg permuter, and the first alpha character
334 is "I" or "l", check to see if the alternative is also a dawg word. If it
335 is, then there is a potential error otherwise the word is ok.
336*/
337
338 dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
339 (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
340 (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
341 (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
342 dict_word_type = dict_word(*(word_res->best_choice));
343 dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
344
345 if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
346 (dict_perm_type && dict_word_ok)) {
347 first_alphanum_index_ = first_alphanum_index(word, lengths);
348 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
349 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
350 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
351 if (safe_dict_word(word_res) > 0) {
352 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
353 if (update_map) {
354 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
355 }
356 return true;
357 } else {
358 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
359 return false;
360 }
361 }
362
363 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
364 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365 if (safe_dict_word(word_res) > 0) {
366 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
367 if (update_map) {
368 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
369 }
370 return true;
371 } else {
372 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
373 return false;
374 }
375 }
376 return false;
377 }
378
379 /*
380 NEW 1Il code. The old code relied on permuter types too much. In fact,
381 tess will use TOP_CHOICE permute for good things like "palette".
382 In this code the string is examined independently to see if it looks like
383 a well formed word.
384*/
385
386 /*
387 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
388 dictionary word.
389*/
390 first_alphanum_index_ = first_alphanum_index(word, lengths);
391 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
392 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
393 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
394 if (safe_dict_word(word_res) > 0) {
395 return false;
396 } else {
397 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
398 }
399 } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
400 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
401 if (safe_dict_word(word_res) > 0) {
402 return false;
403 } else {
404 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405 }
406 }
407 /*
408 For strings containing digits:
409 If there are no alphas OR the numeric permuter liked the word,
410 reject any non 1 conflict chs
411 Else reject all conflict chs
412*/
413 if (word_contains_non_1_digit(word, lengths)) {
414 allow_1s =
415 (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
416
417 int16_t offset;
418 conflict = false;
419 for (i = 0, offset = 0; word[offset] != '\0';
420 offset += word_res->best_choice->unichar_lengths()[i++]) {
421 if ((!allow_1s || (word[offset] != '1')) &&
422 conflict_set_I_l_1.contains(word[offset])) {
423 if (update_map) {
424 word_res->reject_map[i].setrej_1Il_conflict();
425 }
426 conflict = true;
427 }
428 }
429 return conflict;
430 }
431 /*
432 For anything else. See if it conforms to an acceptable word type. If so,
433 treat accordingly.
434*/
435 word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
436 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
437 first_alphanum_index_ = first_alphanum_index(word, lengths);
438 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
439 if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
440 if (update_map) {
441 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
442 }
443 return true;
444 } else {
445 return false;
446 }
447 } else if (word_type == AC_UPPER_CASE) {
448 return false;
449 } else {
450 if (update_map) {
451 reject_I_1_L(word_res);
452 }
453 return true;
454 }
455}
@ DOC_DAWG_PERM
Definition: ratngs.h:245
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:457
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:470
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:483
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:195
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:496

◆ output_pass()

void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 39 of file output.cpp.

40 {
41 BLOCK_RES *block_of_last_word;
42 bool force_eol; // During output
43 BLOCK *nextblock; // block of next word
44 WERD *nextword; // next word
45
46 page_res_it.restart_page();
47 block_of_last_word = nullptr;
48 while (page_res_it.word() != nullptr) {
49 check_debug_pt(page_res_it.word(), 120);
50
51 if (target_word_box) {
52 TBOX current_word_box = page_res_it.word()->word->bounding_box();
53 FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
54 (current_word_box.bottom() + current_word_box.top()) / 2);
55 if (!target_word_box->contains(center_pt)) {
56 page_res_it.forward();
57 continue;
58 }
59 }
60 if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
61 block_of_last_word = page_res_it.block();
62 }
63
64 force_eol =
65 (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
66 (page_res_it.next_word() == nullptr);
67
68 if (page_res_it.next_word() != nullptr) {
69 nextword = page_res_it.next_word()->word;
70 } else {
71 nextword = nullptr;
72 }
73 if (page_res_it.next_block() != nullptr) {
74 nextblock = page_res_it.next_block()->block;
75 } else {
76 nextblock = nullptr;
77 }
78 // regardless of tilde crunching
79 write_results(page_res_it,
80 determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
81 nextword, nextblock),
82 force_eol);
83 page_res_it.forward();
84 }
85}
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:207
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:99

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString ( const std::string &  lang_str,
std::vector< std::string > *  to_load,
std::vector< std::string > *  not_to_load 
)

Definition at line 244 of file tessedit.cpp.

245 {
246 std::string remains(lang_str);
247 // Look whether the model file uses a prefix which must be applied to
248 // included model files as well.
249 std::string prefix;
250 size_t found = lang.find_last_of('/');
251 if (found != std::string::npos) {
252 // A prefix was found.
253 prefix = lang.substr(0, found + 1);
254 }
255 while (!remains.empty()) {
256 // Find the start of the lang code and which vector to add to.
257 const char *start = remains.c_str();
258 while (*start == '+') {
259 ++start;
260 }
261 std::vector<std::string> *target = to_load;
262 if (*start == '~') {
263 target = not_to_load;
264 ++start;
265 }
266 // Find the index of the end of the lang code in string start.
267 int end = strlen(start);
268 const char *plus = strchr(start, '+');
269 if (plus != nullptr && plus - start < end) {
270 end = plus - start;
271 }
272 std::string lang_code(start);
273 lang_code.resize(end);
274 std::string next(start + end);
275 remains = next;
276 lang_code = prefix + lang_code;
277 // Check whether lang_code is already in the target vector and add.
278 if (!IsStrInList(lang_code, *target)) {
279 target->push_back(lang_code);
280 }
281 }
282}
def next(obj)
Definition: ast.py:56

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 355 of file pgedit.cpp.

355 {
356 current_page_res = page_res;
357 if (current_page_res->block_res_list.empty()) {
358 return;
359 }
360
361 recog_done = false;
362 stillRunning = true;
363
364 build_image_window(width, height);
365 word_display_mode.set(DF_EDGE_STEP);
367# ifndef GRAPHICS_DISABLED
368 pe = new ParamsEditor(this, image_win);
369# endif
370 PGEventHandler pgEventHandler(this);
371
372 image_win->AddEventHandler(&pgEventHandler);
373 image_win->AddMessageBox();
374
375 SVMenuNode *svMenuRoot = build_menu_new();
376
377 svMenuRoot->BuildMenu(image_win);
378 image_win->SetVisible(true);
379
380 image_win->AwaitEvent(SVET_DESTROY);
381 image_win->AddEventHandler(nullptr);
382}
@ SVET_DESTROY
Definition: scrollview.h:54
@ DF_EDGE_STEP
Edge steps.
Definition: werd.h:51
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:275
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:900
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:325
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
std::unique_ptr< SVEvent > AwaitEvent(SVEventType type)
Definition: scrollview.cpp:432
void SetVisible(bool visible)
Definition: scrollview.cpp:515
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:408

◆ pix_binary()

Image tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 208 of file tesseractclass.h.

208 {
209 return pix_binary_;
210 }

◆ pix_grey()

Image tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 211 of file tesseractclass.h.

211 {
212 return pix_grey_;
213 }

◆ pix_original()

Image tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 218 of file tesseractclass.h.

218 {
219 return pix_original_;
220 }

◆ potential_word_crunch()

bool tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
bool  ok_dict_word 
)

Definition at line 488 of file docqual.cpp.

489 {
490 float rating_per_ch;
491 int adjusted_len;
492 const char *str = word->best_choice->unichar_string().c_str();
493 const char *lengths = word->best_choice->unichar_lengths().c_str();
494 bool word_crunchable;
495 int poor_indicator_count = 0;
496
497 word_crunchable =
498 !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
499 (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
500
501 adjusted_len = word->reject_map.length();
502 if (adjusted_len > 10) {
503 adjusted_len = 10;
504 }
505 rating_per_ch = word->best_choice->rating() / adjusted_len;
506
507 if (rating_per_ch > crunch_pot_poor_rate) {
508 if (crunch_debug > 2) {
509 tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
510 }
511 poor_indicator_count++;
512 }
513
514 if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
515 if (crunch_debug > 2) {
516 tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
517 }
518 poor_indicator_count++;
519 }
520
521 if (garbage_level != G_OK) {
522 if (crunch_debug > 2) {
523 tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
524 }
525 poor_indicator_count++;
526 }
527 return poor_indicator_count >= crunch_pot_indicators;
528}

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 174 of file applybox.cpp.

174 {
175 const double median_xheight = MedianXHeight(block_list);
176 const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
177 // Strip all fuzzy space markers to simplify the PAGE_RES.
178 BLOCK_IT b_it(block_list);
179 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
180 BLOCK *block = b_it.data();
181 ROW_IT r_it(block->row_list());
182 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
183 ROW *row = r_it.data();
184 const double diff = fabs(row->x_height() - median_xheight);
185 if (diff > max_deviation) {
186 if (applybox_debug) {
187 tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
188 }
189 row->set_x_height(static_cast<float>(median_xheight));
190 }
191 }
192 }
193}
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:36

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 557 of file tesseractclass.cpp.

557 {
558 textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
559 // Find the max splitter strategy over all langs.
560 auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
561 static_cast<int32_t>(pageseg_devanagari_split_strategy));
562 for (auto &sub_lang : sub_langs_) {
563 auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
564 static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));
565 if (pageseg_strategy > max_pageseg_strategy) {
566 max_pageseg_strategy = pageseg_strategy;
567 }
568 sub_lang->pix_binary_.destroy();
569 sub_lang->pix_binary_ = pix_binary().clone();
570 }
571 // Perform shiro-rekha (top-line) splitting and replace the current image by
572 // the newly split image.
573 splitter_.set_orig_pix(pix_binary());
574 splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
575 if (splitter_.Split(true, &pixa_debug_)) {
576 ASSERT_HOST(splitter_.splitted_image());
577 pix_binary_.destroy();
578 pix_binary_ = splitter_.splitted_image().clone();
579 }
580}
Image pix_binary() const
Image clone() const
Definition: image.cpp:24
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:101

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 587 of file tesseractclass.cpp.

587 {
588 // Find the max splitter strategy over all langs.
589 auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
590 static_cast<int32_t>(ocr_devanagari_split_strategy));
591 for (auto &sub_lang : sub_langs_) {
592 auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
593 static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));
594 if (ocr_strategy > max_ocr_strategy) {
595 max_ocr_strategy = ocr_strategy;
596 }
597 }
598 // Utilize the segmentation information available.
599 splitter_.set_segmentation_block_list(block_list);
600 splitter_.set_ocr_split_strategy(max_ocr_strategy);
601 // Run the splitter for OCR
602 bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
603 // Restore pix_binary to the binarized original pix for future reference.
604 ASSERT_HOST(splitter_.orig_pix());
605 pix_binary_.destroy();
606 pix_binary_ = splitter_.orig_pix().clone();
607 // If the pageseg and ocr strategies are different, refresh the block list
608 // (from the last SegmentImage call) with blobs from the real image to be used
609 // for OCR.
610 if (splitter_.HasDifferentSplitStrategies()) {
611 BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_));
612 Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();
613 extract_edges(pix_for_ocr, &block);
614 splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
615 }
616 // The splitter isn't needed any more after this, so save memory by clearing.
617 splitter_.Clear();
618}
void extract_edges(Image pix, BLOCK *block)
Definition: edgblob.cpp:347
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
void set_segmentation_block_list(BLOCK_LIST *block_list)
void set_ocr_split_strategy(SplitStrategy strategy)

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const std::vector< WordData > &  words)

Definition at line 38 of file par_control.cpp.

38 {
39 // Prepare all the blobs.
40 std::vector<BlobData> blobs;
41 for (const auto &w : words) {
42 if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
43 for (size_t s = 0; s < w.lang_words.size(); ++s) {
44 Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
45 const WERD_RES &word = *w.lang_words[s];
46 for (unsigned b = 0; b < word.chopped_word->NumBlobs(); ++b) {
47 blobs.emplace_back(b, sub, word);
48 }
49 }
50 }
51 }
52 // Pre-classify all the blobs.
53 if (tessedit_parallelize > 1) {
54#ifdef _OPENMP
55# pragma omp parallel for num_threads(10)
56#endif // _OPENMP
57 // NOLINTNEXTLINE(modernize-loop-convert)
58 for (size_t b = 0; b < blobs.size(); ++b) {
59 *blobs[b].choices =
60 blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
61 }
62 } else {
63 // TODO(AMD) parallelize this.
64 for (auto &blob : blobs) {
65 *blob.choices = blob.tesseract->classify_blob(blob.blob, "par", ScrollView::WHITE, nullptr);
66 }
67 }
68}

◆ process_cmd_win_event()

bool tesseract::Tesseract::process_cmd_win_event ( int32_t  cmd_event,
char *  new_value 
)

process_cmd_win_event()

Process a command returned from the command window (Just call the appropriate command handler)

Definition at line 391 of file pgedit.cpp.

394 {
395 char msg[160];
396 bool exit = false;
397
398 color_mode = CM_RAINBOW;
399
400 // Run recognition on the full page if needed.
401 switch (cmd_event) {
402 case BLAMER_CMD_EVENT:
412 if (!recog_done) {
413 recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
414 recog_done = true;
415 }
416 break;
417 default:
418 break;
419 }
420
421 char *parameter;
422
423 switch (cmd_event) {
424 case NULL_CMD_EVENT:
425 break;
426
431 case RECOG_WERDS:
432 case RECOG_PSEUDO:
434 mode = static_cast<CMD_EVENTS>(cmd_event);
435 break;
438 parameter = image_win->ShowInputDialog("Config File Name");
439 word_config_ = parameter;
440 delete[] parameter;
441 break;
443 if (new_value[0] == 'T') {
444 word_display_mode.set(DF_BOX);
445 } else {
446 word_display_mode.reset(DF_BOX);
447 }
449 break;
450 case BLAMER_CMD_EVENT:
451 if (new_value[0] == 'T') {
452 word_display_mode.set(DF_BLAMER);
453 } else {
454 word_display_mode.reset(DF_BLAMER);
455 }
458 break;
460 if (new_value[0] == 'T') {
461 word_display_mode.set(DF_TEXT);
462 } else {
463 word_display_mode.reset(DF_TEXT);
464 }
466 break;
468 if (new_value[0] == 'T') {
469 word_display_mode.set(DF_POLYGONAL);
470 } else {
471 word_display_mode.reset(DF_POLYGONAL);
472 }
474 break;
476 if (new_value[0] == 'T') {
477 word_display_mode.set(DF_BN_POLYGONAL);
478 } else {
479 word_display_mode.reset(DF_BN_POLYGONAL);
480 }
482 break;
483 case BITMAP_CMD_EVENT:
484 if (new_value[0] == 'T') {
485 word_display_mode.set(DF_EDGE_STEP);
486 } else {
487 word_display_mode.reset(DF_EDGE_STEP);
488 }
490 break;
493 break;
494 case IMAGE_CMD_EVENT:
495 display_image = (new_value[0] == 'T');
497 break;
498 case BLOCKS_CMD_EVENT:
499 display_blocks = (new_value[0] == 'T');
501 break;
503 display_baselines = (new_value[0] == 'T');
505 break;
507 color_mode = CM_SUBSCRIPT;
509 break;
511 color_mode = CM_SUPERSCRIPT;
513 break;
515 color_mode = CM_ITALIC;
517 break;
519 color_mode = CM_BOLD;
521 break;
523 color_mode = CM_UNDERLINE;
525 break;
527 color_mode = CM_FIXEDPITCH;
529 break;
531 color_mode = CM_SERIF;
533 break;
535 color_mode = CM_SMALLCAPS;
537 break;
539 color_mode = CM_DROPCAPS;
541 break;
544 break;
545 case QUIT_CMD_EVENT:
546 exit = true;
548 break;
549
550 default:
551 snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
552 image_win->AddMessage(msg);
553 break;
554 }
555 return exit;
556}
@ NULL_CMD_EVENT
Definition: pgedit.cpp:51
@ CM_ITALIC
Definition: pgedit.cpp:87
@ CM_SUBSCRIPT
Definition: pgedit.cpp:85
@ CM_RAINBOW
Definition: pgedit.cpp:84
@ CM_FIXEDPITCH
Definition: pgedit.cpp:90
@ CM_BOLD
Definition: pgedit.cpp:88
@ CM_SMALLCAPS
Definition: pgedit.cpp:92
@ CM_SUPERSCRIPT
Definition: pgedit.cpp:86
@ CM_SERIF
Definition: pgedit.cpp:91
@ CM_DROPCAPS
Definition: pgedit.cpp:93
@ CM_UNDERLINE
Definition: pgedit.cpp:89
@ DF_POLYGONAL
Polyg approx.
Definition: werd.h:50
@ DF_BLAMER
Blamer information.
Definition: werd.h:53
@ DF_BOX
Bounding box.
Definition: werd.h:48
@ DF_BN_POLYGONAL
BL normalisd polyapx.
Definition: werd.h:52
@ DF_TEXT
Correct ascii.
Definition: werd.h:49
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:702
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:722
void AddMessage(const char *message)
Definition: scrollview.cpp:533
static void Exit()
Definition: scrollview.cpp:559

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 567 of file pgedit.cpp.

568 {
569 // The following variable should remain static, since it is used by
570 // debug editor, which uses a single Tesseract instance.
571 static ICOORD down;
572 ICOORD up;
573 TBOX selection_box;
574 char msg[80];
575
576 switch (event.type) {
577 case SVET_SELECTION:
578 if (event.type == SVET_SELECTION) {
579 down.set_x(event.x + event.x_size);
580 down.set_y(event.y + event.y_size);
581 if (mode == SHOW_POINT_CMD_EVENT) {
582 show_point(current_page_res, event.x, event.y);
583 }
584 }
585
586 up.set_x(event.x);
587 up.set_y(event.y);
588
589 selection_box = TBOX(down, up);
590
591 switch (mode) {
593 process_selected_words(current_page_res, selection_box,
595 break;
597 process_selected_words(current_page_res, selection_box,
599 break;
601 process_selected_words(current_page_res, selection_box,
603 break;
605 debug_word(current_page_res, selection_box);
606 break;
608 break; // ignore up event
609
610 case RECOG_WERDS:
611# ifndef DISABLED_LEGACY_ENGINE
612 image_win->AddMessage("Recogging selected words");
613 this->process_selected_words(current_page_res, selection_box,
615# endif // ndef DISABLED_LEGACY_ENGINE
616 break;
617 case RECOG_PSEUDO:
618 image_win->AddMessage("Recogging selected blobs");
619 recog_pseudo_word(current_page_res, selection_box);
620 break;
622 blob_feature_display(current_page_res, selection_box);
623 break;
624
625 default:
626 snprintf(msg, sizeof(msg), "Mode %d not yet implemented", mode);
627 image_win->AddMessage(msg);
628 break;
629 }
630 default:
631 break;
632 }
633}
@ SVET_SELECTION
Definition: scrollview.h:57
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:76
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:677
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:877
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:667
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:640
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:913

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

32 {
33 for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {
34 WERD *word = page_res_it.word()->word;
35 if (word->bounding_box().overlap(selection_box)) {
36 if (!(this->*word_processor)(&page_res_it)) {
37 return;
38 }
39 }
40 }
41}

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 118 of file control.cpp.

119 {
120 if (word_config != nullptr) {
121 if (word_box.major_overlap(target_word_box)) {
122 if (backup_config_file_ == nullptr) {
123 backup_config_file_ = kBackUpConfigFile;
124 FILE *config_fp = fopen(backup_config_file_, "wb");
125 if (config_fp == nullptr) {
126 tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
127 } else {
128 ParamUtils::PrintParams(config_fp, params());
129 fclose(config_fp);
130 }
132 }
133 } else {
134 if (backup_config_file_ != nullptr) {
136 backup_config_file_ = nullptr;
137 }
138 }
139 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
140 return false;
141 }
142 return true;
143}
const char *const kBackUpConfigFile
Definition: control.cpp:47
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:41
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 120 of file docqual.cpp.

120 {
121 if ((tessedit_good_quality_unrej && good_quality_doc)) {
122 unrej_good_quality_words(page_res_it);
123 }
124 doc_and_block_rejection(page_res_it, good_quality_doc);
125 if (unlv_tilde_crunching) {
126 tilde_crunch(page_res_it);
127 tilde_delete(page_res_it);
128 }
129}
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:530
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:373
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:210
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:142

◆ read_config_file()

void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 46 of file tessedit.cpp.

46 {
47 std::string path = datadir;
48 path += "configs/";
49 path += filename;
50 FILE *fp;
51 if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
52 fclose(fp);
53 } else {
54 path = datadir;
55 path += "tessconfigs/";
56 path += filename;
57 if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
58 fclose(fp);
59 } else {
60 path = filename;
61 }
62 }
63 ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
64}

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 914 of file control.cpp.

914 {
915 *make_next_word_fuzzy = false;
916 WERD *real_word = pr_it->word()->word;
917 if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
918 real_word->rej_cblob_list()->length() > noise_maxperword) {
919 return false;
920 }
921 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
922 // Get the noise outlines into a vector with matching bool map.
923 std::vector<C_OUTLINE *> outlines;
924 real_word->GetNoiseOutlines(&outlines);
925 std::vector<bool> word_wanted;
926 std::vector<bool> overlapped_any_blob;
927 std::vector<C_BLOB *> target_blobs;
928 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
929 &overlapped_any_blob, &target_blobs);
930 // Filter the outlines that overlapped any blob and put them into the word
931 // now. This simplifies the remaining task and also makes it more accurate
932 // as it has more completed blobs to work on.
933 std::vector<bool> wanted;
934 std::vector<C_BLOB *> wanted_blobs;
935 std::vector<C_OUTLINE *> wanted_outlines;
936 int num_overlapped = 0;
937 int num_overlapped_used = 0;
938 for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
939 if (overlapped_any_blob[i]) {
940 ++num_overlapped;
941 if (word_wanted[i]) {
942 ++num_overlapped_used;
943 }
944 wanted.push_back(word_wanted[i]);
945 wanted_blobs.push_back(target_blobs[i]);
946 wanted_outlines.push_back(outlines[i]);
947 outlines[i] = nullptr;
948 }
949 }
950 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
951 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
952 int non_overlapped = 0;
953 int non_overlapped_used = 0;
954 for (unsigned i = 0; i < word_wanted.size(); ++i) {
955 if (word_wanted[i]) {
956 ++non_overlapped_used;
957 }
958 if (outlines[i] != nullptr) {
959 ++non_overlapped_used;
960 }
961 }
962 if (debug_noise_removal) {
963 tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", num_overlapped_used,
964 num_overlapped, non_overlapped_used, non_overlapped);
965 real_word->bounding_box().print();
966 }
967 // Now we have decided which outlines we want, put them into the real_word.
968 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
969 pr_it->MakeCurrentWordFuzzy();
970 }
971 // TODO(rays) Parts of combos have a deep copy of the real word, and need
972 // to have their noise outlines moved/assigned in the same way!!
973 return num_overlapped_used != 0 || non_overlapped_used != 0;
974}
void AssignDiacriticsToNewBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:1036
void AssignDiacriticsToOverlappingBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:981
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 287 of file control.cpp.

289 {
290 PAGE_RES_IT page_res_it(page_res);
291
292 if (tessedit_minimal_rej_pass1) {
293 tessedit_test_adaption.set_value(true);
294 tessedit_minimal_rejection.set_value(true);
295 }
296
297 if (dopasses == 0 || dopasses == 1) {
298 page_res_it.restart_page();
299 // ****************** Pass 1 *******************
300
301#ifndef DISABLED_LEGACY_ENGINE
302 // If the adaptive classifier is full switch to one we prepared earlier,
303 // ie on the previous page. If the current adaptive classifier is non-empty,
304 // prepare a backup starting at this page, in case it fills up. Do all this
305 // independently for each language.
308 } else if (!AdaptiveClassifierIsEmpty()) {
310 }
311 // Now check the sub-langs as well.
312 for (auto &lang : sub_langs_) {
313 if (lang->AdaptiveClassifierIsFull()) {
314 lang->SwitchAdaptiveClassifier();
315 } else if (!lang->AdaptiveClassifierIsEmpty()) {
316 lang->StartBackupAdaptiveClassifier();
317 }
318 }
319
320#endif // ndef DISABLED_LEGACY_ENGINE
321
322 // Set up all words ready for recognition, so that if parallelism is on
323 // all the input and output classes are ready to run the classifier.
324 std::vector<WordData> words;
325 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
326#ifndef DISABLED_LEGACY_ENGINE
327 if (tessedit_parallelize) {
328 PrerecAllWordsPar(words);
329 }
330#endif // ndef DISABLED_LEGACY_ENGINE
331
332 stats_.word_count = words.size();
333
334 stats_.dict_words = 0;
335 stats_.doc_blob_quality = 0;
336 stats_.doc_outline_errs = 0;
337 stats_.doc_char_quality = 0;
338 stats_.good_char_count = 0;
339 stats_.doc_good_char_quality = 0;
340
341 most_recently_used_ = this;
342 // Run pass 1 word recognition.
343 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
344 return false;
345 }
346 // Pass 1 post-processing.
347 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
348 if (page_res_it.word()->word->flag(W_REP_CHAR)) {
349 fix_rep_char(&page_res_it);
350 continue;
351 }
352
353 // Count dict words.
354 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
355 ++(stats_.dict_words);
356 }
357
358 // Update misadaption log (we only need to do it on pass 1, since
359 // adaption only happens on this pass).
360 if (page_res_it.word()->blamer_bundle != nullptr &&
361 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
362 page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
363 }
364 }
365 }
366
367 if (dopasses == 1) {
368 return true;
369 }
370
371#ifndef DISABLED_LEGACY_ENGINE
372
373 // ****************** Pass 2 *******************
374 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
375 page_res_it.restart_page();
376 std::vector<WordData> words;
377 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
378 if (tessedit_parallelize) {
379 PrerecAllWordsPar(words);
380 }
381 most_recently_used_ = this;
382 // Run pass 2 word recognition.
383 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
384 return false;
385 }
386 }
387
388 // The next passes are only required for Tess-only.
389 if (AnyTessLang() && !AnyLSTMLang()) {
390 // ****************** Pass 3 *******************
391 // Fix fuzzy spaces.
392
393 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
394 !right_to_left()) {
395 fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
396 }
397
398 // ****************** Pass 4 *******************
399 if (tessedit_enable_dict_correction) {
401 }
402 if (tessedit_enable_bigram_correction) {
403 bigram_correction_pass(page_res);
404 }
405
406 // ****************** Pass 5,6 *******************
407 rejection_passes(page_res, monitor, target_word_box, word_config);
408
409 // ****************** Pass 8 *******************
410 font_recognition_pass(page_res);
411
412 // ****************** Pass 9 *******************
413 // Check the correctness of the final results.
414 blamer_pass(page_res);
415 script_pos_pass(page_res);
416 }
417
418#endif // ndef DISABLED_LEGACY_ENGINE
419
420 // Write results pass.
421 // This is now redundant, but retained commented so show how to obtain
422 // bounding boxes and style information.
423
424#ifndef DISABLED_LEGACY_ENGINE
425 // changed by jetsoft
426 // needed for dll to output memory structure
427 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
428 output_pass(page_res_it, target_word_box);
429 }
430// end jetsoft
431#endif // ndef DISABLED_LEGACY_ENGINE
432
433 const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
434 textord_.CleanupSingleRowResult(pageseg_mode, page_res);
435
436 // Remove empty words, as these mess up the result iterators.
437 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
438 const WERD_RES *word = page_res_it.word();
439 const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
440 ? page_res_it.block()->block->pdblk.poly_block()
441 : nullptr;
442 if (word->best_choice == nullptr || word->best_choice->empty() ||
443 (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
444 page_res_it.DeleteCurrentWord();
445 }
446 }
447
448 if (monitor != nullptr) {
449 monitor->progress = 100;
450 }
451 return true;
452}
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:456
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1665
void PrerecAllWordsPar(const std::vector< WordData > &words)
Definition: par_control.cpp:38
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:599
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:39
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
Definition: control.cpp:198
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2057
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2003
bool AnyTessLang() const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:77
bool right_to_left() const
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:707
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:683
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
Definition: control.cpp:146
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:268
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:625
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:609
bool AdaptiveClassifierIsFull() const
Definition: classify.h:265
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:264

◆ recog_interactive()

bool tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 76 of file control.cpp.

76 {
77 WordData word_data(*pr_it);
78 SetupWordPassN(2, &word_data);
79 // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
80 if (lstm_recognizer_ == nullptr) {
81#ifndef DISABLED_LEGACY_ENGINE
82 classify_word_and_language(2, pr_it, &word_data);
83#endif // ndef DISABLED_LEGACY_ENGINE
84 } else {
85 classify_word_and_language(1, pr_it, &word_data);
86 }
87#ifndef DISABLED_LEGACY_ENGINE
88 if (tessedit_debug_quality_metrics) {
89 int16_t char_qual;
90 int16_t good_char_qual;
91 WERD_RES *word_res = pr_it->word();
92 word_char_quality(word_res, &char_qual, &good_char_qual);
93 tprintf(
94 "\n%d chars; word_blob_quality: %d; outline_errs: %d; "
95 "char_quality: %d; good_char_quality: %d\n",
96 word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
97 char_qual, good_char_qual);
98 }
99#endif // ndef DISABLED_LEGACY_ENGINE
100 return true;
101}
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 62 of file control.cpp.

62 {
63 PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
64 if (it != nullptr) {
66 it->DeleteCurrentWord();
67 delete it;
68 }
69}

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented ( const char *  filename,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 86 of file recogtraining.cpp.

87 {
88 std::string box_fname = filename;
89 const char *lastdot = strrchr(box_fname.c_str(), '.');
90 if (lastdot != nullptr) {
91 box_fname[lastdot - box_fname.c_str()] = '\0';
92 }
93 box_fname += ".box";
94 // ReadNextBox() will close box_file
95 FILE *box_file = fopen(box_fname.c_str(), "r");
96 if (box_file == nullptr) {
97 tprintf("Error: Could not open file %s\n", box_fname.c_str());
98 ASSERT_HOST(box_file);
99 }
100
101 PAGE_RES_IT page_res_it;
102 page_res_it.page_res = page_res;
103 page_res_it.restart_page();
104 std::string label;
105
106 // Process all the words on this page.
107 TBOX tbox; // tesseract-identified box
108 TBOX bbox; // box from the box file
109 bool keep_going;
110 int line_number = 0;
111 int examined_words = 0;
112 do {
113 keep_going = read_t(&page_res_it, &tbox);
114 keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
115 // Align bottom left points of the TBOXes.
116 while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117 if (bbox.bottom() < tbox.bottom()) {
118 page_res_it.forward();
119 keep_going = read_t(&page_res_it, &tbox);
120 } else {
121 keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
122 }
123 }
124 while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125 if (bbox.left() > tbox.left()) {
126 page_res_it.forward();
127 keep_going = read_t(&page_res_it, &tbox);
128 } else {
129 keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
130 }
131 }
132 // OCR the word if top right points of the TBOXes are similar.
133 if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
134 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
135 ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
136 examined_words++;
137 }
138 page_res_it.forward();
139 } while (keep_going);
140
141 // Set up scripts on all of the words that did not get sent to
142 // ambigs_classify_and_output. They all should have, but if all the
143 // werd_res's don't get uch_sets, tesseract will crash when you try
144 // to iterate over them. :-(
145 int total_words = 0;
146 for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
147 if (page_res_it.word()) {
148 if (page_res_it.word()->uch_set == nullptr) {
149 page_res_it.word()->SetupFake(unicharset);
150 }
151 total_words++;
152 }
153 }
154 if (examined_words < 0.85 * total_words) {
155 tprintf(
156 "TODO(antonova): clean up recog_training_segmented; "
157 " It examined only a small fraction of the ambigs image.\n");
158 }
159 tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
160}
const int16_t kMaxBoxEdgeDiff
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:153
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 37 of file tfacepp.cpp.

37 {
38 if (wordrec_skip_no_truth_words &&
39 (word->blamer_bundle == nullptr ||
40 word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
41 if (classify_debug_level) {
42 tprintf("No truth for word - skipping\n");
43 }
44 word->tess_failed = true;
45 return;
46 }
47 ASSERT_HOST(!word->chopped_word->blobs.empty());
49 word->SetupBoxWord();
50 ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());
51 // Check that the ratings matrix size matches the sum of all the
52 // segmentation states.
53 if (!word->StatesAllValid()) {
54 tprintf("Not all words have valid states relative to ratings matrix!!");
55 word->DebugWordChoices(true, nullptr);
56 ASSERT_HOST(word->StatesAllValid());
57 }
58 if (tessedit_override_permuter) {
59 /* Override the permuter type if a straight dictionary check disagrees. */
60 uint8_t perm_type = word->best_choice->permuter();
61 if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
62 (perm_type != USER_DAWG_PERM)) {
63 uint8_t real_dict_perm_type = dict_word(*word->best_choice);
64 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
65 (real_dict_perm_type == USER_DAWG_PERM)) &&
66 (alpha_count(word->best_choice->unichar_string().c_str(),
67 word->best_choice->unichar_lengths().c_str()) > 0)) {
68 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
69 }
70 }
71 if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
72 tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
73 }
74 }
75 // Factored out from control.cpp
76 ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
77 if (word->best_choice == nullptr || word->best_choice->empty() ||
78 strspn(word->best_choice->unichar_string().c_str(), " ") ==
79 word->best_choice->length()) {
80 word->tess_failed = true;
81 word->reject_map.initialise(word->box_word->length());
82 word->reject_map.rej_word_tess_failure();
83 } else {
84 word->tess_failed = false;
85 }
86}
@ IRR_NO_TRUTH
Definition: blamer.h:98
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:94

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 94 of file tfacepp.cpp.

94 {
95 auto word_length = word->chopped_word->NumBlobs(); // no of blobs
96 if (word_length > MAX_UNDIVIDED_LENGTH) {
97 return split_and_recog_word(word);
98 }
99 cc_recog(word);
100 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
101
102 // Do sanity checks and minor fixes on best_choice.
103 if (word->best_choice->length() > word_length) {
104 word->best_choice->make_bad(); // should never happen
105 tprintf(
106 "recog_word: Discarded long string \"%s\""
107 " (%d characters vs %d blobs)\n",
108 word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
109 tprintf("Word is at:");
110 word->word->bounding_box().print();
111 }
112 if (word->best_choice->length() < word_length) {
113 UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
114 while (word->best_choice->length() < word_length) {
115 word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
116 }
117 }
118}
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:28
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:126
void cc_recog(WERD_RES *word)
Definition: tface.cpp:119

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
std::vector< WordData > *  words 
)

Definition at line 198 of file control.cpp.

199 {
200 // TODO(rays) Before this loop can be parallelized (it would yield a massive
201 // speed-up) all remaining member globals need to be converted to local/heap
202 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
203 // added. The results will be significantly different with adaption on, and
204 // deterioration will need investigation.
205 pr_it->restart_page();
206 for (unsigned w = 0; w < words->size(); ++w) {
207 WordData *word = &(*words)[w];
208 if (w > 0) {
209 word->prev_word = &(*words)[w - 1];
210 }
211 if (monitor != nullptr) {
212 monitor->ocr_alive = true;
213 if (pass_n == 1) {
214 monitor->progress = 70 * w / words->size();
215 } else {
216 monitor->progress = 70 + 30 * w / words->size();
217 }
218 if (monitor->progress_callback2 != nullptr) {
219 TBOX box = pr_it->word()->word->bounding_box();
220 (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
221 }
222 if (monitor->deadline_exceeded() ||
223 (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
224 // Timeout. Fake out the rest of the words.
225 for (; w < words->size(); ++w) {
226 (*words)[w].word->SetupFake(unicharset);
227 }
228 return false;
229 }
230 }
231 if (word->word->tess_failed) {
232 unsigned s;
233 for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
234 }
235 // If all are failed, skip it. Image words are skipped by this test.
236 if (s > word->lang_words.size()) {
237 continue;
238 }
239 }
240 // Sync pr_it with the WordData.
241 while (pr_it->word() != nullptr && pr_it->word() != word->word) {
242 pr_it->forward();
243 }
244 ASSERT_HOST(pr_it->word() != nullptr);
245 bool make_next_word_fuzzy = false;
246#ifndef DISABLED_LEGACY_ENGINE
247 if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
248 // Needs to be setup again to see the new outlines in the chopped_word.
249 SetupWordPassN(pass_n, word);
250 }
251#endif // ndef DISABLED_LEGACY_ENGINE
252
253 classify_word_and_language(pass_n, pr_it, word);
254 if (tessedit_dump_choices || debug_noise_removal) {
255 tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
256 word->word->best_choice->debug_string().c_str());
257 }
258 pr_it->forward();
259 if (make_next_word_fuzzy && pr_it->word() != nullptr) {
260 pr_it->MakeCurrentWordFuzzy();
261 }
262 }
263 return true;
264}
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:914

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( std::string &  image_name)

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 260 of file reject.cpp.

260 {
261 TBOX word_box = word->word->bounding_box();
262 // Use the box_word as it is already denormed back to image coordinates.
263 int blobcount = word->box_word->length();
264
265 if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
266 word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
267 word_box.top() + tessedit_image_border > ImageHeight() - 1) {
268 ASSERT_HOST(word->reject_map.length() == blobcount);
269 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
270 TBOX blob_box = word->box_word->BlobBox(blobindex);
271 if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
272 blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
273 blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
274 word->reject_map[blobindex].setrej_edge_char();
275 // Close to edge
276 }
277 }
278 }
279}

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 195 of file reject.cpp.

195 {
196 int16_t i;
197 int16_t offset;
198
199 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
200 offset += word->best_choice->unichar_lengths()[i], i += 1) {
201 if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
202 // rej 1Il conflict
203 word->reject_map[i].setrej_1Il_conflict();
204 }
205 }
206}

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 556 of file reject.cpp.

556 {
557 /* Reject the whole of the word if the fraction of rejects exceeds a limit */
558
559 if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
560 rej_whole_of_mostly_reject_word_fract) {
561 word->reject_map.rej_word_mostly_rej();
562 }
563}

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 599 of file control.cpp.

600 {
601 PAGE_RES_IT page_res_it(page_res);
602 // ****************** Pass 5 *******************
603 // Gather statistics on rejects.
604 int word_index = 0;
605 while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
606 WERD_RES *word = page_res_it.word();
607 word_index++;
608 if (monitor != nullptr) {
609 monitor->ocr_alive = true;
610 monitor->progress = 95 + 5 * word_index / stats_.word_count;
611 }
612 if (word->rebuild_word == nullptr) {
613 // Word was not processed by tesseract.
614 page_res_it.forward();
615 continue;
616 }
617 check_debug_pt(word, 70);
618
619 // changed by jetsoft
620 // specific to its needs to extract one word when need
621 if (target_word_box &&
622 !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
623 page_res_it.forward();
624 continue;
625 }
626 // end jetsoft
627
628 page_res_it.rej_stat_word();
629 const int chars_in_word = word->reject_map.length();
630 const int rejects_in_word = word->reject_map.reject_count();
631
632 const int blob_quality = word_blob_quality(word);
633 stats_.doc_blob_quality += blob_quality;
634 const int outline_errs = word_outline_errs(word);
635 stats_.doc_outline_errs += outline_errs;
636 int16_t all_char_quality;
637 int16_t accepted_all_char_quality;
638 word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
639 stats_.doc_char_quality += all_char_quality;
640 const uint8_t permuter_type = word->best_choice->permuter();
641 if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
642 (permuter_type == USER_DAWG_PERM)) {
643 stats_.good_char_count += chars_in_word - rejects_in_word;
644 stats_.doc_good_char_quality += accepted_all_char_quality;
645 }
646 check_debug_pt(word, 80);
647 if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
648 word->reject_map.rej_word_bad_quality();
649 }
650 check_debug_pt(word, 90);
651 page_res_it.forward();
652 }
653
654 if (tessedit_debug_quality_metrics) {
655 tprintf(
656 "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
657 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
658 page_res->char_count, page_res->rej_count,
659 page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
660 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
661 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
662 stats_.doc_char_quality / static_cast<float>(page_res->char_count),
664 (stats_.good_char_count > 0)
665 ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
666 : 0.0);
667 }
668 bool good_quality_doc =
669 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
670 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
671 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
672 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
673
674 // ****************** Pass 6 *******************
675 // Do whole document or whole block rejection pass
676 if (!tessedit_test_adaption) {
677 quality_based_rejection(page_res_it, good_quality_doc);
678 }
679}
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118

◆ repeated_nonalphanum_wd()

bool tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 565 of file reject.cpp.

565 {
566 if (word->best_choice->unichar_lengths().length() <= 1) {
567 return false;
568 }
569
570 if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
571 return false;
572 }
573
574 UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
575 for (unsigned i = 1; i < word->best_choice->length(); ++i) {
576 if (word->best_choice->unichar_id(i) != uch_id) {
577 return false;
578 }
579 }
580
581 int16_t char_quality;
582 int16_t accepted_char_quality;
583 word_char_quality(word, &char_quality, &accepted_char_quality);
584
585 if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
586 (char_quality == accepted_char_quality)) {
587 return true;
588 } else {
589 return false;
590 }
591}

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Logs a bad box by line in the box file and box coords.

Definition at line 743 of file applybox.cpp.

744 {
745 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
746 box.left(), box.bottom(), box.right(), box.top(), err_msg);
747}

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1436 of file control.cpp.

1437 {
1438 tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1439 word->best_choice->debug_string().c_str());
1440 word->reject_map.print(debug_fp);
1441 tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1442 new_word->best_choice->debug_string().c_str());
1443 new_word->reject_map.print(debug_fp);
1444 tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1445 new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1446 accept_new_word ? "ACCEPTED" : "");
1447}

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.

Definition at line 495 of file applybox.cpp.

495 {
496 PAGE_RES_IT pr_it(page_res);
497 WERD_RES *word_res;
498 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
499 const WERD *word = word_res->word;
500 if (word->text() == nullptr || word->text()[0] == '\0') {
501 continue; // Ignore words that have no text.
502 }
503 // Convert the correct text to a vector of UNICHAR_ID
504 std::vector<UNICHAR_ID> target_text;
505 if (!ConvertStringToUnichars(word->text(), &target_text)) {
506 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
507 pr_it.DeleteCurrentWord();
508 continue;
509 }
510 if (!FindSegmentation(target_text, word_res)) {
511 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
512 pr_it.DeleteCurrentWord();
513 continue;
514 }
515 }
516}
bool ConvertStringToUnichars(const char *utf8, std::vector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:520
bool FindSegmentation(const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:545

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 310 of file applybox.cpp.

311 {
312 if (applybox_debug > 1) {
313 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
314 }
315 PAGE_RES_IT page_res_it(page_res);
316 WERD_RES *word_res;
317 for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
318 if (!word_res->box_word->bounding_box().major_overlap(box)) {
319 continue;
320 }
321 if (applybox_debug > 1) {
322 tprintf("Checking word box:");
323 word_res->box_word->bounding_box().print();
324 }
325 int word_len = word_res->box_word->length();
326 for (int i = 0; i < word_len; ++i) {
327 TBOX char_box = TBOX();
328 int blob_count = 0;
329 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
330 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
331 if (!blob_box.major_overlap(box)) {
332 break;
333 }
334 if (word_res->correct_text[i + blob_count].length() > 0) {
335 break; // Blob is claimed already.
336 }
337 if (next_box != nullptr) {
338 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
339 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
340 if (applybox_debug > 2) {
341 tprintf("Checking blob:");
342 blob_box.print();
343 tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
344 next_box_miss_metric);
345 }
346 if (current_box_miss_metric > next_box_miss_metric) {
347 break; // Blob is a better match for next box.
348 }
349 }
350 char_box += blob_box;
351 }
352 if (blob_count > 0) {
353 if (applybox_debug > 1) {
354 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
355 }
356 if (!char_box.almost_equal(box, 3) &&
357 ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
358 (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
359 return false;
360 }
361 // We refine just the box_word, best_state and correct_text here.
362 // The rebuild_word is made in TidyUp.
363 // blob_count blobs are put together to match the box. Merge the
364 // box_word boxes, save the blob_count in the state and the text.
365 word_res->box_word->MergeBoxes(i, i + blob_count);
366 word_res->best_state[i] = blob_count;
367 word_res->correct_text[i] = correct_text;
368 if (applybox_debug > 2) {
369 tprintf("%d Blobs match: blob box:", blob_count);
370 word_res->box_word->BlobBox(i).print();
371 tprintf("Matches box:");
372 box.print();
373 if (next_box != nullptr) {
374 tprintf("With next box:");
375 next_box->print();
376 }
377 }
378 // Eliminated best_state and correct_text entries for the consumed
379 // blobs.
380 for (int j = 1; j < blob_count; ++j) {
381 word_res->best_state.erase(word_res->best_state.begin() + i + 1);
382 word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
383 }
384 // Assume that no box spans multiple source words, so we are done with
385 // this box.
386 if (applybox_debug > 1) {
387 tprintf("Best state = ");
388 for (auto best_state : word_res->best_state) {
389 tprintf("%d ", best_state);
390 }
391 tprintf("\n");
392 tprintf("Correct text = [[ ");
393 for (auto &it : word_res->correct_text) {
394 tprintf("%s ", it.c_str());
395 }
396 tprintf("]]\n");
397 }
398 return true;
399 }
400 }
401 }
402 if (applybox_debug > 0) {
403 tprintf("FAIL!\n");
404 }
405 return false; // Failure.
406}

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an overlapping blob for a box.

Definition at line 414 of file applybox.cpp.

415 {
416 if (applybox_debug > 1) {
417 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
418 }
419 WERD *new_word = nullptr;
420 BLOCK_IT b_it(block_list);
421 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
422 BLOCK *block = b_it.data();
423 if (!box.major_overlap(block->pdblk.bounding_box())) {
424 continue;
425 }
426 ROW_IT r_it(block->row_list());
427 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
428 ROW *row = r_it.data();
429 if (!box.major_overlap(row->bounding_box())) {
430 continue;
431 }
432 WERD_IT w_it(row->word_list());
433 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
434 WERD *word = w_it.data();
435 if (applybox_debug > 2) {
436 tprintf("Checking word:");
437 word->bounding_box().print();
438 }
439 if (word->text() != nullptr && word->text()[0] != '\0') {
440 continue; // Ignore words that are already done.
441 }
442 if (!box.major_overlap(word->bounding_box())) {
443 continue;
444 }
445 C_BLOB_IT blob_it(word->cblob_list());
446 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
447 C_BLOB *blob = blob_it.data();
448 TBOX blob_box = blob->bounding_box();
449 if (!blob_box.major_overlap(box)) {
450 continue;
451 }
452 if (next_box != nullptr) {
453 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
454 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
455 if (applybox_debug > 2) {
456 tprintf("Checking blob:");
457 blob_box.print();
458 tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
459 next_box_miss_metric);
460 }
461 if (current_box_miss_metric > next_box_miss_metric) {
462 continue; // Blob is a better match for next box.
463 }
464 }
465 if (applybox_debug > 2) {
466 tprintf("Blob match: blob:");
467 blob_box.print();
468 tprintf("Matches box:");
469 box.print();
470 if (next_box != nullptr) {
471 tprintf("With next box:");
472 next_box->print();
473 }
474 }
475 if (new_word == nullptr) {
476 // Make a new word with a single blob.
477 new_word = word->shallow_copy();
478 new_word->set_text(correct_text);
479 w_it.add_to_end(new_word);
480 }
481 C_BLOB_IT new_blob_it(new_word->cblob_list());
482 new_blob_it.add_to_end(blob_it.extract());
483 }
484 }
485 }
486 }
487 if (new_word == nullptr && applybox_debug > 0) {
488 tprintf("FAIL!\n");
489 }
490 return new_word != nullptr;
491}

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 513 of file tesseractclass.cpp.

513 {
515 for (auto &sub_lang : sub_langs_) {
516 sub_lang->ResetAdaptiveClassifierInternal();
517 }
518}
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:596

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 523 of file tesseractclass.cpp.

523 {
525 for (auto &sub_lang : sub_langs_) {
526 sub_lang->getDict().ResetDocumentDictionary();
527 }
528}
void ResetDocumentDictionary()
Definition: dict.h:297

◆ reskew()

const FCOORD & tesseract::Tesseract::reskew ( ) const
inline

Definition at line 200 of file tesseractclass.h.

200 {
201 return reskew_;
202 }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 873 of file control.cpp.

874 {
875 if (debug) {
876 tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
877 static_cast<int>(tessedit_ocr_engine_mode));
878 }
879 // Run the recognizer on the word.
880 PointerVector<WERD_RES> new_words;
881 (this->*recognizer)(word_data, in_word, &new_words);
882 if (new_words.empty()) {
883 // Transfer input word to new_words, as the classifier must have put
884 // the result back in the input.
885 new_words.push_back(*in_word);
886 *in_word = nullptr;
887 }
888 if (debug) {
889 for (unsigned i = 0; i < new_words.size(); ++i) {
890 new_words[i]->DebugTopChoice("Lang result");
891 }
892 }
893 // Initial version is a bit of a hack based on better certainty and rating
894 // or a dictionary vs non-dictionary word.
895 return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
896 &new_words, best_words);
897}

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 280 of file tesseractclass.h.

280 {
281 return right_to_left_;
282 }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)

◆ safe_dict_word()

int16_t tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 593 of file reject.cpp.

593 {
594 const WERD_CHOICE &word = *werd_res->best_choice;
595 int dict_word_type = werd_res->tesseract->dict_word(word);
596 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
597}

◆ scaled_color()

Image tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 263 of file tesseractclass.h.

263 {
264 return scaled_color_;
265 }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 266 of file tesseractclass.h.

266 {
267 return scaled_factor_;
268 }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 707 of file control.cpp.

707 {
708 PAGE_RES_IT page_res_it(page_res);
709 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
710 WERD_RES *word = page_res_it.word();
711 if (word->word->flag(W_REP_CHAR)) {
712 page_res_it.forward();
713 continue;
714 }
715 const float x_height = page_res_it.block()->block->x_height();
716 float word_x_height = word->x_height;
717 if (word_x_height < word->best_choice->min_x_height() ||
718 word_x_height > word->best_choice->max_x_height()) {
719 word_x_height =
720 (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
721 }
722 // Test for small caps. Word capheight must be close to block xheight,
723 // and word must contain no lower case letters, and at least one upper case.
724 const double small_cap_xheight = x_height * kXHeightCapRatio;
725 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
726 if (word->uch_set->script_has_xheight() &&
727 small_cap_xheight - small_cap_delta <= word_x_height &&
728 word_x_height <= small_cap_xheight + small_cap_delta) {
729 // Scan for upper/lower.
730 int num_upper = 0;
731 int num_lower = 0;
732 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
733 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
734 ++num_upper;
735 } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
736 ++num_lower;
737 }
738 }
739 if (num_upper > 0 && num_lower == 0) {
740 word->small_caps = true;
741 }
742 }
743 word->SetScriptPositions();
744 }
745}
static const double kXHeightCapRatio
Definition: ccstruct.h:35

◆ SearchForText()

void tesseract::Tesseract::SearchForText ( const std::vector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
unsigned  choices_length,
const std::vector< UNICHAR_ID > &  target_text,
unsigned  text_index,
float  rating,
std::vector< int > *  segmentation,
float *  best_rating,
std::vector< int > *  best_segmentation 
)

Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).

Parameters
choicesis an array of vectors of length choices_length, with each element representing a starting position in the word, and the #vector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
choices_pos
choices_length
target_text
text_index
rating
segmentation
best_rating
best_segmentation

Definition at line 615 of file applybox.cpp.

618 {
620 for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
621 // Rating of matching choice or worst choice if no match.
622 float choice_rating = 0.0f;
623 // Find the corresponding best BLOB_CHOICE.
624 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
625 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
626 const BLOB_CHOICE *choice = choice_it.data();
627 choice_rating = choice->rating();
628 auto class_id = choice->unichar_id();
629 if (class_id == target_text[text_index]) {
630 break;
631 }
632 // Search ambigs table.
633 if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
634 AmbigSpec_IT spec_it(table[class_id]);
635 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
636 const AmbigSpec *ambig_spec = spec_it.data();
637 // We'll only do 1-1.
638 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
639 ambig_spec->correct_ngram_id == target_text[text_index]) {
640 break;
641 }
642 }
643 if (!spec_it.cycled_list()) {
644 break; // Found an ambig.
645 }
646 }
647 }
648 if (choice_it.cycled_list()) {
649 continue; // No match.
650 }
651 segmentation->push_back(length);
652 if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
653 // This is a complete match. If the rating is good record a new best.
654 if (applybox_debug > 2) {
655 tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
656 rating + choice_rating, *best_rating, segmentation->size(),
657 best_segmentation->size());
658 }
659 if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
660 *best_segmentation = *segmentation;
661 *best_rating = rating + choice_rating;
662 }
663 } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
664 if (applybox_debug > 3) {
665 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
666 unicharset.id_to_unichar(target_text[text_index]),
667 choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
668 choices_pos, length);
669 }
670 SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
671 rating + choice_rating, segmentation, best_rating, best_segmentation);
672 if (applybox_debug > 3) {
673 tprintf("End recursion for %d=%s\n", target_text[text_index],
674 unicharset.id_to_unichar(target_text[text_index]));
675 }
676 }
677 segmentation->resize(segmentation->size() - 1);
678 }
679}
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:140
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:157
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 264 of file linerec.cpp.

264 {
265 // Run the segmentation search on the network outputs and make a BoxWord
266 // for each of the output words.
267 // If we drop a word as junk, then there is always a space in front of the
268 // next.
269 const Dict *stopper_dict = lstm_recognizer_->GetDict();
270 if (stopper_dict == nullptr) {
271 stopper_dict = &getDict();
272 }
273 for (unsigned w = 0; w < words->size(); ++w) {
274 WERD_RES *word = (*words)[w];
275 if (word->best_choice == nullptr) {
276 // It is a dud.
277 word->SetupFake(lstm_recognizer_->GetUnicharset());
278 } else {
279 // Set the best state.
280 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
281 int length = word->best_choice->state(i);
282 word->best_state.push_back(length);
283 }
284 word->reject_map.initialise(word->best_choice->length());
285 word->tess_failed = false;
286 word->tess_accepted = true;
287 word->tess_would_adapt = false;
288 word->done = true;
289 word->tesseract = this;
290 float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
291 word_certainty *= kCertaintyScale;
292 if (getDict().stopper_debug_level >= 1) {
293 tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
294 word->best_choice->certainty(), word->space_certainty,
295 std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
296 word_certainty);
297 word->best_choice->print();
298 }
299 word->best_choice->set_certainty(word_certainty);
300
301 word->tess_accepted = stopper_dict->AcceptableResult(word);
302 }
303 }
304}

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage ( const char *  input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.

Definition at line 101 of file pagesegmain.cpp.

102 {
103 ASSERT_HOST(pix_binary_ != nullptr);
104 int width = pixGetWidth(pix_binary_);
105 int height = pixGetHeight(pix_binary_);
106 // Get page segmentation mode.
107 auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
108 // If a UNLV zone file can be found, use that instead of segmentation.
109 if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
110 std::string name = input_file;
111 std::size_t lastdot = name.find_last_of(".");
112 name = name.substr(0, lastdot);
113 read_unlv_file(name, width, height, blocks);
114 }
115 if (blocks->empty()) {
116 // No UNLV file present. Work according to the PageSegMode.
117 // First make a single block covering the whole image.
118 BLOCK_IT block_it(blocks);
119 auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
120 block->set_right_to_left(right_to_left());
121 block_it.add_to_end(block);
122 } else {
123 // UNLV file present. Use PSM_SINGLE_BLOCK.
124 pageseg_mode = PSM_SINGLE_BLOCK;
125 }
126 // The diacritic_blobs holds noise blobs that may be diacritics. They
127 // are separated out on areas of the image that seem noisy and short-circuit
128 // the layout process, going straight from the initial partition creation
129 // right through to after word segmentation, where they are added to the
130 // rej_cblobs list of the most appropriate word. From there classification
131 // will determine whether they are used.
132 BLOBNBOX_LIST diacritic_blobs;
133 int auto_page_seg_ret_val = 0;
134 TO_BLOCK_LIST to_blocks;
135 if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
136 PSM_SPARSE(pageseg_mode)) {
137 auto_page_seg_ret_val =
138 AutoPageSeg(pageseg_mode, blocks, &to_blocks,
139 enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
140 if (pageseg_mode == PSM_OSD_ONLY) {
141 return auto_page_seg_ret_val;
142 }
143 // To create blobs from the image region bounds uncomment this line:
144 // to_blocks.clear(); // Uncomment to go back to the old mode.
145 } else {
146 deskew_ = FCOORD(1.0f, 0.0f);
147 reskew_ = FCOORD(1.0f, 0.0f);
148 if (pageseg_mode == PSM_CIRCLE_WORD) {
149 Image pixcleaned = RemoveEnclosingCircle(pix_binary_);
150 if (pixcleaned != nullptr) {
151 pix_binary_.destroy();
152 pix_binary_ = pixcleaned;
153 }
154 }
155 }
156
157 if (auto_page_seg_ret_val < 0) {
158 return -1;
159 }
160
161 if (blocks->empty()) {
163 tprintf("Empty page\n");
164 }
165 return 0; // AutoPageSeg found an empty page.
166 }
167 bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
168 bool cjk_mode = textord_use_cjk_fp_model;
169
170 textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
171 pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
172 return auto_page_seg_ret_val;
173}
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:186
@ PSM_CIRCLE_WORD
Treat the image as a single word in a circle.
Definition: publictypes.h:169
@ PSM_OSD_ONLY
Orientation and script detection only.
Definition: publictypes.h:158
bool read_unlv_file(std::string &name, int32_t xsize, int32_t ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:36
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:192
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:195
int textord_debug_tabfind
Definition: alignedblob.cpp:29
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:198
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Image binary_pix, Image thresholds_pix, Image grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:177

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const std::vector< C_OUTLINE * > &  outlines,
int  num_outlines,
std::vector< bool > *  ok_outlines 
)

Definition at line 1120 of file control.cpp.

1123 {
1124 std::string best_str;
1125 float target_cert = certainty_threshold;
1126 if (blob != nullptr) {
1127 float target_c2;
1128 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1129 if (debug_noise_removal) {
1130 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1131 target_c2);
1132 blob->bounding_box().print();
1133 }
1134 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1135 }
1136 std::vector<bool> test_outlines = *ok_outlines;
1137 // Start with all the outlines in.
1138 std::string all_str;
1139 std::vector<bool> best_outlines = *ok_outlines;
1140 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1141 if (debug_noise_removal) {
1142 TBOX ol_box;
1143 for (unsigned i = 0; i < test_outlines.size(); ++i) {
1144 if (test_outlines[i]) {
1145 ol_box += outlines[i]->bounding_box();
1146 }
1147 }
1148 tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1149 best_cert - target_cert);
1150 ol_box.print();
1151 }
1152 // Iteratively zero out the bit that improves the certainty the most, until
1153 // we get past the threshold, have zero bits, or fail to improve.
1154 int best_index = 0; // To zero out.
1155 while (num_outlines > 1 && best_index >= 0 &&
1156 (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1157 // Find the best bit to zero out.
1158 best_index = -1;
1159 for (unsigned i = 0; i < outlines.size(); ++i) {
1160 if (test_outlines[i]) {
1161 test_outlines[i] = false;
1162 std::string str;
1163 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1164 if (debug_noise_removal) {
1165 TBOX ol_box;
1166 for (unsigned j = 0; j < outlines.size(); ++j) {
1167 if (test_outlines[j]) {
1168 ol_box += outlines[j]->bounding_box();
1169 }
1170 tprintf("%c", test_outlines[j] ? 'T' : 'F');
1171 }
1172 tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1173 cert - target_cert);
1174 ol_box.print();
1175 }
1176 if (cert > best_cert) {
1177 best_cert = cert;
1178 best_index = i;
1179 best_outlines = test_outlines;
1180 }
1181 test_outlines[i] = true;
1182 }
1183 }
1184 if (best_index >= 0) {
1185 test_outlines[best_index] = false;
1186 --num_outlines;
1187 }
1188 }
1189 if (best_cert >= target_cert) {
1190 // Save the best combination.
1191 *ok_outlines = best_outlines;
1192 if (debug_noise_removal) {
1193 tprintf("%s noise combination ", blob ? "Adding" : "New");
1194 for (auto &&best_outline : best_outlines) {
1195 tprintf("%c", best_outline ? 'T' : 'F');
1196 }
1197 tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1198 }
1199 return true;
1200 }
1201
1202 return false;
1203}
float ClassifyBlobPlusOutlines(const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
Definition: control.cpp:1207

◆ set_done()

void tesseract::Tesseract::set_done ( WERD_RES word,
int16_t  pass 
)

Definition at line 62 of file reject.cpp.

62 {
63 word->done =
64 word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
65 bool word_is_ambig = word->best_choice->dangerous_ambig_found();
66 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67 word->best_choice->permuter() == FREQ_DAWG_PERM ||
68 word->best_choice->permuter() == USER_DAWG_PERM;
69 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
70 one_ell_conflict(word, false)) {
71 if (tessedit_rejection_debug) {
72 tprintf("one_ell_conflict detected\n");
73 }
74 word->done = false;
75 }
76 if (word->done &&
77 ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78 if (tessedit_rejection_debug) {
79 tprintf("non-dict or ambig word detected\n");
80 }
81 word->done = false;
82 }
83 if (tessedit_rejection_debug) {
84 tprintf("set_done(): done=%d\n", word->done);
85 word->best_choice->print("");
86 }
87}

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Image  grey_pix)
inline

Definition at line 214 of file tesseractclass.h.

214 {
215 pix_grey_.destroy();
216 pix_grey_ = grey_pix;
217 }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Image  original_pix)
inline

Definition at line 222 of file tesseractclass.h.

222 {
223 pix_original_.destroy();
224 pix_original_ = original_pix;
225 // Clone to sublangs as well.
226 for (auto &lang : sub_langs_) {
227 lang->set_pix_original(original_pix ? original_pix.clone() : nullptr);
228 }
229 }

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Image  thresholds)
inline

Definition at line 247 of file tesseractclass.h.

247 {
248 pix_thresholds_.destroy();
249 pix_thresholds_ = thresholds;
250 }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 254 of file tesseractclass.h.

254 {
255 source_resolution_ = ppi;
256 }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 270 of file output.cpp.

270 {
271 int len = word_res->reject_map.length();
272 const WERD_CHOICE &word = *(word_res->best_choice);
273 const UNICHARSET &uchset = *word.unicharset();
274 int i;
275 float rating_per_ch;
276
277 if (suspect_level == 0) {
278 for (i = 0; i < len; i++) {
279 if (word_res->reject_map[i].rejected()) {
280 word_res->reject_map[i].setrej_minimal_rej_accept();
281 }
282 }
283 return;
284 }
285
286 if (suspect_level >= 3) {
287 return; // Use defaults
288 }
289
290 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
291
292 if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
293 /* Unreject alphas in dictionary words */
294 for (i = 0; i < len; ++i) {
295 if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
296 word_res->reject_map[i].setrej_minimal_rej_accept();
297 }
298 }
299 }
300
301 rating_per_ch = word.rating() / word_res->reject_map.length();
302
303 if (rating_per_ch >= suspect_rating_per_ch) {
304 return; // Don't touch bad ratings
305 }
306
307 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
308 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
309 for (i = 0; i < len; ++i) {
310 if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
311 word_res->reject_map[i].setrej_minimal_rej_accept();
312 }
313 }
314 }
315
316 for (i = 0; i < len; i++) {
317 if (word_res->reject_map[i].rejected()) {
318 if (word_res->reject_map[i].flag(R_DOC_REJ)) {
319 word_res->reject_map[i].setrej_minimal_rej_accept();
320 }
321 if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
322 word_res->reject_map[i].setrej_minimal_rej_accept();
323 }
324 if (word_res->reject_map[i].flag(R_ROW_REJ)) {
325 word_res->reject_map[i].setrej_minimal_rej_accept();
326 }
327 }
328 }
329
330 if (suspect_level == 2) {
331 return;
332 }
333
334 if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
335 for (i = 0; i < len; i++) {
336 if (word_res->reject_map[i].rejected()) {
337 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
338 word_res->reject_map[i].flag(R_POSTNN_1IL))) {
339 word_res->reject_map[i].setrej_minimal_rej_accept();
340 }
341
342 if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
343 word_res->reject_map[i].setrej_minimal_rej_accept();
344 }
345 }
346 }
347 }
348
349 if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
350 word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
351 acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
352 if (word_res->reject_map.length() > suspect_short_words) {
353 for (i = 0; i < len; i++) {
354 if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
355 word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
356 word_res->reject_map[i].flag(R_POSTNN_1IL) ||
357 word_res->reject_map[i].flag(R_MM_REJECT))) {
358 word_res->reject_map[i].setrej_minimal_rej_accept();
359 }
360 }
361 }
362 }
363}
@ R_ROW_REJ
Definition: rejctmap.h:81
@ R_DOC_REJ
Definition: rejctmap.h:79
@ R_BLOCK_REJ
Definition: rejctmap.h:80
@ R_POSTNN_1IL
Definition: rejctmap.h:57
@ R_MM_REJECT
Definition: rejctmap.h:59
@ R_1IL_CONFLICT
Definition: rejctmap.h:56
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:386
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:365

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1915 of file control.cpp.

1915 {
1916 // Don't try to set the word fonts for an lstm word, as the configs
1917 // will be meaningless.
1918 if (word->chopped_word == nullptr) {
1919 return;
1920 }
1921 ASSERT_HOST(word->best_choice != nullptr);
1922
1923#ifndef DISABLED_LEGACY_ENGINE
1924 const int fontinfo_size = fontinfo_table_.size();
1925 if (fontinfo_size == 0) {
1926 return;
1927 }
1928 if (tessedit_font_id > 0) {
1929 if (tessedit_font_id >= fontinfo_size) {
1930 tprintf("Error, invalid font ID provided: must be below %d.\n"
1931 "Falling back to font auto-detection.\n", fontinfo_size);
1932 } else {
1933 word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1934 word->fontinfo2 = nullptr;
1935 word->fontinfo_id_count = INT8_MAX;
1936 word->fontinfo_id2_count = 0;
1937 return;
1938 }
1939 }
1940 std::vector<int> font_total_score(fontinfo_size);
1941
1942 // Compute the font scores for the word
1943 if (tessedit_debug_fonts) {
1944 tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1945 }
1946 for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1947 const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1948 if (choice == nullptr) {
1949 continue;
1950 }
1951 auto &fonts = choice->fonts();
1952 for (auto &f : fonts) {
1953 const int fontinfo_id = f.fontinfo_id;
1954 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1955 font_total_score[fontinfo_id] += f.score;
1956 }
1957 }
1958 }
1959 // Find the top and 2nd choice for the word.
1960 int score1 = 0, score2 = 0;
1961 int16_t font_id1 = -1, font_id2 = -1;
1962 for (int f = 0; f < fontinfo_size; ++f) {
1963 if (tessedit_debug_fonts && font_total_score[f] > 0) {
1964 tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1965 }
1966 if (font_total_score[f] > score1) {
1967 score2 = score1;
1968 font_id2 = font_id1;
1969 score1 = font_total_score[f];
1970 font_id1 = f;
1971 } else if (font_total_score[f] > score2) {
1972 score2 = font_total_score[f];
1973 font_id2 = f;
1974 }
1975 }
1976 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1977 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1978 // Each score has a limit of UINT16_MAX, so divide by that to get the number
1979 // of "votes" for that font, ie number of perfect scores.
1980 word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1981 word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1982 if (score1 > 0) {
1983 const FontInfo fi = fontinfo_table_.at(font_id1);
1984 if (tessedit_debug_fonts) {
1985 if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1986 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1987 word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
1988 word->fontinfo_id2_count);
1989 } else {
1990 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
1991 }
1992 }
1993 }
1994#endif // ndef DISABLED_LEGACY_ENGINE
1995}
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:434

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 530 of file tesseractclass.cpp.

530 {
531 // Set the white and blacklists (if any)
532 unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
533 tessedit_char_whitelist.c_str(),
534 tessedit_char_unblacklist.c_str());
535 if (lstm_recognizer_) {
536 UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();
537 lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
538 tessedit_char_whitelist.c_str(),
539 tessedit_char_unblacklist.c_str());
540 }
541 // Black and white lists should apply to all loaded classifiers.
542 for (auto &sub_lang : sub_langs_) {
543 sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
544 tessedit_char_whitelist.c_str(),
545 tessedit_char_unblacklist.c_str());
546 if (sub_lang->lstm_recognizer_) {
547 UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();
548 lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
549 tessedit_char_whitelist.c_str(),
550 tessedit_char_unblacklist.c_str());
551 }
552 }
553}
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 507 of file tesseractclass.cpp.

507 {
508 equ_detect_ = detector;
509 equ_detect_->SetLangTesseract(this);
510}
void SetLangTesseract(Tesseract *lang_tesseract)

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor ( int  factor,
Image  color 
)
inline

Definition at line 269 of file tesseractclass.h.

269 {
270 scaled_factor_ = factor;
271 scaled_color_ = color;
272 }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
std::vector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 146 of file control.cpp.

147 {
148 // Prepare all the words.
149 PAGE_RES_IT page_res_it(page_res);
150 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
151 if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
152 *target_word_box, word_config, 1)) {
153 words->push_back(WordData(page_res_it));
154 }
155 }
156 // Setup all the words for recognition with polygonal approximation.
157 for (unsigned w = 0; w < words->size(); ++w) {
158 SetupWordPassN(pass_n, &(*words)[w]);
159 if (w > 0) {
160 (*words)[w].prev_word = &(*words)[w - 1];
161 }
162 }
163}

◆ SetupApplyBoxes()

PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const std::vector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 197 of file applybox.cpp.

197 {
198 PreenXHeights(block_list);
199 // Strip all fuzzy space markers to simplify the PAGE_RES.
200 BLOCK_IT b_it(block_list);
201 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
202 BLOCK *block = b_it.data();
203 ROW_IT r_it(block->row_list());
204 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
205 ROW *row = r_it.data();
206 WERD_IT w_it(row->word_list());
207 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
208 WERD *word = w_it.data();
209 if (word->cblob_list()->empty()) {
210 delete w_it.extract();
211 } else {
212 word->set_flag(W_FUZZY_SP, false);
213 word->set_flag(W_FUZZY_NON, false);
214 }
215 }
216 }
217 }
218 auto *page_res = new PAGE_RES(false, block_list, nullptr);
219 PAGE_RES_IT pr_it(page_res);
220 WERD_RES *word_res;
221 while ((word_res = pr_it.word()) != nullptr) {
222 MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
223 pr_it.forward();
224 }
225 return page_res;
226}
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:174
void MaximallyChopWord(const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:231

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Image photo_mask_pix,
Image music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 270 of file pagesegmain.cpp.

274 {
275 int vertical_x = 0;
276 int vertical_y = 1;
277 TabVector_LIST v_lines;
278 TabVector_LIST h_lines;
279 ICOORD bleft(0, 0);
280
281 ASSERT_HOST(pix_binary_ != nullptr);
282 if (tessedit_dump_pageseg_images) {
283 pixa_debug_.AddPix(pix_binary_, "PageSegInput");
284 }
285 // Leptonica is used to find the rule/separator lines in the input.
286 LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
287 &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
288 if (tessedit_dump_pageseg_images) {
289 pixa_debug_.AddPix(pix_binary_, "NoLines");
290 }
291 // Leptonica is used to find a mask of the photo regions in the input.
292 *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
293 if (tessedit_dump_pageseg_images) {
294 Image pix_no_image_ = nullptr;
295 if (*photo_mask_pix != nullptr) {
296 pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
297 } else {
298 pix_no_image_ = pix_binary_.clone();
299 }
300 pixa_debug_.AddPix(pix_no_image_, "NoImages");
301 pix_no_image_.destroy();
302 }
303 if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
304 v_lines.clear();
305 }
306
307 // The rest of the algorithm uses the usual connected components.
308 textord_.find_components(pix_binary_, blocks, to_blocks);
309
310 TO_BLOCK_IT to_block_it(to_blocks);
311 // There must be exactly one input block.
312 // TODO(rays) handle new textline finding with a UNLV zone file.
313 ASSERT_HOST(to_blocks->singleton());
314 TO_BLOCK *to_block = to_block_it.data();
315 TBOX blkbox = to_block->block->pdblk.bounding_box();
316 ColumnFinder *finder = nullptr;
317 int estimated_resolution = source_resolution_;
318 if (source_resolution_ == kMinCredibleResolution) {
319 // Try to estimate resolution from typical body text size.
320 int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
321 if (res > estimated_resolution && res < kMaxCredibleResolution) {
322 estimated_resolution = res;
323 tprintf("Estimating resolution as %d\n", estimated_resolution);
324 }
325 }
326
327 if (to_block->line_size >= 2) {
328 finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
329 blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
330 textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
331 vertical_y);
332
333 finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
334
335 #ifndef DISABLED_LEGACY_ENGINE
336 if (equ_detect_) {
337 equ_detect_->LabelSpecialText(to_block);
338 }
339 #endif
340
341 BLOBNBOX_CLIST osd_blobs;
342 // osd_orientation is the number of 90 degree rotations to make the
343 // characters upright. (See tesseract/osdetect.h for precise definition.)
344 // We want the text lines horizontal, (vertical text indicates vertical
345 // textlines) which may conflict (eg vertically written CJK).
346 int osd_orientation = 0;
347 bool vertical_text =
348 textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
349 if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
350 vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
351 &osd_blobs);
352 }
353
354 #ifndef DISABLED_LEGACY_ENGINE
355 if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
356 std::vector<int> osd_scripts;
357 if (osd_tess != this) {
358 // We are running osd as part of layout analysis, so constrain the
359 // scripts to those allowed by *this.
360 AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
361 for (auto &lang : sub_langs_) {
362 AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);
363 }
364 }
365 os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
366 if (pageseg_mode == PSM_OSD_ONLY) {
367 delete finder;
368 return nullptr;
369 }
370 osd_orientation = osr->best_result.orientation_id;
371 double osd_score = osr->orientations[osd_orientation];
372 double osd_margin = min_orientation_margin * 2;
373 for (int i = 0; i < 4; ++i) {
374 if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
375 osd_margin = osd_score - osr->orientations[i];
376 }
377 }
378 int best_script_id = osr->best_result.script_id;
379 const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
380 bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
381 best_script_id == osd_tess->unicharset.hiragana_sid() ||
382 best_script_id == osd_tess->unicharset.katakana_sid() ||
383 strcmp("Japanese", best_script_str) == 0 ||
384 strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
385 if (cjk) {
386 finder->set_cjk_script(true);
387 }
388 if (osd_margin < min_orientation_margin) {
389 // The margin is weak.
390 if (!cjk && !vertical_text && osd_orientation == 2) {
391 // upside down latin text is improbable with such a weak margin.
392 tprintf(
393 "OSD: Weak margin (%.2f), horiz textlines, not CJK: "
394 "Don't rotate.\n",
395 osd_margin);
396 osd_orientation = 0;
397 } else {
398 tprintf(
399 "OSD: Weak margin (%.2f) for %d blob text block, "
400 "but using orientation anyway: %d\n",
401 osd_margin, osd_blobs.length(), osd_orientation);
402 }
403 }
404 }
405 #endif // ndef DISABLED_LEGACY_ENGINE
406
407 osd_blobs.shallow_clear();
408 finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
409 }
410
411 return finder;
412}
constexpr int kResolutionEstimationFactor
Definition: publictypes.h:43
@ PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:164
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:189
int os_detect_blobs(const std::vector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:274
constexpr int kMaxCredibleResolution
Definition: publictypes.h:38
constexpr int kMinCredibleResolution
Definition: publictypes.h:36
int LabelSpecialText(TO_BLOCK *to_block) override
void AddPix(const Image pix, const char *caption)
Definition: debugpixa.h:32
static Image FindImages(Image pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:252
static void FindAndRemoveLines(int resolution, bool debug, Image pix, int *vertical_x, int *vertical_y, Image *pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:691
void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:211

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 436 of file tessedit.cpp.

436 {
437 // Note that we can get away with bitwise copying FontInfo in
438 // all_fonts, as it is a temporary structure and we avoid setting the
439 // delete callback.
440 UnicityTable<FontInfo> all_fonts;
441
442 // Create the universal ID table.
443 CollectFonts(get_fontinfo_table(), &all_fonts);
444 for (auto &sub_lang : sub_langs_) {
445 CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
446 }
447 // Assign ids from the table to each font table.
448 AssignIds(all_fonts, &get_fontinfo_table());
449 for (auto &sub_lang : sub_langs_) {
450 AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
451 }
452 font_table_size_ = all_fonts.size();
453}

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 166 of file control.cpp.

166 {
167 if (pass_n == 1 || !word->word->done) {
168 if (pass_n == 1) {
169 word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
170 nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
171 poly_allow_detailed_fx, word->row, word->block);
172 } else if (pass_n == 2) {
173 // TODO(rays) Should we do this on pass1 too?
174 word->word->caps_height = 0.0;
175 if (word->word->x_height == 0.0f) {
176 word->word->x_height = word->row->x_height();
177 }
178 }
179 word->lang_words.truncate(0);
180 for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
181 // The sub_langs_.size() entry is for the master language.
182 Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
183 auto *word_res = new WERD_RES;
184 word_res->InitForRetryRecognition(*word->word);
185 word->lang_words.push_back(word_res);
186 // LSTM doesn't get setup for pass2.
187 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
188 word_res->SetupForRecognition(
189 lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
190 lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
191 lang_t->poly_allow_detailed_fx, word->row, word->block);
192 }
193 }
194 }
195}

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 251 of file tesseractclass.h.

251 {
252 return source_resolution_;
253 }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 126 of file tfacepp.cpp.

126 {
127 // Find the biggest blob gap in the chopped_word.
128 int bestgap = -INT32_MAX;
129 int split_index = 0;
130 for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {
131 TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
132 TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
133 int gap = blob_box.left() - prev_box.right();
134 if (gap > bestgap) {
135 bestgap = gap;
136 split_index = b;
137 }
138 }
139 ASSERT_HOST(split_index > 0);
140
141 WERD_RES *word2 = nullptr;
142 BlamerBundle *orig_bb = nullptr;
143 split_word(word, split_index, &word2, &orig_bb);
144
145 // Recognize the first part of the word.
147 // Recognize the second part of the word.
149
150 join_words(word, word2, orig_bb);
151}
void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:163
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:216

◆ split_word()

void tesseract::Tesseract::split_word ( WERD_RES word,
unsigned  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 163 of file tfacepp.cpp.

164 {
165 ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
166
167 // Save a copy of the blamer bundle so we can try to reconstruct it below.
168 BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
169
170 auto *word2 = new WERD_RES(*word);
171
172 // blow away the copied chopped_word, as we want to work with
173 // the blobs from the input chopped_word so seam_arrays can be merged.
174 TWERD *chopped = word->chopped_word;
175 auto *chopped2 = new TWERD;
176 chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
177 for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {
178 chopped2->blobs.push_back(chopped->blobs[i]);
179 }
180 chopped->blobs.resize(split_pt);
181 word->chopped_word = nullptr;
182 delete word2->chopped_word;
183 word2->chopped_word = nullptr;
184
185 const UNICHARSET &unicharset = *word->uch_set;
186 word->ClearResults();
187 word2->ClearResults();
188 word->chopped_word = chopped;
189 word2->chopped_word = chopped2;
190 word->SetupBasicsFromChoppedWord(unicharset);
191 word2->SetupBasicsFromChoppedWord(unicharset);
192
193 // Try to adjust the blamer bundle.
194 if (orig_bb != nullptr) {
195 // TODO(rays) Looks like a leak to me.
196 // orig_bb should take, rather than copy.
197 word->blamer_bundle = new BlamerBundle();
198 word2->blamer_bundle = new BlamerBundle();
199 orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
200 word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
201 word->blamer_bundle, word2->blamer_bundle);
202 }
203
204 *right_piece = word2;
205 *orig_blamer_bundle = orig_bb;
206}

◆ STRING_VAR_H() [1/17]

tesseract::Tesseract::STRING_VAR_H ( applybox_exposure_pattern  )

◆ STRING_VAR_H() [2/17]

tesseract::Tesseract::STRING_VAR_H ( chs_leading_punct  )

◆ STRING_VAR_H() [3/17]

tesseract::Tesseract::STRING_VAR_H ( chs_trailing_punct1  )

◆ STRING_VAR_H() [4/17]

tesseract::Tesseract::STRING_VAR_H ( chs_trailing_punct2  )

◆ STRING_VAR_H() [5/17]

tesseract::Tesseract::STRING_VAR_H ( conflict_set_I_l_1  )

◆ STRING_VAR_H() [6/17]

tesseract::Tesseract::STRING_VAR_H ( file_type  )

◆ STRING_VAR_H() [7/17]

tesseract::Tesseract::STRING_VAR_H ( numeric_punctuation  )

◆ STRING_VAR_H() [8/17]

tesseract::Tesseract::STRING_VAR_H ( ok_repeated_ch_non_alphanum_wds  )

◆ STRING_VAR_H() [9/17]

tesseract::Tesseract::STRING_VAR_H ( outlines_2  )

◆ STRING_VAR_H() [10/17]

tesseract::Tesseract::STRING_VAR_H ( outlines_odd  )

◆ STRING_VAR_H() [11/17]

tesseract::Tesseract::STRING_VAR_H ( page_separator  )

◆ STRING_VAR_H() [12/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_char_blacklist  )

◆ STRING_VAR_H() [13/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_char_unblacklist  )

◆ STRING_VAR_H() [14/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_char_whitelist  )

◆ STRING_VAR_H() [15/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_load_sublangs  )

◆ STRING_VAR_H() [16/17]

tesseract::Tesseract::STRING_VAR_H ( tessedit_write_params_to_file  )

◆ STRING_VAR_H() [17/17]

tesseract::Tesseract::STRING_VAR_H ( unrecognised_char  )

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 108 of file superscript.cpp.

108 {
109 if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
110 return false;
111 }
112 int num_leading, num_trailing;
113 ScriptPos sp_leading, sp_trailing;
114 float leading_certainty, trailing_certainty;
115 float avg_certainty, unlikely_threshold;
116
117 // Calculate the number of whole suspicious characters at the edges.
118 GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
119 &sp_trailing, &trailing_certainty, &avg_certainty,
120 &unlikely_threshold);
121
122 const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
123 const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
124
125 int num_blobs = word->best_choice->length();
126
127 // Calculate the remainder (partial characters) at the edges.
128 // This accounts for us having classified the best version of
129 // a word as [speaker?'] when it was instead [speaker.^{21}]
130 // (that is we accidentally thought the 2 was attached to the period).
131 int num_remainder_leading = 0, num_remainder_trailing = 0;
132 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
133 int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
134 int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
135 int last_word_char = num_blobs - 1 - num_trailing;
136 float last_char_certainty = word->best_choice->certainty(last_word_char);
137 if (word->best_choice->unichar_id(last_word_char) != 0 &&
138 last_char_certainty <= unlikely_threshold) {
139 ScriptPos rpos;
140 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
141 &num_remainder_trailing);
142 if (num_trailing > 0 && rpos != sp_trailing) {
143 num_remainder_trailing = 0;
144 }
145 if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
146 trailing_certainty = last_char_certainty;
147 }
148 }
149 bool another_blob_available =
150 (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
151 int first_char_certainty = word->best_choice->certainty(num_leading);
152 if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
153 first_char_certainty <= unlikely_threshold) {
154 ScriptPos lpos;
155 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
156 nullptr, nullptr);
157 if (num_leading > 0 && lpos != sp_leading) {
158 num_remainder_leading = 0;
159 }
160 if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
161 leading_certainty = first_char_certainty;
162 }
163 }
164 }
165
166 // If nothing to do, bail now.
167 if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
168 return false;
169 }
170
171 if (superscript_debug >= 1) {
172 tprintf("Candidate for superscript detection: %s (",
173 word->best_choice->unichar_string().c_str());
174 if (num_leading || num_remainder_leading) {
175 tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
176 }
177 if (num_trailing || num_remainder_trailing) {
178 tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
179 }
180 tprintf(")\n");
181 }
182 if (superscript_debug >= 3) {
183 word->best_choice->print();
184 }
185 if (superscript_debug >= 2) {
186 tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty,
187 unlikely_threshold);
188 if (num_leading) {
189 tprintf("Orig. leading (min): %.2f ", leading_certainty);
190 }
191 if (num_trailing) {
192 tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
193 }
194 tprintf("\n");
195 }
196
197 // We've now calculated the number of rebuilt blobs we want to carve off.
198 // However, split_word() works from TBLOBs in chopped_word, so we need to
199 // convert to those.
200 int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
201 int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202
203 int retry_leading = 0;
204 int retry_trailing = 0;
205 bool is_good = false;
206 WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
207 num_chopped_trailing, trailing_certainty, sp_trailing,
208 word, &is_good, &retry_leading, &retry_trailing);
209 if (is_good) {
210 word->ConsumeWordResults(revised);
211 } else if (retry_leading || retry_trailing) {
212 int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
213 int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
214 WERD_RES *revised2 = TrySuperscriptSplits(
215 retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
216 trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
217 if (is_good) {
218 word->ConsumeWordResults(revised2);
219 }
220 delete revised2;
221 }
222 delete revised;
223 return is_good;
224}
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)

◆ terrible_word_crunch()

bool tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 450 of file docqual.cpp.

450 {
451 float rating_per_ch;
452 int adjusted_len;
453 int crunch_mode = 0;
454
455 if (word->best_choice->unichar_string().empty() ||
456 (strspn(word->best_choice->unichar_string().c_str(), " ") ==
457 word->best_choice->unichar_string().size())) {
458 crunch_mode = 1;
459 } else {
460 adjusted_len = word->reject_map.length();
461 if (adjusted_len > crunch_rating_max) {
462 adjusted_len = crunch_rating_max;
463 }
464 rating_per_ch = word->best_choice->rating() / adjusted_len;
465
466 if (rating_per_ch > crunch_terrible_rating) {
467 crunch_mode = 2;
468 } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
469 crunch_mode = 3;
470 } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
471 (garbage_level != G_OK)) {
472 crunch_mode = 4;
473 } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
474 crunch_mode = 5;
475 }
476 }
477 if (crunch_mode > 0) {
478 if (crunch_debug > 2) {
479 tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
480 word->best_choice->unichar_string().c_str());
481 }
482 return true;
483 } else {
484 return false;
485 }
486}

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 64 of file tessbox.cpp.

64 {
65 return getDict().AcceptableResult(word);
66}
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:111

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 73 of file tessbox.cpp.

73 {
74 getDict().add_document_word(*word_choice);
75}
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:647

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 32 of file tessbox.cpp.

32 {
33 int saved_enable_assoc = 0;
34 int saved_chop_enable = 0;
35
36 if (word->word->flag(W_DONT_CHOP)) {
37 saved_enable_assoc = wordrec_enable_assoc;
38 saved_chop_enable = chop_enable;
39 wordrec_enable_assoc.set_value(false);
40 chop_enable.set_value(false);
41 }
42 if (pass_n == 1) {
43 set_pass1();
44 } else {
45 set_pass2();
46 }
47 recog_word(word);
48 if (word->best_choice == nullptr) {
49 word->SetupFake(*word->uch_set);
50 }
51 if (word->word->flag(W_DONT_CHOP)) {
52 wordrec_enable_assoc.set_value(saved_enable_assoc);
53 chop_enable.set_value(saved_chop_enable);
54 }
55}
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:37
void set_pass1()
Definition: tface.cpp:97
void set_pass2()
Definition: tface.cpp:108

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1488 of file control.cpp.

1489 {
1490 bool accept_new_x_ht = false;
1491 WERD_RES new_x_ht_word(word->word);
1492 if (word->blamer_bundle != nullptr) {
1493 new_x_ht_word.blamer_bundle = new BlamerBundle();
1494 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1495 }
1496 new_x_ht_word.x_height = new_x_ht;
1497 new_x_ht_word.baseline_shift = baseline_shift;
1498 new_x_ht_word.caps_height = 0.0;
1499 new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1500 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1501 poly_allow_detailed_fx, row, block);
1502 match_word_pass_n(2, &new_x_ht_word, row, block);
1503 if (!new_x_ht_word.tess_failed) {
1504 int new_misfits = CountMisfitTops(&new_x_ht_word);
1505 if (debug_x_ht_level >= 1) {
1506 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1507 word->x_height, new_misfits, new_x_ht);
1508 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1509 word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1510 new_x_ht_word.best_choice->certainty());
1511 }
1512 // The misfits must improve and either the rating or certainty.
1513 accept_new_x_ht = new_misfits < original_misfits &&
1514 (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1515 new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1516 if (debug_x_ht_level >= 1) {
1517 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1518 }
1519 }
1520 if (accept_new_x_ht) {
1521 word->ConsumeWordResults(&new_x_ht_word);
1522 return true;
1523 }
1524 return false;
1525}
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:72
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1436

◆ textord()

const Textord & tesseract::Tesseract::textord ( ) const
inline

Definition at line 273 of file tesseractclass.h.

273 {
274 return textord_;
275 }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)
  • Counts up the labelled words and the blobs within.
  • Deletes all unused or emptied words, counting the unused ones.
  • Resets W_BOL and W_EOL flags correctly.
  • Builds the rebuild_word and rebuilds the box_word and the best_choice.

Definition at line 685 of file applybox.cpp.

685 {
686 int ok_blob_count = 0;
687 int bad_blob_count = 0;
688 int ok_word_count = 0;
689 int unlabelled_words = 0;
690 PAGE_RES_IT pr_it(page_res);
691 WERD_RES *word_res;
692 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
693 int ok_in_word = 0;
694 int blob_count = word_res->correct_text.size();
695 auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
696 word_choice->set_permuter(TOP_CHOICE_PERM);
697 for (int c = 0; c < blob_count; ++c) {
698 if (word_res->correct_text[c].length() > 0) {
699 ++ok_in_word;
700 }
701 // Since we only need a fake word_res->best_choice, the actual
702 // unichar_ids do not matter. Which is fortunate, since TidyUp()
703 // can be called while training Tesseract, at the stage where
704 // unicharset is not meaningful yet.
705 word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
706 1.0f, -1.0f);
707 }
708 if (ok_in_word > 0) {
709 ok_blob_count += ok_in_word;
710 bad_blob_count += word_res->correct_text.size() - ok_in_word;
711 word_res->LogNewRawChoice(word_choice);
712 word_res->LogNewCookedChoice(1, false, word_choice);
713 } else {
714 ++unlabelled_words;
715 if (applybox_debug > 0) {
716 tprintf("APPLY_BOXES: Unlabelled word at :");
717 word_res->word->bounding_box().print();
718 }
719 pr_it.DeleteCurrentWord();
720 delete word_choice;
721 }
722 }
723 pr_it.restart_page();
724 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
725 // Denormalize back to a BoxWord.
726 word_res->RebuildBestState();
727 word_res->SetupBoxWord();
728 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
729 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
730 }
731 if (applybox_debug > 0) {
732 tprintf(" Found %d good blobs.\n", ok_blob_count);
733 if (bad_blob_count > 0) {
734 tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
735 }
736 if (unlabelled_words > 0) {
737 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
738 }
739 }
740}
@ TOP_CHOICE_PERM
Definition: ratngs.h:238

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 373 of file docqual.cpp.

373 {
374 WERD_RES *word;
375 GARBAGE_LEVEL garbage_level;
376 PAGE_RES_IT copy_it;
377 bool prev_potential_marked = false;
378 bool found_terrible_word = false;
379 bool ok_dict_word;
380
381 page_res_it.restart_page();
382 while (page_res_it.word() != nullptr) {
383 POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
384 if (pb != nullptr && !pb->IsText()) {
385 page_res_it.forward();
386 continue;
387 }
388 word = page_res_it.word();
389
390 if (crunch_early_convert_bad_unlv_chs) {
392 }
393
394 if (crunch_early_merge_tess_fails) {
395 word->merge_tess_fails();
396 }
397
398 if (word->reject_map.accept_count() != 0) {
399 found_terrible_word = false;
400 // Forget earlier potential crunches
401 prev_potential_marked = false;
402 } else {
403 ok_dict_word = safe_dict_word(word);
404 garbage_level = garbage_word(word, ok_dict_word);
405
406 if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
407 if (crunch_debug > 0) {
408 tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
409 }
410 word->unlv_crunch_mode = CR_KEEP_SPACE;
411 if (prev_potential_marked) {
412 while (copy_it.word() != word) {
413 if (crunch_debug > 0) {
414 tprintf("P1 CRUNCHING: \"%s\"\n",
415 copy_it.word()->best_choice->unichar_string().c_str());
416 }
417 copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
418 copy_it.forward();
419 }
420 prev_potential_marked = false;
421 }
422 found_terrible_word = true;
423 } else if ((garbage_level != G_NEVER_CRUNCH) &&
424 (potential_word_crunch(word, garbage_level, ok_dict_word))) {
425 if (found_terrible_word) {
426 if (crunch_debug > 0) {
427 tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
428 }
429 word->unlv_crunch_mode = CR_KEEP_SPACE;
430 } else if (!prev_potential_marked) {
431 copy_it = page_res_it;
432 prev_potential_marked = true;
433 if (crunch_debug > 1) {
434 tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
435 }
436 }
437 } else {
438 found_terrible_word = false;
439 // Forget earlier potential crunches
440 prev_potential_marked = false;
441 if (crunch_debug > 2) {
442 tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
443 }
444 }
445 }
446 page_res_it.forward();
447 }
448}
@ CR_KEEP_SPACE
Definition: pageres.h:160
GARBAGE_LEVEL
Definition: docqual.h:30
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:616
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:594
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:488
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:450

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 530 of file docqual.cpp.

530 {
531 WERD_RES *word;
532 PAGE_RES_IT copy_it;
533 bool deleting_from_bol = false;
534 bool marked_delete_point = false;
535 int16_t debug_delete_mode;
536 CRUNCH_MODE delete_mode;
537 int16_t x_debug_delete_mode;
538 CRUNCH_MODE x_delete_mode;
539
540 page_res_it.restart_page();
541 while (page_res_it.word() != nullptr) {
542 word = page_res_it.word();
543
544 delete_mode = word_deletable(word, debug_delete_mode);
545 if (delete_mode != CR_NONE) {
546 if (word->word->flag(W_BOL) || deleting_from_bol) {
547 if (crunch_debug > 0) {
548 tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
549 word->best_choice->unichar_string().c_str());
550 }
551 word->unlv_crunch_mode = delete_mode;
552 deleting_from_bol = true;
553 } else if (word->word->flag(W_EOL)) {
554 if (marked_delete_point) {
555 while (copy_it.word() != word) {
556 x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
557 if (crunch_debug > 0) {
558 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
559 copy_it.word()->best_choice->unichar_string().c_str());
560 }
561 copy_it.word()->unlv_crunch_mode = x_delete_mode;
562 copy_it.forward();
563 }
564 }
565 if (crunch_debug > 0) {
566 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
567 word->best_choice->unichar_string().c_str());
568 }
569 word->unlv_crunch_mode = delete_mode;
570 deleting_from_bol = false;
571 marked_delete_point = false;
572 } else {
573 if (!marked_delete_point) {
574 copy_it = page_res_it;
575 marked_delete_point = true;
576 }
577 }
578 } else {
579 deleting_from_bol = false;
580 // Forget earlier potential crunches
581 marked_delete_point = false;
582 }
583 /*
584 The following step has been left till now as the tess fails are used to
585 determine if the word is deletable.
586*/
587 if (!crunch_early_merge_tess_fails) {
588 word->merge_tess_fails();
589 }
590 page_res_it.forward();
591 }
592}
@ CR_NONE
Definition: pageres.h:160
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:825

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1455 of file control.cpp.

1455 {
1456 int original_misfits = CountMisfitTops(word);
1457 if (original_misfits == 0) {
1458 return false;
1459 }
1460 float baseline_shift = 0.0f;
1461 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1462 if (baseline_shift != 0.0f) {
1463 // Try the shift on its own first.
1464 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1465 return false;
1466 }
1467 original_misfits = CountMisfitTops(word);
1468 if (original_misfits > 0) {
1469 float new_baseline_shift;
1470 // Now recompute the new x_height.
1471 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1472 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1473 // No test of return value here, as we are definitely making a change
1474 // to the word by shifting the baseline.
1475 TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1476 }
1477 }
1478 return true;
1479 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480 return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1481 } else {
1482 return false;
1483 }
1484}
const double kMinRefitXHeightFraction
Definition: control.cpp:51
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:105
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1488

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes ( const std::vector< TBOX > &  boxes,
const std::vector< std::string > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 76 of file linerec.cpp.

77 {
78 auto box_count = boxes.size();
79 // Process all the text lines in this page, as defined by the boxes.
80 unsigned end_box = 0;
81 // Don't let \t, which marks newlines in the box file, get into the line
82 // content, as that makes the line unusable in training.
83 while (end_box < texts.size() && texts[end_box] == "\t") {
84 ++end_box;
85 }
86 for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
87 // Find the textline of boxes starting at start and their bounding box.
88 TBOX line_box = boxes[start_box];
89 std::string line_str = texts[start_box];
90 for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
91 line_box += boxes[end_box];
92 line_str += texts[end_box];
93 }
94 // Find the most overlapping block.
95 BLOCK *best_block = nullptr;
96 int best_overlap = 0;
97 BLOCK_IT b_it(block_list);
98 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
99 BLOCK *block = b_it.data();
100 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
101 continue; // Not a text block.
102 }
103 TBOX block_box = block->pdblk.bounding_box();
104 block_box.rotate(block->re_rotation());
105 if (block_box.major_overlap(line_box)) {
106 TBOX overlap_box = line_box.intersection(block_box);
107 if (overlap_box.area() > best_overlap) {
108 best_overlap = overlap_box.area();
109 best_block = block;
110 }
111 }
112 }
113 ImageData *imagedata = nullptr;
114 if (best_block == nullptr) {
115 tprintf("No block overlapping textline: %s\n", line_str.c_str());
116 } else {
117 imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
118 }
119 if (imagedata != nullptr) {
120 training_data->AddPageToDocument(imagedata);
121 }
122 // Don't let \t, which marks newlines in the box file, get into the line
123 // content, as that makes the line unusable in training.
124 while (end_box < texts.size() && texts[end_box] == "\t") {
125 ++end_box;
126 }
127 }
128}
ImageData * GetLineData(const TBOX &line_box, const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:133

◆ TrainLineRecognizer()

bool tesseract::Tesseract::TrainLineRecognizer ( const char *  input_imagename,
const std::string &  output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 41 of file linerec.cpp.

42 {
43 std::string lstmf_name = output_basename + ".lstmf";
44 DocumentData images(lstmf_name);
45 if (applybox_page > 0) {
46 // Load existing document for the previous pages.
47 if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
48 tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
49 return false;
50 }
51 }
52 std::vector<TBOX> boxes;
53 std::vector<std::string> texts;
54 // Get the boxes for this page, if there are any.
55 if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
56 boxes.empty()) {
57 tprintf("Failed to read boxes from %s\n", input_imagename);
58 return false;
59 }
60 TrainFromBoxes(boxes, texts, block_list, &images);
61 if (images.PagesSize() == 0) {
62 tprintf("Failed to read pages from %s\n", input_imagename);
63 return false;
64 }
65 images.Shuffle();
66 if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
67 tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
68 return false;
69 }
70 return true;
71}
void TrainFromBoxes(const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:76

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 369 of file superscript.cpp.

373 {
374 int num_chopped = word->chopped_word->NumBlobs();
375
376 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
377
378 // Chop apart the word into up to three pieces.
379
380 BlamerBundle *bb0 = nullptr;
381 BlamerBundle *bb1 = nullptr;
382 WERD_RES *prefix = nullptr;
383 WERD_RES *core = nullptr;
384 WERD_RES *suffix = nullptr;
385 if (num_chopped_leading > 0) {
386 prefix = new WERD_RES(*word);
387 split_word(prefix, num_chopped_leading, &core, &bb0);
388 } else {
389 core = new WERD_RES(*word);
390 }
391
392 if (num_chopped_trailing > 0) {
393 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
394 split_word(core, split_pt, &suffix, &bb1);
395 }
396
397 // Recognize the pieces in turn.
398 int saved_cp_multiplier = classify_class_pruner_multiplier;
399 int saved_im_multiplier = classify_integer_matcher_multiplier;
400 if (prefix) {
401 // Turn off Tesseract's y-position penalties for the leading superscript.
402 classify_class_pruner_multiplier.set_value(0);
403 classify_integer_matcher_multiplier.set_value(0);
404
405 // Adjust our expectations about the baseline for this prefix.
406 if (superscript_debug >= 3) {
407 tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
408 }
409 recog_word_recursive(prefix);
410 if (superscript_debug >= 2) {
411 tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
412 prefix->best_choice->unichar_string().c_str());
413 }
414
415 // Restore the normal y-position penalties.
416 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
417 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
418 }
419
420 if (superscript_debug >= 3) {
421 tprintf(" recognizing middle %d chopped blobs\n",
422 num_chopped - num_chopped_leading - num_chopped_trailing);
423 }
424
425 if (suffix) {
426 // Turn off Tesseract's y-position penalties for the trailing superscript.
427 classify_class_pruner_multiplier.set_value(0);
428 classify_integer_matcher_multiplier.set_value(0);
429
430 if (superscript_debug >= 3) {
431 tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
432 }
433 recog_word_recursive(suffix);
434 if (superscript_debug >= 2) {
435 tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
436 suffix->best_choice->unichar_string().c_str());
437 }
438
439 // Restore the normal y-position penalties.
440 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
441 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
442 }
443
444 // Evaluate whether we think the results are believably better
445 // than what we already had.
446 bool good_prefix =
447 !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
448 superscript_bettered_certainty * leading_certainty,
449 retry_rebuild_leading, nullptr);
450 bool good_suffix =
451 !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
452 superscript_bettered_certainty * trailing_certainty, nullptr,
453 retry_rebuild_trailing);
454
455 *is_good = good_prefix && good_suffix;
456 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
457 // None of it is any good. Quit now.
458 delete core;
459 delete prefix;
460 delete suffix;
461 delete bb1;
462 return nullptr;
463 }
465
466 // Now paste the results together into core.
467 if (suffix) {
468 suffix->SetAllScriptPositions(trailing_pos);
469 join_words(core, suffix, bb1);
470 }
471 if (prefix) {
472 prefix->SetAllScriptPositions(leading_pos);
473 join_words(prefix, core, bb0);
474 core = prefix;
475 prefix = nullptr;
476 }
477
478 if (superscript_debug >= 1) {
479 tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
480 core->best_choice->unichar_string().c_str());
481 }
482 return core;
483}
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:193
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs ( WERD_RES word)

Definition at line 98 of file docqual.cpp.

98 {
99 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
100 word->rebuild_word->blobs.empty()) {
101 using namespace std::placeholders; // for _1
102 word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
103 std::bind(acceptIfGoodQuality, word, _1));
104 }
105}

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 142 of file docqual.cpp.

143 {
144 WERD_RES *word;
145 ROW_RES *current_row;
146 BLOCK_RES *current_block;
147 int i;
148
149 page_res_it.restart_page();
150 while (page_res_it.word() != nullptr) {
151 check_debug_pt(page_res_it.word(), 100);
152 if (bland_unrej) {
153 word = page_res_it.word();
154 for (i = 0; i < word->reject_map.length(); i++) {
155 if (word->reject_map[i].accept_if_good_quality()) {
156 word->reject_map[i].setrej_quality_accept();
157 }
158 }
159 page_res_it.forward();
160 } else if ((page_res_it.row()->char_count > 0) &&
161 ((page_res_it.row()->rej_count /
162 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
163 word = page_res_it.word();
164 if (word->reject_map.quality_recoverable_rejects() &&
165 (tessedit_unrej_any_wd ||
166 acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
167 word->best_choice->unichar_lengths().c_str()) !=
169 unrej_good_chs(word);
170 }
171 page_res_it.forward();
172 } else {
173 // Skip to end of dodgy row.
174 current_row = page_res_it.row();
175 while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
176 page_res_it.forward();
177 }
178 }
179 check_debug_pt(page_res_it.word(), 110);
180 }
181 page_res_it.restart_page();
182 page_res_it.page_res->char_count = 0;
183 page_res_it.page_res->rej_count = 0;
184 current_block = nullptr;
185 current_row = nullptr;
186 while (page_res_it.word() != nullptr) {
187 if (current_block != page_res_it.block()) {
188 current_block = page_res_it.block();
189 current_block->char_count = 0;
190 current_block->rej_count = 0;
191 }
192 if (current_row != page_res_it.row()) {
193 current_row = page_res_it.row();
194 current_row->char_count = 0;
195 current_row->rej_count = 0;
196 current_row->whole_word_rej_count = 0;
197 }
198 page_res_it.rej_stat_word();
199 page_res_it.forward();
200 }
201}
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:98

◆ word_adaptable()

bool tesseract::Tesseract::word_adaptable ( WERD_RES word,
uint16_t  mode 
)

Definition at line 34 of file adaptions.cpp.

35 {
36 if (tessedit_adaption_debug) {
37 tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
38 word->best_choice->unichar_string().c_str(), word->best_choice->rating(),
39 word->best_choice->certainty());
40 }
41
42 bool status = false;
43 std::bitset<16> flags(mode);
44
45 enum MODES {
46 ADAPTABLE_WERD,
47 ACCEPTABLE_WERD,
48 CHECK_DAWGS,
49 CHECK_SPACES,
50 CHECK_ONE_ELL_CONFLICT,
51 CHECK_AMBIG_WERD
52 };
53
54 /*
550: NO adaption
56*/
57 if (mode == 0) {
58 if (tessedit_adaption_debug) {
59 tprintf("adaption disabled\n");
60 }
61 return false;
62 }
63
64 if (flags[ADAPTABLE_WERD]) {
65 status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
66 if (tessedit_adaption_debug && !status) {
67 tprintf("tess_would_adapt bit is false\n");
68 }
69 }
70
71 if (flags[ACCEPTABLE_WERD]) {
72 status |= word->tess_accepted;
73 if (tessedit_adaption_debug && !status) {
74 tprintf("tess_accepted bit is false\n");
75 }
76 }
77
78 if (!status) { // If not set then
79 return false; // ignore other checks
80 }
81
82 if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
83 (word->best_choice->permuter() != FREQ_DAWG_PERM) &&
84 (word->best_choice->permuter() != USER_DAWG_PERM) &&
85 (word->best_choice->permuter() != NUMBER_PERM)) {
86 if (tessedit_adaption_debug) {
87 tprintf("word not in dawgs\n");
88 }
89 return false;
90 }
91
92 if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {
93 if (tessedit_adaption_debug) {
94 tprintf("word has ell conflict\n");
95 }
96 return false;
97 }
98
99 if (flags[CHECK_SPACES] &&
100 (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
101 if (tessedit_adaption_debug) {
102 tprintf("word contains spaces\n");
103 }
104 return false;
105 }
106
107 if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {
108 if (tessedit_adaption_debug) {
109 tprintf("word is ambiguous\n");
110 }
111 return false;
112 }
113
114 if (tessedit_adaption_debug) {
115 tprintf("returning status %d\n", status);
116 }
117 return status;
118}

◆ word_blank_and_set_display()

bool tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_it)

word_blank_and_set_display() Word processor

Blank display of word then redisplay word according to current display mode settings

Definition at line 667 of file pgedit.cpp.

667 {
668 pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);
669 return word_set_display(pr_it);
670}

◆ word_bln_display()

bool tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 677 of file pgedit.cpp.

677 {
678 WERD_RES *word_res = pr_it->word();
679 if (word_res->chopped_word == nullptr) {
680 // Setup word normalization parameters.
681 word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
682 classify_bln_numeric_mode, textord_use_cjk_fp_model,
683 poly_allow_detailed_fx, pr_it->row()->row, pr_it->block()->block);
684 }
685 bln_word_window_handle()->Clear();
686 display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f);
687 C_BLOB_IT it(word_res->word->cblob_list());
689 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
690 it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle());
691 color = WERD::NextColor(color);
692 }
693 bln_word_window_handle()->Update();
694 return true;
695}
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:298

◆ word_blob_quality()

int16_t tesseract::Tesseract::word_blob_quality ( WERD_RES word)

Definition at line 51 of file docqual.cpp.

51 {
52 int16_t match_count = 0;
53 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
54 !word->rebuild_word->blobs.empty()) {
55 using namespace std::placeholders; // for _1
56 word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
57 std::bind(countMatchingBlobs, match_count, _1));
58 }
59 return match_count;
60}

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality ( WERD_RES word,
int16_t *  match_count,
int16_t *  accepted_match_count 
)

Definition at line 81 of file docqual.cpp.

82 {
83 *match_count = 0;
84 *accepted_match_count = 0;
85 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
86 !word->rebuild_word->blobs.empty()) {
87 using namespace std::placeholders; // for _1
88 word->bln_boxes->ProcessMatchedBlobs(
89 *word->rebuild_word,
90 std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
91 }
92}

◆ word_contains_non_1_digit()

bool tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 496 of file reject.cpp.

496 {
497 int16_t i;
498 int16_t offset;
499
500 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
501 if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
502 (word_lengths[i] != 1 || word[offset] != '1')) {
503 return true;
504 }
505 }
506 return false;
507}

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
int16_t &  delete_mode 
)

Definition at line 825 of file docqual.cpp.

825 {
826 int word_len = word->reject_map.length();
827 float rating_per_ch;
828 TBOX box; // BB of word
829
830 if (word->unlv_crunch_mode == CR_NONE) {
831 delete_mode = 0;
832 return CR_NONE;
833 }
834
835 if (word_len == 0) {
836 delete_mode = 1;
837 return CR_DELETE;
838 }
839
840 if (word->rebuild_word != nullptr) {
841 // Cube leaves rebuild_word nullptr.
842 box = word->rebuild_word->bounding_box();
843 if (box.height() < crunch_del_min_ht * kBlnXHeight) {
844 delete_mode = 4;
845 return CR_DELETE;
846 }
847
848 if (noise_outlines(word->rebuild_word)) {
849 delete_mode = 5;
850 return CR_DELETE;
851 }
852 }
853
854 if ((failure_count(word) * 1.5) > word_len) {
855 delete_mode = 2;
856 return CR_LOOSE_SPACE;
857 }
858
859 if (word->best_choice->certainty() < crunch_del_cert) {
860 delete_mode = 7;
861 return CR_LOOSE_SPACE;
862 }
863
864 rating_per_ch = word->best_choice->rating() / word_len;
865
866 if (rating_per_ch > crunch_del_rating) {
867 delete_mode = 8;
868 return CR_LOOSE_SPACE;
869 }
870
871 if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
872 delete_mode = 9;
873 return CR_LOOSE_SPACE;
874 }
875
876 if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
877 delete_mode = 10;
878 return CR_LOOSE_SPACE;
879 }
880
881 if (box.height() > crunch_del_max_ht * kBlnXHeight) {
882 delete_mode = 11;
883 return CR_LOOSE_SPACE;
884 }
885
886 if (box.width() < crunch_del_min_width * kBlnXHeight) {
887 delete_mode = 3;
888 return CR_LOOSE_SPACE;
889 }
890
891 delete_mode = 0;
892 return CR_NONE;
893}
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:907
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:895

◆ word_display()

bool tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 702 of file pgedit.cpp.

702 {
703 WERD_RES *word_res = pr_it->word();
704 WERD *word = word_res->word;
705 TBOX word_bb; // word bounding box
706 int word_height; // ht of word BB
707 bool displayed_something = false;
708 float shift; // from bot left
709
710 if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
711# ifndef DISABLED_LEGACY_ENGINE
712 BoxWord *box_word = word_res->box_word;
713 WERD_CHOICE *best_choice = word_res->best_choice;
714 int length = box_word->length();
715 if (word_res->fontinfo == nullptr) {
716 return false;
717 }
718 const FontInfo &font_info = *word_res->fontinfo;
719 for (int i = 0; i < length; ++i) {
721 switch (color_mode) {
722 case CM_SUBSCRIPT:
723 if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) {
724 color = ScrollView::RED;
725 }
726 break;
727 case CM_SUPERSCRIPT:
728 if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) {
729 color = ScrollView::RED;
730 }
731 break;
732 case CM_ITALIC:
733 if (font_info.is_italic()) {
734 color = ScrollView::RED;
735 }
736 break;
737 case CM_BOLD:
738 if (font_info.is_bold()) {
739 color = ScrollView::RED;
740 }
741 break;
742 case CM_FIXEDPITCH:
743 if (font_info.is_fixed_pitch()) {
744 color = ScrollView::RED;
745 }
746 break;
747 case CM_SERIF:
748 if (font_info.is_serif()) {
749 color = ScrollView::RED;
750 }
751 break;
752 case CM_SMALLCAPS:
753 if (word_res->small_caps) {
754 color = ScrollView::RED;
755 }
756 break;
757 case CM_DROPCAPS:
758 if (best_choice->BlobPosition(i) == SP_DROPCAP) {
759 color = ScrollView::RED;
760 }
761 break;
762 // TODO(rays) underline is currently completely unsupported.
763 case CM_UNDERLINE:
764 default:
765 break;
766 }
767 image_win->Pen(color);
768 TBOX box = box_word->BlobBox(i);
769 image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
770 }
771 return true;
772# else
773 return false;
774# endif // ndef DISABLED_LEGACY_ENGINE
775 }
776 /*
777 Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
778 etc. are to keep the compiler happy.
779*/
780 // display bounding box
781 if (word->display_flag(DF_BOX)) {
782 word->bounding_box().plot(image_win,
783 static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
784 static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
785
786 auto c = static_cast<ScrollView::Color>((int32_t)editor_image_blob_bb_color);
787 image_win->Pen(c);
788 // cblob iterator
789 C_BLOB_IT c_it(word->cblob_list());
790 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
791 c_it.data()->bounding_box().plot(image_win);
792 }
793 displayed_something = true;
794 }
795
796 // display edge steps
797 if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
798 word->plot(image_win); // rainbow colors
799 displayed_something = true;
800 }
801
802 // display poly approx
803 if (word->display_flag(DF_POLYGONAL)) {
804 // need to convert
805 TWERD *tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
806 tword->plot(image_win);
807 delete tword;
808 displayed_something = true;
809 }
810
811 // Display correct text and blamer information.
812 std::string text;
813 std::string blame;
814 if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
815 text = word->text();
816 }
817 if (word->display_flag(DF_BLAMER) &&
818 !(word_res->blamer_bundle != nullptr &&
819 word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
820 text = "";
821 const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
822 if (blamer_bundle == nullptr) {
823 text += "NULL";
824 } else {
825 text = blamer_bundle->TruthString();
826 }
827 text += " -> ";
828 std::string best_choice_str;
829 if (word_res->best_choice == nullptr) {
830 best_choice_str = "NULL";
831 } else {
832 word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
833 }
834 text += best_choice_str;
835 IncorrectResultReason reason =
836 (blamer_bundle == nullptr) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
838 blame += " [";
839 blame += BlamerBundle::IncorrectReasonName(reason);
840 blame += "]";
841 }
842 if (text.length() > 0) {
843 word_bb = word->bounding_box();
844 image_win->Pen(ScrollView::RED);
845 word_height = word_bb.height();
846 int text_height = 0.50 * word_height;
847 if (text_height > 20) {
848 text_height = 20;
849 }
850 image_win->TextAttributes("Arial", text_height, false, false, false);
851 shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
852 image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str());
853 if (blame.length() > 0) {
854 image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height,
855 blame.c_str());
856 }
857
858 displayed_something = true;
859 }
860
861 if (!displayed_something) { // display BBox anyway
862 word->bounding_box().plot(image_win,
863 static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
864 static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
865 }
866 return true;
867}
@ SP_DROPCAP
Definition: ratngs.h:254
@ IRR_CORRECT
Definition: blamer.h:58
@ IRR_PAGE_LAYOUT
Definition: blamer.h:77
int editor_image_word_bb_color
Definition: pgedit.cpp:126
int editor_image_blob_bb_color
Definition: pgedit.cpp:127
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:778
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:610
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:635
void Pen(Color color)
Definition: scrollview.cpp:710
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:576

◆ word_dumper()

bool tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 877 of file pgedit.cpp.

877 {
878 if (pr_it->block()->block != nullptr) {
879 tprintf("\nBlock data...\n");
880 pr_it->block()->block->print(nullptr, false);
881 }
882 tprintf("\nRow data...\n");
883 pr_it->row()->row->print(nullptr);
884 tprintf("\nWord data...\n");
885 WERD_RES *word_res = pr_it->word();
886 word_res->word->print();
887 if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
888 word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
889 tprintf("Current blamer debug: %s\n", word_res->blamer_bundle->debug().c_str());
890 }
891 return true;
892}

◆ word_outline_errs()

int16_t tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 62 of file docqual.cpp.

62 {
63 int16_t i = 0;
64 int16_t err_count = 0;
65
66 if (word->rebuild_word != nullptr) {
67 for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
68 TBLOB *blob = word->rebuild_word->blobs[b];
69 err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
70 i++;
71 }
72 }
73 return err_count;
74}
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:107

◆ word_set_display()

bool tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 900 of file pgedit.cpp.

900 {
901 WERD *word = pr_it->word()->word;
902 word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
903 word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
904 word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
905 word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
906 word->set_display_flag(DF_BN_POLYGONAL, word_display_mode[DF_BN_POLYGONAL]);
907 word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
908 return word_display(pr_it);
909}

◆ worst_noise_blob()

int16_t tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 685 of file fixspace.cpp.

685 {
686 float noise_score[512];
687 int min_noise_blob; // 1st contender
688 int max_noise_blob; // last contender
689 int non_noise_count;
690 int worst_noise_blob; // Worst blob
691 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
692 float non_noise_limit = kBlnXHeight * 0.8;
693
694 if (word_res->rebuild_word == nullptr) {
695 return -1; // Can't handle cube words.
696 }
697
698 // Normalised.
699 auto blob_count = word_res->box_word->length();
700 ASSERT_HOST(blob_count <= 512);
701 if (blob_count < 5) {
702 return -1; // too short to split
703 }
704
705 /* Get the noise scores for all blobs */
706
707#ifndef SECURE_NAMES
708 if (debug_fix_space_level > 5) {
709 tprintf("FP fixspace Noise metrics for \"%s\": ",
710 word_res->best_choice->unichar_string().c_str());
711 }
712#endif
713
714 for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
715 TBLOB *blob = word_res->rebuild_word->blobs[i];
716 if (word_res->reject_map[i].accepted()) {
717 noise_score[i] = non_noise_limit;
718 } else {
719 noise_score[i] = blob_noise_score(blob);
720 }
721
722 if (debug_fix_space_level > 5) {
723 tprintf("%1.1f ", noise_score[i]);
724 }
725 }
726 if (debug_fix_space_level > 5) {
727 tprintf("\n");
728 }
729
730 /* Now find the worst one which is far enough away from the end of the word */
731
732 non_noise_count = 0;
733 int i;
734 for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
735 if (noise_score[i] >= non_noise_limit) {
736 non_noise_count++;
737 }
738 }
739 if (non_noise_count < fixsp_non_noise_limit) {
740 return -1;
741 }
742
743 min_noise_blob = i;
744
745 non_noise_count = 0;
746 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
747 if (noise_score[i] >= non_noise_limit) {
748 non_noise_count++;
749 }
750 }
751 if (non_noise_count < fixsp_non_noise_limit) {
752 return -1;
753 }
754
755 max_noise_blob = i;
756
757 if (min_noise_blob > max_noise_blob) {
758 return -1;
759 }
760
761 *worst_noise_score = small_limit;
762 worst_noise_blob = -1;
763 for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
764 if (noise_score[i] < *worst_noise_score) {
766 *worst_noise_score = noise_score[i];
767 }
768 }
769 return worst_noise_blob;
770}

◆ write_results()

void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
bool  force_eol 
)

Definition at line 99 of file output.cpp.

101 { // override tilde crunch?
102 WERD_RES *word = page_res_it.word();
103 const UNICHARSET &uchset = *word->uch_set;
104 bool need_reject = false;
105 UNICHAR_ID space = uchset.unichar_to_id(" ");
106
107 if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
108 !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
109 if ((word->unlv_crunch_mode != CR_DELETE) &&
110 (!stats_.tilde_crunch_written ||
111 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
112 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
113 if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
114 !word->word->flag(W_FUZZY_SP)) {
115 stats_.last_char_was_tilde = false;
116 }
117 need_reject = true;
118 }
119 if ((need_reject && !stats_.last_char_was_tilde) ||
120 (force_eol && stats_.write_results_empty_block)) {
121 /* Write a reject char - mark as rejected unless zero_rejection mode */
122 stats_.last_char_was_tilde = true;
123 stats_.tilde_crunch_written = true;
124 stats_.last_char_was_newline = false;
125 stats_.write_results_empty_block = false;
126 }
127
128 if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
129 stats_.tilde_crunch_written = false;
130 stats_.last_char_was_newline = true;
131 stats_.last_char_was_tilde = false;
132 }
133
134 if (force_eol) {
135 stats_.write_results_empty_block = true;
136 }
137 return;
138 }
139
140 /* NORMAL PROCESSING of non tilde crunched words */
141
142 stats_.tilde_crunch_written = false;
143 if (newline_type) {
144 stats_.last_char_was_newline = true;
145 } else {
146 stats_.last_char_was_newline = false;
147 }
148 stats_.write_results_empty_block = force_eol; // about to write a real word
149
150 if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
151 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
152 (word->best_choice->unichar_id(0) == space)) {
153 /* Prevent adjacent tilde across words - we know that adjacent tildes within
154 words have been removed */
155 word->MergeAdjacentBlobs(0);
156 }
157 if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
158 stats_.last_char_was_tilde = false;
159 } else {
160 if (word->reject_map.length() > 0) {
161 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
162 stats_.last_char_was_tilde = true;
163 } else {
164 stats_.last_char_was_tilde = false;
165 }
166 } else if (word->word->space() > 0) {
167 stats_.last_char_was_tilde = false;
168 }
169 /* else it is unchanged as there are no output chars */
170 }
171
172 ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
173
174 set_unlv_suspects(word);
175 check_debug_pt(word, 120);
176 if (tessedit_rejection_debug) {
177 tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
178 dict_word(*(word->best_choice)));
179 }
180 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
181 if (tessedit_zero_rejection) {
182 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
183 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
184 if (word->reject_map[i].rejected()) {
185 word->reject_map[i].setrej_minimal_rej_accept();
186 }
187 }
188 }
189 if (tessedit_minimal_rejection) {
190 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
191 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
192 if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
193 word->reject_map[i].setrej_minimal_rej_accept();
194 }
195 }
196 }
197 }
198}
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:270

The documentation for this class was generated from the following files: