tesseract v5.3.3.20231005
|
#include <tesseractclass.h>
Public Member Functions | ||||||||||
Tesseract () | ||||||||||
~Tesseract () override | ||||||||||
Dict & | getDict () override | |||||||||
void | Clear () | |||||||||
void | ResetAdaptiveClassifier () | |||||||||
void | ResetDocumentDictionary () | |||||||||
void | SetEquationDetect (EquationDetect *detector) | |||||||||
const FCOORD & | reskew () const | |||||||||
Image * | mutable_pix_binary () | |||||||||
Image | pix_binary () const | |||||||||
Image | pix_grey () const | |||||||||
void | set_pix_grey (Image grey_pix) | |||||||||
Image | pix_original () const | |||||||||
void | set_pix_original (Image original_pix) | |||||||||
Image | BestPix () const | |||||||||
void | set_pix_thresholds (Image thresholds) | |||||||||
int | source_resolution () const | |||||||||
void | set_source_resolution (int ppi) | |||||||||
int | ImageWidth () const | |||||||||
int | ImageHeight () const | |||||||||
Image | scaled_color () const | |||||||||
int | scaled_factor () const | |||||||||
void | SetScaledColor (int factor, Image color) | |||||||||
const Textord & | textord () const | |||||||||
Textord * | mutable_textord () | |||||||||
bool | right_to_left () const | |||||||||
int | num_sub_langs () const | |||||||||
Tesseract * | get_sub_lang (int index) const | |||||||||
bool | AnyTessLang () const | |||||||||
bool | AnyLSTMLang () const | |||||||||
void | SetBlackAndWhitelist () | |||||||||
void | PrepareForPageseg () | |||||||||
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) | |||||||||
int | SegmentPage (const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
void | SetupWordScripts (BLOCK_LIST *blocks) | |||||||||
int | AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) | |||||||||
ColumnFinder * | SetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix, Image *music_mask_pix) | |||||||||
void | PrerecAllWordsPar (const std::vector< WordData > &words) | |||||||||
bool | TrainLineRecognizer (const char *input_imagename, const std::string &output_basename, BLOCK_LIST *block_list) | |||||||||
void | TrainFromBoxes (const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, BLOCK_LIST *block_list, DocumentData *training_data) | |||||||||
ImageData * | GetLineData (const TBOX &line_box, const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, int start_box, int end_box, const BLOCK &block) | |||||||||
ImageData * | GetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const | |||||||||
void | LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words) | |||||||||
void | SearchWords (PointerVector< WERD_RES > *words) | |||||||||
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) | |||||||||
void | SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words) | |||||||||
void | SetupWordPassN (int pass_n, WordData *word) | |||||||||
bool | RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words) | |||||||||
bool | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) | |||||||||
void | rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config) | |||||||||
void | bigram_correction_pass (PAGE_RES *page_res) | |||||||||
void | blamer_pass (PAGE_RES *page_res) | |||||||||
void | script_pos_pass (PAGE_RES *page_res) | |||||||||
int | RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words) | |||||||||
bool | ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) | |||||||||
void | AssignDiacriticsToOverlappingBlobs (const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs) | |||||||||
void | AssignDiacriticsToNewBlobs (const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs) | |||||||||
bool | SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines) | |||||||||
float | ClassifyBlobPlusOutlines (const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) | |||||||||
float | ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2) | |||||||||
void | classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) | |||||||||
void | classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) | |||||||||
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) | |||||||||
void | fix_rep_char (PAGE_RES_IT *page_res_it) | |||||||||
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths) | |||||||||
void | match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block) | |||||||||
void | classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) | |||||||||
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) | |||||||||
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | recog_interactive (PAGE_RES_IT *pr_it) | |||||||||
void | set_word_fonts (WERD_RES *word) | |||||||||
void | font_recognition_pass (PAGE_RES *page_res) | |||||||||
void | dictionary_correction_pass (PAGE_RES *page_res) | |||||||||
bool | check_debug_pt (WERD_RES *word, int location) | |||||||||
bool | SubAndSuperscriptFix (WERD_RES *word_res) | |||||||||
void | GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold) | |||||||||
WERD_RES * | TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing) | |||||||||
bool | BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const | |||||||||
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) | |||||||||
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol) | |||||||||
void | set_unlv_suspects (WERD_RES *word) | |||||||||
UNICHAR_ID | get_rep_char (WERD_RES *word) | |||||||||
bool | acceptable_number_string (const char *s, const char *lengths) | |||||||||
int16_t | count_alphanums (const WERD_CHOICE &word) | |||||||||
int16_t | count_alphas (const WERD_CHOICE &word) | |||||||||
void | read_config_file (const char *filename, SetParamConstraint constraint) | |||||||||
int | init_tesseract (const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr) | |||||||||
int | init_tesseract (const std::string &datapath, const std::string &language, OcrEngineMode oem) | |||||||||
int | init_tesseract_internal (const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr) | |||||||||
void | SetupUniversalFontIds () | |||||||||
void | recognize_page (std::string &image_name) | |||||||||
void | end_tesseract () | |||||||||
bool | init_tesseract_lang_data (const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr) | |||||||||
void | ParseLanguageString (const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load) | |||||||||
SVMenuNode * | build_menu_new () | |||||||||
void | pgeditor_main (int width, int height, PAGE_RES *page_res) | |||||||||
void | process_image_event (const SVEvent &event) | |||||||||
bool | process_cmd_win_event (int32_t cmd_event, char *new_value) | |||||||||
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) | |||||||||
bool | word_display (PAGE_RES_IT *pr_it) | |||||||||
bool | word_bln_display (PAGE_RES_IT *pr_it) | |||||||||
bool | word_blank_and_set_display (PAGE_RES_IT *pr_its) | |||||||||
bool | word_set_display (PAGE_RES_IT *pr_it) | |||||||||
bool | word_dumper (PAGE_RES_IT *pr_it) | |||||||||
void | blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | make_reject_map (WERD_RES *word, ROW *row, int16_t pass) | |||||||||
bool | one_ell_conflict (WERD_RES *word_res, bool update_map) | |||||||||
int16_t | first_alphanum_index (const char *word, const char *word_lengths) | |||||||||
int16_t | first_alphanum_offset (const char *word, const char *word_lengths) | |||||||||
int16_t | alpha_count (const char *word, const char *word_lengths) | |||||||||
bool | word_contains_non_1_digit (const char *word, const char *word_lengths) | |||||||||
void | dont_allow_1Il (WERD_RES *word) | |||||||||
int16_t | count_alphanums (WERD_RES *word) | |||||||||
void | flip_0O (WERD_RES *word) | |||||||||
bool | non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
bool | non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
bool | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) | |||||||||
void | nn_match_word (WERD_RES *word, ROW *row) | |||||||||
void | nn_recover_rejects (WERD_RES *word, ROW *row) | |||||||||
void | set_done (WERD_RES *word, int16_t pass) | |||||||||
int16_t | safe_dict_word (const WERD_RES *werd_res) | |||||||||
void | flip_hyphens (WERD_RES *word) | |||||||||
void | reject_I_1_L (WERD_RES *word) | |||||||||
void | reject_edge_blobs (WERD_RES *word) | |||||||||
void | reject_mostly_rejects (WERD_RES *word) | |||||||||
bool | word_adaptable (WERD_RES *word, uint16_t mode) | |||||||||
void | recog_word_recursive (WERD_RES *word) | |||||||||
void | recog_word (WERD_RES *word) | |||||||||
void | split_and_recog_word (WERD_RES *word) | |||||||||
void | split_word (WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const | |||||||||
void | join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const | |||||||||
GARBAGE_LEVEL | garbage_word (WERD_RES *word, bool ok_dict_word) | |||||||||
bool | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word) | |||||||||
void | tilde_crunch (PAGE_RES_IT &page_res_it) | |||||||||
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) | |||||||||
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) | |||||||||
void | quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) | |||||||||
void | convert_bad_unlv_chs (WERD_RES *word_res) | |||||||||
void | tilde_delete (PAGE_RES_IT &page_res_it) | |||||||||
int16_t | word_blob_quality (WERD_RES *word) | |||||||||
void | word_char_quality (WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count) | |||||||||
void | unrej_good_chs (WERD_RES *word) | |||||||||
int16_t | count_outline_errs (char c, int16_t outline_count) | |||||||||
int16_t | word_outline_errs (WERD_RES *word) | |||||||||
bool | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) | |||||||||
CRUNCH_MODE | word_deletable (WERD_RES *word, int16_t &delete_mode) | |||||||||
int16_t | failure_count (WERD_RES *word) | |||||||||
bool | noise_outlines (TWERD *word) | |||||||||
void | tess_segment_pass_n (int pass_n, WERD_RES *word) | |||||||||
PAGE_RES * | ApplyBoxes (const char *filename, bool find_segmentation, BLOCK_LIST *block_list) | |||||||||
void | PreenXHeights (BLOCK_LIST *block_list) | |||||||||
PAGE_RES * | SetupApplyBoxes (const std::vector< TBOX > &boxes, BLOCK_LIST *block_list) | |||||||||
void | MaximallyChopWord (const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text) | |||||||||
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text) | |||||||||
void | ReSegmentByClassification (PAGE_RES *page_res) | |||||||||
bool | ConvertStringToUnichars (const char *utf8, std::vector< UNICHAR_ID > *class_ids) | |||||||||
bool | FindSegmentation (const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res) | |||||||||
void | SearchForText (const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation) | |||||||||
void | TidyUp (PAGE_RES *page_res) | |||||||||
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) | |||||||||
void | CorrectClassifyWords (PAGE_RES *page_res) | |||||||||
void | ApplyBoxTraining (const std::string &fontname, PAGE_RES *page_res) | |||||||||
int | CountMisfitTops (WERD_RES *word_res) | |||||||||
float | ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift) | |||||||||
BOOL_VAR_H (tessedit_resegment_from_boxes) | ||||||||||
BOOL_VAR_H (tessedit_resegment_from_line_boxes) | ||||||||||
BOOL_VAR_H (tessedit_train_from_boxes) | ||||||||||
BOOL_VAR_H (tessedit_make_boxes_from_boxes) | ||||||||||
BOOL_VAR_H (tessedit_train_line_recognizer) | ||||||||||
BOOL_VAR_H (tessedit_dump_pageseg_images) | ||||||||||
BOOL_VAR_H (tessedit_do_invert) | ||||||||||
double_VAR_H (invert_threshold) | ||||||||||
INT_VAR_H (tessedit_pageseg_mode) | ||||||||||
INT_VAR_H (thresholding_method) | ||||||||||
BOOL_VAR_H (thresholding_debug) | ||||||||||
double_VAR_H (thresholding_window_size) | ||||||||||
double_VAR_H (thresholding_kfactor) | ||||||||||
double_VAR_H (thresholding_tile_size) | ||||||||||
double_VAR_H (thresholding_smooth_kernel_size) | ||||||||||
double_VAR_H (thresholding_score_fraction) | ||||||||||
INT_VAR_H (tessedit_ocr_engine_mode) | ||||||||||
STRING_VAR_H (tessedit_char_blacklist) | ||||||||||
STRING_VAR_H (tessedit_char_whitelist) | ||||||||||
STRING_VAR_H (tessedit_char_unblacklist) | ||||||||||
BOOL_VAR_H (tessedit_ambigs_training) | ||||||||||
INT_VAR_H (pageseg_devanagari_split_strategy) | ||||||||||
INT_VAR_H (ocr_devanagari_split_strategy) | ||||||||||
STRING_VAR_H (tessedit_write_params_to_file) | ||||||||||
BOOL_VAR_H (tessedit_adaption_debug) | ||||||||||
INT_VAR_H (bidi_debug) | ||||||||||
INT_VAR_H (applybox_debug) | ||||||||||
INT_VAR_H (applybox_page) | ||||||||||
STRING_VAR_H (applybox_exposure_pattern) | ||||||||||
BOOL_VAR_H (applybox_learn_chars_and_char_frags_mode) | ||||||||||
BOOL_VAR_H (applybox_learn_ngrams_mode) | ||||||||||
BOOL_VAR_H (tessedit_display_outwords) | ||||||||||
BOOL_VAR_H (tessedit_dump_choices) | ||||||||||
BOOL_VAR_H (tessedit_timing_debug) | ||||||||||
BOOL_VAR_H (tessedit_fix_fuzzy_spaces) | ||||||||||
BOOL_VAR_H (tessedit_unrej_any_wd) | ||||||||||
BOOL_VAR_H (tessedit_fix_hyphens) | ||||||||||
BOOL_VAR_H (tessedit_enable_doc_dict) | ||||||||||
BOOL_VAR_H (tessedit_debug_fonts) | ||||||||||
INT_VAR_H (tessedit_font_id) | ||||||||||
BOOL_VAR_H (tessedit_debug_block_rejection) | ||||||||||
BOOL_VAR_H (tessedit_enable_bigram_correction) | ||||||||||
BOOL_VAR_H (tessedit_enable_dict_correction) | ||||||||||
INT_VAR_H (tessedit_bigram_debug) | ||||||||||
BOOL_VAR_H (enable_noise_removal) | ||||||||||
INT_VAR_H (debug_noise_removal) | ||||||||||
double_VAR_H (noise_cert_basechar) | ||||||||||
double_VAR_H (noise_cert_disjoint) | ||||||||||
double_VAR_H (noise_cert_punc) | ||||||||||
double_VAR_H (noise_cert_factor) | ||||||||||
INT_VAR_H (noise_maxperblob) | ||||||||||
INT_VAR_H (noise_maxperword) | ||||||||||
INT_VAR_H (debug_x_ht_level) | ||||||||||
STRING_VAR_H (chs_leading_punct) | ||||||||||
STRING_VAR_H (chs_trailing_punct1) | ||||||||||
STRING_VAR_H (chs_trailing_punct2) | ||||||||||
double_VAR_H (quality_rej_pc) | ||||||||||
double_VAR_H (quality_blob_pc) | ||||||||||
double_VAR_H (quality_outline_pc) | ||||||||||
double_VAR_H (quality_char_pc) | ||||||||||
INT_VAR_H (quality_min_initial_alphas_reqd) | ||||||||||
INT_VAR_H (tessedit_tess_adaption_mode) | ||||||||||
BOOL_VAR_H (tessedit_minimal_rej_pass1) | ||||||||||
BOOL_VAR_H (tessedit_test_adaption) | ||||||||||
BOOL_VAR_H (test_pt) | ||||||||||
double_VAR_H (test_pt_x) | ||||||||||
double_VAR_H (test_pt_y) | ||||||||||
INT_VAR_H (multilang_debug_level) | ||||||||||
INT_VAR_H (paragraph_debug_level) | ||||||||||
BOOL_VAR_H (paragraph_text_based) | ||||||||||
BOOL_VAR_H (lstm_use_matrix) | ||||||||||
STRING_VAR_H (outlines_odd) | ||||||||||
STRING_VAR_H (outlines_2) | ||||||||||
BOOL_VAR_H (tessedit_good_quality_unrej) | ||||||||||
BOOL_VAR_H (tessedit_use_reject_spaces) | ||||||||||
double_VAR_H (tessedit_reject_doc_percent) | ||||||||||
double_VAR_H (tessedit_reject_block_percent) | ||||||||||
double_VAR_H (tessedit_reject_row_percent) | ||||||||||
double_VAR_H (tessedit_whole_wd_rej_row_percent) | ||||||||||
BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds) | ||||||||||
BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds) | ||||||||||
BOOL_VAR_H (tessedit_dont_blkrej_good_wds) | ||||||||||
BOOL_VAR_H (tessedit_dont_rowrej_good_wds) | ||||||||||
INT_VAR_H (tessedit_preserve_min_wd_len) | ||||||||||
BOOL_VAR_H (tessedit_row_rej_good_docs) | ||||||||||
double_VAR_H (tessedit_good_doc_still_rowrej_wd) | ||||||||||
BOOL_VAR_H (tessedit_reject_bad_qual_wds) | ||||||||||
BOOL_VAR_H (tessedit_debug_doc_rejection) | ||||||||||
BOOL_VAR_H (tessedit_debug_quality_metrics) | ||||||||||
BOOL_VAR_H (bland_unrej) | ||||||||||
double_VAR_H (quality_rowrej_pc) | ||||||||||
BOOL_VAR_H (unlv_tilde_crunching) | ||||||||||
BOOL_VAR_H (hocr_font_info) | ||||||||||
BOOL_VAR_H (hocr_char_boxes) | ||||||||||
BOOL_VAR_H (crunch_early_merge_tess_fails) | ||||||||||
BOOL_VAR_H (crunch_early_convert_bad_unlv_chs) | ||||||||||
double_VAR_H (crunch_terrible_rating) | ||||||||||
BOOL_VAR_H (crunch_terrible_garbage) | ||||||||||
double_VAR_H (crunch_poor_garbage_cert) | ||||||||||
double_VAR_H (crunch_poor_garbage_rate) | ||||||||||
double_VAR_H (crunch_pot_poor_rate) | ||||||||||
double_VAR_H (crunch_pot_poor_cert) | ||||||||||
double_VAR_H (crunch_del_rating) | ||||||||||
double_VAR_H (crunch_del_cert) | ||||||||||
double_VAR_H (crunch_del_min_ht) | ||||||||||
double_VAR_H (crunch_del_max_ht) | ||||||||||
double_VAR_H (crunch_del_min_width) | ||||||||||
double_VAR_H (crunch_del_high_word) | ||||||||||
double_VAR_H (crunch_del_low_word) | ||||||||||
double_VAR_H (crunch_small_outlines_size) | ||||||||||
INT_VAR_H (crunch_rating_max) | ||||||||||
INT_VAR_H (crunch_pot_indicators) | ||||||||||
BOOL_VAR_H (crunch_leave_ok_strings) | ||||||||||
BOOL_VAR_H (crunch_accept_ok) | ||||||||||
BOOL_VAR_H (crunch_leave_accept_strings) | ||||||||||
BOOL_VAR_H (crunch_include_numerals) | ||||||||||
INT_VAR_H (crunch_leave_lc_strings) | ||||||||||
INT_VAR_H (crunch_leave_uc_strings) | ||||||||||
INT_VAR_H (crunch_long_repetitions) | ||||||||||
INT_VAR_H (crunch_debug) | ||||||||||
INT_VAR_H (fixsp_non_noise_limit) | ||||||||||
double_VAR_H (fixsp_small_outlines_size) | ||||||||||
BOOL_VAR_H (tessedit_prefer_joined_punct) | ||||||||||
INT_VAR_H (fixsp_done_mode) | ||||||||||
INT_VAR_H (debug_fix_space_level) | ||||||||||
STRING_VAR_H (numeric_punctuation) | ||||||||||
INT_VAR_H (x_ht_acceptance_tolerance) | ||||||||||
INT_VAR_H (x_ht_min_change) | ||||||||||
INT_VAR_H (superscript_debug) | ||||||||||
double_VAR_H (superscript_worse_certainty) | ||||||||||
double_VAR_H (superscript_bettered_certainty) | ||||||||||
double_VAR_H (superscript_scaledown_ratio) | ||||||||||
double_VAR_H (subscript_max_y_top) | ||||||||||
double_VAR_H (superscript_min_y_bottom) | ||||||||||
BOOL_VAR_H (tessedit_write_block_separators) | ||||||||||
BOOL_VAR_H (tessedit_write_rep_codes) | ||||||||||
BOOL_VAR_H (tessedit_write_unlv) | ||||||||||
BOOL_VAR_H (tessedit_create_txt) | ||||||||||
BOOL_VAR_H (tessedit_create_hocr) | ||||||||||
BOOL_VAR_H (tessedit_create_alto) | ||||||||||
BOOL_VAR_H (tessedit_create_lstmbox) | ||||||||||
BOOL_VAR_H (tessedit_create_tsv) | ||||||||||
BOOL_VAR_H (tessedit_create_wordstrbox) | ||||||||||
BOOL_VAR_H (tessedit_create_pdf) | ||||||||||
BOOL_VAR_H (textonly_pdf) | ||||||||||
INT_VAR_H (jpg_quality) | ||||||||||
INT_VAR_H (user_defined_dpi) | ||||||||||
INT_VAR_H (min_characters_to_try) | ||||||||||
STRING_VAR_H (unrecognised_char) | ||||||||||
INT_VAR_H (suspect_level) | ||||||||||
INT_VAR_H (suspect_short_words) | ||||||||||
BOOL_VAR_H (suspect_constrain_1Il) | ||||||||||
double_VAR_H (suspect_rating_per_ch) | ||||||||||
double_VAR_H (suspect_accept_rating) | ||||||||||
BOOL_VAR_H (tessedit_minimal_rejection) | ||||||||||
BOOL_VAR_H (tessedit_zero_rejection) | ||||||||||
BOOL_VAR_H (tessedit_word_for_word) | ||||||||||
BOOL_VAR_H (tessedit_zero_kelvin_rejection) | ||||||||||
INT_VAR_H (tessedit_reject_mode) | ||||||||||
BOOL_VAR_H (tessedit_rejection_debug) | ||||||||||
BOOL_VAR_H (tessedit_flip_0O) | ||||||||||
double_VAR_H (tessedit_lower_flip_hyphen) | ||||||||||
double_VAR_H (tessedit_upper_flip_hyphen) | ||||||||||
BOOL_VAR_H (rej_trust_doc_dawg) | ||||||||||
BOOL_VAR_H (rej_1Il_use_dict_word) | ||||||||||
BOOL_VAR_H (rej_1Il_trust_permuter_type) | ||||||||||
BOOL_VAR_H (rej_use_tess_accepted) | ||||||||||
BOOL_VAR_H (rej_use_tess_blanks) | ||||||||||
BOOL_VAR_H (rej_use_good_perm) | ||||||||||
BOOL_VAR_H (rej_use_sensible_wd) | ||||||||||
BOOL_VAR_H (rej_alphas_in_number_perm) | ||||||||||
double_VAR_H (rej_whole_of_mostly_reject_word_fract) | ||||||||||
INT_VAR_H (tessedit_image_border) | ||||||||||
STRING_VAR_H (ok_repeated_ch_non_alphanum_wds) | ||||||||||
STRING_VAR_H (conflict_set_I_l_1) | ||||||||||
INT_VAR_H (min_sane_x_ht_pixels) | ||||||||||
BOOL_VAR_H (tessedit_create_boxfile) | ||||||||||
INT_VAR_H (tessedit_page_number) | ||||||||||
BOOL_VAR_H (tessedit_write_images) | ||||||||||
BOOL_VAR_H (interactive_display_mode) | ||||||||||
STRING_VAR_H (file_type) | ||||||||||
BOOL_VAR_H (tessedit_override_permuter) | ||||||||||
STRING_VAR_H (tessedit_load_sublangs) | ||||||||||
BOOL_VAR_H (tessedit_use_primary_params_model) | ||||||||||
double_VAR_H (min_orientation_margin) | ||||||||||
BOOL_VAR_H (textord_tabfind_show_vlines) | ||||||||||
BOOL_VAR_H (textord_use_cjk_fp_model) | ||||||||||
BOOL_VAR_H (poly_allow_detailed_fx) | ||||||||||
BOOL_VAR_H (tessedit_init_config_only) | ||||||||||
BOOL_VAR_H (textord_equation_detect) | ||||||||||
BOOL_VAR_H (textord_tabfind_vertical_text) | ||||||||||
BOOL_VAR_H (textord_tabfind_force_vertical_text) | ||||||||||
double_VAR_H (textord_tabfind_vertical_text_ratio) | ||||||||||
double_VAR_H (textord_tabfind_aligned_gap_fraction) | ||||||||||
INT_VAR_H (tessedit_parallelize) | ||||||||||
BOOL_VAR_H (preserve_interword_spaces) | ||||||||||
STRING_VAR_H (page_separator) | ||||||||||
INT_VAR_H (lstm_choice_mode) | ||||||||||
INT_VAR_H (lstm_choice_iterations) | ||||||||||
double_VAR_H (lstm_rating_coefficient) | ||||||||||
BOOL_VAR_H (pageseg_apply_music_mask) | ||||||||||
FILE * | init_recog_training (const char *filename) | |||||||||
void | recog_training_segmented (const char *filename, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) | |||||||||
void | ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file) | |||||||||
eval_word_spacing() | ||||||||||
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect. Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred. The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space. Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word
The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. | ||||||||||
bool | digit_or_numeric_punct (WERD_RES *word, int char_position) | |||||||||
int16_t | eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
fix_fuzzy_spaces() | ||||||||||
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
| ||||||||||
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) | |||||||||
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) | |||||||||
fix_sp_fp_word() | ||||||||||
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words. | ||||||||||
int16_t | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) | |||||||||
int16_t | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) | |||||||||
float | blob_noise_score (TBLOB *blob) | |||||||||
void | break_noisiest_blob_word (WERD_RES_LIST &words) | |||||||||
transform_to_next_perm() | ||||||||||
Examines the current word list to find the smallest word gap size. Then walks the word list closing any gaps of this size by either inserted new combination words, or extending existing ones. The routine COULD be limited to stop it building words longer than N blobs. If there are no more gaps then it DELETES the entire list and returns the empty list to cause termination. | ||||||||||
void | dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) | |||||||||
bool | fixspace_thinks_word_done (WERD_RES *word) | |||||||||
process_selected_words() | ||||||||||
Walk the current block list applying the specified word processor function to each word that overlaps the selection_box. | ||||||||||
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) | |||||||||
tess_add_doc_word | ||||||||||
Add the given word to the document dictionary | ||||||||||
void | tess_add_doc_word (WERD_CHOICE *word_choice) | |||||||||
tess_acceptable_word | ||||||||||
| ||||||||||
bool | tess_acceptable_word (WERD_RES *word) | |||||||||
![]() | ||||||||||
BOOL_VAR_H (merge_fragments_in_matrix) | ||||||||||
BOOL_VAR_H (wordrec_enable_assoc) | ||||||||||
BOOL_VAR_H (force_word_assoc) | ||||||||||
INT_VAR_H (repair_unchopped_blobs) | ||||||||||
double_VAR_H (tessedit_certainty_threshold) | ||||||||||
INT_VAR_H (chop_debug) | ||||||||||
BOOL_VAR_H (chop_enable) | ||||||||||
BOOL_VAR_H (chop_vertical_creep) | ||||||||||
INT_VAR_H (chop_split_length) | ||||||||||
INT_VAR_H (chop_same_distance) | ||||||||||
INT_VAR_H (chop_min_outline_points) | ||||||||||
INT_VAR_H (chop_seam_pile_size) | ||||||||||
BOOL_VAR_H (chop_new_seam_pile) | ||||||||||
INT_VAR_H (chop_inside_angle) | ||||||||||
INT_VAR_H (chop_min_outline_area) | ||||||||||
double_VAR_H (chop_split_dist_knob) | ||||||||||
double_VAR_H (chop_overlap_knob) | ||||||||||
double_VAR_H (chop_center_knob) | ||||||||||
INT_VAR_H (chop_centered_maxwidth) | ||||||||||
double_VAR_H (chop_sharpness_knob) | ||||||||||
double_VAR_H (chop_width_change_knob) | ||||||||||
double_VAR_H (chop_ok_split) | ||||||||||
double_VAR_H (chop_good_split) | ||||||||||
INT_VAR_H (chop_x_y_weight) | ||||||||||
BOOL_VAR_H (assume_fixed_pitch_char_segment) | ||||||||||
INT_VAR_H (wordrec_debug_level) | ||||||||||
INT_VAR_H (wordrec_max_join_chunks) | ||||||||||
BOOL_VAR_H (wordrec_skip_no_truth_words) | ||||||||||
BOOL_VAR_H (wordrec_debug_blamer) | ||||||||||
BOOL_VAR_H (wordrec_run_blamer) | ||||||||||
INT_VAR_H (segsearch_debug_level) | ||||||||||
INT_VAR_H (segsearch_max_pain_points) | ||||||||||
INT_VAR_H (segsearch_max_futile_classifications) | ||||||||||
double_VAR_H (segsearch_max_char_wh_ratio) | ||||||||||
BOOL_VAR_H (save_alt_choices) | ||||||||||
Wordrec () | ||||||||||
~Wordrec () override=default | ||||||||||
void | SaveAltChoices (const LIST &best_choices, WERD_RES *word) | |||||||||
void | FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | |||||||||
void | CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | |||||||||
void | SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | |||||||||
void | InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | |||||||||
void | add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams) | |||||||||
void | choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) | |||||||||
void | combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue) | |||||||||
SEAM * | pick_good_seam (TBLOB *blob) | |||||||||
void | try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) | |||||||||
void | try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) | |||||||||
PRIORITY | grade_split_length (SPLIT *split) | |||||||||
PRIORITY | grade_sharpness (SPLIT *split) | |||||||||
bool | near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) | |||||||||
virtual BLOB_CHOICE_LIST * | classify_piece (const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle) | |||||||||
void | program_editup (const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict) | |||||||||
void | cc_recog (WERD_RES *word) | |||||||||
void | program_editdown (int32_t elasped_time) | |||||||||
void | set_pass1 () | |||||||||
void | set_pass2 () | |||||||||
int | end_recog () | |||||||||
BLOB_CHOICE_LIST * | call_matcher (TBLOB *blob) | |||||||||
int | dict_word (const WERD_CHOICE &word) | |||||||||
BLOB_CHOICE_LIST * | classify_blob (TBLOB *blob, const char *string, ScrollView::Color color, BlamerBundle *blamer_bundle) | |||||||||
PRIORITY | point_priority (EDGEPT *point) | |||||||||
void | add_point_to_list (PointHeap *point_heap, EDGEPT *point) | |||||||||
bool | is_inside_angle (EDGEPT *pt) | |||||||||
int | angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) | |||||||||
EDGEPT * | pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) | |||||||||
void | prioritize_points (TESSLINE *outline, PointHeap *points) | |||||||||
void | new_min_point (EDGEPT *local_min, PointHeap *points) | |||||||||
void | new_max_point (EDGEPT *local_max, PointHeap *points) | |||||||||
void | vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points) | |||||||||
SEAM * | attempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const std::vector< SEAM * > &seams) | |||||||||
SEAM * | chop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const std::vector< SEAM * > &seams) | |||||||||
SEAM * | chop_overlapping_blob (const std::vector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, unsigned *blob_number) | |||||||||
SEAM * | improve_one_blob (const std::vector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, unsigned *blob_number) | |||||||||
SEAM * | chop_one_blob (const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number) | |||||||||
void | chop_word_main (WERD_RES *word) | |||||||||
void | improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending) | |||||||||
int | select_blob_to_split (const std::vector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment) | |||||||||
int | select_blob_to_split_from_fixpt (DANGERR *fixpt) | |||||||||
![]() | ||||||||||
Classify () | ||||||||||
~Classify () override | ||||||||||
virtual Dict & | getDict () | |||||||||
const ShapeTable * | shape_table () const | |||||||||
void | SetStaticClassifier (ShapeClassifier *static_classifier) | |||||||||
void | AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices) | |||||||||
bool | LargeSpeckle (const TBLOB &blob) | |||||||||
int | GetFontinfoId (ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId) | |||||||||
int | PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results) | |||||||||
void | ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs) | |||||||||
void | PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) | |||||||||
void | WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) | |||||||||
ADAPT_TEMPLATES_STRUCT * | ReadAdaptedTemplates (TFile *File) | |||||||||
void | ConvertProto (PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class) | |||||||||
INT_TEMPLATES_STRUCT * | CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset) | |||||||||
void | LearnWord (const char *fontname, WERD_RES *word) | |||||||||
void | LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word) | |||||||||
void | InitAdaptiveClassifier (TessdataManager *mgr) | |||||||||
void | InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates) | |||||||||
void | AmbigClassifier (const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) | |||||||||
void | MasterMatcher (INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results) | |||||||||
void | ExpandShapesAndApplyCorrections (ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results) | |||||||||
double | ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors) | |||||||||
void | ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) | |||||||||
void | AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results) | |||||||||
int | GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures) | |||||||||
void | DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results) | |||||||||
PROTO_ID | MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask) | |||||||||
int | MakeNewTemporaryConfig (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures) | |||||||||
void | MakePermanent (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob) | |||||||||
void | PrintAdaptiveMatchResults (const ADAPT_RESULTS &results) | |||||||||
void | RemoveExtraPuncs (ADAPT_RESULTS *Results) | |||||||||
void | RemoveBadMatches (ADAPT_RESULTS *Results) | |||||||||
void | SetAdaptiveThreshold (float Threshold) | |||||||||
void | ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features) | |||||||||
std::string | ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const | |||||||||
int | ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const | |||||||||
int | ShapeIDToClassID (int shape_id) const | |||||||||
UNICHAR_ID * | BaselineClassifier (TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results) | |||||||||
int | CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results) | |||||||||
int | CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, std::vector< UnicharRating > *results) | |||||||||
UNICHAR_ID * | GetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass) | |||||||||
void | DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results) | |||||||||
void | AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates) | |||||||||
void | DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class) | |||||||||
bool | AdaptableWord (WERD_RES *word) | |||||||||
void | EndAdaptiveClassifier () | |||||||||
void | SettupPass1 () | |||||||||
void | SettupPass2 () | |||||||||
void | AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices) | |||||||||
void | ClassifyAsNoise (ADAPT_RESULTS *Results) | |||||||||
void | ResetAdaptiveClassifierInternal () | |||||||||
void | SwitchAdaptiveClassifier () | |||||||||
void | StartBackupAdaptiveClassifier () | |||||||||
int | GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array) | |||||||||
void | ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array) | |||||||||
bool | TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config) | |||||||||
void | UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob) | |||||||||
bool | AdaptiveClassifierIsFull () const | |||||||||
bool | AdaptiveClassifierIsEmpty () const | |||||||||
bool | LooksLikeGarbage (TBLOB *blob) | |||||||||
void | RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox) | |||||||||
void | ClearCharNormArray (uint8_t *char_norm_array) | |||||||||
void | ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array) | |||||||||
void | ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures) | |||||||||
INT_TEMPLATES_STRUCT * | ReadIntTemplates (TFile *fp) | |||||||||
void | WriteIntTemplates (FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset) | |||||||||
CLASS_ID | GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id) | |||||||||
void | ShowMatchDisplay () | |||||||||
UnicityTable< FontInfo > & | get_fontinfo_table () | |||||||||
const UnicityTable< FontInfo > & | get_fontinfo_table () const | |||||||||
UnicityTable< FontSet > & | get_fontset_table () | |||||||||
void | NormalizeOutlines (LIST Outlines, float *XScale, float *YScale) | |||||||||
FEATURE_SET | ExtractOutlineFeatures (TBLOB *Blob) | |||||||||
FEATURE_SET | ExtractPicoFeatures (TBLOB *Blob) | |||||||||
FEATURE_SET | ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) | |||||||||
FEATURE_SET | ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) | |||||||||
void | LearnBlob (const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) | |||||||||
bool | WriteTRFile (const char *filename) | |||||||||
BOOL_VAR_H (allow_blob_division) | ||||||||||
BOOL_VAR_H (prioritize_division) | ||||||||||
BOOL_VAR_H (classify_enable_learning) | ||||||||||
INT_VAR_H (classify_debug_level) | ||||||||||
INT_VAR_H (classify_norm_method) | ||||||||||
double_VAR_H (classify_char_norm_range) | ||||||||||
double_VAR_H (classify_max_rating_ratio) | ||||||||||
double_VAR_H (classify_max_certainty_margin) | ||||||||||
BOOL_VAR_H (tess_cn_matching) | ||||||||||
BOOL_VAR_H (tess_bn_matching) | ||||||||||
BOOL_VAR_H (classify_enable_adaptive_matcher) | ||||||||||
BOOL_VAR_H (classify_use_pre_adapted_templates) | ||||||||||
BOOL_VAR_H (classify_save_adapted_templates) | ||||||||||
BOOL_VAR_H (classify_enable_adaptive_debugger) | ||||||||||
BOOL_VAR_H (classify_nonlinear_norm) | ||||||||||
INT_VAR_H (matcher_debug_level) | ||||||||||
INT_VAR_H (matcher_debug_flags) | ||||||||||
INT_VAR_H (classify_learning_debug_level) | ||||||||||
double_VAR_H (matcher_good_threshold) | ||||||||||
double_VAR_H (matcher_reliable_adaptive_result) | ||||||||||
double_VAR_H (matcher_perfect_threshold) | ||||||||||
double_VAR_H (matcher_bad_match_pad) | ||||||||||
double_VAR_H (matcher_rating_margin) | ||||||||||
double_VAR_H (matcher_avg_noise_size) | ||||||||||
INT_VAR_H (matcher_permanent_classes_min) | ||||||||||
INT_VAR_H (matcher_min_examples_for_prototyping) | ||||||||||
INT_VAR_H (matcher_sufficient_examples_for_prototyping) | ||||||||||
double_VAR_H (matcher_clustering_max_angle_delta) | ||||||||||
double_VAR_H (classify_misfit_junk_penalty) | ||||||||||
double_VAR_H (rating_scale) | ||||||||||
double_VAR_H (tessedit_class_miss_scale) | ||||||||||
double_VAR_H (classify_adapted_pruning_factor) | ||||||||||
double_VAR_H (classify_adapted_pruning_threshold) | ||||||||||
INT_VAR_H (classify_adapt_proto_threshold) | ||||||||||
INT_VAR_H (classify_adapt_feature_threshold) | ||||||||||
BOOL_VAR_H (disable_character_fragments) | ||||||||||
double_VAR_H (classify_character_fragments_garbage_certainty_threshold) | ||||||||||
BOOL_VAR_H (classify_debug_character_fragments) | ||||||||||
BOOL_VAR_H (matcher_debug_separate_windows) | ||||||||||
STRING_VAR_H (classify_learn_debug_str) | ||||||||||
INT_VAR_H (classify_class_pruner_threshold) | ||||||||||
INT_VAR_H (classify_class_pruner_multiplier) | ||||||||||
INT_VAR_H (classify_cp_cutoff_strength) | ||||||||||
INT_VAR_H (classify_integer_matcher_multiplier) | ||||||||||
BOOL_VAR_H (classify_bln_numeric_mode) | ||||||||||
double_VAR_H (speckle_large_max_size) | ||||||||||
double_VAR_H (speckle_rating_penalty) | ||||||||||
float | ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) | |||||||||
void | FreeNormProtos () | |||||||||
NORM_PROTOS * | ReadNormProtos (TFile *fp) | |||||||||
![]() | ||||||||||
CCUtil () | ||||||||||
virtual | ~CCUtil () | |||||||||
void | main_setup (const std::string &argv0, const std::string &basename) | |||||||||
CCUtil::main_setup - set location of tessdata and name of image. More... | ||||||||||
ParamsVectors * | params () | |||||||||
INT_VAR_H (ambigs_debug_level) | ||||||||||
BOOL_VAR_H (use_ambigs_for_adaption) | ||||||||||
Additional Inherited Members | |
![]() | |
static void | SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info) |
static void | ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, std::vector< INT_FEATURE_STRUCT > *bl_features, std::vector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, std::vector< int > *outline_cn_counts) |
![]() | |
std::unique_ptr< LanguageModel > | language_model_ |
PRIORITY | pass2_ok_split |
WERD_CHOICE * | prev_word_best_choice_ |
void(Wordrec::* | fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
![]() | |
INT_TEMPLATES_STRUCT * | PreTrainedTemplates = nullptr |
ADAPT_TEMPLATES_STRUCT * | AdaptedTemplates = nullptr |
ADAPT_TEMPLATES_STRUCT * | BackupAdaptedTemplates = nullptr |
BIT_VECTOR | AllProtosOn = nullptr |
BIT_VECTOR | AllConfigsOn = nullptr |
BIT_VECTOR | AllConfigsOff = nullptr |
BIT_VECTOR | TempProtoMask = nullptr |
NORM_PROTOS * | NormProtos = nullptr |
UnicityTable< FontInfo > | fontinfo_table_ |
UnicityTable< FontSet > | fontset_table_ |
bool | EnableLearning = true |
![]() | |
std::string | datadir |
std::string | imagebasename |
std::string | lang |
std::string | language_data_path_prefix |
UNICHARSET | unicharset |
UnicharAmbigs | unichar_ambigs |
std::string | imagefile |
std::string | directory |
![]() | |
static const double | kDescenderFraction = 0.25 |
static const double | kXHeightFraction = 0.5 |
static const double | kAscenderFraction = 0.25 |
static const double | kXHeightCapRatio |
![]() | |
bool | SegSearchDone (int num_futile_classifications) |
void | UpdateSegSearchNodes (float rating_cert_scale, int starting_col, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) |
void | ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, std::vector< SegSearchPending > &pending) |
void | InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, std::string &blamer_debug) |
![]() | |
IntegerMatcher | im_ |
FEATURE_DEFS_STRUCT | feature_defs_ |
ShapeTable * | shape_table_ = nullptr |
Definition at line 178 of file tesseractclass.h.
tesseract::Tesseract::Tesseract | ( | ) |
Definition at line 53 of file tesseractclass.cpp.
|
override |
Definition at line 469 of file tesseractclass.cpp.
bool tesseract::Tesseract::acceptable_number_string | ( | const char * | s, |
const char * | lengths | ||
) |
Definition at line 386 of file output.cpp.
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string | ( | const UNICHARSET & | char_set, |
const char * | s, | ||
const char * | lengths | ||
) |
Definition at line 1692 of file control.cpp.
int16_t tesseract::Tesseract::alpha_count | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 483 of file reject.cpp.
void tesseract::Tesseract::ambigs_classify_and_output | ( | const char * | label, |
PAGE_RES_IT * | pr_it, | ||
FILE * | output_file | ||
) |
Definition at line 203 of file recogtraining.cpp.
|
inline |
Definition at line 302 of file tesseractclass.h.
|
inline |
Definition at line 290 of file tesseractclass.h.
PAGE_RES * tesseract::Tesseract::ApplyBoxes | ( | const char * | filename, |
bool | find_segmentation, | ||
BLOCK_LIST * | block_list | ||
) |
Definition at line 110 of file applybox.cpp.
void tesseract::Tesseract::ApplyBoxTraining | ( | const std::string & | fontname, |
PAGE_RES * | page_res | ||
) |
Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.
Definition at line 751 of file applybox.cpp.
void tesseract::Tesseract::AssignDiacriticsToNewBlobs | ( | const std::vector< C_OUTLINE * > & | outlines, |
int | pass, | ||
WERD * | real_word, | ||
PAGE_RES_IT * | pr_it, | ||
std::vector< bool > * | word_wanted, | ||
std::vector< C_BLOB * > * | target_blobs | ||
) |
Definition at line 1036 of file control.cpp.
void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs | ( | const std::vector< C_OUTLINE * > & | outlines, |
int | pass, | ||
WERD * | real_word, | ||
PAGE_RES_IT * | pr_it, | ||
std::vector< bool > * | word_wanted, | ||
std::vector< bool > * | overlapped_any_blob, | ||
std::vector< C_BLOB * > * | target_blobs | ||
) |
Definition at line 981 of file control.cpp.
int tesseract::Tesseract::AutoPageSeg | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks, | ||
BLOBNBOX_LIST * | diacritic_blobs, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.
If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).
Definition at line 199 of file pagesegmain.cpp.
bool tesseract::Tesseract::BelievableSuperscript | ( | bool | debug, |
const WERD_RES & | word, | ||
float | certainty_threshold, | ||
int * | left_ok, | ||
int * | right_ok | ||
) | const |
Return whether this is believable superscript or subscript text.
We insist that:
[in] | debug | If true, spew debug output |
[in] | word | The word whose best_choice we're evaluating |
[in] | certainty_threshold | If any of the characters have less certainty than this, reject. |
[out] | left_ok | How many left-side characters were ok? |
[out] | right_ok | How many right-side characters were ok? |
Definition at line 503 of file superscript.cpp.
|
inline |
Definition at line 238 of file tesseractclass.h.
void tesseract::Tesseract::bigram_correction_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 456 of file control.cpp.
void tesseract::Tesseract::blamer_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 683 of file control.cpp.
Definition at line 913 of file pgedit.cpp.
float tesseract::Tesseract::blob_noise_score | ( | TBLOB * | blob | ) |
Definition at line 772 of file fixspace.cpp.
tesseract::Tesseract::BOOL_VAR_H | ( | applybox_learn_chars_and_char_frags_mode | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | applybox_learn_ngrams_mode | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | bland_unrej | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_accept_ok | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_early_convert_bad_unlv_chs | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_early_merge_tess_fails | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_include_numerals | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_leave_accept_strings | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_leave_ok_strings | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | crunch_terrible_garbage | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | enable_noise_removal | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | hocr_char_boxes | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | hocr_font_info | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | interactive_display_mode | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | lstm_use_matrix | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | pageseg_apply_music_mask | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | paragraph_text_based | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | poly_allow_detailed_fx | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | preserve_interword_spaces | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_1Il_trust_permuter_type | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_1Il_use_dict_word | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_alphas_in_number_perm | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_trust_doc_dawg | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_good_perm | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_sensible_wd | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_tess_accepted | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | rej_use_tess_blanks | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | suspect_constrain_1Il | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_adaption_debug | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_ambigs_training | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_alto | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_boxfile | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_hocr | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_lstmbox | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_pdf | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_tsv | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_txt | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_create_wordstrbox | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_block_rejection | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_doc_rejection | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_fonts | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_debug_quality_metrics | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_display_outwords | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_do_invert | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dont_blkrej_good_wds | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dont_rowrej_good_wds | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dump_choices | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_dump_pageseg_images | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_enable_bigram_correction | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_enable_dict_correction | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_enable_doc_dict | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_fix_fuzzy_spaces | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_fix_hyphens | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_flip_0O | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_good_quality_unrej | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_init_config_only | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_make_boxes_from_boxes | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_minimal_rej_pass1 | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_minimal_rejection | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_override_permuter | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_prefer_joined_punct | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_preserve_blk_rej_perfect_wds | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_preserve_row_rej_perfect_wds | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_reject_bad_qual_wds | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_rejection_debug | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_resegment_from_boxes | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_resegment_from_line_boxes | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_row_rej_good_docs | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_test_adaption | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_timing_debug | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_train_from_boxes | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_train_line_recognizer | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_unrej_any_wd | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_use_primary_params_model | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_use_reject_spaces | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_word_for_word | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_block_separators | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_images | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_rep_codes | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_write_unlv | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_zero_kelvin_rejection | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | tessedit_zero_rejection | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | test_pt | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | textonly_pdf | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_equation_detect | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_tabfind_force_vertical_text | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_tabfind_show_vlines | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_tabfind_vertical_text | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | textord_use_cjk_fp_model | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | thresholding_debug | ) |
tesseract::Tesseract::BOOL_VAR_H | ( | unlv_tilde_crunching | ) |
void tesseract::Tesseract::break_noisiest_blob_word | ( | WERD_RES_LIST & | words | ) |
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
Definition at line 621 of file fixspace.cpp.
SVMenuNode * tesseract::Tesseract::build_menu_new | ( | ) |
build_menu()
Construct the menu tree used by the command window
Definition at line 275 of file pgedit.cpp.
bool tesseract::Tesseract::check_debug_pt | ( | WERD_RES * | word, |
int | location | ||
) |
Definition at line 1799 of file control.cpp.
void tesseract::Tesseract::classify_word_and_language | ( | int | pass_n, |
PAGE_RES_IT * | pr_it, | ||
WordData * | word_data | ||
) |
Definition at line 1302 of file control.cpp.
void tesseract::Tesseract::classify_word_pass1 | ( | const WordData & | word_data, |
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | out_words | ||
) |
classify_word_pass1
Baseline normalize the word and pass it to Tess.
Definition at line 1379 of file control.cpp.
void tesseract::Tesseract::classify_word_pass2 | ( | const WordData & | word_data, |
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | out_words | ||
) |
classify_word_pass2
Control what to do with the word in pass 2
Definition at line 1535 of file control.cpp.
float tesseract::Tesseract::ClassifyBlobAsWord | ( | int | pass_n, |
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
std::string & | best_str, | ||
float * | c2 | ||
) |
Definition at line 1252 of file control.cpp.
float tesseract::Tesseract::ClassifyBlobPlusOutlines | ( | const std::vector< bool > & | ok_outlines, |
const std::vector< C_OUTLINE * > & | outlines, | ||
int | pass_n, | ||
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
std::string & | best_str | ||
) |
Definition at line 1207 of file control.cpp.
void tesseract::Tesseract::Clear | ( | ) |
Definition at line 489 of file tesseractclass.cpp.
float tesseract::Tesseract::ComputeCompatibleXheight | ( | WERD_RES * | word_res, |
float * | baseline_shift | ||
) |
Definition at line 105 of file fixxht.cpp.
void tesseract::Tesseract::convert_bad_unlv_chs | ( | WERD_RES * | word_res | ) |
bool tesseract::Tesseract::ConvertStringToUnichars | ( | const char * | utf8, |
std::vector< UNICHAR_ID > * | class_ids | ||
) |
Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
Definition at line 520 of file applybox.cpp.
void tesseract::Tesseract::CorrectClassifyWords | ( | PAGE_RES * | page_res | ) |
Creates a fake best_choice entry in each WERD_RES with the correct text.
Definition at line 764 of file applybox.cpp.
int16_t tesseract::Tesseract::count_alphanums | ( | const WERD_CHOICE & | word | ) |
int16_t tesseract::Tesseract::count_alphanums | ( | WERD_RES * | word | ) |
Definition at line 542 of file reject.cpp.
int16_t tesseract::Tesseract::count_alphas | ( | const WERD_CHOICE & | word | ) |
int16_t tesseract::Tesseract::count_outline_errs | ( | char | c, |
int16_t | outline_count | ||
) |
Definition at line 107 of file docqual.cpp.
int tesseract::Tesseract::CountMisfitTops | ( | WERD_RES * | word_res | ) |
Definition at line 72 of file fixxht.cpp.
debug_word
Process the whole image, but load word_config_ for the selected word(s).
Definition at line 640 of file pgedit.cpp.
void tesseract::Tesseract::dictionary_correction_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 2057 of file control.cpp.
bool tesseract::Tesseract::digit_or_numeric_punct | ( | WERD_RES * | word, |
int | char_position | ||
) |
Definition at line 366 of file fixspace.cpp.
void tesseract::Tesseract::do_re_display | ( | bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) | word_painter | ) |
Redisplay page
Definition at line 325 of file pgedit.cpp.
void tesseract::Tesseract::doc_and_block_rejection | ( | PAGE_RES_IT & | page_res_it, |
bool | good_quality_doc | ||
) |
Definition at line 210 of file docqual.cpp.
void tesseract::Tesseract::dont_allow_1Il | ( | WERD_RES * | word | ) |
Definition at line 513 of file reject.cpp.
tesseract::Tesseract::double_VAR_H | ( | crunch_del_cert | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_high_word | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_low_word | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_max_ht | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_min_ht | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_min_width | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_del_rating | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_poor_garbage_cert | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_poor_garbage_rate | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_pot_poor_cert | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_pot_poor_rate | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_small_outlines_size | ) |
tesseract::Tesseract::double_VAR_H | ( | crunch_terrible_rating | ) |
tesseract::Tesseract::double_VAR_H | ( | fixsp_small_outlines_size | ) |
tesseract::Tesseract::double_VAR_H | ( | invert_threshold | ) |
tesseract::Tesseract::double_VAR_H | ( | lstm_rating_coefficient | ) |
tesseract::Tesseract::double_VAR_H | ( | min_orientation_margin | ) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_basechar | ) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_disjoint | ) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_factor | ) |
tesseract::Tesseract::double_VAR_H | ( | noise_cert_punc | ) |
tesseract::Tesseract::double_VAR_H | ( | quality_blob_pc | ) |
tesseract::Tesseract::double_VAR_H | ( | quality_char_pc | ) |
tesseract::Tesseract::double_VAR_H | ( | quality_outline_pc | ) |
tesseract::Tesseract::double_VAR_H | ( | quality_rej_pc | ) |
tesseract::Tesseract::double_VAR_H | ( | quality_rowrej_pc | ) |
tesseract::Tesseract::double_VAR_H | ( | rej_whole_of_mostly_reject_word_fract | ) |
tesseract::Tesseract::double_VAR_H | ( | subscript_max_y_top | ) |
tesseract::Tesseract::double_VAR_H | ( | superscript_bettered_certainty | ) |
tesseract::Tesseract::double_VAR_H | ( | superscript_min_y_bottom | ) |
tesseract::Tesseract::double_VAR_H | ( | superscript_scaledown_ratio | ) |
tesseract::Tesseract::double_VAR_H | ( | superscript_worse_certainty | ) |
tesseract::Tesseract::double_VAR_H | ( | suspect_accept_rating | ) |
tesseract::Tesseract::double_VAR_H | ( | suspect_rating_per_ch | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_good_doc_still_rowrej_wd | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_lower_flip_hyphen | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_reject_block_percent | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_reject_doc_percent | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_reject_row_percent | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_upper_flip_hyphen | ) |
tesseract::Tesseract::double_VAR_H | ( | tessedit_whole_wd_rej_row_percent | ) |
tesseract::Tesseract::double_VAR_H | ( | test_pt_x | ) |
tesseract::Tesseract::double_VAR_H | ( | test_pt_y | ) |
tesseract::Tesseract::double_VAR_H | ( | textord_tabfind_aligned_gap_fraction | ) |
tesseract::Tesseract::double_VAR_H | ( | textord_tabfind_vertical_text_ratio | ) |
tesseract::Tesseract::double_VAR_H | ( | thresholding_kfactor | ) |
tesseract::Tesseract::double_VAR_H | ( | thresholding_score_fraction | ) |
tesseract::Tesseract::double_VAR_H | ( | thresholding_smooth_kernel_size | ) |
tesseract::Tesseract::double_VAR_H | ( | thresholding_tile_size | ) |
tesseract::Tesseract::double_VAR_H | ( | thresholding_window_size | ) |
void tesseract::Tesseract::dump_words | ( | WERD_RES_LIST & | perm, |
int16_t | score, | ||
int16_t | mode, | ||
bool | improved | ||
) |
Definition at line 467 of file fixspace.cpp.
void tesseract::Tesseract::end_tesseract | ( | ) |
Definition at line 457 of file tessedit.cpp.
int16_t tesseract::Tesseract::eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Definition at line 260 of file fixspace.cpp.
int16_t tesseract::Tesseract::failure_count | ( | WERD_RES * | word | ) |
Definition at line 895 of file docqual.cpp.
bool tesseract::Tesseract::FindSegmentation | ( | const std::vector< UNICHAR_ID > & | target_text, |
WERD_RES * | word_res | ||
) |
Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.
Definition at line 545 of file applybox.cpp.
int16_t tesseract::Tesseract::first_alphanum_index | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 457 of file reject.cpp.
int16_t tesseract::Tesseract::first_alphanum_offset | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 470 of file reject.cpp.
void tesseract::Tesseract::fix_fuzzy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 171 of file fixspace.cpp.
void tesseract::Tesseract::fix_fuzzy_spaces | ( | ETEXT_DESC * | monitor, |
int32_t | word_count, | ||
PAGE_RES * | page_res | ||
) |
Definition at line 77 of file fixspace.cpp.
void tesseract::Tesseract::fix_noisy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 577 of file fixspace.cpp.
void tesseract::Tesseract::fix_rep_char | ( | PAGE_RES_IT * | page_res_it | ) |
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
Definition at line 1665 of file control.cpp.
Definition at line 545 of file fixspace.cpp.
bool tesseract::Tesseract::fixspace_thinks_word_done | ( | WERD_RES * | word | ) |
Definition at line 514 of file fixspace.cpp.
void tesseract::Tesseract::flip_0O | ( | WERD_RES * | word | ) |
Definition at line 660 of file reject.cpp.
void tesseract::Tesseract::flip_hyphens | ( | WERD_RES * | word | ) |
Definition at line 602 of file reject.cpp.
void tesseract::Tesseract::font_recognition_pass | ( | PAGE_RES * | page_res | ) |
font_recognition_pass
Smooth the fonts for the document.
Definition at line 2003 of file control.cpp.
int16_t tesseract::Tesseract::fp_eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
fp_eval_word_spacing() Evaluation function for fixed pitch word lists.
Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars
Definition at line 837 of file fixspace.cpp.
GARBAGE_LEVEL tesseract::Tesseract::garbage_word | ( | WERD_RES * | word, |
bool | ok_dict_word | ||
) |
Definition at line 616 of file docqual.cpp.
UNICHAR_ID tesseract::Tesseract::get_rep_char | ( | WERD_RES * | word | ) |
Definition at line 247 of file output.cpp.
|
inline |
Definition at line 286 of file tesseractclass.h.
|
overridevirtual |
Reimplemented from tesseract::Classify.
Definition at line 480 of file tesseractclass.cpp.
ImageData * tesseract::Tesseract::GetLineData | ( | const TBOX & | line_box, |
const std::vector< TBOX > & | boxes, | ||
const std::vector< std::string > & | texts, | ||
int | start_box, | ||
int | end_box, | ||
const BLOCK & | block | ||
) |
Definition at line 133 of file linerec.cpp.
ImageData * tesseract::Tesseract::GetRectImage | ( | const TBOX & | box, |
const BLOCK & | block, | ||
int | padding, | ||
TBOX * | revised_box | ||
) | const |
Definition at line 165 of file linerec.cpp.
void tesseract::Tesseract::GetSubAndSuperscriptCandidates | ( | const WERD_RES * | word, |
int * | num_rebuilt_leading, | ||
ScriptPos * | leading_pos, | ||
float * | leading_certainty, | ||
int * | num_rebuilt_trailing, | ||
ScriptPos * | trailing_pos, | ||
float * | trailing_certainty, | ||
float * | avg_certainty, | ||
float * | unlikely_threshold | ||
) |
Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.
[in] | word | The word to examine. |
[out] | num_rebuilt_leading | the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified. |
[out] | leading_pos | "super" or "sub" (for debugging) |
[out] | leading_certainty | the worst certainty in the leading blobs. |
[out] | num_rebuilt_trailing | the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified. |
[out] | trailing_pos | "super" or "sub" (for debugging) |
[out] | trailing_certainty | the worst certainty in the trailing blobs. |
[out] | avg_certainty | the average certainty of "normal" blobs in the word. |
[out] | unlikely_threshold | the threshold (on certainty) we used to select "bad enough" outlier characters. |
Definition at line 250 of file superscript.cpp.
|
inline |
Definition at line 260 of file tesseractclass.h.
|
inline |
Definition at line 257 of file tesseractclass.h.
FILE * tesseract::Tesseract::init_recog_training | ( | const char * | filename | ) |
Definition at line 36 of file recogtraining.cpp.
int tesseract::Tesseract::init_tesseract | ( | const std::string & | arg0, |
const std::string & | textbase, | ||
const std::string & | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const std::vector< std::string > * | vars_vec, | ||
const std::vector< std::string > * | vars_values, | ||
bool | set_only_non_debug_params, | ||
TessdataManager * | mgr | ||
) |
Definition at line 288 of file tessedit.cpp.
|
inline |
Definition at line 500 of file tesseractclass.h.
int tesseract::Tesseract::init_tesseract_internal | ( | const std::string & | arg0, |
const std::string & | textbase, | ||
const std::string & | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const std::vector< std::string > * | vars_vec, | ||
const std::vector< std::string > * | vars_values, | ||
bool | set_only_non_debug_params, | ||
TessdataManager * | mgr | ||
) |
Definition at line 395 of file tessedit.cpp.
bool tesseract::Tesseract::init_tesseract_lang_data | ( | const std::string & | arg0, |
const std::string & | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const std::vector< std::string > * | vars_vec, | ||
const std::vector< std::string > * | vars_values, | ||
bool | set_only_non_debug_params, | ||
TessdataManager * | mgr | ||
) |
Definition at line 76 of file tessedit.cpp.
tesseract::Tesseract::INT_VAR_H | ( | applybox_debug | ) |
tesseract::Tesseract::INT_VAR_H | ( | applybox_page | ) |
tesseract::Tesseract::INT_VAR_H | ( | bidi_debug | ) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_debug | ) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_leave_lc_strings | ) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_leave_uc_strings | ) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_long_repetitions | ) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_pot_indicators | ) |
tesseract::Tesseract::INT_VAR_H | ( | crunch_rating_max | ) |
tesseract::Tesseract::INT_VAR_H | ( | debug_fix_space_level | ) |
tesseract::Tesseract::INT_VAR_H | ( | debug_noise_removal | ) |
tesseract::Tesseract::INT_VAR_H | ( | debug_x_ht_level | ) |
tesseract::Tesseract::INT_VAR_H | ( | fixsp_done_mode | ) |
tesseract::Tesseract::INT_VAR_H | ( | fixsp_non_noise_limit | ) |
tesseract::Tesseract::INT_VAR_H | ( | jpg_quality | ) |
tesseract::Tesseract::INT_VAR_H | ( | lstm_choice_iterations | ) |
tesseract::Tesseract::INT_VAR_H | ( | lstm_choice_mode | ) |
tesseract::Tesseract::INT_VAR_H | ( | min_characters_to_try | ) |
tesseract::Tesseract::INT_VAR_H | ( | min_sane_x_ht_pixels | ) |
tesseract::Tesseract::INT_VAR_H | ( | multilang_debug_level | ) |
tesseract::Tesseract::INT_VAR_H | ( | noise_maxperblob | ) |
tesseract::Tesseract::INT_VAR_H | ( | noise_maxperword | ) |
tesseract::Tesseract::INT_VAR_H | ( | ocr_devanagari_split_strategy | ) |
tesseract::Tesseract::INT_VAR_H | ( | pageseg_devanagari_split_strategy | ) |
tesseract::Tesseract::INT_VAR_H | ( | paragraph_debug_level | ) |
tesseract::Tesseract::INT_VAR_H | ( | quality_min_initial_alphas_reqd | ) |
tesseract::Tesseract::INT_VAR_H | ( | superscript_debug | ) |
tesseract::Tesseract::INT_VAR_H | ( | suspect_level | ) |
tesseract::Tesseract::INT_VAR_H | ( | suspect_short_words | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_bigram_debug | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_font_id | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_image_border | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_ocr_engine_mode | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_page_number | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_pageseg_mode | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_parallelize | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_preserve_min_wd_len | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_reject_mode | ) |
tesseract::Tesseract::INT_VAR_H | ( | tessedit_tess_adaption_mode | ) |
tesseract::Tesseract::INT_VAR_H | ( | thresholding_method | ) |
tesseract::Tesseract::INT_VAR_H | ( | user_defined_dpi | ) |
tesseract::Tesseract::INT_VAR_H | ( | x_ht_acceptance_tolerance | ) |
tesseract::Tesseract::INT_VAR_H | ( | x_ht_min_change | ) |
void tesseract::Tesseract::join_words | ( | WERD_RES * | word, |
WERD_RES * | word2, | ||
BlamerBundle * | orig_bb | ||
) | const |
Definition at line 216 of file tfacepp.cpp.
void tesseract::Tesseract::LSTMRecognizeWord | ( | const BLOCK & | block, |
ROW * | row, | ||
WERD_RES * | word, | ||
PointerVector< WERD_RES > * | words | ||
) |
Definition at line 230 of file linerec.cpp.
Definition at line 96 of file reject.cpp.
Definition at line 218 of file fixspace.cpp.
void tesseract::Tesseract::match_word_pass_n | ( | int | pass_n, |
WERD_RES * | word, | ||
ROW * | row, | ||
BLOCK * | block | ||
) |
match_word_pass2
Baseline normalize the word and pass it to Tess.
Definition at line 1589 of file control.cpp.
void tesseract::Tesseract::MaximallyChopWord | ( | const std::vector< TBOX > & | boxes, |
BLOCK * | block, | ||
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.
Definition at line 231 of file applybox.cpp.
|
inline |
Definition at line 204 of file tesseractclass.h.
|
inline |
Definition at line 276 of file tesseractclass.h.
bool tesseract::Tesseract::noise_outlines | ( | TWERD * | word | ) |
Definition at line 907 of file docqual.cpp.
bool tesseract::Tesseract::non_0_digit | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
Definition at line 772 of file reject.cpp.
bool tesseract::Tesseract::non_O_upper | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
Definition at line 768 of file reject.cpp.
|
inline |
Definition at line 283 of file tesseractclass.h.
bool tesseract::Tesseract::one_ell_conflict | ( | WERD_RES * | word_res, |
bool | update_map | ||
) |
Definition at line 287 of file reject.cpp.
void tesseract::Tesseract::output_pass | ( | PAGE_RES_IT & | page_res_it, |
const TBOX * | target_word_box | ||
) |
Definition at line 39 of file output.cpp.
void tesseract::Tesseract::ParseLanguageString | ( | const std::string & | lang_str, |
std::vector< std::string > * | to_load, | ||
std::vector< std::string > * | not_to_load | ||
) |
Definition at line 244 of file tessedit.cpp.
void tesseract::Tesseract::pgeditor_main | ( | int | width, |
int | height, | ||
PAGE_RES * | page_res | ||
) |
Top level editor operation: Setup a new window and an according event handler
Definition at line 355 of file pgedit.cpp.
|
inline |
Definition at line 208 of file tesseractclass.h.
|
inline |
Definition at line 211 of file tesseractclass.h.
|
inline |
Definition at line 218 of file tesseractclass.h.
bool tesseract::Tesseract::potential_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level, | ||
bool | ok_dict_word | ||
) |
Definition at line 488 of file docqual.cpp.
void tesseract::Tesseract::PreenXHeights | ( | BLOCK_LIST * | block_list | ) |
Any row xheight that is significantly different from the median is set to the median.
Definition at line 174 of file applybox.cpp.
void tesseract::Tesseract::PrepareForPageseg | ( | ) |
Definition at line 557 of file tesseractclass.cpp.
void tesseract::Tesseract::PrepareForTessOCR | ( | BLOCK_LIST * | block_list, |
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Definition at line 587 of file tesseractclass.cpp.
void tesseract::Tesseract::PrerecAllWordsPar | ( | const std::vector< WordData > & | words | ) |
Definition at line 38 of file par_control.cpp.
bool tesseract::Tesseract::process_cmd_win_event | ( | int32_t | cmd_event, |
char * | new_value | ||
) |
Process a command returned from the command window (Just call the appropriate command handler)
Definition at line 391 of file pgedit.cpp.
void tesseract::Tesseract::process_image_event | ( | const SVEvent & | event | ) |
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
Definition at line 567 of file pgedit.cpp.
void tesseract::Tesseract::process_selected_words | ( | PAGE_RES * | page_res, |
TBOX & | selection_box, | ||
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) | word_processor | ||
) |
Definition at line 30 of file pagewalk.cpp.
bool tesseract::Tesseract::ProcessTargetWord | ( | const TBOX & | word_box, |
const TBOX & | target_word_box, | ||
const char * | word_config, | ||
int | pass | ||
) |
Definition at line 118 of file control.cpp.
void tesseract::Tesseract::quality_based_rejection | ( | PAGE_RES_IT & | page_res_it, |
bool | good_quality_doc | ||
) |
Definition at line 120 of file docqual.cpp.
void tesseract::Tesseract::read_config_file | ( | const char * | filename, |
SetParamConstraint | constraint | ||
) |
Definition at line 46 of file tessedit.cpp.
bool tesseract::Tesseract::ReassignDiacritics | ( | int | pass, |
PAGE_RES_IT * | pr_it, | ||
bool * | make_next_word_fuzzy | ||
) |
Definition at line 914 of file control.cpp.
bool tesseract::Tesseract::recog_all_words | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
int | dopasses | ||
) |
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.
page_res | page structure |
monitor | progress monitor |
word_config | word_config file |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
Definition at line 287 of file control.cpp.
bool tesseract::Tesseract::recog_interactive | ( | PAGE_RES_IT * | pr_it | ) |
Recognize a single word in interactive mode.
pr_it | the page results iterator |
Definition at line 76 of file control.cpp.
Definition at line 62 of file control.cpp.
void tesseract::Tesseract::recog_training_segmented | ( | const char * | filename, |
PAGE_RES * | page_res, | ||
volatile ETEXT_DESC * | monitor, | ||
FILE * | output_file | ||
) |
Definition at line 86 of file recogtraining.cpp.
void tesseract::Tesseract::recog_word | ( | WERD_RES * | word | ) |
Definition at line 37 of file tfacepp.cpp.
void tesseract::Tesseract::recog_word_recursive | ( | WERD_RES * | word | ) |
Definition at line 94 of file tfacepp.cpp.
bool tesseract::Tesseract::RecogAllWordsPassN | ( | int | pass_n, |
ETEXT_DESC * | monitor, | ||
PAGE_RES_IT * | pr_it, | ||
std::vector< WordData > * | words | ||
) |
Definition at line 198 of file control.cpp.
void tesseract::Tesseract::recognize_page | ( | std::string & | image_name | ) |
void tesseract::Tesseract::reject_edge_blobs | ( | WERD_RES * | word | ) |
Definition at line 260 of file reject.cpp.
void tesseract::Tesseract::reject_I_1_L | ( | WERD_RES * | word | ) |
Definition at line 195 of file reject.cpp.
void tesseract::Tesseract::reject_mostly_rejects | ( | WERD_RES * | word | ) |
Definition at line 556 of file reject.cpp.
void tesseract::Tesseract::rejection_passes | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config | ||
) |
Definition at line 599 of file control.cpp.
Definition at line 565 of file reject.cpp.
void tesseract::Tesseract::ReportFailedBox | ( | int | boxfile_lineno, |
TBOX | box, | ||
const char * | box_ch, | ||
const char * | err_msg | ||
) |
Logs a bad box by line in the box file and box coords.
Definition at line 743 of file applybox.cpp.
void tesseract::Tesseract::ReportXhtFixResult | ( | bool | accept_new_word, |
float | new_x_ht, | ||
WERD_RES * | word, | ||
WERD_RES * | new_word | ||
) |
Definition at line 1436 of file control.cpp.
void tesseract::Tesseract::ReSegmentByClassification | ( | PAGE_RES * | page_res | ) |
Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.
Definition at line 495 of file applybox.cpp.
bool tesseract::Tesseract::ResegmentCharBox | ( | PAGE_RES * | page_res, |
const TBOX * | prev_box, | ||
const TBOX & | box, | ||
const TBOX * | next_box, | ||
const char * | correct_text | ||
) |
Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.
Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.
This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.
Definition at line 310 of file applybox.cpp.
bool tesseract::Tesseract::ResegmentWordBox | ( | BLOCK_LIST * | block_list, |
const TBOX & | box, | ||
const TBOX * | next_box, | ||
const char * | correct_text | ||
) |
Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.
Definition at line 414 of file applybox.cpp.
void tesseract::Tesseract::ResetAdaptiveClassifier | ( | ) |
Definition at line 513 of file tesseractclass.cpp.
void tesseract::Tesseract::ResetDocumentDictionary | ( | ) |
Definition at line 523 of file tesseractclass.cpp.
|
inline |
Definition at line 200 of file tesseractclass.h.
int tesseract::Tesseract::RetryWithLanguage | ( | const WordData & | word_data, |
WordRecognizer | recognizer, | ||
bool | debug, | ||
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | best_words | ||
) |
Definition at line 873 of file control.cpp.
|
inline |
Definition at line 280 of file tesseractclass.h.
int16_t tesseract::Tesseract::safe_dict_word | ( | const WERD_RES * | werd_res | ) |
Definition at line 593 of file reject.cpp.
|
inline |
Definition at line 263 of file tesseractclass.h.
|
inline |
Definition at line 266 of file tesseractclass.h.
void tesseract::Tesseract::script_pos_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 707 of file control.cpp.
void tesseract::Tesseract::SearchForText | ( | const std::vector< BLOB_CHOICE_LIST * > * | choices, |
int | choices_pos, | ||
unsigned | choices_length, | ||
const std::vector< UNICHAR_ID > & | target_text, | ||
unsigned | text_index, | ||
float | rating, | ||
std::vector< int > * | segmentation, | ||
float * | best_rating, | ||
std::vector< int > * | best_segmentation | ||
) |
Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).
choices | is an array of vectors of length choices_length, with each element representing a starting position in the word, and the #vector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc. |
choices_pos | |
choices_length | |
target_text | |
text_index | |
rating | |
segmentation | |
best_rating | |
best_segmentation |
Definition at line 615 of file applybox.cpp.
void tesseract::Tesseract::SearchWords | ( | PointerVector< WERD_RES > * | words | ) |
Definition at line 264 of file linerec.cpp.
int tesseract::Tesseract::SegmentPage | ( | const char * | input_file, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.
Definition at line 101 of file pagesegmain.cpp.
bool tesseract::Tesseract::SelectGoodDiacriticOutlines | ( | int | pass, |
float | certainty_threshold, | ||
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
const std::vector< C_OUTLINE * > & | outlines, | ||
int | num_outlines, | ||
std::vector< bool > * | ok_outlines | ||
) |
Definition at line 1120 of file control.cpp.
void tesseract::Tesseract::set_done | ( | WERD_RES * | word, |
int16_t | pass | ||
) |
Definition at line 62 of file reject.cpp.
|
inline |
Definition at line 214 of file tesseractclass.h.
|
inline |
Definition at line 222 of file tesseractclass.h.
|
inline |
Definition at line 247 of file tesseractclass.h.
|
inline |
Definition at line 254 of file tesseractclass.h.
void tesseract::Tesseract::set_unlv_suspects | ( | WERD_RES * | word | ) |
Definition at line 270 of file output.cpp.
void tesseract::Tesseract::set_word_fonts | ( | WERD_RES * | word | ) |
set_word_fonts
Get the fonts for the word.
Definition at line 1915 of file control.cpp.
void tesseract::Tesseract::SetBlackAndWhitelist | ( | ) |
Definition at line 530 of file tesseractclass.cpp.
void tesseract::Tesseract::SetEquationDetect | ( | EquationDetect * | detector | ) |
Definition at line 507 of file tesseractclass.cpp.
|
inline |
Definition at line 269 of file tesseractclass.h.
void tesseract::Tesseract::SetupAllWordsPassN | ( | int | pass_n, |
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
PAGE_RES * | page_res, | ||
std::vector< WordData > * | words | ||
) |
If tesseract is to be run, sets the words up ready for it.
Definition at line 146 of file control.cpp.
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes | ( | const std::vector< TBOX > & | boxes, |
BLOCK_LIST * | block_list | ||
) |
Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.
Definition at line 197 of file applybox.cpp.
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr, | ||
TO_BLOCK_LIST * | to_blocks, | ||
Image * | photo_mask_pix, | ||
Image * | music_mask_pix | ||
) |
Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.
Definition at line 270 of file pagesegmain.cpp.
void tesseract::Tesseract::SetupUniversalFontIds | ( | ) |
Definition at line 436 of file tessedit.cpp.
void tesseract::Tesseract::SetupWordPassN | ( | int | pass_n, |
WordData * | word | ||
) |
Definition at line 166 of file control.cpp.
void tesseract::Tesseract::SetupWordScripts | ( | BLOCK_LIST * | blocks | ) |
|
inline |
Definition at line 251 of file tesseractclass.h.
void tesseract::Tesseract::split_and_recog_word | ( | WERD_RES * | word | ) |
Definition at line 126 of file tfacepp.cpp.
void tesseract::Tesseract::split_word | ( | WERD_RES * | word, |
unsigned | split_pt, | ||
WERD_RES ** | right_piece, | ||
BlamerBundle ** | orig_blamer_bundle | ||
) | const |
Definition at line 163 of file tfacepp.cpp.
tesseract::Tesseract::STRING_VAR_H | ( | applybox_exposure_pattern | ) |
tesseract::Tesseract::STRING_VAR_H | ( | chs_leading_punct | ) |
tesseract::Tesseract::STRING_VAR_H | ( | chs_trailing_punct1 | ) |
tesseract::Tesseract::STRING_VAR_H | ( | chs_trailing_punct2 | ) |
tesseract::Tesseract::STRING_VAR_H | ( | conflict_set_I_l_1 | ) |
tesseract::Tesseract::STRING_VAR_H | ( | file_type | ) |
tesseract::Tesseract::STRING_VAR_H | ( | numeric_punctuation | ) |
tesseract::Tesseract::STRING_VAR_H | ( | ok_repeated_ch_non_alphanum_wds | ) |
tesseract::Tesseract::STRING_VAR_H | ( | outlines_2 | ) |
tesseract::Tesseract::STRING_VAR_H | ( | outlines_odd | ) |
tesseract::Tesseract::STRING_VAR_H | ( | page_separator | ) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_char_blacklist | ) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_char_unblacklist | ) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_char_whitelist | ) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_load_sublangs | ) |
tesseract::Tesseract::STRING_VAR_H | ( | tessedit_write_params_to_file | ) |
tesseract::Tesseract::STRING_VAR_H | ( | unrecognised_char | ) |
bool tesseract::Tesseract::SubAndSuperscriptFix | ( | WERD_RES * | word | ) |
Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.
This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.
Definition at line 108 of file superscript.cpp.
bool tesseract::Tesseract::terrible_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level | ||
) |
Definition at line 450 of file docqual.cpp.
bool tesseract::Tesseract::tess_acceptable_word | ( | WERD_RES * | word | ) |
Definition at line 64 of file tessbox.cpp.
void tesseract::Tesseract::tess_add_doc_word | ( | WERD_CHOICE * | word_choice | ) |
Definition at line 73 of file tessbox.cpp.
void tesseract::Tesseract::tess_segment_pass_n | ( | int | pass_n, |
WERD_RES * | word | ||
) |
Definition at line 32 of file tessbox.cpp.
bool tesseract::Tesseract::TestNewNormalization | ( | int | original_misfits, |
float | baseline_shift, | ||
float | new_x_ht, | ||
WERD_RES * | word, | ||
BLOCK * | block, | ||
ROW * | row | ||
) |
Definition at line 1488 of file control.cpp.
|
inline |
Definition at line 273 of file tesseractclass.h.
void tesseract::Tesseract::TidyUp | ( | PAGE_RES * | page_res | ) |
Definition at line 685 of file applybox.cpp.
void tesseract::Tesseract::tilde_crunch | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 373 of file docqual.cpp.
void tesseract::Tesseract::tilde_delete | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 530 of file docqual.cpp.
Definition at line 1455 of file control.cpp.
void tesseract::Tesseract::TrainFromBoxes | ( | const std::vector< TBOX > & | boxes, |
const std::vector< std::string > & | texts, | ||
BLOCK_LIST * | block_list, | ||
DocumentData * | training_data | ||
) |
Definition at line 76 of file linerec.cpp.
bool tesseract::Tesseract::TrainLineRecognizer | ( | const char * | input_imagename, |
const std::string & | output_basename, | ||
BLOCK_LIST * | block_list | ||
) |
Definition at line 41 of file linerec.cpp.
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits | ( | int | num_chopped_leading, |
float | leading_certainty, | ||
ScriptPos | leading_pos, | ||
int | num_chopped_trailing, | ||
float | trailing_certainty, | ||
ScriptPos | trailing_pos, | ||
WERD_RES * | word, | ||
bool * | is_good, | ||
int * | retry_rebuild_leading, | ||
int * | retry_rebuild_trailing | ||
) |
Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.
[in] | num_chopped_leading | how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | leading_certainty | the (minimum) certainty had by the characters in the original leading section. |
[in] | leading_pos | "super" or "sub" (for debugging) |
[in] | num_chopped_trailing | how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | trailing_certainty | the (minimum) certainty had by the characters in the original trailing section. |
[in] | trailing_pos | "super" or "sub" (for debugging) |
[in] | word | the word to try to chop up. |
[out] | is_good | do we believe our result? |
[out] | retry_rebuild_leading,retry_rebuild_trailing | If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars. |
Definition at line 369 of file superscript.cpp.
void tesseract::Tesseract::unrej_good_chs | ( | WERD_RES * | word | ) |
Definition at line 98 of file docqual.cpp.
void tesseract::Tesseract::unrej_good_quality_words | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 142 of file docqual.cpp.
bool tesseract::Tesseract::word_adaptable | ( | WERD_RES * | word, |
uint16_t | mode | ||
) |
Definition at line 34 of file adaptions.cpp.
bool tesseract::Tesseract::word_blank_and_set_display | ( | PAGE_RES_IT * | pr_it | ) |
word_blank_and_set_display() Word processor
Blank display of word then redisplay word according to current display mode settings
Definition at line 667 of file pgedit.cpp.
bool tesseract::Tesseract::word_bln_display | ( | PAGE_RES_IT * | pr_it | ) |
Normalize word and display in word window
Definition at line 677 of file pgedit.cpp.
int16_t tesseract::Tesseract::word_blob_quality | ( | WERD_RES * | word | ) |
Definition at line 51 of file docqual.cpp.
void tesseract::Tesseract::word_char_quality | ( | WERD_RES * | word, |
int16_t * | match_count, | ||
int16_t * | accepted_match_count | ||
) |
Definition at line 81 of file docqual.cpp.
bool tesseract::Tesseract::word_contains_non_1_digit | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 496 of file reject.cpp.
CRUNCH_MODE tesseract::Tesseract::word_deletable | ( | WERD_RES * | word, |
int16_t & | delete_mode | ||
) |
Definition at line 825 of file docqual.cpp.
bool tesseract::Tesseract::word_display | ( | PAGE_RES_IT * | pr_it | ) |
word_display() Word Processor
Display a word according to its display modes
Definition at line 702 of file pgedit.cpp.
bool tesseract::Tesseract::word_dumper | ( | PAGE_RES_IT * | pr_it | ) |
Dump members to the debug window
Definition at line 877 of file pgedit.cpp.
int16_t tesseract::Tesseract::word_outline_errs | ( | WERD_RES * | word | ) |
Definition at line 62 of file docqual.cpp.
bool tesseract::Tesseract::word_set_display | ( | PAGE_RES_IT * | pr_it | ) |
word_set_display() Word processor
Display word according to current display mode settings
Definition at line 900 of file pgedit.cpp.
int16_t tesseract::Tesseract::worst_noise_blob | ( | WERD_RES * | word_res, |
float * | worst_noise_score | ||
) |
Definition at line 685 of file fixspace.cpp.
void tesseract::Tesseract::write_results | ( | PAGE_RES_IT & | page_res_it, |
char | newline_type, | ||
bool | force_eol | ||
) |
Definition at line 99 of file output.cpp.