All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract ()
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
BOOL8 recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
BOOL8 check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
BOOL8 acceptable_number_string (const char *s, const char *lengths)
 
inT16 count_alphanums (const WERD_CHOICE &word)
 
inT16 count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
BOOL8 process_cmd_win_event (inT32 cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
BOOL8 word_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_bln_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
BOOL8 word_set_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, inT16 pass)
 
BOOL8 one_ell_conflict (WERD_RES *word_res, BOOL8 update_map)
 
inT16 first_alphanum_index (const char *word, const char *word_lengths)
 
inT16 first_alphanum_offset (const char *word, const char *word_lengths)
 
inT16 alpha_count (const char *word, const char *word_lengths)
 
BOOL8 word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
inT16 count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
BOOL8 non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, inT16 pass)
 
inT16 safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
BOOL8 word_adaptable (WERD_RES *word, uinT16 mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
inT16 fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
 
BOOL8 fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)
 
BOOL8 potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
inT16 word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
inT16 count_outline_errs (char c, inT16 outline_count)
 
inT16 word_outline_errs (WERD_RES *word)
 
BOOL8 terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, inT16 &delete_mode)
 
inT16 failure_count (WERD_RES *word)
 
BOOL8 noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
CubeRecoContextGetCubeRecoContext ()
 
init_cube_objects

Instantiates Tesseract object's CubeRecoContext and TesseractCubeCombiner. Returns false if cube context could not be created or if load_combiner is true, but the combiner could not be loaded.

bool init_cube_objects (bool load_combiner, TessdataManager *tessdata_manager)
 
run_cube_combiner

Iterates through tesseract's results and calls cube on each word, combining the results with the existing tesseract result.

void run_cube_combiner (PAGE_RES *page_res)
 
cube_word_pass1

Recognizes a single word using (only) cube. Compatible with Tesseract's classify_word_pass1/classify_word_pass2.

void cube_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word)
 
cube_recognize_word

Cube recognizer to recognize a single word as with classify_word_pass1 but also returns the cube object in case the combiner is needed.

CubeObjectcube_recognize_word (BLOCK *block, WERD_RES *word)
 
cube_combine_word

Combines the cube and tesseract results for a single word, leaving the result in tess_word.

void cube_combine_word (CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
 
cube_recognize

Call cube on the current word, and write the result to word. Sets up a fake result and returns false if something goes wrong.

bool cube_recognize (CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
 
fill_werd_res

Fill Tesseract's word result fields with cube's.

void fill_werd_res (const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res)
 
extract_cube_state

Extract CharSamp objects and character bounding boxes from the CubeObject's state. The caller should free both structres.

bool extract_cube_state (CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
 
create_cube_box_word

Fill the given BoxWord with boxes from character bounding boxes. The char_boxes have local coordinates w.r.t. the word bounding box, i.e., the left-most character bbox of each word has (0,0) left-top coord, but the BoxWord must be defined in page coordinates.

bool create_cube_box_word (Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

BOOL8 digit_or_numeric_punct (WERD_RES *word, int char_position)
 
inT16 eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
inT16 worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
virtual ~Wordrec ()
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void WordSearch (WERD_RES *word_res)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, bool init_classifier, bool init_permute)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (inT32 elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (bool load_pre_trained_templates)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (FILE *File)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_dump_pageseg_images = false
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_redo_xheight = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
bool debug_acceptable_wds = false
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool tessedit_matcher_log = false
 
int tessedit_test_adaption_mode = 3
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
int cube_debug_level = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool docqual_excuse_outline_errs = false
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = true
 
bool hocr_font_info = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
bool crunch_pot_garbage = true
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = true
 
bool tessedit_create_hocr = false
 
bool tessedit_create_pdf = false
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_space_level = 100
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
bool tessedit_consistent_reps = true
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
int tessdata_manager_debug_level = 0
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = FALSE
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
bool include_page_breaks = false
 
char * page_separator = "\f"
 
bool textord_tabfind_vertical_horizontal_mix = true
 
int tessedit_ok_mode = 5
 
bool load_fixed_length_dawgs = true
 
int segment_debug = 0
 
bool permute_debug = 0
 
double bestrate_pruning_factor = 2.0
 
bool permute_script_word = 0
 
bool segment_segcost_rating = 0
 
double segment_reward_script = 0.95
 
bool permute_fixed_length_dawg = 0
 
bool permute_chartype_word = 0
 
double segment_reward_chartype = 0.97
 
double segment_reward_ngram_best_choice = 0.99
 
bool ngram_permuter_activated = false
 
bool permute_only_top = false
 
int language_model_fixed_length_choices_depth = 3
 
bool use_new_state_cost = FALSE
 
double heuristic_segcost_rating_base = 1.25
 
double heuristic_weight_rating = 1
 
double heuristic_weight_width = 1000.0
 
double heuristic_weight_seamcut = 0
 
double heuristic_max_char_wh_ratio = 2.0
 
bool enable_new_segsearch = false
 
double segsearch_max_fixed_pitch_char_wh_ratio = 2.0
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
TessdataManager tessdata_manager
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 170 of file tesseractclass.h.

Constructor & Destructor Documentation

tesseract::Tesseract::Tesseract ( )

Definition at line 57 of file tesseractclass.cpp.

59  "Take segmentation and labeling from box file",
60  this->params()),
62  "Conversion of word/line box file to char box file",
63  this->params()),
65  "Generate training data from boxed chars", this->params()),
67  "Generate more boxes from boxed chars", this->params()),
69  "Dump intermediate images made during page segmentation",
70  this->params()),
71  // The default for pageseg_mode is the old behaviour, so as not to
72  // upset anything that relies on that.
73  INT_MEMBER(
75  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
76  " 5=line, 6=word, 7=char"
77  " (Values from PageSegMode enum in publictypes.h)",
78  this->params()),
80  "Which OCR engine(s) to run (Tesseract, Cube, both)."
81  " Defaults to loading and running only Tesseract"
82  " (no Cube,no combiner)."
83  " Values from OcrEngineMode enum in tesseractclass.h)",
84  this->params()),
86  "Blacklist of chars not to recognize", this->params()),
88  "Whitelist of chars to recognize", this->params()),
90  "List of chars to override tessedit_char_blacklist",
91  this->params()),
93  "Perform training for ambiguities", this->params()),
96  "Whether to use the top-line splitting process for Devanagari "
97  "documents while performing page-segmentation.",
98  this->params()),
101  "Whether to use the top-line splitting process for Devanagari "
102  "documents while performing ocr.",
103  this->params()),
105  "Write all parameters to the given file.", this->params()),
107  "Generate and print debug"
108  " information for adaption",
109  this->params()),
110  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
111  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
112  INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
113  this->params()),
115  "Exposure value follows"
116  " this pattern in the image filename. The name of the image"
117  " files are expected to be in the form"
118  " [lang].[fontname].exp[num].tif",
119  this->params()),
121  "Learn both character fragments (as is done in the"
122  " special low exposure mode) as well as unfragmented"
123  " characters.",
124  this->params()),
126  "Each bounding box"
127  " is assumed to contain ngrams. Only learn the ngrams"
128  " whose outlines overlap horizontally.",
129  this->params()),
130  BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
131  this->params()),
132  BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
133  this->params()),
134  BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
135  this->params()),
137  "Try to improve fuzzy spaces", this->params()),
139  "Dont bother with word plausibility", this->params()),
140  BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
141  this->params()),
142  BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
143  this->params()),
145  "Add words to the document dictionary", this->params()),
146  BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
147  this->params()),
148  BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
149  this->params()),
151  "Enable correction based on the word bigram dictionary.",
152  this->params()),
154  "Enable single word correction based on the dictionary.",
155  this->params()),
157  "Amount of debug output for bigram correction.",
158  this->params()),
160  "Remove and conditionally reassign small outlines when they"
161  " confuse layout analysis, determining diacritics vs noise",
162  this->params()),
163  INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
164  this->params()),
165  // Worst (min) certainty, for which a diacritic is allowed to make the
166  // base
167  // character worse and still be included.
169  "Hingepoint for base char certainty", this->params()),
170  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
171  // to make the base character worse and still be included.
173  "Hingepoint for disjoint certainty", this->params()),
174  // Worst (min) certainty, for which a diacritic is allowed to make a new
175  // stand-alone blob.
177  "Threshold for new punc char certainty", this->params()),
178  // Factor of certainty margin for adding diacritics to not count as worse.
180  "Scaling on certainty diff from Hingepoint",
181  this->params()),
182  INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
183  this->params()),
184  INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
185  this->params()),
186  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
187  BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
188  this->params()),
189  STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
190  this->params()),
191  STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
192  this->params()),
193  STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
194  this->params()),
196  "good_quality_doc lte rejection limit", this->params()),
198  "good_quality_doc gte good blobs limit", this->params()),
200  "good_quality_doc lte outline error limit", this->params()),
202  "good_quality_doc gte good char limit", this->params()),
203  INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
204  this->params()),
206  "Adaptation decision algorithm for tess", this->params()),
208  "Do minimal rejection on pass 1 output", this->params()),
209  BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
210  this->params()),
211  BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
212  this->params()),
214  "Adaptation decision algorithm for tess", this->params()),
215  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
216  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
217  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
218  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
219  this->params()),
221  "Run paragraph detection on the post-text-recognition "
222  "(more accurate)",
223  this->params()),
224  INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
225  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
226  this->params()),
227  STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
228  this->params()),
230  "Allow outline errs in unrejection?", this->params()),
232  "Reduce rejection on good docs", this->params()),
233  BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
234  this->params()),
236  "%rej allowed before rej whole doc", this->params()),
238  "%rej allowed before rej whole block", this->params()),
240  "%rej allowed before rej whole row", this->params()),
242  "Number of row rejects in whole word rejects"
243  "which prevents whole row rejection",
244  this->params()),
246  "Only rej partially rejected words in block rejection",
247  this->params()),
249  "Only rej partially rejected words in row rejection",
250  this->params()),
252  "Use word segmentation quality metric", this->params()),
254  "Use word segmentation quality metric", this->params()),
256  "Only preserve wds longer than this", this->params()),
258  "Apply row rejection to good docs", this->params()),
260  "rej good doc wd if more than this fraction rejected",
261  this->params()),
263  "Reject all bad quality wds", this->params()),
264  BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
265  this->params()),
267  "Output data to debug file", this->params()),
268  BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs",
269  this->params()),
271  "good_quality_doc gte good char limit", this->params()),
273  "Mark v.bad words for tilde crunch", this->params()),
274  BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
275  this->params()),
276  BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
277  this->params()),
279  "Take out ~^ early?", this->params()),
280  double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
281  this->params()),
282  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
284  "crunch garbage cert lt this", this->params()),
286  "crunch garbage rating lt this", this->params()),
287  double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
288  this->params()),
289  double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
290  this->params()),
291  BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
292  this->params()),
293  double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
294  this->params()),
295  double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
296  this->params()),
297  double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
298  this->params()),
299  double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
300  this->params()),
302  "Del if word width lt xht x this", this->params()),
304  "Del if word gt xht x this above bl", this->params()),
306  "Del if word gt xht x this below bl", this->params()),
307  double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
308  this->params()),
309  INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
310  this->params()),
312  "How many potential indicators needed", this->params()),
313  BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
314  this->params()),
315  BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
316  this->params()),
318  "Dont pot crunch sensible strings", this->params()),
319  BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
320  this->params()),
322  "Dont crunch words with long lower case strings",
323  this->params()),
325  "Dont crunch words with long lower case strings",
326  this->params()),
328  "Crunch words with long repetitions", this->params()),
329  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
331  "How many non-noise blbs either side?", this->params()),
332  double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
333  this->params()),
335  "Reward punctation joins", this->params()),
336  INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
337  this->params()),
338  INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
339  this->params()),
341  "Punct. chs expected WITHIN numbers", this->params()),
343  "Max allowed deviation of blob top outside of font data",
344  this->params()),
346  "Min change in xht before actually trying it", this->params()),
348  "Debug level for sub & superscript fixer", this->params()),
351  "How many times worse "
352  "certainty does a superscript position glyph need to be for "
353  "us to try classifying it as a char with a different "
354  "baseline?",
355  this->params()),
358  "What reduction in "
359  "badness do we think sufficient to choose a superscript "
360  "over what we'd thought. For example, a value of 0.6 means "
361  "we want to reduce badness of certainty by at least 40%",
362  this->params()),
364  "A superscript scaled down more than this is unbelievably "
365  "small. For example, 0.3 means we expect the font size to "
366  "be no smaller than 30% of the text line font size.",
367  this->params()),
369  "Maximum top of a character measured as a multiple of "
370  "x-height above the baseline for us to reconsider whether "
371  "it's a subscript.",
372  this->params()),
374  "Minimum bottom of a character measured as a multiple of "
375  "x-height above the baseline for us to reconsider whether "
376  "it's a superscript.",
377  this->params()),
379  "Write block separators in output", this->params()),
380  BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
381  this->params()),
382  BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
383  this->params()),
384  BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
385  this->params()),
386  BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
387  this->params()),
388  BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
389  this->params()),
391  "Output char for unidentified blobs", this->params()),
392  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
394  "Min suspect level for rejecting spaces", this->params()),
396  "Dont Suspect dict wds longer than this", this->params()),
397  BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
398  this->params()),
399  double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
400  this->params()),
401  double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
402  this->params()),
404  "Only reject tess failures", this->params()),
405  BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
406  this->params()),
408  "Make output have exactly one word per WERD", this->params()),
410  "Dont reject ANYTHING AT ALL", this->params()),
412  "Force all rep chars the same", this->params()),
413  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
414  this->params()),
415  BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
416  this->params()),
417  BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
418  this->params()),
420  "Aspect ratio dot/hyphen test", this->params()),
422  "Aspect ratio dot/hyphen test", this->params()),
424  "Use DOC dawg in 11l conf. detector", this->params()),
425  BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
426  this->params()),
427  BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
428  this->params()),
429  BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
430  this->params()),
431  BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
432  this->params()),
433  BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
434  this->params()),
435  BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
436  this->params()),
437  BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
438  this->params()),
440  "if >this fract", this->params()),
441  INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
442  this->params()),
444  "Allow NN to unrej", this->params()),
445  STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
446  this->params()),
447  INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
448  this->params()),
449  BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
450  this->params()),
452  "-1 -> All pages"
453  " , else specifc page to process",
454  this->params()),
456  "Capture the image from the IPE", this->params()),
457  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
458  this->params()),
459  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
460  BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
461  this->params()),
463  "Debug level for"
464  " TessdataManager functions.",
465  this->params()),
467  "List of languages to load with this one", this->params()),
469  "In multilingual mode use params model of the"
470  " primary language",
471  this->params()),
473  "Min acceptable orientation margin", this->params()),
474  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
475  this->params()),
476  BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
477  this->params()),
479  "Allow feature extractors to see the original outline",
480  this->params()),
482  "Only initialize with the config file. Useful if the "
483  "instance is not going to be used for OCR but say only "
484  "for layout analysis.",
485  this->params()),
486  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
487  this->params()),
489  "Enable vertical detection", this->params()),
491  "Force using vertical text page mode", this->params()),
494  "Fraction of textlines deemed vertical to use vertical page "
495  "mode",
496  this->params()),
499  "Fraction of height used as a minimum gap for aligned blobs.",
500  this->params()),
501  INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
502  this->params()),
504  "Preserve multiple interword spaces", this->params()),
506  "Include page separator string in output text after each "
507  "image/page.",
508  this->params()),
510  "Page separator (default is form feed control character)",
511  this->params()),
512 
513  // The following parameters were deprecated and removed from their
514  // original
515  // locations. The parameters are temporarily kept here to give Tesseract
516  // users a chance to updated their [lang].traineddata and config files
517  // without introducing failures during Tesseract initialization.
518  // TODO(ocr-team): remove these parameters from the code once we are
519  // reasonably sure that Tesseract users have updated their data files.
520  //
521  // BEGIN DEPRECATED PARAMETERS
523  "find horizontal lines such as headers in vertical page mode",
524  this->params()),
525  INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
526  this->params()),
528  "Load fixed length dawgs"
529  " (e.g. for non-space delimited languages)",
530  this->params()),
531  INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
532  this->params()),
533  BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
534  this->params()),
536  "Multiplying factor of"
537  " current best rate to prune other hypotheses",
538  this->params()),
540  "Turn on word script consistency permuter", this->params()),
542  "incorporate segmentation cost in word rating?",
543  this->params()),
545  "Score multipler for script consistency within a word. "
546  "Being a 'reward' factor, it should be <= 1. "
547  "Smaller value implies bigger reward.",
548  this->params()),
550  "Turn on fixed-length phrasebook search permuter",
551  this->params()),
553  "Turn on character type (property) consistency permuter",
554  this->params()),
556  "Score multipler for char type consistency within a word. ",
557  this->params()),
559  "Score multipler for ngram permuter's best choice"
560  " (only used in the Han script path).",
561  this->params()),
563  "Activate character-level n-gram-based permuter",
564  this->params()),
565  BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
566  this->params()),
568  "Depth of blob choice lists to explore"
569  " when fixed length dawgs are on",
570  this->params()),
572  "use new state cost heuristics for segmentation state"
573  " evaluation",
574  this->params()),
576  "base factor for adding segmentation cost into word rating."
577  "It's a multiplying factor, the larger the value above 1, "
578  "the bigger the effect of segmentation cost.",
579  this->params()),
581  "weight associated with char rating in combined cost of"
582  "state",
583  this->params()),
585  "weight associated with width evidence in combined cost of"
586  " state",
587  this->params()),
589  "weight associated with seam cut in combined cost of state",
590  this->params()),
592  "max char width-to-height ratio allowed in segmentation",
593  this->params()),
595  "Enable new segmentation search path.", this->params()),
597  "Maximum character width-to-height ratio for"
598  " fixed-pitch fonts",
599  this->params()),
600  // END DEPRECATED PARAMETERS
601 
602  backup_config_file_(NULL),
603  pix_binary_(NULL),
604  cube_binary_(NULL),
605  pix_grey_(NULL),
606  pix_thresholds_(NULL),
607  source_resolution_(0),
608  textord_(this),
609  right_to_left_(false),
610  scaled_color_(NULL),
611  scaled_factor_(-1),
612  deskew_(1.0f, 0.0f),
613  reskew_(1.0f, 0.0f),
614  most_recently_used_(this),
615  font_table_size_(0),
616 #ifndef ANDROID_BUILD
617  cube_cntxt_(NULL),
618  tess_cube_combiner_(NULL),
619 #endif
620  equ_detect_(NULL) {
621 }
double superscript_scaledown_ratio
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:307
double segment_reward_ngram_best_choice
char * ok_repeated_ch_non_alphanum_wds
char * tessedit_write_params_to_file
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
double tessedit_whole_wd_rej_row_percent
double tessedit_reject_block_percent
double textord_tabfind_vertical_text_ratio
bool crunch_early_convert_bad_unlv_chs
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
bool textord_tabfind_vertical_horizontal_mix
bool tessedit_enable_bigram_correction
bool tessedit_resegment_from_line_boxes
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
double tessedit_reject_row_percent
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
double heuristic_segcost_rating_base
double rej_whole_of_mostly_reject_word_fract
ParamsVectors * params()
Definition: ccutil.h:65
bool tessedit_preserve_row_rej_perfect_wds
#define FALSE
Definition: capi.h:29
double tessedit_good_doc_still_rowrej_wd
double tessedit_reject_doc_percent
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
#define NULL
Definition: host.h:144
double superscript_worse_certainty
bool textord_tabfind_force_vertical_text
double segsearch_max_fixed_pitch_char_wh_ratio
bool applybox_learn_chars_and_char_frags_mode
bool tessedit_preserve_blk_rej_perfect_wds
double textord_tabfind_aligned_gap_fraction
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:313
double superscript_bettered_certainty
int language_model_fixed_length_choices_depth
tesseract::Tesseract::~Tesseract ( )

Definition at line 623 of file tesseractclass.cpp.

623  {
624  Clear();
625  end_tesseract();
626  sub_langs_.delete_data_pointers();
627 #ifndef ANDROID_BUILD
628  // Delete cube objects.
629  if (cube_cntxt_ != NULL) {
630  delete cube_cntxt_;
631  cube_cntxt_ = NULL;
632  }
633  if (tess_cube_combiner_ != NULL) {
634  delete tess_cube_combiner_;
635  tess_cube_combiner_ = NULL;
636  }
637 #endif
638 }
#define NULL
Definition: host.h:144

Member Function Documentation

BOOL8 tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 421 of file output.cpp.

422  {
423  BOOL8 prev_digit = FALSE;
424 
425  if (*lengths == 1 && *s == '(')
426  s++;
427 
428  if (*lengths == 1 &&
429  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
430  s++;
431 
432  for (; *s != '\0'; s += *(lengths++)) {
433  if (unicharset.get_isdigit(s, *lengths))
434  prev_digit = TRUE;
435  else if (prev_digit &&
436  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
437  prev_digit = FALSE;
438  else if (prev_digit && *lengths == 1 &&
439  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
440  return TRUE;
441  else if (prev_digit &&
442  *lengths == 1 && (*s == '%') &&
443  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
444  (*(s + *lengths + *(lengths + 1)) == '\0'))
445  return TRUE;
446  else
447  return FALSE;
448  }
449  return TRUE;
450 }
UNICHARSET unicharset
Definition: ccutil.h:72
unsigned char BOOL8
Definition: host.h:113
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1663 of file control.cpp.

1664  {
1665  int i = 0;
1666  int offset = 0;
1667  int leading_punct_count;
1668  int upper_count = 0;
1669  int hyphen_pos = -1;
1671 
1672  if (strlen (lengths) > 20)
1673  return word_type;
1674 
1675  /* Single Leading punctuation char*/
1676 
1677  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1678  offset += lengths[i++];
1679  leading_punct_count = i;
1680 
1681  /* Initial cap */
1682  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1683  offset += lengths[i++];
1684  upper_count++;
1685  }
1686  if (upper_count > 1) {
1687  word_type = AC_UPPER_CASE;
1688  } else {
1689  /* Lower case word, possibly with an initial cap */
1690  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1691  offset += lengths[i++];
1692  }
1693  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1694  goto not_a_word;
1695  /*
1696  Allow a single hyphen in a lower case word
1697  - dont trust upper case - I've seen several cases of "H" -> "I-I"
1698  */
1699  if (lengths[i] == 1 && s[offset] == '-') {
1700  hyphen_pos = i;
1701  offset += lengths[i++];
1702  if (s[offset] != '\0') {
1703  while ((s[offset] != '\0') &&
1704  char_set.get_islower(s + offset, lengths[i])) {
1705  offset += lengths[i++];
1706  }
1707  if (i < hyphen_pos + 3)
1708  goto not_a_word;
1709  }
1710  } else {
1711  /* Allow "'s" in NON hyphenated lower case words */
1712  if (lengths[i] == 1 && (s[offset] == '\'') &&
1713  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1714  offset += lengths[i++];
1715  offset += lengths[i++];
1716  }
1717  }
1718  if (upper_count > 0)
1719  word_type = AC_INITIAL_CAP;
1720  else
1721  word_type = AC_LOWER_CASE;
1722  }
1723 
1724  /* Up to two different, constrained trailing punctuation chars */
1725  if (lengths[i] == 1 && s[offset] != '\0' &&
1726  STRING(chs_trailing_punct1).contains(s[offset]))
1727  offset += lengths[i++];
1728  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1729  s[offset - lengths[i - 1]] != s[offset] &&
1730  STRING(chs_trailing_punct2).contains (s[offset]))
1731  offset += lengths[i++];
1732 
1733  if (s[offset] != '\0')
1734  word_type = AC_UNACCEPTABLE;
1735 
1736  not_a_word:
1737 
1738  if (word_type == AC_UNACCEPTABLE) {
1739  /* Look for abbreviation string */
1740  i = 0;
1741  offset = 0;
1742  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1743  word_type = AC_UC_ABBREV;
1744  while (s[offset] != '\0' &&
1745  char_set.get_isupper(s + offset, lengths[i]) &&
1746  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1747  offset += lengths[i++];
1748  offset += lengths[i++];
1749  }
1750  }
1751  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1752  word_type = AC_LC_ABBREV;
1753  while (s[offset] != '\0' &&
1754  char_set.get_islower(s + offset, lengths[i]) &&
1755  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1756  offset += lengths[i++];
1757  offset += lengths[i++];
1758  }
1759  }
1760  if (s[offset] != '\0')
1761  word_type = AC_UNACCEPTABLE;
1762  }
1763 
1764  return word_type;
1765 }
a.b.c.
Definition: control.h:40
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
A.B.C.
Definition: control.h:41
ALL upper case.
Definition: control.h:38
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
ALL lower case.
Definition: control.h:37
Unacceptable word.
Definition: control.h:36
ALL but initial lc.
Definition: control.h:39
Definition: strngs.h:44
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
inT16 tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 495 of file reject.cpp.

496  {
497  inT16 i;
498  inT16 offset;
499  inT16 count = 0;
500 
501  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
502  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
503  count++;
504  }
505  return count;
506 }
UNICHARSET unicharset
Definition: ccutil.h:72
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
short inT16
Definition: host.h:100
void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 203 of file recogtraining.cpp.

205  {
206  // Classify word.
207  fflush(stdout);
208  WordData word_data(*pr_it);
209  SetupWordPassN(1, &word_data);
210  classify_word_and_language(1, pr_it, &word_data);
211  WERD_RES* werd_res = word_data.word;
212  WERD_CHOICE *best_choice = werd_res->best_choice;
213  ASSERT_HOST(best_choice != NULL);
214 
215  // Compute the number of unichars in the label.
216  GenericVector<UNICHAR_ID> encoding;
217  if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
218  tprintf("Not outputting illegal unichar %s\n", label);
219  return;
220  }
221 
222  // Dump all paths through the ratings matrix (which is normally small).
223  int dim = werd_res->ratings->dimension();
224  const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
225  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
226  unicharset, label, output_file);
227  delete [] blob_choices;
228 }
MATRIX * ratings
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
int dimension() const
Definition: matrix.h:247
#define ASSERT_HOST(x)
Definition: errcode.h:84
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
WERD * word
Definition: pageres.h:175
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
#define NULL
Definition: host.h:144
bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 258 of file tesseractclass.h.

258  {
259  if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true;
260  for (int i = 0; i < sub_langs_.size(); ++i) {
261  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY)
262  return true;
263  }
264  return false;
265  }
PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 117 of file applybox.cpp.

119  {
120  GenericVector<TBOX> boxes;
121  GenericVector<STRING> texts, full_texts;
122  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
123  NULL)) {
124  return NULL; // Can't do it.
125  }
126 
127  int box_count = boxes.size();
128  int box_failures = 0;
129  // Add an empty everything to the end.
130  boxes.push_back(TBOX());
131  texts.push_back(STRING());
132  full_texts.push_back(STRING());
133 
134  // In word mode, we use the boxes to make a word for each box, but
135  // in blob mode we use the existing words and maximally chop them first.
136  PAGE_RES* page_res = find_segmentation ?
137  NULL : SetupApplyBoxes(boxes, block_list);
138  clear_any_old_text(block_list);
139 
140  for (int i = 0; i < boxes.size() - 1; i++) {
141  bool foundit = false;
142  if (page_res != NULL) {
143  if (i == 0) {
144  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
145  full_texts[i].string());
146  } else {
147  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
148  boxes[i + 1], full_texts[i].string());
149  }
150  } else {
151  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
152  texts[i].string());
153  }
154  if (!foundit) {
155  box_failures++;
156  ReportFailedBox(i, boxes[i], texts[i].string(),
157  "FAILURE! Couldn't find a matching blob");
158  }
159  }
160 
161  if (page_res == NULL) {
162  // In word/line mode, we now maximally chop all the words and resegment
163  // them with the classifier.
164  page_res = SetupApplyBoxes(boxes, block_list);
165  ReSegmentByClassification(page_res);
166  }
167  if (applybox_debug > 0) {
168  tprintf("APPLY_BOXES:\n");
169  tprintf(" Boxes read from boxfile: %6d\n", box_count);
170  if (box_failures > 0)
171  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
172  }
173  TidyUp(page_res);
174  return page_res;
175 }
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:340
int size() const
Definition: genericvector.h:72
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:764
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:438
Definition: rect.h:30
Definition: strngs.h:44
#define NULL
Definition: host.h:144
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:51
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509
void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 796 of file applybox.cpp.

796  {
797  PAGE_RES_IT pr_it(page_res);
798  int word_count = 0;
799  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
800  word_res = pr_it.forward()) {
801  LearnWord(fontname.string(), word_res);
802  ++word_count;
803  }
804  tprintf("Generated training data for %d words\n", word_count);
805 }
#define tprintf(...)
Definition: tprintf.h:31
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1029 of file control.cpp.

1032  {
1033  GenericVector<bool> blob_wanted;
1034  word_wanted->init_to_size(outlines.size(), false);
1035  target_blobs->init_to_size(outlines.size(), NULL);
1036  // Check for outlines that need to be turned into stand-alone blobs.
1037  for (int i = 0; i < outlines.size(); ++i) {
1038  if (outlines[i] == NULL) continue;
1039  // Get a set of adjacent outlines that don't overlap any existing blob.
1040  blob_wanted.init_to_size(outlines.size(), false);
1041  int num_blob_outlines = 0;
1042  TBOX total_ol_box(outlines[i]->bounding_box());
1043  while (i < outlines.size() && outlines[i] != NULL) {
1044  blob_wanted[i] = true;
1045  total_ol_box += outlines[i]->bounding_box();
1046  ++i;
1047  ++num_blob_outlines;
1048  }
1049  // Find the insertion point.
1050  C_BLOB_IT blob_it(real_word->cblob_list());
1051  while (!blob_it.at_last() &&
1052  blob_it.data_relative(1)->bounding_box().left() <=
1053  total_ol_box.left()) {
1054  blob_it.forward();
1055  }
1056  // Choose which combination of them we actually want and where to put
1057  // them.
1058  if (debug_noise_removal)
1059  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1060  C_BLOB* left_blob = blob_it.data();
1061  TBOX left_box = left_blob->bounding_box();
1062  C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
1063  if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
1064  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1065  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1066  outlines, num_blob_outlines,
1067  &blob_wanted)) {
1068  if (debug_noise_removal) tprintf("Added to left blob\n");
1069  for (int j = 0; j < blob_wanted.size(); ++j) {
1070  if (blob_wanted[j]) {
1071  (*word_wanted)[j] = true;
1072  (*target_blobs)[j] = left_blob;
1073  }
1074  }
1075  } else if (right_blob != NULL &&
1076  (!left_box.x_overlap(total_ol_box) ||
1077  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1079  right_blob, outlines,
1080  num_blob_outlines, &blob_wanted)) {
1081  if (debug_noise_removal) tprintf("Added to right blob\n");
1082  for (int j = 0; j < blob_wanted.size(); ++j) {
1083  if (blob_wanted[j]) {
1084  (*word_wanted)[j] = true;
1085  (*target_blobs)[j] = right_blob;
1086  }
1087  }
1088  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
1089  outlines, num_blob_outlines,
1090  &blob_wanted)) {
1091  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1092  for (int j = 0; j < blob_wanted.size(); ++j) {
1093  if (blob_wanted[j]) {
1094  (*word_wanted)[j] = true;
1095  (*target_blobs)[j] = NULL;
1096  }
1097  }
1098  }
1099  }
1100 }
int size() const
Definition: genericvector.h:72
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1105
#define tprintf(...)
Definition: tprintf.h:31
void init_to_size(int size, T t)
bool x_overlap(const TBOX &box) const
Definition: rect.h:391
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
#define NULL
Definition: host.h:144
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 976 of file control.cpp.

980  {
981  GenericVector<bool> blob_wanted;
982  word_wanted->init_to_size(outlines.size(), false);
983  overlapped_any_blob->init_to_size(outlines.size(), false);
984  target_blobs->init_to_size(outlines.size(), NULL);
985  // For each real blob, find the outlines that seriously overlap it.
986  // A single blob could be several merged characters, so there can be quite
987  // a few outlines overlapping, and the full engine needs to be used to chop
988  // and join to get a sensible result.
989  C_BLOB_IT blob_it(real_word->cblob_list());
990  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
991  C_BLOB* blob = blob_it.data();
992  TBOX blob_box = blob->bounding_box();
993  blob_wanted.init_to_size(outlines.size(), false);
994  int num_blob_outlines = 0;
995  for (int i = 0; i < outlines.size(); ++i) {
996  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
997  !(*word_wanted)[i]) {
998  blob_wanted[i] = true;
999  (*overlapped_any_blob)[i] = true;
1000  ++num_blob_outlines;
1001  }
1002  }
1003  if (debug_noise_removal) {
1004  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1005  blob_box.print();
1006  }
1007  // If any outlines overlap the blob, and not too many, classify the blob
1008  // (using the full engine, languages and all), and choose the maximal
1009  // combination of outlines that doesn't hurt the end-result classification
1010  // by too much. Mark them as wanted.
1011  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1012  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1013  outlines, num_blob_outlines,
1014  &blob_wanted)) {
1015  for (int i = 0; i < blob_wanted.size(); ++i) {
1016  if (blob_wanted[i]) {
1017  // Claim the outline and record where it is going.
1018  (*word_wanted)[i] = true;
1019  (*target_blobs)[i] = blob;
1020  }
1021  }
1022  }
1023  }
1024  }
1025 }
int size() const
Definition: genericvector.h:72
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1105
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
void init_to_size(int size, T t)
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
#define NULL
Definition: host.h:144
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout anaylsis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 232 of file pagesegmain.cpp.

235  {
236  if (textord_debug_images) {
237  WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
238  }
239  Pix* photomask_pix = NULL;
240  Pix* musicmask_pix = NULL;
241  // The blocks made by the ColumnFinder. Moved to blocks before return.
242  BLOCK_LIST found_blocks;
243  TO_BLOCK_LIST temp_blocks;
244 
245  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
246  pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
247  &musicmask_pix);
248  int result = 0;
249  if (finder != NULL) {
250  TO_BLOCK_IT to_block_it(&temp_blocks);
251  TO_BLOCK* to_block = to_block_it.data();
252  if (musicmask_pix != NULL) {
253  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
254  // blocks separately. For now combine with photomask_pix.
255  pixOr(photomask_pix, photomask_pix, musicmask_pix);
256  }
257  if (equ_detect_) {
258  finder->SetEquationDetect(equ_detect_);
259  }
260  result = finder->FindBlocks(
261  pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
262  pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
263  if (result >= 0)
264  finder->GetDeskewVectors(&deskew_, &reskew_);
265  delete finder;
266  }
267  pixDestroy(&photomask_pix);
268  pixDestroy(&musicmask_pix);
269  if (result < 0) return result;
270 
271  blocks->clear();
272  BLOCK_IT block_it(blocks);
273  // Move the found blocks to the input/output blocks.
274  block_it.add_list_after(&found_blocks);
275 
276  if (textord_debug_images) {
277  // The debug image is no longer needed so delete it.
278  unlink(AlignedBlob::textord_debug_pix().string());
279  }
280  return result;
281 }
bool textord_debug_images
Definition: alignedblob.cpp:33
static const STRING & textord_debug_pix()
Definition: alignedblob.h:112
bool textord_debug_printable
Definition: alignedblob.cpp:34
#define NULL
Definition: host.h:144
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 520 of file superscript.cpp.

524  {
525  int initial_ok_run_count = 0;
526  int ok_run_count = 0;
527  float worst_certainty = 0.0f;
528  const WERD_CHOICE &wc = *word.best_choice;
529 
530  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
531  for (int i = 0; i < wc.length(); i++) {
532  TBLOB *blob = word.rebuild_word->blobs[i];
533  UNICHAR_ID unichar_id = wc.unichar_id(i);
534  float char_certainty = wc.certainty(i);
535  bool bad_certainty = char_certainty < certainty_threshold;
536  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
537  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
538  BLOB_CHOICE *choice = word.GetBlobChoice(i);
539  if (choice && fontinfo_table.size() > 0) {
540  // Get better information from the specific choice, if available.
541  int font_id1 = choice->fontinfo_id();
542  bool font1_is_italic = font_id1 >= 0
543  ? fontinfo_table.get(font_id1).is_italic() : false;
544  int font_id2 = choice->fontinfo_id2();
545  is_italic = font1_is_italic &&
546  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
547  }
548 
549  float height_fraction = 1.0f;
550  float char_height = blob->bounding_box().height();
551  float normal_height = char_height;
552  if (wc.unicharset()->top_bottom_useful()) {
553  int min_bot, max_bot, min_top, max_top;
554  wc.unicharset()->get_top_bottom(unichar_id,
555  &min_bot, &max_bot,
556  &min_top, &max_top);
557  float hi_height = max_top - max_bot;
558  float lo_height = min_top - min_bot;
559  normal_height = (hi_height + lo_height) / 2;
560  if (normal_height >= kBlnXHeight) {
561  // Only ding characters that we have decent information for because
562  // they're supposed to be normal sized, not tiny specks or dashes.
563  height_fraction = char_height / normal_height;
564  }
565  }
566  bool bad_height = height_fraction < superscript_scaledown_ratio;
567 
568  if (debug) {
569  if (is_italic) {
570  tprintf(" Rejecting: superscript is italic.\n");
571  }
572  if (is_punc) {
573  tprintf(" Rejecting: punctuation present.\n");
574  }
575  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
576  if (bad_certainty) {
577  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
578  "which is less than threshold %.2f\n", char_str,
579  char_certainty, certainty_threshold);
580  }
581  if (bad_height) {
582  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
583  "expected %.2f\n", char_str, char_height, normal_height);
584  }
585  }
586  if (bad_certainty || bad_height || is_punc || is_italic) {
587  if (ok_run_count == i) {
588  initial_ok_run_count = ok_run_count;
589  }
590  ok_run_count = 0;
591  } else {
592  ok_run_count++;
593  }
594  if (char_certainty < worst_certainty) {
595  worst_certainty = char_certainty;
596  }
597  }
598  bool all_ok = ok_run_count == wc.length();
599  if (all_ok && debug) {
600  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
601  }
602  if (!all_ok) {
603  if (left_ok) *left_ok = initial_ok_run_count;
604  if (right_ok) *right_ok = ok_run_count;
605  }
606  return all_ok;
607 }
double superscript_scaledown_ratio
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const FontInfo * fontinfo
Definition: pageres.h:288
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
inT16 fontinfo_id() const
Definition: ratngs.h:85
float certainty() const
Definition: ratngs.h:327
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool is_italic() const
Definition: fontinfo.h:111
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
int UNICHAR_ID
Definition: unichar.h:33
inT16 fontinfo_id2() const
Definition: ratngs.h:88
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
inT16 height() const
Definition: rect.h:104
const T & get(int id) const
Return the object from an id.
bool top_bottom_useful() const
Definition: unicharset.h:495
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
int size() const
Return the size used.
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
TBOX bounding_box() const
Definition: blobs.cpp:482
Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 212 of file tesseractclass.h.

212  {
213  return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
214  }
#define NULL
Definition: host.h:144
void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 442 of file control.cpp.

442  {
443  PAGE_RES_IT word_it(page_res);
444 
445  WERD_RES *w_prev = NULL;
446  WERD_RES *w = word_it.word();
447  while (1) {
448  w_prev = w;
449  while (word_it.forward() != NULL &&
450  (!word_it.word() || word_it.word()->part_of_combo)) {
451  // advance word_it, skipping over parts of combos
452  }
453  if (!word_it.word()) break;
454  w = word_it.word();
455  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
456  continue;
457  }
458  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
459  if (tessedit_bigram_debug) {
460  tprintf("Skipping because one of the words is W_REP_CHAR\n");
461  }
462  continue;
463  }
464  // Two words sharing the same language model, excellent!
465  GenericVector<WERD_CHOICE *> overrides_word1;
466  GenericVector<WERD_CHOICE *> overrides_word2;
467 
468  STRING orig_w1_str = w_prev->best_choice->unichar_string();
469  STRING orig_w2_str = w->best_choice->unichar_string();
470  WERD_CHOICE prev_best(w->uch_set);
471  {
472  int w1start, w1end;
473  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
474  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
475  }
476  WERD_CHOICE this_best(w->uch_set);
477  {
478  int w2start, w2end;
479  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
480  this_best = w->best_choice->shallow_copy(w2start, w2end);
481  }
482 
483  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
484  if (tessedit_bigram_debug) {
485  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
486  orig_w1_str.string(), orig_w2_str.string());
487  }
488  continue;
489  }
490  if (tessedit_bigram_debug > 2) {
491  tprintf("Examining alt choices for \"%s %s\".\n",
492  orig_w1_str.string(), orig_w2_str.string());
493  }
494  if (tessedit_bigram_debug > 1) {
495  if (!w_prev->best_choices.singleton()) {
496  w_prev->PrintBestChoices();
497  }
498  if (!w->best_choices.singleton()) {
499  w->PrintBestChoices();
500  }
501  }
502  float best_rating = 0.0;
503  int best_idx = 0;
504  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
505  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
506  WERD_CHOICE *p1 = prev_it.data();
507  WERD_CHOICE strip1(w->uch_set);
508  {
509  int p1start, p1end;
510  p1->GetNonSuperscriptSpan(&p1start, &p1end);
511  strip1 = p1->shallow_copy(p1start, p1end);
512  }
513  WERD_CHOICE_IT w_it(&w->best_choices);
514  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
515  WERD_CHOICE *p2 = w_it.data();
516  WERD_CHOICE strip2(w->uch_set);
517  {
518  int p2start, p2end;
519  p2->GetNonSuperscriptSpan(&p2start, &p2end);
520  strip2 = p2->shallow_copy(p2start, p2end);
521  }
522  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
523  overrides_word1.push_back(p1);
524  overrides_word2.push_back(p2);
525  if (overrides_word1.size() == 1 ||
526  p1->rating() + p2->rating() < best_rating) {
527  best_rating = p1->rating() + p2->rating();
528  best_idx = overrides_word1.size() - 1;
529  }
530  }
531  }
532  }
533  if (overrides_word1.size() >= 1) {
534  // Excellent, we have some bigram matches.
536  *overrides_word1[best_idx]) &&
538  *overrides_word2[best_idx])) {
539  if (tessedit_bigram_debug > 1) {
540  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
541  "model.\n", orig_w1_str.string(), orig_w2_str.string());
542  }
543  continue;
544  }
545  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
546  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
547  if (new_w1_str != orig_w1_str) {
548  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
549  }
550  if (new_w2_str != orig_w2_str) {
551  w->ReplaceBestChoice(overrides_word2[best_idx]);
552  }
553  if (tessedit_bigram_debug > 0) {
554  STRING choices_description;
555  int num_bigram_choices
556  = overrides_word1.size() * overrides_word2.size();
557  if (num_bigram_choices == 1) {
558  choices_description = "This was the unique bigram choice.";
559  } else {
560  if (tessedit_bigram_debug > 1) {
561  STRING bigrams_list;
562  const int kMaxChoicesToPrint = 20;
563  for (int i = 0; i < overrides_word1.size() &&
564  i < kMaxChoicesToPrint; i++) {
565  if (i > 0) { bigrams_list += ", "; }
566  WERD_CHOICE *p1 = overrides_word1[i];
567  WERD_CHOICE *p2 = overrides_word2[i];
568  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
569  if (i == kMaxChoicesToPrint) {
570  bigrams_list += " ...";
571  }
572  }
573  choices_description = "There were many choices: {";
574  choices_description += bigrams_list;
575  choices_description += "}";
576  } else {
577  choices_description.add_str_int("There were ", num_bigram_choices);
578  choices_description += " compatible bigrams.";
579  }
580  }
581  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
582  orig_w1_str.string(), orig_w2_str.string(),
583  new_w1_str.string(), new_w2_str.string(),
584  choices_description.string());
585  }
586  }
587  }
588 }
int size() const
Definition: genericvector.h:72
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
float rating() const
Definition: ratngs.h:324
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
WERD_CHOICE * best_choice
Definition: pageres.h:219
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:392
const STRING & unichar_string() const
Definition: ratngs.h:524
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:375
const UNICHARSET * uch_set
Definition: pageres.h:192
tesseract::Tesseract * tesseract
Definition: pageres.h:266
Dict & getDict()
Definition: classify.h:65
void PrintBestChoices() const
Definition: pageres.cpp:709
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738
WERD * word
Definition: pageres.h:175
void add_str_int(const char *str, int number)
Definition: strngs.cpp:376
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:791
void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 686 of file control.cpp.

686  {
687  if (!wordrec_run_blamer) return;
688  PAGE_RES_IT page_res_it(page_res);
689  for (page_res_it.restart_page(); page_res_it.word() != NULL;
690  page_res_it.forward()) {
691  WERD_RES *word = page_res_it.word();
694  }
695  tprintf("Blame reasons:\n");
696  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
698  static_cast<IncorrectResultReason>(bl)),
699  page_res->blame_reasons[bl]);
700  }
701  if (page_res->misadaption_log.length() > 0) {
702  tprintf("Misadaption log:\n");
703  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
704  tprintf("%s\n", page_res->misadaption_log[i].string());
705  }
706  }
707 }
int length() const
Definition: genericvector.h:79
#define tprintf(...)
Definition: tprintf.h:31
GenericVector< int > blame_reasons
Definition: pageres.h:68
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
bool wordrec_debug_blamer
Definition: wordrec.h:167
bool wordrec_run_blamer
Definition: wordrec.h:168
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 960 of file pgedit.cpp.

961  {
962  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
963  if (it != NULL) {
964  WERD_RES* word_res = it->word();
965  word_res->x_height = it->row()->row->x_height();
966  word_res->SetupForRecognition(unicharset, this, BestPix(),
971  it->row()->row, it->block()->block);
972  TWERD* bln_word = word_res->chopped_word;
973  TBLOB* bln_blob = bln_word->blobs[0];
974  INT_FX_RESULT_STRUCT fx_info;
977  Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
978  &cn_features, &fx_info, NULL);
979  // Display baseline features.
980  ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
982  for (int f = 0; f < bl_features.size(); ++f)
983  RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
984  bl_win->Update();
985  // Display cn features.
986  ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
988  for (int f = 0; f < cn_features.size(); ++f)
989  RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
990  cn_win->Update();
991 
992  it->DeleteCurrentWord();
993  delete it;
994  }
995 }
Definition: blobs.h:261
int size() const
Definition: genericvector.h:72
bool classify_bln_numeric_mode
Definition: classify.h:500
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:1104
TWERD * chopped_word
Definition: pageres.h:201
static void Update()
Definition: scrollview.cpp:715
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
UNICHARSET unicharset
Definition: ccutil.h:72
float x_height() const
Definition: ocrrow.h:61
void DeleteCurrentWord()
Definition: pageres.cpp:1449
float x_height
Definition: pageres.h:295
BLOCK * block
Definition: pageres.h:99
BLOCK_RES * block() const
Definition: pageres.h:739
bool classify_nonlinear_norm
Definition: classify.h:416
ROW_RES * row() const
Definition: pageres.h:736
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1770
Pix * BestPix() const
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
ROW * row
Definition: pageres.h:127
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define NULL
Definition: host.h:144
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1936
Definition: blobs.h:395
WERD_RES * word() const
Definition: pageres.h:733
float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 761 of file fixspace.cpp.

761  {
762  TBOX box; // BB of outline
763  inT16 outline_count = 0;
764  inT16 max_dimension;
765  inT16 largest_outline_dimension = 0;
766 
767  for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
768  outline_count++;
769  box = ol->bounding_box();
770  if (box.height() > box.width()) {
771  max_dimension = box.height();
772  } else {
773  max_dimension = box.width();
774  }
775 
776  if (largest_outline_dimension < max_dimension)
777  largest_outline_dimension = max_dimension;
778  }
779 
780  if (outline_count > 5) {
781  // penalise LOTS of blobs
782  largest_outline_dimension *= 2;
783  }
784 
785  box = blob->bounding_box();
786  if (box.bottom() > kBlnBaselineOffset * 4 ||
787  box.top() < kBlnBaselineOffset / 2) {
788  // Lax blob is if high or low
789  largest_outline_dimension /= 2;
790  }
791 
792  return largest_outline_dimension;
793 }
const int kBlnBaselineOffset
Definition: normalis.h:29
inT16 bottom() const
Definition: rect.h:61
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
Definition: rect.h:30
#define NULL
Definition: host.h:144
TBOX bounding_box() const
Definition: blobs.cpp:482
TESSLINE * outlines
Definition: blobs.h:377
inT16 top() const
Definition: rect.h:54
short inT16
Definition: host.h:100
void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 616 of file fixspace.cpp.

616  {
617  WERD_RES_IT word_it(&words);
618  WERD_RES_IT worst_word_it;
619  float worst_noise_score = 9999;
620  int worst_blob_index = -1; // Noisiest blob of noisiest wd
621  int blob_index; // of wds noisiest blob
622  float noise_score; // of wds noisiest blob
623  WERD_RES *word_res;
624  C_BLOB_IT blob_it;
625  C_BLOB_IT rej_cblob_it;
626  C_BLOB_LIST new_blob_list;
627  C_BLOB_IT new_blob_it;
628  C_BLOB_IT new_rej_cblob_it;
629  WERD *new_word;
630  inT16 start_of_noise_blob;
631  inT16 i;
632 
633  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
634  blob_index = worst_noise_blob(word_it.data(), &noise_score);
635  if (blob_index > -1 && worst_noise_score > noise_score) {
636  worst_noise_score = noise_score;
637  worst_blob_index = blob_index;
638  worst_word_it = word_it;
639  }
640  }
641  if (worst_blob_index < 0) {
642  words.clear(); // signal termination
643  return;
644  }
645 
646  /* Now split the worst_word_it */
647 
648  word_res = worst_word_it.data();
649 
650  /* Move blobs before noise blob to a new bloblist */
651 
652  new_blob_it.set_to_list(&new_blob_list);
653  blob_it.set_to_list(word_res->word->cblob_list());
654  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
655  new_blob_it.add_after_then_move(blob_it.extract());
656  }
657  start_of_noise_blob = blob_it.data()->bounding_box().left();
658  delete blob_it.extract(); // throw out noise blob
659 
660  new_word = new WERD(&new_blob_list, word_res->word);
661  new_word->set_flag(W_EOL, FALSE);
662  word_res->word->set_flag(W_BOL, FALSE);
663  word_res->word->set_blanks(1); // After break
664 
665  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
666  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
667  for (;
668  (!rej_cblob_it.empty() &&
669  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
670  rej_cblob_it.forward()) {
671  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
672  }
673 
674  WERD_RES* new_word_res = new WERD_RES(new_word);
675  new_word_res->combination = TRUE;
676  worst_word_it.add_before_then_move(new_word_res);
677 
678  word_res->ClearResults();
679 }
void ClearResults()
Definition: pageres.cpp:1140
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:681
Definition: werd.h:35
BOOL8 combination
Definition: pageres.h:315
Definition: werd.h:36
Definition: werd.h:60
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
short inT16
Definition: host.h:100
SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 257 of file pgedit.cpp.

257  {
258  SVMenuNode* parent_menu;
259  SVMenuNode* root_menu_item = new SVMenuNode();
260 
261  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
262 
263  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
264  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
265  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
266  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
267  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
268  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
269  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
270  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
271 
272  parent_menu = root_menu_item->AddChild("DISPLAY");
273 
274  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
275  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
276  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
277  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
278  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
279  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
280  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
281  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
282  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
283  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
284  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
285  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
286  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
287  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
288  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
289 
290 
291  parent_menu = root_menu_item->AddChild("OTHER");
292 
293  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
294  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
295  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
296  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
297  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
298  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
299 
300  return root_menu_item;
301 }
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:59
BOOL8 tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1767 of file control.cpp.

1767  {
1768  BOOL8 show_map_detail = FALSE;
1769  inT16 i;
1770 
1771  if (!test_pt)
1772  return FALSE;
1773 
1774  tessedit_rejection_debug.set_value (FALSE);
1775  debug_x_ht_level.set_value(0);
1776 
1777  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1778  if (location < 0)
1779  return TRUE; // For breakpoint use
1780  tessedit_rejection_debug.set_value (TRUE);
1781  debug_x_ht_level.set_value(2);
1782  tprintf ("\n\nTESTWD::");
1783  switch (location) {
1784  case 0:
1785  tprintf ("classify_word_pass1 start\n");
1786  word->word->print();
1787  break;
1788  case 10:
1789  tprintf ("make_reject_map: initial map");
1790  break;
1791  case 20:
1792  tprintf ("make_reject_map: after NN");
1793  break;
1794  case 30:
1795  tprintf ("classify_word_pass2 - START");
1796  break;
1797  case 40:
1798  tprintf ("classify_word_pass2 - Pre Xht");
1799  break;
1800  case 50:
1801  tprintf ("classify_word_pass2 - END");
1802  show_map_detail = TRUE;
1803  break;
1804  case 60:
1805  tprintf ("fixspace");
1806  break;
1807  case 70:
1808  tprintf ("MM pass START");
1809  break;
1810  case 80:
1811  tprintf ("MM pass END");
1812  break;
1813  case 90:
1814  tprintf ("After Poor quality rejection");
1815  break;
1816  case 100:
1817  tprintf ("unrej_good_quality_words - START");
1818  break;
1819  case 110:
1820  tprintf ("unrej_good_quality_words - END");
1821  break;
1822  case 120:
1823  tprintf ("Write results pass");
1824  show_map_detail = TRUE;
1825  break;
1826  }
1827  if (word->best_choice != NULL) {
1828  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1829  word->reject_map.print(debug_fp);
1830  tprintf("\n");
1831  if (show_map_detail) {
1832  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1833  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1834  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1835  word->reject_map[i].full_print(debug_fp);
1836  }
1837  }
1838  } else {
1839  tprintf("null best choice\n");
1840  }
1841  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1842  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1843  return TRUE;
1844  } else {
1845  return FALSE;
1846  }
1847 }
BOOL8 tess_accepted
Definition: pageres.h:280
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
const STRING & unichar_string() const
Definition: ratngs.h:524
void full_print(FILE *fp)
Definition: rejctmap.cpp:406
void print()
Definition: werd.cpp:266
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
FILE * debug_fp
Definition: tessvars.cpp:24
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
void print(FILE *fp)
Definition: rejctmap.cpp:394
const char * string() const
Definition: strngs.cpp:193
Definition: points.h:189
short inT16
Definition: host.h:100
void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1268 of file control.cpp.

1269  {
1270  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1272  // Best result so far.
1273  PointerVector<WERD_RES> best_words;
1274  // Points to the best result. May be word or in lang_words.
1275  WERD_RES* word = word_data->word;
1276  clock_t start_t = clock();
1278  tprintf("%s word with lang %s at:",
1279  word->done ? "Already done" : "Processing",
1280  most_recently_used_->lang.string());
1281  word->word->bounding_box().print();
1282  }
1283  if (word->done) {
1284  // If done on pass1, leave it as-is.
1285  if (!word->tess_failed)
1286  most_recently_used_ = word->tesseract;
1287  return;
1288  }
1289  int sub = sub_langs_.size();
1290  if (most_recently_used_ != this) {
1291  // Get the index of the most_recently_used_.
1292  for (sub = 0; sub < sub_langs_.size() &&
1293  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1294  }
1295  most_recently_used_->RetryWithLanguage(
1296  *word_data, recognizer, &word_data->lang_words[sub], &best_words);
1297  Tesseract* best_lang_tess = most_recently_used_;
1298  if (!WordsAcceptable(best_words)) {
1299  // Try all the other languages to see if they are any better.
1300  if (most_recently_used_ != this &&
1301  this->RetryWithLanguage(*word_data, recognizer,
1302  &word_data->lang_words[sub_langs_.size()],
1303  &best_words) > 0) {
1304  best_lang_tess = this;
1305  }
1306  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1307  ++i) {
1308  if (most_recently_used_ != sub_langs_[i] &&
1309  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
1310  &word_data->lang_words[i],
1311  &best_words) > 0) {
1312  best_lang_tess = sub_langs_[i];
1313  }
1314  }
1315  }
1316  most_recently_used_ = best_lang_tess;
1317  if (!best_words.empty()) {
1318  if (best_words.size() == 1 && !best_words[0]->combination) {
1319  // Move the best single result to the main word.
1320  word_data->word->ConsumeWordResults(best_words[0]);
1321  } else {
1322  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1323  word_data->word = best_words.back();
1324  pr_it->ReplaceCurrentWord(&best_words);
1325  }
1326  ASSERT_HOST(word_data->word->box_word != NULL);
1327  } else {
1328  tprintf("no best words!!\n");
1329  }
1330  clock_t ocr_t = clock();
1331  if (tessedit_timing_debug) {
1332  tprintf("%s (ocr took %.2f sec)\n",
1333  word->best_choice->unichar_string().string(),
1334  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1335  }
1336 }
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:869
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1488
WERD_CHOICE * best_choice
Definition: pageres.h:219
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1321
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1344
tesseract::Tesseract * tesseract
Definition: pageres.h:266
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
BOOL8 tess_failed
Definition: pageres.h:272
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
STRING lang
Definition: ccutil.h:69
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1344 of file control.cpp.

1346  {
1347  ROW* row = word_data.row;
1348  BLOCK* block = word_data.block;
1349  prev_word_best_choice_ = word_data.prev_word != NULL
1350  ? word_data.prev_word->word->best_choice : NULL;
1351 #ifndef ANDROID_BUILD
1352  // If we only intend to run cube - run it and return.
1354  cube_word_pass1(block, row, *in_word);
1355  return;
1356  }
1357 #endif
1358  WERD_RES* word = *in_word;
1359  match_word_pass_n(1, word, row, block);
1360  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1361  word->tess_would_adapt = AdaptableWord(word);
1362  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1363 
1364  if (adapt_ok) {
1365  // Send word to adaptive classifier for training.
1366  word->BestChoiceToCorrectText();
1367  LearnWord(NULL, word);
1368  // Mark misadaptions if running blamer.
1369  if (word->blamer_bundle != NULL) {
1372  }
1373  }
1374 
1375  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1377  }
1378 }
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1549
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
BOOL8 tess_would_adapt
Definition: pageres.h:281
Definition: ocrrow.h:32
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
bool wordrec_debug_blamer
Definition: wordrec.h:167
Definition: ocrblock.h:30
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
WERD * word
Definition: pageres.h:175
BOOL8 tess_failed
Definition: pageres.h:272
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:850
#define NULL
Definition: host.h:144
bool IsAmbiguous()
Definition: pageres.cpp:443
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
void BestChoiceToCorrectText()
Definition: pageres.cpp:917
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79
void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1488 of file control.cpp.

1490  {
1491  // Return if we do not want to run Tesseract.
1494  word_data.word->best_choice != NULL)
1495  return;
1497  return;
1498  }
1499  ROW* row = word_data.row;
1500  BLOCK* block = word_data.block;
1501  WERD_RES* word = *in_word;
1502  prev_word_best_choice_ = word_data.prev_word != NULL
1503  ? word_data.prev_word->word->best_choice : NULL;
1504 
1506  check_debug_pt(word, 30);
1507  if (!word->done) {
1508  word->caps_height = 0.0;
1509  if (word->x_height == 0.0f)
1510  word->x_height = row->x_height();
1511  match_word_pass_n(2, word, row, block);
1512  check_debug_pt(word, 40);
1513  }
1514 
1515  SubAndSuperscriptFix(word);
1516 
1517  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1519  block->classify_rotation().y() == 0.0f) {
1520  // Use the tops and bottoms since they are available.
1521  TrainedXheightFix(word, block, row);
1522  }
1523 
1525  }
1526 #ifndef GRAPHICS_DISABLED
1528  if (fx_win == NULL)
1529  create_fx_win();
1530  clear_fx_win();
1531  word->rebuild_word->plot(fx_win);
1532  TBOX wbox = word->rebuild_word->bounding_box();
1533  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1534  wbox.right(), wbox.bottom());
1536  }
1537 #endif
1539  check_debug_pt(word, 50);
1540 }
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1402
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1549
bool SubAndSuperscriptFix(WERD_RES *word_res)
static void Update()
Definition: scrollview.cpp:715
UNICHARSET unicharset
Definition: ccutil.h:72
void clear_fx_win()
Definition: drawfx.cpp:73
float caps_height
Definition: pageres.h:296
float x_height() const
Definition: ocrrow.h:61
inT16 right() const
Definition: rect.h:75
float x_height
Definition: pageres.h:295
Definition: ocrrow.h:32
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
void plot(ScrollView *window)
Definition: blobs.cpp:918
FCOORD classify_rotation() const
Definition: ocrblock.h:144
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
inT16 left() const
Definition: rect.h:68
TWERD * rebuild_word
Definition: pageres.h:244
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
bool script_has_xheight() const
Definition: unicharset.h:849
Definition: ocrblock.h:30
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:51
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:881
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
bool top_bottom_useful() const
Definition: unicharset.h:495
BOOL8 tess_failed
Definition: pageres.h:272
Definition: rect.h:30
float y() const
Definition: points.h:212
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
#define SUBLOC_NORM
Definition: errcode.h:59
void create_fx_win()
Definition: drawfx.cpp:60
float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1232 of file control.cpp.

1233  {
1234  WERD* real_word = pr_it->word()->word;
1235  WERD* word = real_word->ConstructFromSingleBlob(
1236  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1237  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1238  // Get a new iterator that points to the new word.
1239  PAGE_RES_IT it(pr_it->page_res);
1240  while (it.word() != word_res && it.word() != NULL) it.forward();
1241  ASSERT_HOST(it.word() == word_res);
1242  WordData wd(it);
1243  // Force full initialization.
1244  SetupWordPassN(1, &wd);
1245  classify_word_and_language(pass_n, &it, &wd);
1246  if (debug_noise_removal) {
1247  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1248  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1249  wd.word->raw_choice->max_x_height());
1250  }
1251  float cert = wd.word->raw_choice->certainty();
1252  float rat = wd.word->raw_choice->rating();
1253  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1254  *best_str = wd.word->raw_choice->unichar_string();
1255  it.DeleteCurrentWord();
1256  pr_it->ResetWordIterator();
1257  return cert;
1258 }
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
void ResetWordIterator()
Definition: pageres.cpp:1532
#define tprintf(...)
Definition: tprintf.h:31
PAGE_RES * page_res
Definition: pageres.h:658
float x_height
Definition: pageres.h:295
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: werd.h:35
Definition: werd.h:36
Definition: werd.h:60
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
WERD * word
Definition: pageres.h:175
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1268
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
#define NULL
Definition: host.h:144
WERD_RES * word() const
Definition: pageres.h:733
float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1190 of file control.cpp.

1193  {
1194  C_OUTLINE_IT ol_it;
1195  C_OUTLINE* first_to_keep = NULL;
1196  if (blob != NULL) {
1197  // Add the required outlines to the blob.
1198  ol_it.set_to_list(blob->out_list());
1199  first_to_keep = ol_it.data();
1200  }
1201  for (int i = 0; i < ok_outlines.size(); ++i) {
1202  if (ok_outlines[i]) {
1203  // This outline is to be added.
1204  if (blob == NULL) {
1205  blob = new C_BLOB(outlines[i]);
1206  ol_it.set_to_list(blob->out_list());
1207  } else {
1208  ol_it.add_before_stay_put(outlines[i]);
1209  }
1210  }
1211  }
1212  float c2;
1213  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1214  ol_it.move_to_first();
1215  if (first_to_keep == NULL) {
1216  // We created blob. Empty its outlines and delete it.
1217  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1218  delete blob;
1219  cert = -c2;
1220  } else {
1221  // Remove the outlines that we put in.
1222  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1223  ol_it.extract();
1224  }
1225  }
1226  return cert;
1227 }
int size() const
Definition: genericvector.h:72
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1232
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
#define NULL
Definition: host.h:144
void tesseract::Tesseract::Clear ( )

Definition at line 640 of file tesseractclass.cpp.

640  {
641  pixDestroy(&pix_binary_);
642  pixDestroy(&cube_binary_);
643  pixDestroy(&pix_grey_);
644  pixDestroy(&pix_thresholds_);
645  pixDestroy(&scaled_color_);
646  deskew_ = FCOORD(1.0f, 0.0f);
647  reskew_ = FCOORD(1.0f, 0.0f);
648  splitter_.Clear();
649  scaled_factor_ = -1;
650  for (int i = 0; i < sub_langs_.size(); ++i)
651  sub_langs_[i]->Clear();
652 }
Definition: points.h:189
float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 101 of file fixxht.cpp.

102  {
103  STATS top_stats(0, MAX_UINT8);
104  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
105  int bottom_shift = 0;
106  int num_blobs = word_res->rebuild_word->NumBlobs();
107  do {
108  top_stats.clear();
109  shift_stats.clear();
110  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
111  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
112  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
113  if (unicharset.get_isalpha(class_id) ||
114  unicharset.get_isdigit(class_id)) {
115  int top = blob->bounding_box().top() + bottom_shift;
116  // Clip the top to the limit of normalized feature space.
117  if (top >= INT_FEAT_RANGE)
118  top = INT_FEAT_RANGE - 1;
119  int bottom = blob->bounding_box().bottom() + bottom_shift;
120  int min_bottom, max_bottom, min_top, max_top;
121  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
122  &min_top, &max_top);
123  // Chars with a wild top range would mess up the result so ignore them.
124  if (max_top - min_top > kMaxCharTopRange)
125  continue;
126  int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
127  top - (max_top + x_ht_acceptance_tolerance));
128  int height = top - kBlnBaselineOffset;
129  if (debug_x_ht_level >= 2) {
130  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
131  unicharset.id_to_unichar(class_id),
132  height, min_bottom, max_bottom, min_top, max_top,
133  bottom, top);
134  }
135  // Use only chars that fit in the expected bottom range, and where
136  // the range of tops is sensibly near the xheight.
137  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
138  bottom - x_ht_acceptance_tolerance <= max_bottom &&
139  min_top > kBlnBaselineOffset &&
140  max_top - kBlnBaselineOffset >= kBlnXHeight &&
141  misfit_dist > 0) {
142  // Compute the x-height position using proportionality between the
143  // actual height and expected height.
144  int min_xht = DivRounded(height * kBlnXHeight,
145  max_top - kBlnBaselineOffset);
146  int max_xht = DivRounded(height * kBlnXHeight,
147  min_top - kBlnBaselineOffset);
148  if (debug_x_ht_level >= 2) {
149  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
150  }
151  // The range of expected heights gets a vote equal to the distance
152  // of the actual top from the expected top.
153  for (int y = min_xht; y <= max_xht; ++y)
154  top_stats.add(y, misfit_dist);
155  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
156  bottom - x_ht_acceptance_tolerance > max_bottom) &&
157  bottom_shift == 0) {
158  // Get the range of required bottom shift.
159  int min_shift = min_bottom - bottom;
160  int max_shift = max_bottom - bottom;
161  if (debug_x_ht_level >= 2) {
162  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
163  }
164  // The range of expected shifts gets a vote equal to the min distance
165  // of the actual bottom from the expected bottom, spread over the
166  // range of its acceptance.
167  int misfit_weight = abs(min_shift);
168  if (max_shift > min_shift)
169  misfit_weight /= max_shift - min_shift;
170  for (int y = min_shift; y <= max_shift; ++y)
171  shift_stats.add(y, misfit_weight);
172  } else {
173  if (bottom_shift == 0) {
174  // Things with bottoms that are already ok need to say so, on the
175  // 1st iteration only.
176  shift_stats.add(0, kBlnBaselineOffset);
177  }
178  if (debug_x_ht_level >= 2) {
179  tprintf(" already OK\n");
180  }
181  }
182  }
183  }
184  if (shift_stats.get_total() > top_stats.get_total()) {
185  bottom_shift = IntCastRounded(shift_stats.median());
186  if (debug_x_ht_level >= 2) {
187  tprintf("Applying bottom shift=%d\n", bottom_shift);
188  }
189  }
190  } while (bottom_shift != 0 &&
191  top_stats.get_total() < shift_stats.get_total());
192  // Baseline shift is opposite sign to the bottom shift.
193  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194  if (debug_x_ht_level >= 2) {
195  tprintf("baseline shift=%g\n", *baseline_shift);
196  }
197  if (top_stats.get_total() == 0)
198  return bottom_shift != 0 ? word_res->x_height : 0.0f;
199  // The new xheight is just the median vote, which is then scaled out
200  // of BLN space back to pixel space to get the x-height in pixel space.
201  float new_xht = top_stats.median();
202  if (debug_x_ht_level >= 2) {
203  tprintf("Median xht=%f\n", new_xht);
204  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
205  new_xht, new_xht / word_res->denorm.y_scale());
206  }
207  // The xheight must change by at least x_ht_min_change to be used.
208  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
209  return new_xht / word_res->denorm.y_scale();
210  else
211  return bottom_shift != 0 ? word_res->x_height : 0.0f;
212 }
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
#define MAX(x, y)
Definition: ndminx.h:24
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
Definition: statistc.h:33
UNICHARSET unicharset
Definition: ccutil.h:72
float x_height
Definition: pageres.h:295
int NumBlobs() const
Definition: blobs.h:425
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
DENORM denorm
Definition: pageres.h:190
const int kBlnBaselineOffset
Definition: normalis.h:29
#define MAX_UINT8
Definition: host.h:121
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
int UNICHAR_ID
Definition: unichar.h:33
inT16 bottom() const
Definition: rect.h:61
int DivRounded(int a, int b)
Definition: helpers.h:166
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int kMaxCharTopRange
Definition: fixxht.cpp:66
int IntCastRounded(double x)
Definition: helpers.h:172
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
float y_scale() const
Definition: normalis.h:272
#define INT_FEAT_RANGE
Definition: float2int.h:27
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 663 of file docqual.cpp.

663  {
664  int i;
665  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
666  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
667  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
668  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
669  for (i = 0; i < word_res->reject_map.length(); ++i) {
670  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
671  word_res->best_choice->set_unichar_id(unichar_dash, i);
672  if (word_res->reject_map[i].accepted ())
673  word_res->reject_map[i].setrej_unlv_rej ();
674  }
675  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
676  word_res->best_choice->set_unichar_id(unichar_space, i);
677  if (word_res->reject_map[i].accepted ())
678  word_res->reject_map[i].setrej_unlv_rej ();
679  }
680  }
681 }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
int UNICHAR_ID
Definition: unichar.h:33
bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns
false if an invalid UNICHAR_ID is encountered.

Definition at line 535 of file applybox.cpp.

536  {
537  for (int step = 0; *utf8 != '\0'; utf8 += step) {
538  const char* next_space = strchr(utf8, ' ');
539  if (next_space == NULL)
540  next_space = utf8 + strlen(utf8);
541  step = next_space - utf8;
542  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
543  if (class_id == INVALID_UNICHAR_ID) {
544  return false;
545  }
546  while (utf8[step] == ' ')
547  ++step;
548  class_ids->push_back(class_id);
549  }
550  return true;
551 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int push_back(T object)
UNICHARSET unicharset
Definition: ccutil.h:72
int UNICHAR_ID
Definition: unichar.h:33
#define NULL
Definition: host.h:144
void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 772 of file applybox.cpp.

772  {
773  PAGE_RES_IT pr_it(page_res);
774  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
775  word_res = pr_it.forward()) {
776  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
777  word_res->correct_text.size());
778  for (int i = 0; i < word_res->correct_text.size(); ++i) {
779  // The part before the first space is the real ground truth, and the
780  // rest is the bounding box location and page number.
781  GenericVector<STRING> tokens;
782  word_res->correct_text[i].split(' ', &tokens);
783  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
784  choice->append_unichar_id_space_allocated(char_id,
785  word_res->best_state[i],
786  0.0f, 0.0f);
787  }
788  word_res->ClearWordChoices();
789  word_res->LogNewRawChoice(choice);
790  word_res->LogNewCookedChoice(1, false, choice);
791  }
792 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
UNICHARSET unicharset
Definition: ccutil.h:72
int UNICHAR_ID
Definition: unichar.h:33
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
inT16 tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 410 of file output.cpp.

410  {
411  int count = 0;
412  for (int i = 0; i < word.length(); ++i) {
413  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
414  word.unicharset()->get_isdigit(word.unichar_id(i)))
415  count++;
416  }
417  return count;
418 }
int length() const
Definition: ratngs.h:300
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
inT16 tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 558 of file reject.cpp.

558  {
559  int count = 0;
560  const WERD_CHOICE *best_choice = word_res->best_choice;
561  for (int i = 0; i < word_res->reject_map.length(); ++i) {
562  if ((word_res->reject_map[i].accepted()) &&
563  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
564  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
565  count++;
566  }
567  }
568  return count;
569 }
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int count(LIST var_list)
Definition: oldlist.cpp:108
inT16 tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 400 of file output.cpp.

400  {
401  int count = 0;
402  for (int i = 0; i < word.length(); ++i) {
403  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
404  count++;
405  }
406  return count;
407 }
int length() const
Definition: ratngs.h:300
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
inT16 tesseract::Tesseract::count_outline_errs ( char  c,
inT16  outline_count 
)

Definition at line 128 of file docqual.cpp.

128  {
129  int expected_outline_count;
130 
131  if (STRING (outlines_odd).contains (c))
132  return 0; //Dont use this char
133  else if (STRING (outlines_2).contains (c))
134  expected_outline_count = 2;
135  else
136  expected_outline_count = 1;
137  return abs (outline_count - expected_outline_count);
138 }
Definition: strngs.h:44
int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 69 of file fixxht.cpp.

69  {
70  int bad_blobs = 0;
71  int num_blobs = word_res->rebuild_word->NumBlobs();
72  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
73  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
74  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
75  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
76  int top = blob->bounding_box().top();
77  if (top >= INT_FEAT_RANGE)
78  top = INT_FEAT_RANGE - 1;
79  int min_bottom, max_bottom, min_top, max_top;
80  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
81  &min_top, &max_top);
82  if (max_top - min_top > kMaxCharTopRange)
83  continue;
84  bool bad = top < min_top - x_ht_acceptance_tolerance ||
85  top > max_top + x_ht_acceptance_tolerance;
86  if (bad)
87  ++bad_blobs;
88  if (debug_x_ht_level >= 1) {
89  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
90  unicharset.id_to_unichar(class_id),
91  bad ? "Misfit" : "OK", top, min_top, max_top,
92  static_cast<int>(x_ht_acceptance_tolerance));
93  }
94  }
95  }
96  return bad_blobs;
97 }
Definition: blobs.h:261
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
int NumBlobs() const
Definition: blobs.h:425
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
int UNICHAR_ID
Definition: unichar.h:33
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int kMaxCharTopRange
Definition: fixxht.cpp:66
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define INT_FEAT_RANGE
Definition: float2int.h:27
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
bool tesseract::Tesseract::create_cube_box_word ( Boxa *  char_boxes,
int  num_chars,
TBOX  word_box,
BoxWord box_word 
)

Definition at line 116 of file cube_control.cpp.

119  {
120  if (!box_word) {
121  if (cube_debug_level > 0) {
122  tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n");
123  }
124  return false;
125  }
126 
127  // Find the x-coordinate of left-most char_box, which could be
128  // nonzero if the word image was padded before recognition took place.
129  int x_offset = -1;
130  for (int i = 0; i < num_chars; ++i) {
131  Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
132  if (x_offset < 0 || char_box->x < x_offset) {
133  x_offset = char_box->x;
134  }
135  boxDestroy(&char_box);
136  }
137 
138  for (int i = 0; i < num_chars; ++i) {
139  Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
140  TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset);
141  boxDestroy(&char_box);
142  box_word->InsertBox(i, tbox);
143  }
144  return true;
145 }
TBOX char_box_to_tbox(Box *char_box, TBOX word_box, int x_offset)
#define tprintf(...)
Definition: tprintf.h:31
Definition: rect.h:30
void tesseract::Tesseract::cube_combine_word ( CubeObject cube_obj,
WERD_RES cube_word,
WERD_RES tess_word 
)

Definition at line 283 of file cube_control.cpp.

284  {
285  float combiner_prob = tess_cube_combiner_->CombineResults(tess_word,
286  cube_obj);
287  // If combiner probability is greater than tess/cube combiner
288  // classifier threshold, i.e. tesseract wins, then just return the
289  // tesseract result unchanged, as the combiner knows nothing about how
290  // correct the answer is. If cube and tesseract agree, then improve the
291  // scores before returning.
292  WERD_CHOICE* tess_best = tess_word->best_choice;
293  WERD_CHOICE* cube_best = cube_word->best_choice;
295  tprintf("Combiner prob = %g vs threshold %g\n",
296  combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh());
297  }
298  if (combiner_prob >=
299  cube_cntxt_->Params()->CombinerClassifierThresh()) {
300  if (tess_best->unichar_string() == cube_best->unichar_string()) {
301  // Cube and tess agree, so improve the scores.
302  tess_best->set_rating(tess_best->rating() / 2);
303  tess_best->set_certainty(tess_best->certainty() / 2);
304  }
305  return;
306  }
307  // Cube wins.
308  // It is better for the language combiner to have all tesseract scores,
309  // so put them in the cube result.
310  cube_best->set_rating(tess_best->rating());
311  cube_best->set_certainty(tess_best->certainty());
313  tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n",
314  tess_best->unichar_string().string(),
315  cube_best->unichar_string().string());
316  }
317  tess_word->ConsumeWordResults(cube_word);
318 }
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
float rating() const
Definition: ratngs.h:324
double CombinerClassifierThresh() const
Definition: tuning_params.h:63
void set_certainty(float new_val)
Definition: ratngs.h:369
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const STRING & unichar_string() const
Definition: ratngs.h:524
float certainty() const
Definition: ratngs.h:327
TuningParams * Params() const
const char * string() const
Definition: strngs.cpp:193
void set_rating(float new_val)
Definition: ratngs.h:366
bool tesseract::Tesseract::cube_recognize ( CubeObject cube_obj,
BLOCK block,
WERD_RES word 
)

Definition at line 326 of file cube_control.cpp.

327  {
328  // Run cube
329  WordAltList *cube_alt_list = cube_obj->RecognizeWord();
330  if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
331  if (cube_debug_level > 0) {
332  tprintf("Cube returned nothing for word at:");
333  word->word->bounding_box().print();
334  }
335  word->SetupFake(unicharset);
336  return false;
337  }
338 
339  // Get cube's best result and its probability, mapped to tesseract's
340  // certainty range
341  char_32 *cube_best_32 = cube_alt_list->Alt(0);
342  double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0));
343  float cube_certainty = convert_prob_to_tess_certainty(cube_prob);
344  string cube_best_str;
345  CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str);
346 
347  // Retrieve Cube's character bounding boxes and CharSamples,
348  // corresponding to the most recent call to RecognizeWord().
349  Boxa *char_boxes = NULL;
350  CharSamp **char_samples = NULL;;
351  int num_chars;
352  if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples)
353  && cube_debug_level > 0) {
354  tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
355  "cube state.\n");
356  word->SetupFake(unicharset);
357  return false;
358  }
359 
360  // Convert cube's character bounding boxes to a BoxWord.
361  BoxWord cube_box_word;
362  TBOX tess_word_box = word->word->bounding_box();
363  if (word->denorm.block() != NULL)
364  tess_word_box.rotate(word->denorm.block()->re_rotation());
365  bool box_word_success = create_cube_box_word(char_boxes, num_chars,
366  tess_word_box,
367  &cube_box_word);
368  boxaDestroy(&char_boxes);
369  if (!box_word_success) {
370  if (cube_debug_level > 0) {
371  tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
372  "create cube BoxWord\n");
373  }
374  word->SetupFake(unicharset);
375  return false;
376  }
377 
378  // Fill tesseract result's fields with cube results
379  fill_werd_res(cube_box_word, cube_best_str.c_str(), word);
380 
381  // Create cube's best choice.
382  BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars];
383  for (int i = 0; i < num_chars; ++i) {
384  UNICHAR_ID uch_id =
385  cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
386  choices[i] = new BLOB_CHOICE(uch_id, -cube_certainty, cube_certainty,
387  -1, 0.0f, 0.0f, 0.0f, BCC_STATIC_CLASSIFIER);
388  }
389  word->FakeClassifyWord(num_chars, choices);
390  // within a word, cube recognizes the word in reading order.
392  delete [] choices;
393  delete [] char_samples;
394 
395  // Some sanity checks
396  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
397 
399  tprintf("Cube result: %s r=%g, c=%g\n",
400  word->best_choice->unichar_string().string(),
401  word->best_choice->rating(),
402  word->best_choice->certainty());
403  }
404  return true;
405 }
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:343
float rating() const
Definition: ratngs.h:324
inT32 length() const
Definition: rejctmap.h:237
void fill_werd_res(const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res)
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool create_cube_box_word(Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
bool set_unichars_in_script_order(bool in_script_order)
Definition: ratngs.h:514
FCOORD re_rotation() const
Definition: ocrblock.h:138
float certainty() const
Definition: ratngs.h:327
bool extract_cube_state(CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
DENORM denorm
Definition: pageres.h:190
int UnicharID(const char_32 *str) const
Definition: char_set.h:80
int UNICHAR_ID
Definition: unichar.h:33
WERD * word
Definition: pageres.h:175
static double Cost2Prob(int cost)
Definition: cube_utils.cpp:47
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:282
CharSet * CharacterSet() const
Definition: rect.h:30
signed int char_32
Definition: string_32.h:40
const BLOCK * block() const
Definition: normalis.h:275
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void rotate(const FCOORD &vec)
Definition: rect.h:189
CubeObject * tesseract::Tesseract::cube_recognize_word ( BLOCK block,
WERD_RES word 
)

Definition at line 246 of file cube_control.cpp.

246  {
247  if (!cube_binary_ || !cube_cntxt_) {
248  if (cube_debug_level > 0 && !cube_binary_)
249  tprintf("Tesseract::run_cube(): NULL binary image.\n");
250  word->SetupFake(unicharset);
251  return NULL;
252  }
253  TBOX word_box = word->word->bounding_box();
254  if (block != NULL && (block->re_rotation().x() != 1.0f ||
255  block->re_rotation().y() != 0.0f)) {
256  // TODO(rays) We have to rotate the bounding box to get the true coords.
257  // This will be achieved in the future via DENORM.
258  // In the mean time, cube can't process this word.
259  if (cube_debug_level > 0) {
260  tprintf("Cube can't process rotated word at:");
261  word_box.print();
262  }
263  word->SetupFake(unicharset);
264  return NULL;
265  }
266  CubeObject* cube_obj = new tesseract::CubeObject(
267  cube_cntxt_, cube_binary_, word_box.left(),
268  pixGetHeight(cube_binary_) - word_box.top(),
269  word_box.width(), word_box.height());
270  if (!cube_recognize(cube_obj, block, word)) {
271  delete cube_obj;
272  return NULL;
273  }
274  return cube_obj;
275 }
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:343
float x() const
Definition: points.h:209
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
inT16 left() const
Definition: rect.h:68
FCOORD re_rotation() const
Definition: ocrblock.h:138
WERD * word
Definition: pageres.h:175
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
Definition: rect.h:30
float y() const
Definition: points.h:212
bool cube_recognize(CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
void tesseract::Tesseract::cube_word_pass1 ( BLOCK block,
ROW row,
WERD_RES word 
)

Definition at line 235 of file cube_control.cpp.

235  {
236  CubeObject *cube_obj = cube_recognize_word(block, word);
237  delete cube_obj;
238 }
CubeObject * cube_recognize_word(BLOCK *block, WERD_RES *word)
void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 641 of file pgedit.cpp.

641  {
643  recog_all_words(page_res, NULL, &selection_box, word_config_.string(), 0);
644 }
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2015 of file control.cpp.

2015  {
2016  PAGE_RES_IT word_it(page_res);
2017  for (WERD_RES* word = word_it.word(); word != NULL;
2018  word = word_it.forward()) {
2019  if (word->best_choices.singleton())
2020  continue; // There are no alternates.
2021 
2022  WERD_CHOICE* best = word->best_choice;
2023  if (word->tesseract->getDict().valid_word(*best) != 0)
2024  continue; // The best choice is in the dictionary.
2025 
2026  WERD_CHOICE_IT choice_it(&word->best_choices);
2027  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2028  choice_it.forward()) {
2029  WERD_CHOICE* alternate = choice_it.data();
2030  if (word->tesseract->getDict().valid_word(*alternate)) {
2031  // The alternate choice is in the dictionary.
2032  if (tessedit_bigram_debug) {
2033  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2034  best->unichar_string().string(),
2035  alternate->unichar_string().string());
2036  }
2037  // Replace the 'best' choice with a better choice.
2038  word->ReplaceBestChoice(alternate);
2039  break;
2040  }
2041  }
2042  }
2043 }
#define tprintf(...)
Definition: tprintf.h:31
const STRING & unichar_string() const
Definition: ratngs.h:524
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
BOOL8 tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 344 of file fixspace.cpp.

344  {
345  int i;
346  int offset;
347 
348  for (i = 0, offset = 0; i < char_position;
349  offset += word->best_choice->unichar_lengths()[i++]);
350  return (
351  word->uch_set->get_isdigit(
352  word->best_choice->unichar_string().string() + offset,
353  word->best_choice->unichar_lengths()[i]) ||
354  (word->best_choice->permuter() == NUMBER_PERM &&
356  word->best_choice->unichar_string().string()[offset])));
357 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:531
const STRING & unichar_string() const
Definition: ratngs.h:524
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHARSET * uch_set
Definition: pageres.h:192
uinT8 permuter() const
Definition: ratngs.h:343
Definition: strngs.h:44
const char * string() const
Definition: strngs.cpp:193
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
void tesseract::Tesseract::do_re_display ( BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 308 of file pgedit.cpp.

309  {
310  int block_count = 1;
311 
312  image_win->Clear();
313  if (display_image != 0) {
314  image_win->Image(pix_binary_, 0, 0);
315  }
316 
319  for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
320  (this->*word_painter)(&pr_it);
321  if (display_baselines && pr_it.row() != pr_it.prev_row())
322  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
323  if (display_blocks && pr_it.block() != pr_it.prev_block())
324  pr_it.block()->block->plot(image_win, block_count++, ScrollView::RED);
325  }
326  image_win->Update();
327 }
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
static void Update()
Definition: scrollview.cpp:715
BOOL8 display_baselines
Definition: pgedit.cpp:126
BOOL8 display_blocks
Definition: pgedit.cpp:125
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:773
BOOL8 display_image
Definition: pgedit.cpp:124
void Clear()
Definition: scrollview.cpp:595
ScrollView * image_win
Definition: pgedit.cpp:107
void Brush(Color color)
Definition: scrollview.cpp:732
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 235 of file docqual.cpp.

237  {
238  inT16 block_no = 0;
239  inT16 row_no = 0;
240  BLOCK_RES *current_block;
241  ROW_RES *current_row;
242 
243  BOOL8 rej_word;
244  BOOL8 prev_word_rejected;
245  inT16 char_quality = 0;
246  inT16 accepted_char_quality;
247 
248  if (page_res_it.page_res->rej_count * 100.0 /
250  reject_whole_page(page_res_it);
252  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
253  page_res_it.page_res->char_count,
254  page_res_it.page_res->rej_count);
255  }
256  } else {
258  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
259  page_res_it.page_res->char_count,
260  page_res_it.page_res->rej_count);
261  }
262 
263  /* Walk blocks testing for block rejection */
264 
265  page_res_it.restart_page();
266  WERD_RES* word;
267  while ((word = page_res_it.word()) != NULL) {
268  current_block = page_res_it.block();
269  block_no = current_block->block->index();
270  if (current_block->char_count > 0 &&
271  (current_block->rej_count * 100.0 / current_block->char_count) >
274  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
275  block_no, current_block->char_count,
276  current_block->rej_count);
277  }
278  prev_word_rejected = FALSE;
279  while ((word = page_res_it.word()) != NULL &&
280  (page_res_it.block() == current_block)) {
282  rej_word = word->reject_map.reject_count() > 0 ||
284  if (rej_word && tessedit_dont_blkrej_good_wds &&
287  *word->uch_set,
288  word->best_choice->unichar_string().string(),
289  word->best_choice->unichar_lengths().string()) !=
290  AC_UNACCEPTABLE) {
291  word_char_quality(word, page_res_it.row()->row,
292  &char_quality,
293  &accepted_char_quality);
294  rej_word = char_quality != word->reject_map.length();
295  }
296  } else {
297  rej_word = TRUE;
298  }
299  if (rej_word) {
300  /*
301  Reject spacing if both current and prev words are rejected.
302  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
303  generated more space errors.
304  */
306  prev_word_rejected &&
307  page_res_it.prev_row() == page_res_it.row() &&
308  word->word->space() == 1)
309  word->reject_spaces = TRUE;
311  }
312  prev_word_rejected = rej_word;
313  page_res_it.forward();
314  }
315  } else {
317  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
318  block_no, page_res_it.block()->char_count,
319  page_res_it.block()->rej_count);
320  }
321 
322  /* Walk rows in block testing for row rejection */
323  row_no = 0;
324  while (page_res_it.word() != NULL &&
325  page_res_it.block() == current_block) {
326  current_row = page_res_it.row();
327  row_no++;
328  /* Reject whole row if:
329  fraction of chars on row which are rejected exceed a limit AND
330  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
331  limit
332  */
333  if (current_row->char_count > 0 &&
334  (current_row->rej_count * 100.0 / current_row->char_count) >
336  (current_row->whole_word_rej_count * 100.0 /
337  current_row->rej_count) <
340  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
341  row_no, current_row->char_count,
342  current_row->rej_count);
343  }
344  prev_word_rejected = FALSE;
345  while ((word = page_res_it.word()) != NULL &&
346  page_res_it.row () == current_row) {
347  /* Preserve words on good docs unless they are mostly rejected*/
348  if (!tessedit_row_rej_good_docs && good_quality_doc) {
349  rej_word = word->reject_map.reject_count() /
350  static_cast<float>(word->reject_map.length()) >
353  /* Preserve perfect words anyway */
354  rej_word = word->reject_map.reject_count() > 0 ||
356  if (rej_word && tessedit_dont_rowrej_good_wds &&
359  word->best_choice->unichar_string().string(),
360  word->best_choice->unichar_lengths().string()) !=
361  AC_UNACCEPTABLE) {
362  word_char_quality(word, page_res_it.row()->row,
363  &char_quality,
364  &accepted_char_quality);
365  rej_word = char_quality != word->reject_map.length();
366  }
367  } else {
368  rej_word = TRUE;
369  }
370  if (rej_word) {
371  /*
372  Reject spacing if both current and prev words are rejected.
373  NOTE - this is NOT restricted to FUZZY spaces. - When tried
374  this generated more space errors.
375  */
377  prev_word_rejected &&
378  page_res_it.prev_row() == page_res_it.row() &&
379  word->word->space () == 1)
380  word->reject_spaces = TRUE;
382  }
383  prev_word_rejected = rej_word;
384  page_res_it.forward();
385  }
386  } else {
388  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
389  row_no, current_row->char_count, current_row->rej_count);
390  }
391  while (page_res_it.word() != NULL &&
392  page_res_it.row() == current_row)
393  page_res_it.forward();
394  }
395  }
396  }
397  }
398  }
399 }
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
inT32 char_count
Definition: pageres.h:60
#define tprintf(...)
Definition: tprintf.h:31
inT32 whole_word_rej_count
Definition: pageres.h:130
BOOL8 reject_spaces
Definition: pageres.h:317
PAGE_RES * page_res
Definition: pageres.h:658
const STRING & unichar_lengths() const
Definition: ratngs.h:531
unsigned char BOOL8
Definition: host.h:113
BLOCK * block
Definition: pageres.h:99
const STRING & unichar_string() const
Definition: ratngs.h:524
double tessedit_whole_wd_rej_row_percent
BLOCK_RES * block() const
Definition: pageres.h:739
double tessedit_reject_block_percent
WERD_RES * forward()
Definition: pageres.h:713
WERD_RES * restart_page()
Definition: pageres.h:680
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
inT32 char_count
Definition: pageres.h:100
void rej_word_block_rej()
Definition: rejctmap.cpp:506
const UNICHARSET * uch_set
Definition: pageres.h:192
ROW_RES * row() const
Definition: pageres.h:736
inT32 rej_count
Definition: pageres.h:61
void rej_word_row_rej()
Definition: rejctmap.cpp:515
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
inT32 rej_count
Definition: pageres.h:129
inT32 char_count
Definition: pageres.h:128
double tessedit_reject_row_percent
WERD * word
Definition: pageres.h:175
inT32 rej_count
Definition: pageres.h:101
Unacceptable word.
Definition: control.h:36
bool tessedit_preserve_row_rej_perfect_wds
#define FALSE
Definition: capi.h:29
double tessedit_good_doc_still_rowrej_wd
ROW * row
Definition: pageres.h:127
inT16 reject_count()
Definition: rejctmap.h:243
#define TRUE
Definition: capi.h:28
int index() const
Definition: pdblock.h:77
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:410
double tessedit_reject_doc_percent
uinT8 space()
Definition: werd.h:104
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
ROW_RES * prev_row() const
Definition: pageres.h:727
bool tessedit_preserve_blk_rej_perfect_wds
WERD_RES * word() const
Definition: pageres.h:733
short inT16
Definition: host.h:100
void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 526 of file reject.cpp.

526  {
527  int i = 0;
528  int offset;
529  int word_len = word->reject_map.length();
530  const char *s = word->best_choice->unichar_string().string();
531  const char *lengths = word->best_choice->unichar_lengths().string();
532  BOOL8 accepted_1Il = FALSE;
533 
534  for (i = 0, offset = 0; i < word_len;
535  offset += word->best_choice->unichar_lengths()[i++]) {
536  if (word->reject_map[i].accepted()) {
537  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
538  accepted_1Il = TRUE;
539  } else {
540  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
541  word->uch_set->get_isdigit(s + offset, lengths[i]))
542  return; // >=1 non 1Il ch accepted
543  }
544  }
545  }
546  if (!accepted_1Il)
547  return; //Nothing to worry about
548 
549  for (i = 0, offset = 0; i < word_len;
550  offset += word->best_choice->unichar_lengths()[i++]) {
551  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
552  word->reject_map[i].accepted())
553  word->reject_map[i].setrej_postNN_1Il();
554  }
555 }
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_lengths() const
Definition: ratngs.h:531
unsigned char BOOL8
Definition: host.h:113
const STRING & unichar_string() const
Definition: ratngs.h:524
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHARSET * uch_set
Definition: pageres.h:192
#define FALSE
Definition: capi.h:29
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define TRUE
Definition: capi.h:28
Definition: strngs.h:44
const char * string() const
Definition: strngs.cpp:193
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
inT16  score,
inT16  mode,
BOOL8  improved 
)

Definition at line 450 of file fixspace.cpp.

451  {
452  WERD_RES_IT word_res_it(&perm);
453 
454  if (debug_fix_space_level > 0) {
455  if (mode == 1) {
456  stats_.dump_words_str = "";
457  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
458  word_res_it.forward()) {
459  if (!word_res_it.data()->part_of_combo) {
460  stats_.dump_words_str +=
461  word_res_it.data()->best_choice->unichar_string();
462  stats_.dump_words_str += ' ';
463  }
464  }
465  }
466 
467  if (debug_fix_space_level > 1) {
468  switch (mode) {
469  case 1:
470  tprintf("EXTRACTED (%d): \"", score);
471  break;
472  case 2:
473  tprintf("TESTED (%d): \"", score);
474  break;
475  case 3:
476  tprintf("RETURNED (%d): \"", score);
477  break;
478  }
479 
480  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
481  word_res_it.forward()) {
482  if (!word_res_it.data()->part_of_combo) {
483  tprintf("%s/%1d ",
484  word_res_it.data()->best_choice->unichar_string().string(),
485  (int)word_res_it.data()->best_choice->permuter());
486  }
487  }
488  tprintf("\"\n");
489  } else if (improved) {
490  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
491  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
492  word_res_it.forward()) {
493  if (!word_res_it.data()->part_of_combo) {
494  tprintf("%s/%1d ",
495  word_res_it.data()->best_choice->unichar_string().string(),
496  (int)word_res_it.data()->best_choice->permuter());
497  }
498  }
499  tprintf("\"\n");
500  }
501  }
502 }
#define tprintf(...)
Definition: tprintf.h:31
CMD_EVENTS mode
Definition: pgedit.cpp:116
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::end_tesseract ( )

Definition at line 471 of file tessedit.cpp.

471  {
472  end_recog();
473 }
inT16 tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 240 of file fixspace.cpp.

240  {
241  WERD_RES_IT word_res_it(&word_res_list);
242  inT16 total_score = 0;
243  inT16 word_count = 0;
244  inT16 done_word_count = 0;
245  inT16 word_len;
246  inT16 i;
247  inT16 offset;
248  WERD_RES *word; // current word
249  inT16 prev_word_score = 0;
250  BOOL8 prev_word_done = FALSE;
251  BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
252  BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
253  BOOL8 current_char_1 = FALSE;
254  BOOL8 current_word_ok_so_far;
255  STRING punct_chars = "!\"`',.:;";
256  BOOL8 prev_char_punct = FALSE;
257  BOOL8 current_char_punct = FALSE;
258  BOOL8 word_done = FALSE;
259 
260  do {
261  word = word_res_it.data();
262  word_done = fixspace_thinks_word_done(word);
263  word_count++;
264  if (word->tess_failed) {
265  total_score += prev_word_score;
266  if (prev_word_done)
267  done_word_count++;
268  prev_word_score = 0;
269  prev_char_1 = FALSE;
270  prev_char_digit = FALSE;
271  prev_word_done = FALSE;
272  } else {
273  /*
274  Can we add the prev word score and potentially count this word?
275  Yes IF it didnt end in a 1 when the first char of this word is a digit
276  AND it didnt end in a digit when the first char of this word is a 1
277  */
278  word_len = word->reject_map.length();
279  current_word_ok_so_far = FALSE;
280  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
281  (prev_char_digit && (
282  (word_done &&
283  word->best_choice->unichar_lengths().string()[0] == 1 &&
284  word->best_choice->unichar_string()[0] == '1') ||
285  (!word_done && STRING(conflict_set_I_l_1).contains(
286  word->best_choice->unichar_string()[0])))))) {
287  total_score += prev_word_score;
288  if (prev_word_done)
289  done_word_count++;
290  current_word_ok_so_far = word_done;
291  }
292 
293  if (current_word_ok_so_far) {
294  prev_word_done = TRUE;
295  prev_word_score = word_len;
296  } else {
297  prev_word_done = FALSE;
298  prev_word_score = 0;
299  }
300 
301  /* Add 1 to total score for every joined 1 regardless of context and
302  rejtn */
303  for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
304  current_char_1 = word->best_choice->unichar_string()[i] == '1';
305  if (prev_char_1 || (current_char_1 && (i > 0)))
306  total_score++;
307  prev_char_1 = current_char_1;
308  }
309 
310  /* Add 1 to total score for every joined punctuation regardless of context
311  and rejtn */
313  for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
314  offset += word->best_choice->unichar_lengths()[i++]) {
315  current_char_punct =
316  punct_chars.contains(word->best_choice->unichar_string()[offset]);
317  if (prev_char_punct || (current_char_punct && i > 0))
318  total_score++;
319  prev_char_punct = current_char_punct;
320  }
321  }
322  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
323  for (i = 0, offset = 0; i < word_len - 1;
324  offset += word->best_choice->unichar_lengths()[i++]);
325  prev_char_1 =
326  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
327  || (!word_done && STRING(conflict_set_I_l_1).contains(
328  word->best_choice->unichar_string()[offset])));
329  }
330  /* Find next word */
331  do {
332  word_res_it.forward();
333  } while (word_res_it.data()->part_of_combo);
334  } while (!word_res_it.at_first());
335  total_score += prev_word_score;
336  if (prev_word_done)
337  done_word_count++;
338  if (done_word_count == word_count)
339  return PERFECT_WERDS;
340  else
341  return total_score;
342 }
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_lengths() const
Definition: ratngs.h:531
unsigned char BOOL8
Definition: host.h:113
const STRING & unichar_string() const
Definition: ratngs.h:524
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:504
#define FALSE
Definition: capi.h:29
BOOL8 tess_failed
Definition: pageres.h:272
#define PERFECT_WERDS
Definition: fixspace.cpp:33
#define TRUE
Definition: capi.h:28
Definition: strngs.h:44
const char * string() const
Definition: strngs.cpp:193
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:344
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
short inT16
Definition: host.h:100
bool tesseract::Tesseract::extract_cube_state ( CubeObject cube_obj,
int *  num_chars,
Boxa **  char_boxes,
CharSamp ***  char_samples 
)

Definition at line 65 of file cube_control.cpp.

68  {
69  if (!cube_obj) {
70  if (cube_debug_level > 0) {
71  tprintf("Cube WARNING (extract_cube_state): Invalid cube object "
72  "passed to extract_cube_state\n");
73  }
74  return false;
75  }
76 
77  // Note that the CubeObject accessors return either the deslanted or
78  // regular objects search object or beam search object, whichever
79  // was used in the last call to Recognize()
80  CubeSearchObject* cube_search_obj = cube_obj->SrchObj();
81  if (!cube_search_obj) {
82  if (cube_debug_level > 0) {
83  tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
84  "cube's search object in extract_cube_state.\n");
85  }
86  return false;
87  }
88  BeamSearch *beam_search_obj = cube_obj->BeamObj();
89  if (!beam_search_obj) {
90  if (cube_debug_level > 0) {
91  tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
92  "cube's beam search object in extract_cube_state.\n");
93  }
94  return false;
95  }
96 
97  // Get the character samples and bounding boxes by backtracking
98  // through the beam search path
99  int best_node_index = beam_search_obj->BestPresortedNodeIndex();
100  *char_samples = beam_search_obj->BackTrack(
101  cube_search_obj, best_node_index, num_chars, NULL, char_boxes);
102  if (!*char_samples)
103  return false;
104  return true;
105 }
#define tprintf(...)
Definition: tprintf.h:31
#define NULL
Definition: host.h:144
inT16 tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 969 of file docqual.cpp.

969  {
970  const char *str = word->best_choice->unichar_string().string();
971  int tess_rejs = 0;
972 
973  for (; *str != '\0'; str++) {
974  if (*str == ' ')
975  tess_rejs++;
976  }
977  return tess_rejs;
978 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_string() const
Definition: ratngs.h:524
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::fill_werd_res ( const BoxWord cube_box_word,
const char *  cube_best_str,
WERD_RES tess_werd_res 
)

Definition at line 413 of file cube_control.cpp.

415  {
416  delete tess_werd_res->box_word;
417  tess_werd_res->box_word = new BoxWord(cube_box_word);
418  tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
419  tess_werd_res->word);
420  // Fill text and remaining fields
421  tess_werd_res->word->set_text(cube_best_str);
422  tess_werd_res->tess_failed = FALSE;
423  tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res);
424  // There is no output word, so we can' call AdaptableWord, but then I don't
425  // think we need to. Fudge the result with accepted.
426  tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
427 
428  // Set word to done, i.e., ignore all of tesseract's tests for rejection
429  tess_werd_res->done = tess_werd_res->tess_accepted;
430 }
BOOL8 tess_accepted
Definition: pageres.h:280
tesseract::BoxWord * box_word
Definition: pageres.h:250
void set_text(const char *new_text)
Definition: werd.h:126
BOOL8 tess_would_adapt
Definition: pageres.h:281
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:95
DENORM denorm
Definition: pageres.h:190
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
BOOL8 tess_failed
Definition: pageres.h:272
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69
const BLOCK * block() const
Definition: normalis.h:275
bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.

Definition at line 559 of file applybox.cpp.

560  {
561  // Classify all required combinations of blobs and save results in choices.
562  int word_length = word_res->box_word->length();
564  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
565  for (int i = 0; i < word_length; ++i) {
566  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
567  BLOB_CHOICE_LIST* match_result = classify_piece(
568  word_res->seam_array, i, i + j - 1, "Applybox",
569  word_res->chopped_word, word_res->blamer_bundle);
570  if (applybox_debug > 2) {
571  tprintf("%d+%d:", i, j);
572  print_ratings_list("Segment:", match_result, unicharset);
573  }
574  choices[i].push_back(match_result);
575  }
576  }
577  // Search the segmentation graph for the target text. Must be an exact
578  // match. Using wildcards makes it difficult to find the correct
579  // segmentation even when it is there.
580  word_res->best_state.clear();
581  GenericVector<int> search_segmentation;
582  float best_rating = 0.0f;
583  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
584  &search_segmentation, &best_rating, &word_res->best_state);
585  for (int i = 0; i < word_length; ++i)
586  choices[i].delete_data_pointers();
587  delete [] choices;
588  if (word_res->best_state.empty()) {
589  // Build the original segmentation and if it is the same length as the
590  // truth, assume it will do.
591  int blob_count = 1;
592  for (int s = 0; s < word_res->seam_array.size(); ++s) {
593  SEAM* seam = word_res->seam_array[s];
594  if (!seam->HasAnySplits()) {
595  word_res->best_state.push_back(blob_count);
596  blob_count = 1;
597  } else {
598  ++blob_count;
599  }
600  }
601  word_res->best_state.push_back(blob_count);
602  if (word_res->best_state.size() != target_text.size()) {
603  word_res->best_state.clear(); // No good. Original segmentation bad size.
604  return false;
605  }
606  }
607  word_res->correct_text.clear();
608  for (int i = 0; i < target_text.size(); ++i) {
609  word_res->correct_text.push_back(
610  STRING(unicharset.id_to_unichar(target_text[i])));
611  }
612  return true;
613 }
const int kMaxGroupSize
Definition: applybox.cpp:40
int size() const
Definition: genericvector.h:72
tesseract::BoxWord * box_word
Definition: pageres.h:250
bool HasAnySplits() const
Definition: seam.h:67
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
GenericVector< STRING > correct_text
Definition: pageres.h:259
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:57
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:629
bool empty() const
Definition: genericvector.h:84
const int length() const
Definition: boxword.h:85
Definition: strngs.h:44
GenericVector< int > best_state
Definition: pageres.h:255
Definition: seam.h:44
BlamerBundle * blamer_bundle
Definition: pageres.h:230
inT16 tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 469 of file reject.cpp.

470  {
471  inT16 i;
472  inT16 offset;
473 
474  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476  unicharset.get_isdigit(word + offset, word_lengths[i]))
477  return i;
478  }
479  return -1;
480 }
UNICHARSET unicharset
Definition: ccutil.h:72
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
short inT16
Definition: host.h:100
inT16 tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 482 of file reject.cpp.

483  {
484  inT16 i;
485  inT16 offset;
486 
487  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
488  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
489  unicharset.get_isdigit(word + offset, word_lengths[i]))
490  return offset;
491  }
492  return -1;
493 }
UNICHARSET unicharset
Definition: ccutil.h:72
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
short inT16
Definition: host.h:100
void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 145 of file fixspace.cpp.

147  {
148  inT16 best_score;
149  WERD_RES_LIST current_perm;
150  inT16 current_score;
151  BOOL8 improved = FALSE;
152 
153  best_score = eval_word_spacing(best_perm); // default score
154  dump_words(best_perm, best_score, 1, improved);
155 
156  if (best_score != PERFECT_WERDS)
157  initialise_search(best_perm, current_perm);
158 
159  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
160  match_current_words(current_perm, row, block);
161  current_score = eval_word_spacing(current_perm);
162  dump_words(current_perm, current_score, 2, improved);
163  if (current_score > best_score) {
164  best_perm.clear();
165  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
166  best_score = current_score;
167  improved = TRUE;
168  }
169  if (current_score < PERFECT_WERDS)
170  transform_to_next_perm(current_perm);
171  }
172  dump_words(best_perm, best_score, 3, improved);
173 }
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:177
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:373
unsigned char BOOL8
Definition: host.h:113
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:240
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:630
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
#define FALSE
Definition: capi.h:29
#define PERFECT_WERDS
Definition: fixspace.cpp:33
#define TRUE
Definition: capi.h:28
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:450
short inT16
Definition: host.h:100
void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
inT32  word_count,
PAGE_RES page_res 
)

Definition at line 48 of file fixspace.cpp.

50  {
51  BLOCK_RES_IT block_res_it;
52  ROW_RES_IT row_res_it;
53  WERD_RES_IT word_res_it_from;
54  WERD_RES_IT word_res_it_to;
55  WERD_RES *word_res;
56  WERD_RES_LIST fuzzy_space_words;
57  inT16 new_length;
58  BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
59  inT32 word_index; // current word
60 
61  block_res_it.set_to_list(&page_res->block_res_list);
62  word_index = 0;
63  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
64  block_res_it.forward()) {
65  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
66  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
67  row_res_it.forward()) {
68  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
69  while (!word_res_it_from.at_last()) {
70  word_res = word_res_it_from.data();
71  while (!word_res_it_from.at_last() &&
72  !(word_res->combination ||
73  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
74  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
75  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
76  block_res_it.data()->block);
77  word_res = word_res_it_from.forward();
78  word_index++;
79  if (monitor != NULL) {
80  monitor->ocr_alive = TRUE;
81  monitor->progress = 90 + 5 * word_index / word_count;
82  if (monitor->deadline_exceeded() ||
83  (monitor->cancel != NULL &&
84  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
85  return;
86  }
87  }
88 
89  if (!word_res_it_from.at_last()) {
90  word_res_it_to = word_res_it_from;
91  prevent_null_wd_fixsp =
92  word_res->word->cblob_list()->empty();
93  if (check_debug_pt(word_res, 60))
94  debug_fix_space_level.set_value(10);
95  word_res_it_to.forward();
96  word_index++;
97  if (monitor != NULL) {
98  monitor->ocr_alive = TRUE;
99  monitor->progress = 90 + 5 * word_index / word_count;
100  if (monitor->deadline_exceeded() ||
101  (monitor->cancel != NULL &&
102  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
103  return;
104  }
105  while (!word_res_it_to.at_last () &&
106  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
107  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
108  if (check_debug_pt(word_res, 60))
109  debug_fix_space_level.set_value(10);
110  if (word_res->word->cblob_list()->empty())
111  prevent_null_wd_fixsp = TRUE;
112  word_res = word_res_it_to.forward();
113  }
114  if (check_debug_pt(word_res, 60))
115  debug_fix_space_level.set_value(10);
116  if (word_res->word->cblob_list()->empty())
117  prevent_null_wd_fixsp = TRUE;
118  if (prevent_null_wd_fixsp) {
119  word_res_it_from = word_res_it_to;
120  } else {
121  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
122  &word_res_it_to);
123  fix_fuzzy_space_list(fuzzy_space_words,
124  row_res_it.data()->row,
125  block_res_it.data()->block);
126  new_length = fuzzy_space_words.length();
127  word_res_it_from.add_list_before(&fuzzy_space_words);
128  for (;
129  !word_res_it_from.at_last() && new_length > 0;
130  new_length--) {
131  word_res_it_from.forward();
132  }
133  }
134  if (test_pt)
135  debug_fix_space_level.set_value(0);
136  }
137  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
138  block_res_it.data()->block);
139  // Last word in row
140  }
141  }
142  }
143 }
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
volatile inT8 ocr_alive
Definition: ocrclass.h:117
void * cancel_this
Definition: ocrclass.h:120
unsigned char BOOL8
Definition: host.h:113
BOOL8 combination
Definition: pageres.h:315
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:145
CANCEL_FUNC cancel
Definition: ocrclass.h:119
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
bool deadline_exceeded() const
Definition: ocrclass.h:144
inT16 progress
Definition: ocrclass.h:115
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:536
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102
void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 570 of file fixspace.cpp.

571  {
572  inT16 best_score;
573  WERD_RES_IT best_perm_it(&best_perm);
574  WERD_RES_LIST current_perm;
575  WERD_RES_IT current_perm_it(&current_perm);
576  WERD_RES *old_word_res;
577  inT16 current_score;
578  BOOL8 improved = FALSE;
579 
580  best_score = fp_eval_word_spacing(best_perm); // default score
581 
582  dump_words(best_perm, best_score, 1, improved);
583 
584  old_word_res = best_perm_it.data();
585  // Even deep_copy doesn't copy the underlying WERD unless its combination
586  // flag is true!.
587  old_word_res->combination = TRUE; // Kludge to force deep copy
588  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
589  old_word_res->combination = FALSE; // Undo kludge
590 
591  break_noisiest_blob_word(current_perm);
592 
593  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
594  match_current_words(current_perm, row, block);
595  current_score = fp_eval_word_spacing(current_perm);
596  dump_words(current_perm, current_score, 2, improved);
597  if (current_score > best_score) {
598  best_perm.clear();
599  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
600  best_score = current_score;
601  improved = TRUE;
602  }
603  if (current_score < PERFECT_WERDS) {
604  break_noisiest_blob_word(current_perm);
605  }
606  }
607  dump_words(best_perm, best_score, 3, improved);
608 }
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:616
unsigned char BOOL8
Definition: host.h:113
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:630
BOOL8 combination
Definition: pageres.h:315
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:831
#define FALSE
Definition: capi.h:29
#define PERFECT_WERDS
Definition: fixspace.cpp:33
#define TRUE
Definition: capi.h:28
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:450
short inT16
Definition: host.h:100
void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1624 of file control.cpp.

1624  {
1625  WERD_RES *word_res = page_res_it->word();
1626  const WERD_CHOICE &word = *(word_res->best_choice);
1627 
1628  // Find the frequency of each unique character in the word.
1629  SortHelper<UNICHAR_ID> rep_ch(word.length());
1630  for (int i = 0; i < word.length(); ++i) {
1631  rep_ch.Add(word.unichar_id(i), 1);
1632  }
1633 
1634  // Find the most frequent result.
1635  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1636  int max_count = rep_ch.MaxCount(&maxch_id);
1637  // Find the best exemplar of a classifier result for maxch_id.
1638  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1639  if (best_choice == NULL) {
1640  tprintf("Failed to find a choice for %s, occurring %d times\n",
1641  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1642  return;
1643  }
1644  word_res->done = TRUE;
1645 
1646  // Measure the mean space.
1647  int gap_count = 0;
1648  WERD* werd = word_res->word;
1649  C_BLOB_IT blob_it(werd->cblob_list());
1650  C_BLOB* prev_blob = blob_it.data();
1651  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1652  C_BLOB* blob = blob_it.data();
1653  int gap = blob->bounding_box().left();
1654  gap -= prev_blob->bounding_box().right();
1655  ++gap_count;
1656  prev_blob = blob;
1657  }
1658  // Just correct existing classification.
1659  CorrectRepcharChoices(best_choice, word_res);
1660  word_res->reject_map.initialise(word.length());
1661 }
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
inT16 left() const
Definition: rect.h:68
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
int UNICHAR_ID
Definition: unichar.h:33
Definition: werd.h:60
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
void Add(T value, int count)
Definition: sorthelper.h:65
#define TRUE
Definition: capi.h:28
TBOX bounding_box() const
Definition: stepblob.cpp:250
void initialise(inT16 length)
Definition: rejctmap.cpp:318
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_RES * word() const
Definition: pageres.h:733
void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 536 of file fixspace.cpp.

537  {
538  WERD_RES *word_res;
539  WERD_RES_LIST sub_word_list;
540  WERD_RES_IT sub_word_list_it(&sub_word_list);
541  inT16 blob_index;
542  inT16 new_length;
543  float junk;
544 
545  word_res = word_res_it.data();
546  if (word_res->word->flag(W_REP_CHAR) ||
547  word_res->combination ||
548  word_res->part_of_combo ||
549  !word_res->word->flag(W_DONT_CHOP))
550  return;
551 
552  blob_index = worst_noise_blob(word_res, &junk);
553  if (blob_index < 0)
554  return;
555 
556  if (debug_fix_space_level > 1) {
557  tprintf("FP fixspace working on \"%s\"\n",
558  word_res->best_choice->unichar_string().string());
559  }
560  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
561  sub_word_list_it.add_after_stay_put(word_res_it.extract());
562  fix_noisy_space_list(sub_word_list, row, block);
563  new_length = sub_word_list.length();
564  word_res_it.add_list_before(&sub_word_list);
565  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
566  word_res_it.forward();
567  }
568 }
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:681
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const STRING & unichar_string() const
Definition: ratngs.h:524
BOOL8 part_of_combo
Definition: pageres.h:316
BOOL8 combination
Definition: pageres.h:315
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:570
WERD * word
Definition: pageres.h:175
int c_blob_comparator(const void *blob1p, const void *blob2p)
Definition: genblob.cpp:30
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
const char * string() const
Definition: strngs.cpp:193
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
short inT16
Definition: host.h:100
BOOL8 tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 504 of file fixspace.cpp.

504  {
505  if (word->done)
506  return TRUE;
507 
508  /*
509  Use all the standard pass 2 conditions for mode 5 in set_done() in
510  reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
511  CARE WHETHER WE HAVE of/at on/an etc.
512  */
513  if (fixsp_done_mode > 0 &&
514  (word->tess_accepted ||
515  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
516  fixsp_done_mode == 3) &&
517  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
518  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
519  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
520  (word->best_choice->permuter() == USER_DAWG_PERM) ||
521  (word->best_choice->permuter() == NUMBER_PERM))) {
522  return TRUE;
523  } else {
524  return FALSE;
525  }
526 }
BOOL8 tess_accepted
Definition: pageres.h:280
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_string() const
Definition: ratngs.h:524
uinT8 permuter() const
Definition: ratngs.h:343
BOOL8 done
Definition: pageres.h:282
#define FALSE
Definition: capi.h:29
inT16 reject_count()
Definition: rejctmap.h:243
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 673 of file reject.cpp.

673  {
674  WERD_CHOICE *best_choice = word_res->best_choice;
675  int i;
676  TBOX out_box;
677 
678  if (!tessedit_flip_0O)
679  return;
680 
681  int num_blobs = word_res->rebuild_word->NumBlobs();
682  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
683  TBLOB* blob = word_res->rebuild_word->blobs[i];
684  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
685  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
686  out_box = blob->bounding_box();
687  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
688  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
689  return; //Beware words with sub/superscripts
690  }
691  }
692  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
693  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
694  if (unichar_0 == INVALID_UNICHAR_ID ||
695  !word_res->uch_set->get_enabled(unichar_0) ||
696  unichar_O == INVALID_UNICHAR_ID ||
697  !word_res->uch_set->get_enabled(unichar_O)) {
698  return; // 0 or O are not present/enabled in unicharset
699  }
700  for (i = 1; i < best_choice->length(); ++i) {
701  if (best_choice->unichar_id(i) == unichar_0 ||
702  best_choice->unichar_id(i) == unichar_O) {
703  /* A0A */
704  if ((i+1) < best_choice->length() &&
705  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
707  best_choice->set_unichar_id(unichar_O, i);
708  }
709  /* A00A */
710  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
711  (i+1) < best_choice->length() &&
712  (best_choice->unichar_id(i+1) == unichar_0 ||
713  best_choice->unichar_id(i+1) == unichar_O) &&
714  (i+2) < best_choice->length() &&
715  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
716  best_choice->set_unichar_id(unichar_O, i);
717  i++;
718  }
719  /* AA0<non digit or end of word> */
720  if ((i > 1) &&
721  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
722  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
723  (((i+1) < best_choice->length() &&
724  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
725  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
726  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
727  (i == best_choice->length() - 1))) {
728  best_choice->set_unichar_id(unichar_O, i);
729  }
730  /* 9O9 */
731  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
732  (i+1) < best_choice->length() &&
733  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
734  best_choice->set_unichar_id(unichar_0, i);
735  }
736  /* 9OOO */
737  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
738  (i+2) < best_choice->length() &&
739  (best_choice->unichar_id(i+1) == unichar_0 ||
740  best_choice->unichar_id(i+1) == unichar_O) &&
741  (best_choice->unichar_id(i+2) == unichar_0 ||
742  best_choice->unichar_id(i+2) == unichar_O)) {
743  best_choice->set_unichar_id(unichar_0, i);
744  best_choice->set_unichar_id(unichar_0, i+1);
745  best_choice->set_unichar_id(unichar_0, i+2);
746  i += 2;
747  }
748  /* 9OO<non upper> */
749  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
750  (i+2) < best_choice->length() &&
751  (best_choice->unichar_id(i+1) == unichar_0 ||
752  best_choice->unichar_id(i+1) == unichar_O) &&
753  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
754  best_choice->set_unichar_id(unichar_0, i);
755  best_choice->set_unichar_id(unichar_0, i+1);
756  i++;
757  }
758  /* 9O<non upper> */
759  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
760  (i+1) < best_choice->length() &&
761  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
762  best_choice->set_unichar_id(unichar_0, i);
763  }
764  /* 9[.,]OOO.. */
765  if ((i > 1) &&
766  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
767  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
768  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
769  best_choice->unichar_id(i-2) == unichar_O)) {
770  if (best_choice->unichar_id(i-2) == unichar_O) {
771  best_choice->set_unichar_id(unichar_0, i-2);
772  }
773  while (i < best_choice->length() &&
774  (best_choice->unichar_id(i) == unichar_O ||
775  best_choice->unichar_id(i) == unichar_0)) {
776  best_choice->set_unichar_id(unichar_0, i);
777  i++;
778  }
779  i--;
780  }
781  }
782  }
783 }
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
int length() const
Definition: ratngs.h:300
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const int kBlnBaselineOffset
Definition: normalis.h:29
int UNICHAR_ID
Definition: unichar.h:33
inT16 bottom() const
Definition: rect.h:61
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:785
Definition: rect.h:30
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:789
void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 616 of file reject.cpp.

616  {
617  WERD_CHOICE *best_choice = word_res->best_choice;
618  int i;
619  int prev_right = -9999;
620  int next_left;
621  TBOX out_box;
622  float aspect_ratio;
623 
625  return;
626 
627  int num_blobs = word_res->rebuild_word->NumBlobs();
628  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
629  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
630  TBLOB* blob = word_res->rebuild_word->blobs[i];
631  out_box = blob->bounding_box();
632  if (i + 1 == num_blobs)
633  next_left = 9999;
634  else
635  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
636  // Dont touch small or touching blobs - it is too dangerous.
637  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
638  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
639  aspect_ratio = out_box.width() / (float) out_box.height();
640  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
641  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
642  word_res->uch_set->contains_unichar_id(unichar_dash) &&
643  word_res->uch_set->get_enabled(unichar_dash)) {
644  /* Certain HYPHEN */
645  best_choice->set_unichar_id(unichar_dash, i);
646  if (word_res->reject_map[i].rejected())
647  word_res->reject_map[i].setrej_hyphen_accept();
648  }
649  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
650  word_res->reject_map[i].accepted())
651  //Suspected HYPHEN
652  word_res->reject_map[i].setrej_hyphen ();
653  }
654  else if (best_choice->unichar_id(i) == unichar_dash) {
655  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
656  (word_res->reject_map[i].rejected()))
657  word_res->reject_map[i].setrej_hyphen_accept();
658  //Certain HYPHEN
659 
660  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
661  (word_res->reject_map[i].accepted()))
662  //Suspected HYPHEN
663  word_res->reject_map[i].setrej_hyphen();
664  }
665  }
666  prev_right = out_box.right();
667  }
668 }
Definition: blobs.h:261
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
int length() const
Definition: ratngs.h:300
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int UNICHAR_ID
Definition: unichar.h:33
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
Definition: rect.h:30
TBOX bounding_box() const
Definition: blobs.cpp:482
void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 1958 of file control.cpp.

1958  {
1959  PAGE_RES_IT page_res_it(page_res);
1960  WERD_RES *word; // current word
1961  STATS doc_fonts(0, font_table_size_); // font counters
1962 
1963  // Gather font id statistics.
1964  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1965  page_res_it.forward()) {
1966  word = page_res_it.word();
1967  if (word->fontinfo != NULL) {
1968  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
1969  }
1970  if (word->fontinfo2 != NULL) {
1971  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
1972  }
1973  }
1974  inT16 doc_font; // modal font
1975  inT8 doc_font_count; // modal font
1976  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1977  if (doc_font_count == 0)
1978  return;
1979  // Get the modal font pointer.
1980  const FontInfo* modal_font = NULL;
1981  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1982  page_res_it.forward()) {
1983  word = page_res_it.word();
1984  if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
1985  modal_font = word->fontinfo;
1986  break;
1987  }
1988  if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
1989  modal_font = word->fontinfo2;
1990  break;
1991  }
1992  }
1993  ASSERT_HOST(modal_font != NULL);
1994 
1995  // Assign modal font to weak words.
1996  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1997  page_res_it.forward()) {
1998  word = page_res_it.word();
1999  int length = word->best_choice->length();
2000 
2001  int count = word->fontinfo_id_count;
2002  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2003  word->fontinfo = modal_font;
2004  // Counts only get 1 as it came from the doc.
2005  word->fontinfo_id_count = 1;
2006  word->italic = modal_font->is_italic() ? 1 : -1;
2007  word->bold = modal_font->is_bold() ? 1 : -1;
2008  }
2009  }
2010 }
bool is_bold() const
Definition: fontinfo.h:112
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
inT8 bold
Definition: pageres.h:286
Definition: statistc.h:33
const FontInfo * fontinfo
Definition: pageres.h:288
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT8 fontinfo_id_count
Definition: pageres.h:290
bool is_italic() const
Definition: fontinfo.h:111
const FontInfo * fontinfo2
Definition: pageres.h:289
inT8 fontinfo_id2_count
Definition: pageres.h:291
WERD * word
Definition: pageres.h:175
int count(LIST var_list)
Definition: oldlist.cpp:108
#define NULL
Definition: host.h:144
SIGNED char inT8
Definition: host.h:98
inT8 italic
Definition: pageres.h:285
short inT16
Definition: host.h:100
inT16 tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 831 of file fixspace.cpp.

831  {
832  WERD_RES_IT word_it(&word_res_list);
833  WERD_RES *word;
834  inT16 word_length;
835  inT16 score = 0;
836  inT16 i;
837  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
838 
839  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
840  word = word_it.data();
841  if (word->rebuild_word == NULL)
842  continue; // Can't handle cube words.
843  word_length = word->reject_map.length();
844  if (word->done ||
845  word->tess_accepted ||
846  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
847  word->best_choice->permuter() == FREQ_DAWG_PERM ||
848  word->best_choice->permuter() == USER_DAWG_PERM ||
849  safe_dict_word(word) > 0) {
850  int num_blobs = word->rebuild_word->NumBlobs();
851  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
852  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
853  TBLOB* blob = word->rebuild_word->blobs[i];
854  if (word->best_choice->unichar_id(i) == space ||
855  blob_noise_score(blob) < small_limit) {
856  score -= 1; // penalise possibly erroneous non-space
857  } else if (word->reject_map[i].accepted()) {
858  score++;
859  }
860  }
861  }
862  }
863  if (score < 0)
864  score = 0;
865  return score;
866 }
const int kBlnXHeight
Definition: normalis.h:28
BOOL8 tess_accepted
Definition: pageres.h:280
Definition: blobs.h:261
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
int NumBlobs() const
Definition: blobs.h:425
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
uinT8 permuter() const
Definition: ratngs.h:343
int UNICHAR_ID
Definition: unichar.h:33
BOOL8 done
Definition: pageres.h:282
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:761
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
#define NULL
Definition: host.h:144
short inT16
Definition: host.h:100
GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 
)

Definition at line 683 of file docqual.cpp.

683  {
684  enum STATES
685  {
686  JUNK,
687  FIRST_UPPER,
688  FIRST_LOWER,
689  FIRST_NUM,
690  SUBSEQUENT_UPPER,
691  SUBSEQUENT_LOWER,
692  SUBSEQUENT_NUM
693  };
694  const char *str = word->best_choice->unichar_string().string();
695  const char *lengths = word->best_choice->unichar_lengths().string();
696  STATES state = JUNK;
697  int len = 0;
698  int isolated_digits = 0;
699  int isolated_alphas = 0;
700  int bad_char_count = 0;
701  int tess_rejs = 0;
702  int dodgy_chars = 0;
703  int ok_chars;
704  UNICHAR_ID last_char = -1;
705  int alpha_repetition_count = 0;
706  int longest_alpha_repetition_count = 0;
707  int longest_lower_run_len = 0;
708  int lower_string_count = 0;
709  int longest_upper_run_len = 0;
710  int upper_string_count = 0;
711  int total_alpha_count = 0;
712  int total_digit_count = 0;
713 
714  for (; *str != '\0'; str += *(lengths++)) {
715  len++;
716  if (word->uch_set->get_isupper (str, *lengths)) {
717  total_alpha_count++;
718  switch (state) {
719  case SUBSEQUENT_UPPER:
720  case FIRST_UPPER:
721  state = SUBSEQUENT_UPPER;
722  upper_string_count++;
723  if (longest_upper_run_len < upper_string_count)
724  longest_upper_run_len = upper_string_count;
725  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
726  alpha_repetition_count++;
727  if (longest_alpha_repetition_count < alpha_repetition_count) {
728  longest_alpha_repetition_count = alpha_repetition_count;
729  }
730  }
731  else {
732  last_char = word->uch_set->unichar_to_id(str, *lengths);
733  alpha_repetition_count = 1;
734  }
735  break;
736  case FIRST_NUM:
737  isolated_digits++;
738  default:
739  state = FIRST_UPPER;
740  last_char = word->uch_set->unichar_to_id(str, *lengths);
741  alpha_repetition_count = 1;
742  upper_string_count = 1;
743  break;
744  }
745  }
746  else if (word->uch_set->get_islower (str, *lengths)) {
747  total_alpha_count++;
748  switch (state) {
749  case SUBSEQUENT_LOWER:
750  case FIRST_LOWER:
751  state = SUBSEQUENT_LOWER;
752  lower_string_count++;
753  if (longest_lower_run_len < lower_string_count)
754  longest_lower_run_len = lower_string_count;
755  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
756  alpha_repetition_count++;
757  if (longest_alpha_repetition_count < alpha_repetition_count) {
758  longest_alpha_repetition_count = alpha_repetition_count;
759  }
760  }
761  else {
762  last_char = word->uch_set->unichar_to_id(str, *lengths);
763  alpha_repetition_count = 1;
764  }
765  break;
766  case FIRST_NUM:
767  isolated_digits++;
768  default:
769  state = FIRST_LOWER;
770  last_char = word->uch_set->unichar_to_id(str, *lengths);
771  alpha_repetition_count = 1;
772  lower_string_count = 1;
773  break;
774  }
775  }
776  else if (word->uch_set->get_isdigit (str, *lengths)) {
777  total_digit_count++;
778  switch (state) {
779  case FIRST_NUM:
780  state = SUBSEQUENT_NUM;
781  case SUBSEQUENT_NUM:
782  break;
783  case FIRST_UPPER:
784  case FIRST_LOWER:
785  isolated_alphas++;
786  default:
787  state = FIRST_NUM;
788  break;
789  }
790  }
791  else {
792  if (*lengths == 1 && *str == ' ')
793  tess_rejs++;
794  else
795  bad_char_count++;
796  switch (state) {
797  case FIRST_NUM:
798  isolated_digits++;
799  break;
800  case FIRST_UPPER:
801  case FIRST_LOWER:
802  isolated_alphas++;
803  default:
804  break;
805  }
806  state = JUNK;
807  }
808  }
809 
810  switch (state) {
811  case FIRST_NUM:
812  isolated_digits++;
813  break;
814  case FIRST_UPPER:
815  case FIRST_LOWER:
816  isolated_alphas++;
817  default:
818  break;
819  }
820 
822  total_alpha_count += total_digit_count - isolated_digits;
823  }
824 
825  if (crunch_leave_ok_strings && len >= 4 &&
826  2 * (total_alpha_count - isolated_alphas) > len &&
827  longest_alpha_repetition_count < crunch_long_repetitions) {
828  if ((crunch_accept_ok &&
829  acceptable_word_string(*word->uch_set, str, lengths) !=
830  AC_UNACCEPTABLE) ||
831  longest_lower_run_len > crunch_leave_lc_strings ||
832  longest_upper_run_len > crunch_leave_uc_strings)
833  return G_NEVER_CRUNCH;
834  }
835  if (word->reject_map.length() > 1 &&
836  strpbrk(str, " ") == NULL &&
837  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
838  word->best_choice->permuter() == FREQ_DAWG_PERM ||
839  word->best_choice->permuter() == USER_DAWG_PERM ||
840  word->best_choice->permuter() == NUMBER_PERM ||
841  acceptable_word_string(*word->uch_set, str, lengths) !=
842  AC_UNACCEPTABLE || ok_dict_word))
843  return G_OK;
844 
845  ok_chars = len - bad_char_count - isolated_digits -
846  isolated_alphas - tess_rejs;
847 
848  if (crunch_debug > 3) {
849  tprintf("garbage_word: \"%s\"\n",
850  word->best_choice->unichar_string().string());
851  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
852  len,
853  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
854  }
855  if (bad_char_count == 0 &&
856  tess_rejs == 0 &&
857  (len > isolated_digits + isolated_alphas || len <= 2))
858  return G_OK;
859 
860  if (tess_rejs > ok_chars ||
861  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
862  return G_TERRIBLE;
863 
864  if (len > 4) {
865  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
866  isolated_alphas;
867  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
868  return G_DODGY;
869  else
870  return G_OK;
871  } else {
872  dodgy_chars = 2 * tess_rejs + bad_char_count;
873  if ((len == 4 && dodgy_chars > 2) ||
874  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
875  return G_DODGY;
876  else
877  return G_OK;
878  }
879 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
const STRING & unichar_lengths() const
Definition: ratngs.h:531
Definition: docqual.h:28
const STRING & unichar_string() const
Definition: ratngs.h:524
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHARSET * uch_set
Definition: pageres.h:192
uinT8 permuter() const
Definition: ratngs.h:343
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
int UNICHAR_ID
Definition: unichar.h:33
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
Unacceptable word.
Definition: control.h:36
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 285 of file output.cpp.

285  { // what char is repeated?
286  int i;
287  for (i = 0; ((i < word->reject_map.length()) &&
288  (word->reject_map[i].rejected())); ++i);
289 
290  if (i < word->reject_map.length()) {
291  return word->best_choice->unichar_id(i);
292  } else {
293  return word->uch_set->unichar_to_id(unrecognised_char.string());
294  }
295 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 254 of file tesseractclass.h.

254  {
255  return sub_langs_[index];
256  }
CubeRecoContext* tesseract::Tesseract::GetCubeRecoContext ( )
inline

Definition at line 1160 of file tesseractclass.h.

1160 { return cube_cntxt_; }
void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 253 of file superscript.cpp.

261  {
262  *avg_certainty = *unlikely_threshold = 0.0f;
263  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
264  *leading_certainty = *trailing_certainty = 0.0f;
265 
266  int super_y_bottom =
268  int sub_y_top =
270 
271  // Step one: Get an average certainty for "normally placed" characters.
272 
273  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
274  *leading_pos = *trailing_pos = SP_NORMAL;
275  int leading_outliers = 0;
276  int trailing_outliers = 0;
277  int num_normal = 0;
278  float normal_certainty_total = 0.0f;
279  float worst_normal_certainty = 0.0f;
280  ScriptPos last_pos = SP_NORMAL;
281  int num_blobs = word->rebuild_word->NumBlobs();
282  for (int b = 0; b < num_blobs; ++b) {
283  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
284  ScriptPos pos = SP_NORMAL;
285  if (box.bottom() >= super_y_bottom) {
286  pos = SP_SUPERSCRIPT;
287  } else if (box.top() <= sub_y_top) {
288  pos = SP_SUBSCRIPT;
289  }
290  if (pos == SP_NORMAL) {
291  if (word->best_choice->unichar_id(b) != 0) {
292  float char_certainty = word->best_choice->certainty(b);
293  if (char_certainty < worst_normal_certainty) {
294  worst_normal_certainty = char_certainty;
295  }
296  num_normal++;
297  normal_certainty_total += char_certainty;
298  }
299  if (trailing_outliers == b) {
300  leading_outliers = trailing_outliers;
301  *leading_pos = last_pos;
302  }
303  trailing_outliers = 0;
304  } else {
305  if (last_pos == pos) {
306  trailing_outliers++;
307  } else {
308  trailing_outliers = 1;
309  }
310  }
311  last_pos = pos;
312  }
313  *trailing_pos = last_pos;
314  if (num_normal >= 3) { // throw out the worst as an outlier.
315  num_normal--;
316  normal_certainty_total -= worst_normal_certainty;
317  }
318  if (num_normal > 0) {
319  *avg_certainty = normal_certainty_total / num_normal;
320  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
321  }
322  if (num_normal == 0 ||
323  (leading_outliers == 0 && trailing_outliers == 0)) {
324  return;
325  }
326 
327  // Step two: Try to split off bits of the word that are both outliers
328  // and have much lower certainty than average
329  // Calculate num_leading and leading_certainty.
330  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
331  *num_rebuilt_leading < leading_outliers;
332  (*num_rebuilt_leading)++) {
333  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
334  if (char_certainty > *unlikely_threshold) {
335  break;
336  }
337  if (char_certainty < *leading_certainty) {
338  *leading_certainty = char_certainty;
339  }
340  }
341 
342  // Calculate num_trailing and trailing_certainty.
343  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
344  *num_rebuilt_trailing < trailing_outliers;
345  (*num_rebuilt_trailing)++) {
346  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
347  float char_certainty = word->best_choice->certainty(blob_idx);
348  if (char_certainty > *unlikely_threshold) {
349  break;
350  }
351  if (char_certainty < *trailing_certainty) {
352  *trailing_certainty = char_certainty;
353  }
354  }
355 }
const int kBlnXHeight
Definition: normalis.h:28
WERD_CHOICE * best_choice
Definition: pageres.h:219
int NumBlobs() const
Definition: blobs.h:425
float certainty() const
Definition: ratngs.h:327
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const int kBlnBaselineOffset
Definition: normalis.h:29
inT16 bottom() const
Definition: rect.h:61
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: rect.h:30
double superscript_worse_certainty
inT16 top() const
Definition: rect.h:54
int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 228 of file tesseractclass.h.

228  {
229  return pixGetHeight(pix_binary_);
230  }
int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 225 of file tesseractclass.h.

225  {
226  return pixGetWidth(pix_binary_);
227  }
bool tesseract::Tesseract::init_cube_objects ( bool  load_combiner,
TessdataManager tessdata_manager 
)

Definition at line 154 of file cube_control.cpp.

155  {
156  ASSERT_HOST(cube_cntxt_ == NULL);
157  ASSERT_HOST(tess_cube_combiner_ == NULL);
158 
159  // Create the cube context object
160  cube_cntxt_ = CubeRecoContext::Create(this, tessdata_manager, &unicharset);
161  if (cube_cntxt_ == NULL) {
162  if (cube_debug_level > 0) {
163  tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to "
164  "instantiate CubeRecoContext\n");
165  }
166  return false;
167  }
168 
169  // Create the combiner object and load the combiner net for target languages.
170  if (load_combiner) {
171  tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_);
172  if (!tess_cube_combiner_ || !tess_cube_combiner_->LoadCombinerNet()) {
173  delete cube_cntxt_;
174  cube_cntxt_ = NULL;
175  if (tess_cube_combiner_ != NULL) {
176  delete tess_cube_combiner_;
177  tess_cube_combiner_ = NULL;
178  }
179  if (cube_debug_level > 0)
180  tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n");
181  return false;
182  }
183  }
184  return true;
185 }
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
TessdataManager tessdata_manager
Definition: ccutil.h:71
#define ASSERT_HOST(x)
Definition: errcode.h:84
static CubeRecoContext * Create(Tesseract *tess_obj, TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
#define NULL
Definition: host.h:144
FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 36 of file recogtraining.cpp.

36  {
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40  // Explore all segmentations.
42  }
43 
44  STRING output_fname = fname;
45  const char *lastdot = strrchr(output_fname.string(), '.');
46  if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
47  output_fname += ".txt";
48  FILE *output_file = open_file(output_fname.string(), "a+");
49  return output_file;
50 }
Dict & getDict()
Definition: classify.h:65
bool stopper_no_acceptable_choices
Definition: dict.h:615
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 285 of file tessedit.cpp.

290  {
291  GenericVector<STRING> langs_to_load;
292  GenericVector<STRING> langs_not_to_load;
293  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
294 
295  sub_langs_.delete_data_pointers();
296  sub_langs_.clear();
297  // Find the first loadable lang and load into this.
298  // Add any languages that this language requires
299  bool loaded_primary = false;
300  // Load the rest into sub_langs_.
301  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
302  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
303  const char *lang_str = langs_to_load[lang_index].string();
304  Tesseract *tess_to_init;
305  if (!loaded_primary) {
306  tess_to_init = this;
307  } else {
308  tess_to_init = new Tesseract;
309  }
310 
311  int result = tess_to_init->init_tesseract_internal(
312  arg0, textbase, lang_str, oem, configs, configs_size,
313  vars_vec, vars_values, set_only_non_debug_params);
314 
315  if (!loaded_primary) {
316  if (result < 0) {
317  tprintf("Failed loading language '%s'\n", lang_str);
318  } else {
320  tprintf("Loaded language '%s' as main language\n", lang_str);
321  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
322  &langs_to_load, &langs_not_to_load);
323  loaded_primary = true;
324  }
325  } else {
326  if (result < 0) {
327  tprintf("Failed loading language '%s'\n", lang_str);
328  delete tess_to_init;
329  } else {
331  tprintf("Loaded language '%s' as secondary language\n", lang_str);
332  sub_langs_.push_back(tess_to_init);
333  // Add any languages that this language requires
334  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
335  &langs_to_load, &langs_not_to_load);
336  }
337  }
338  }
339  }
340  if (!loaded_primary) {
341  tprintf("Tesseract couldn't load any languages!\n");
342  return -1; // Couldn't load any language!
343  }
344  if (!sub_langs_.empty()) {
345  // In multilingual mode word ratings have to be directly comparable,
346  // so use the same language model weights for all languages:
347  // use the primary language's params model if
348  // tessedit_use_primary_params_model is set,
349  // otherwise use default language model weights.
351  for (int s = 0; s < sub_langs_.size(); ++s) {
352  sub_langs_[s]->language_model_->getParamsModel().Copy(
354  }
355  tprintf("Using params model of the primary language\n");
358  }
359  } else {
361  for (int s = 0; s < sub_langs_.size(); ++s) {
362  sub_langs_[s]->language_model_->getParamsModel().Clear();
363  }
365  tprintf("Using default language params\n");
366  }
367  }
368 
370  return 0;
371 }
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:249
LanguageModel * language_model_
Definition: wordrec.h:411
ParamsModel & getParamsModel()
void SetupUniversalFontIds()
Definition: tessedit.cpp:439
int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 487 of file tesseractclass.h.

489  {
490  return init_tesseract(datapath, NULL, language, oem,
491  NULL, 0, NULL, NULL, false);
492  }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:285
#define NULL
Definition: host.h:144
int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 389 of file tessedit.cpp.

394  {
395  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
396  configs_size, vars_vec, vars_values,
397  set_only_non_debug_params)) {
398  return -1;
399  }
402  return 0;
403  }
404  // If only Cube will be used, skip loading Tesseract classifier's
405  // pre-trained templates.
406  bool init_tesseract_classifier =
409  // If only Cube will be used and if it has its own Unicharset,
410  // skip initializing permuter and loading Tesseract Dawgs.
411  bool init_dict =
414  program_editup(textbase, init_tesseract_classifier, init_dict);
416  return 0; //Normal exit
417 }
TessdataManager tessdata_manager
Definition: ccutil.h:71
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:83
void program_editup(const char *textbase, bool init_classifier, bool init_permute)
Definition: tface.cpp:46
bool SeekToStart(TessdataType tessdata_type)
bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 83 of file tessedit.cpp.

88  {
89  // Set the basename, compute the data directory.
90  main_setup(arg0, textbase);
91 
92  // Set the language data path prefix
93  lang = language != NULL ? language : "eng";
97 
98  // Initialize TessdataManager.
99  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
100  if (!tessdata_manager.Init(tessdata_path.string(),
102  return false;
103  }
104 
105  // If a language specific config file (lang.config) exists, load it in.
112  tprintf("Loaded language config file\n");
113  }
114  }
115 
116  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
118  // Load tesseract variables from config files. This is done after loading
119  // language-specific variables from [lang].traineddata file, so that custom
120  // config files can override values in [lang].traineddata file.
121  for (int i = 0; i < configs_size; ++i) {
122  read_config_file(configs[i], set_params_constraint);
123  }
124 
125  // Set params specified in vars_vec (done after setting params from config
126  // files, so that params in vars_vec can override those from files).
127  if (vars_vec != NULL && vars_values != NULL) {
128  for (int i = 0; i < vars_vec->size(); ++i) {
129  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
130  (*vars_values)[i].string(),
131  set_params_constraint, this->params())) {
132  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
133  exit(1);
134  }
135  }
136  }
137 
138  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
139  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
140  if (params_file != NULL) {
141  ParamUtils::PrintParams(params_file, this->params());
142  fclose(params_file);
144  tprintf("Wrote parameters to %s\n",
145  tessedit_write_params_to_file.string());
146  }
147  } else {
148  tprintf("Failed to open %s for writing params.\n",
149  tessedit_write_params_to_file.string());
150  }
151  }
152 
153  // Determine which ocr engine(s) should be loaded and used for recognition.
154  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
156  tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
157  static_cast<int>(tessedit_ocr_engine_mode));
158  }
159 
160  // If we are only loading the config file (and so not planning on doing any
161  // recognition) then there's nothing else do here.
164  tprintf("Returning after loading config file\n");
165  }
166  return true;
167  }
168 
169  // Load the unicharset
172  return false;
173  }
174  if (unicharset.size() > MAX_NUM_CLASSES) {
175  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
176  return false;
177  }
178  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
179  right_to_left_ = unicharset.major_right_to_left();
180 
181  // Setup initial unichar ambigs table and read universal ambigs.
182  UNICHARSET encoder_unicharset;
183  encoder_unicharset.CopyFrom(unicharset);
185  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
186 
189  TFile ambigs_file;
190  ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
193  encoder_unicharset,
194  &ambigs_file,
196  if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
197  }
198 
199  // The various OcrEngineMode settings (see publictypes.h) determine which
200  // engine-specific data files need to be loaded. Currently everything needs
201  // the base tesseract data, which supplies other useful information, but
202  // alternative engines, such as cube and LSTM are optional.
203 #ifndef ANDROID_BUILD
207  tprintf("Loaded Cube w/out combiner\n");
211  tprintf("Loaded Cube with combiner\n");
212  }
213 #endif
214  // Init ParamsModel.
215  // Load pass1 and pass2 weights (for now these two sets are the same, but in
216  // the future separate sets of weights can be generated).
217  for (int p = ParamsModel::PTRAIN_PASS1;
220  static_cast<ParamsModel::PassEnum>(p));
225  return false;
226  }
227  }
228  }
230 
231  return true;
232 }
FILE * GetDataFilePtr() const
bool major_right_to_left() const
Definition: unicharset.cpp:931
int size() const
Definition: genericvector.h:72
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:66
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
SetParamConstraint
Definition: params.h:36
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
bool LoadFromFp(const char *lang, FILE *fp, inT64 end_offset)
char * tessedit_write_params_to_file
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180
inT64 GetEndOffset(TessdataType tessdata_type) const
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
TessdataManager tessdata_manager
Definition: ccutil.h:71
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:67
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:53
STRING datadir
Definition: ccutil.h:67
LanguageModel * language_model_
Definition: wordrec.h:411
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:74
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:52
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:98
STRING language_data_path_prefix
Definition: ccutil.h:70
ParamsVectors * params()
Definition: ccutil.h:65
bool SeekToStart(TessdataType tessdata_type)
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:53
ParamsModel & getParamsModel()
bool Init(const char *data_file_name, int debug_level)
STRING lang
Definition: ccutil.h:69
Definition: strngs.h:44
void SetPass(PassEnum pass)
Definition: params_model.h:72
#define NULL
Definition: host.h:144
bool use_ambigs_for_adaption
Definition: ccutil.h:93
int size() const
Definition: unicharset.h:297
const char * string() const
Definition: strngs.cpp:193
int ambigs_debug_level
Definition: ccutil.h:89
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language 
)

Definition at line 460 of file tessedit.cpp.

462  {
463  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
464  NULL, 0, NULL, NULL, false))
465  return -1;
468  return 0;
469 }
TessdataManager tessdata_manager
Definition: ccutil.h:71
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186
Dict & getDict()
Definition: classify.h:65
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:83
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194
#define NULL
Definition: host.h:144
void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 240 of file tfacepp.cpp.

242  {
243  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
244  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
245  // Tack the word2 outputs onto the end of the word outputs.
246  word->chopped_word->blobs += word2->chopped_word->blobs;
247  word->rebuild_word->blobs += word2->rebuild_word->blobs;
248  word2->chopped_word->blobs.clear();
249  word2->rebuild_word->blobs.clear();
250  TPOINT split_pt;
251  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
252  split_pt.y = (prev_box.top() + prev_box.bottom() +
253  blob_box.top() + blob_box.bottom()) / 4;
254  // Move the word2 seams onto the end of the word1 seam_array.
255  // Since the seam list is one element short, an empty seam marking the
256  // end of the last blob in the first word is needed first.
257  word->seam_array.push_back(new SEAM(0.0f, split_pt));
258  word->seam_array += word2->seam_array;
259  word2->seam_array.truncate(0);
260  // Fix widths and gaps.
261  word->blob_widths += word2->blob_widths;
262  word->blob_gaps += word2->blob_gaps;
263  // Fix the ratings matrix.
264  int rat1 = word->ratings->dimension();
265  int rat2 = word2->ratings->dimension();
266  word->ratings->AttachOnCorner(word2->ratings);
267  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
268  word->best_state += word2->best_state;
269  // Append the word choices.
270  *word->raw_choice += *word2->raw_choice;
271 
272  // How many alt choices from each should we try to get?
273  const int kAltsPerPiece = 2;
274  // When do we start throwing away extra alt choices?
275  const int kTooManyAltChoices = 100;
276 
277  // Construct the cartesian product of the best_choices of word(1) and word2.
278  WERD_CHOICE_LIST joined_choices;
279  WERD_CHOICE_IT jc_it(&joined_choices);
280  WERD_CHOICE_IT bc1_it(&word->best_choices);
281  WERD_CHOICE_IT bc2_it(&word2->best_choices);
282  int num_word1_choices = word->best_choices.length();
283  int total_joined_choices = num_word1_choices;
284  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
285  // word2 choices, and put them in the joined_choices list. The 1st word2
286  // choice gets added to the original word1 choices in-place after we have
287  // finished with them.
288  int bc2_index = 1;
289  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
290  if (total_joined_choices >= kTooManyAltChoices &&
291  bc2_index > kAltsPerPiece)
292  break;
293  int bc1_index = 0;
294  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
295  ++bc1_index, bc1_it.forward()) {
296  if (total_joined_choices >= kTooManyAltChoices &&
297  bc1_index > kAltsPerPiece)
298  break;
299  WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
300  *wc += *bc2_it.data();
301  jc_it.add_after_then_move(wc);
302  ++total_joined_choices;
303  }
304  }
305  // Now that we've filled in as many alternates as we want, paste the best
306  // choice for word2 onto the original word alt_choices.
307  bc1_it.move_to_first();
308  bc2_it.move_to_first();
309  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
310  *bc1_it.data() += *bc2_it.data();
311  }
312  bc1_it.move_to_last();
313  bc1_it.add_list_after(&joined_choices);
314 
315  // Restore the pointer to original blamer bundle and combine blamer
316  // information recorded in the splits.
317  if (orig_bb != NULL) {
318  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
320  delete word->blamer_bundle;
321  word->blamer_bundle = orig_bb;
322  }
323  word->SetupBoxWord();
324  word->reject_map.initialise(word->box_word->length());
325  delete word2;
326 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
void truncate(int size)
tesseract::BoxWord * box_word
Definition: pageres.h:250
MATRIX * ratings
Definition: pageres.h:215
int push_back(T object)
REJMAP reject_map
Definition: pageres.h:271
TWERD * chopped_word
Definition: pageres.h:201
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:264
T & back() const
inT16 y
Definition: blobs.h:72
inT16 right() const
Definition: rect.h:75
int dimension() const
Definition: matrix.h:247
#define ASSERT_HOST(x)
Definition: errcode.h:84
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:225
bool wordrec_debug_blamer
Definition: wordrec.h:167
inT16 left() const
Definition: rect.h:68
TWERD * rebuild_word
Definition: pageres.h:244
Definition: blobs.h:50
GenericVector< int > blob_gaps
Definition: pageres.h:208
WERD_CHOICE * raw_choice
Definition: pageres.h:224
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
inT16 x
Definition: blobs.h:71
inT16 bottom() const
Definition: rect.h:61
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int length() const
Definition: boxword.h:85
Definition: rect.h:30
void SetupBoxWord()
Definition: pageres.cpp:843
void initialise(inT16 length)
Definition: rejctmap.cpp:318
GenericVector< int > best_state
Definition: pageres.h:255
#define NULL
Definition: host.h:144
GenericVector< int > blob_widths
Definition: pageres.h:205
Definition: seam.h:44
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
inT16  pass 
)
void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 196 of file fixspace.cpp.

197  {
198  WERD_RES_IT word_it(&words);
199  WERD_RES *word;
200  // Since we are not using PAGE_RES to iterate over words, we need to update
201  // prev_word_best_choice_ before calling classify_word_pass2().
203  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
204  word = word_it.data();
205  if ((!word->part_of_combo) && (word->box_word == NULL)) {
206  WordData word_data(block, row, word);
207  SetupWordPassN(2, &word_data);
208  classify_word_and_language(2, NULL, &word_data);
209  }
211  }
212 }
tesseract::BoxWord * box_word
Definition: pageres.h:250
WERD_CHOICE * best_choice
Definition: pageres.h:219
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
BOOL8 part_of_combo
Definition: pageres.h:316
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
#define NULL
Definition: host.h:144
void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1549 of file control.cpp.

1550  {
1551  if (word->tess_failed) return;
1552  tess_segment_pass_n(pass_n, word);
1553 
1554  if (!word->tess_failed) {
1555  if (!word->word->flag (W_REP_CHAR)) {
1556  word->fix_quotes();
1558  word->fix_hyphens();
1559  /* Dont trust fix_quotes! - though I think I've fixed the bug */
1560  if (word->best_choice->length() != word->box_word->length()) {
1561  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1562  " #Blobs=%d\n",
1563  word->best_choice->debug_string().string(),
1564  word->best_choice->length(),
1565  word->box_word->length());
1566 
1567  }
1568  word->tess_accepted = tess_acceptable_word(word);
1569 
1570  // Also sets word->done flag
1571  make_reject_map(word, row, pass_n);
1572  }
1573  }
1574  set_word_fonts(word);
1575 
1576  ASSERT_HOST(word->raw_choice != NULL);
1577 }
BOOL8 tess_accepted
Definition: pageres.h:280
tesseract::BoxWord * box_word
Definition: pageres.h:250
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
void fix_quotes()
Definition: pageres.cpp:1012
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING debug_string() const
Definition: ratngs.h:502
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:39
WERD * word
Definition: pageres.h:175
const int length() const
Definition: boxword.h:85
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1880
BOOL8 tess_failed
Definition: pageres.h:272
void fix_hyphens()
Definition: pageres.cpp:1041
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 253 of file applybox.cpp.

255  {
256  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
261  row, block)) {
262  word_res->CloneChoppedToRebuild();
263  return;
264  }
265  if (chop_debug) {
266  tprintf("Maximally chopping word at:");
267  word_res->word->bounding_box().print();
268  }
269  GenericVector<BLOB_CHOICE*> blob_choices;
270  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
271  float rating = static_cast<float>(MAX_INT8);
272  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
273  // The rating and certainty are not quite arbitrary. Since
274  // select_blob_to_chop uses the worst certainty to choose, they all have
275  // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
276  // in here, and then divide by e each time they are chopped, which
277  // should guarantee a set of unequal values for the whole tree of blobs
278  // produced, however much chopping is required. The chops are thus only
279  // limited by the ability of the chopper to find suitable chop points,
280  // and not by the value of the certainties.
281  BLOB_CHOICE* choice =
282  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
283  blob_choices.push_back(choice);
284  rating -= 0.125f;
285  }
286  const double e = exp(1.0); // The base of natural logs.
287  int blob_number;
288  int right_chop_index = 0;
290  // We only chop if the language is not fixed pitch like CJK.
291  SEAM* seam = NULL;
292  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
293  &blob_number)) != NULL) {
294  word_res->InsertSeam(blob_number, seam);
295  BLOB_CHOICE* left_choice = blob_choices[blob_number];
296  rating = left_choice->rating() / e;
297  left_choice->set_rating(rating);
298  left_choice->set_certainty(-rating);
299  // combine confidence w/ serial #
300  BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
301  rating - 0.125f, -rating, -1,
302  0.0f, 0.0f, 0.0f, BCC_FAKE);
303  blob_choices.insert(right_choice, blob_number + 1);
304  }
305  }
306  word_res->CloneChoppedToRebuild();
307  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
308 }
int size() const
Definition: genericvector.h:72
bool classify_bln_numeric_mode
Definition: classify.h:500
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
int NumBlobs() const
Definition: blobs.h:425
float rating() const
Definition: ratngs.h:79
void insert(T t, int index)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
void CloneChoppedToRebuild()
Definition: pageres.cpp:828
void set_certainty(float newrat)
Definition: ratngs.h:150
Pix * BestPix() const
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
WERD * word
Definition: pageres.h:175
#define MAX_INT8
Definition: host.h:118
bool empty() const
Definition: genericvector.h:84
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define NULL
Definition: host.h:144
void set_rating(float newrat)
Definition: ratngs.h:147
Definition: seam.h:44
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:374
Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 191 of file tesseractclass.h.

191  {
192  Clear();
193  return &pix_binary_;
194  }
Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 244 of file tesseractclass.h.

244  {
245  return &textord_;
246  }
void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)
void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)
BOOL8 tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 981 of file docqual.cpp.

981  {
982  TBOX box; // BB of outline
983  inT16 outline_count = 0;
984  inT16 small_outline_count = 0;
985  inT16 max_dimension;
986  float small_limit = kBlnXHeight * crunch_small_outlines_size;
987 
988  for (int b = 0; b < word->NumBlobs(); ++b) {
989  TBLOB* blob = word->blobs[b];
990  for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
991  outline_count++;
992  box = ol->bounding_box();
993  if (box.height() > box.width())
994  max_dimension = box.height();
995  else
996  max_dimension = box.width();
997  if (max_dimension < small_limit)
998  small_outline_count++;
999  }
1000  }
1001  return small_outline_count >= outline_count;
1002 }
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
int NumBlobs() const
Definition: blobs.h:425
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: rect.h:30
#define NULL
Definition: host.h:144
TESSLINE * outlines
Definition: blobs.h:377
short inT16
Definition: host.h:100
BOOL8 tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 789 of file reject.cpp.

789  {
790  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
791 }
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
BOOL8 tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 785 of file reject.cpp.

785  {
786  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
787 }
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 251 of file tesseractclass.h.

251  {
252  return sub_langs_.size();
253  }
BOOL8 tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
BOOL8  update_map 
)

Definition at line 292 of file reject.cpp.

292  {
293  const char *word;
294  const char *lengths;
295  inT16 word_len; //its length
296  inT16 first_alphanum_index_;
297  inT16 first_alphanum_offset_;
298  inT16 i;
299  inT16 offset;
300  BOOL8 non_conflict_set_char; //non conf set a/n?
301  BOOL8 conflict = FALSE;
302  BOOL8 allow_1s;
303  ACCEPTABLE_WERD_TYPE word_type;
304  BOOL8 dict_perm_type;
305  BOOL8 dict_word_ok;
306  int dict_word_type;
307 
308  word = word_res->best_choice->unichar_string().string ();
309  lengths = word_res->best_choice->unichar_lengths().string();
310  word_len = strlen (lengths);
311  /*
312  If there are no occurrences of the conflict set characters then the word
313  is OK.
314  */
315  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
316  return FALSE;
317 
318  /*
319  There is a conflict if there are NO other (confirmed) alphanumerics apart
320  from those in the conflict set.
321  */
322 
323  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
324  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
325  non_conflict_set_char =
326  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
327  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
328  !STRING (conflict_set_I_l_1).contains (word[offset]);
329  if (!non_conflict_set_char) {
330  if (update_map)
331  reject_I_1_L(word_res);
332  return TRUE;
333  }
334 
335  /*
336  If the word is accepted by a dawg permuter, and the first alpha character
337  is "I" or "l", check to see if the alternative is also a dawg word. If it
338  is, then there is a potential error otherwise the word is ok.
339  */
340 
341  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
342  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
344  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
345  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
346  dict_word_type = dict_word(*(word_res->best_choice));
347  dict_word_ok = (dict_word_type > 0) &&
348  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
349 
350  if ((rej_1Il_use_dict_word && dict_word_ok) ||
351  (rej_1Il_trust_permuter_type && dict_perm_type) ||
352  (dict_perm_type && dict_word_ok)) {
353  first_alphanum_index_ = first_alphanum_index (word, lengths);
354  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
355  if (lengths[first_alphanum_index_] == 1 &&
356  word[first_alphanum_offset_] == 'I') {
357  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
358  if (safe_dict_word(word_res) > 0) {
359  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
360  if (update_map)
361  word_res->reject_map[first_alphanum_index_].
362  setrej_1Il_conflict();
363  return TRUE;
364  }
365  else {
366  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
367  return FALSE;
368  }
369  }
370 
371  if (lengths[first_alphanum_index_] == 1 &&
372  word[first_alphanum_offset_] == 'l') {
373  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
374  if (safe_dict_word(word_res) > 0) {
375  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
376  if (update_map)
377  word_res->reject_map[first_alphanum_index_].
378  setrej_1Il_conflict();
379  return TRUE;
380  }
381  else {
382  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
383  return FALSE;
384  }
385  }
386  return FALSE;
387  }
388 
389  /*
390  NEW 1Il code. The old code relied on permuter types too much. In fact,
391  tess will use TOP_CHOICE permute for good things like "palette".
392  In this code the string is examined independently to see if it looks like
393  a well formed word.
394  */
395 
396  /*
397  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
398  dictionary word.
399  */
400  first_alphanum_index_ = first_alphanum_index (word, lengths);
401  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
402  if (lengths[first_alphanum_index_] == 1 &&
403  word[first_alphanum_offset_] == 'l') {
404  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405  if (safe_dict_word(word_res) > 0)
406  return FALSE;
407  else
408  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
409  }
410  else if (lengths[first_alphanum_index_] == 1 &&
411  word[first_alphanum_offset_] == 'I') {
412  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
413  if (safe_dict_word(word_res) > 0)
414  return FALSE;
415  else
416  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
417  }
418  /*
419  For strings containing digits:
420  If there are no alphas OR the numeric permuter liked the word,
421  reject any non 1 conflict chs
422  Else reject all conflict chs
423  */
424  if (word_contains_non_1_digit (word, lengths)) {
425  allow_1s = (alpha_count (word, lengths) == 0) ||
426  (word_res->best_choice->permuter () == NUMBER_PERM);
427 
428  inT16 offset;
429  conflict = FALSE;
430  for (i = 0, offset = 0; word[offset] != '\0';
431  offset += word_res->best_choice->unichar_lengths()[i++]) {
432  if ((!allow_1s || (word[offset] != '1')) &&
433  STRING (conflict_set_I_l_1).contains (word[offset])) {
434  if (update_map)
435  word_res->reject_map[i].setrej_1Il_conflict ();
436  conflict = TRUE;
437  }
438  }
439  return conflict;
440  }
441  /*
442  For anything else. See if it conforms to an acceptable word type. If so,
443  treat accordingly.
444  */
445  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
446  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
447  first_alphanum_index_ = first_alphanum_index (word, lengths);
448  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
449  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
450  if (update_map)
451  word_res->reject_map[first_alphanum_index_].
452  setrej_1Il_conflict ();
453  return TRUE;
454  }
455  else
456  return FALSE;
457  }
458  else if (word_type == AC_UPPER_CASE) {
459  return FALSE;
460  }
461  else {
462  if (update_map)
463  reject_I_1_L(word_res);
464  return TRUE;
465  }
466 }
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:482
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_lengths() const
Definition: ratngs.h:531
unsigned char BOOL8
Definition: host.h:113
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:124
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
const STRING & unichar_string() const
Definition: ratngs.h:524
ALL upper case.
Definition: control.h:38
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495
inT16 first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:469
const UNICHARSET * uch_set
Definition: pageres.h:192
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
uinT8 permuter() const
Definition: ratngs.h:343
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:509
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
ALL lower case.
Definition: control.h:37
#define FALSE
Definition: capi.h:29
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define TRUE
Definition: capi.h:28
ALL but initial lc.
Definition: control.h:39
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:191
const char * string() const
Definition: strngs.cpp:193
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
short inT16
Definition: host.h:100
void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 68 of file output.cpp.

70  {
71  BLOCK_RES *block_of_last_word;
72  BOOL8 force_eol; //During output
73  BLOCK *nextblock; //block of next word
74  WERD *nextword; //next word
75 
76  page_res_it.restart_page ();
77  block_of_last_word = NULL;
78  while (page_res_it.word () != NULL) {
79  check_debug_pt (page_res_it.word (), 120);
80 
81  if (target_word_box)
82  {
83 
84  TBOX current_word_box=page_res_it.word ()->word->bounding_box();
85  FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
86  if (!target_word_box->contains(center_pt))
87  {
88  page_res_it.forward ();
89  continue;
90  }
91 
92  }
94  block_of_last_word != page_res_it.block ()) {
95  block_of_last_word = page_res_it.block ();
96  }
97 
98  force_eol = (tessedit_write_block_separators &&
99  (page_res_it.block () != page_res_it.next_block ())) ||
100  (page_res_it.next_word () == NULL);
101 
102  if (page_res_it.next_word () != NULL)
103  nextword = page_res_it.next_word ()->word;
104  else
105  nextword = NULL;
106  if (page_res_it.next_block () != NULL)
107  nextblock = page_res_it.next_block ()->block;
108  else
109  nextblock = NULL;
110  //regardless of tilde crunching
111  write_results(page_res_it,
112  determine_newline_type(page_res_it.word()->word,
113  page_res_it.block()->block,
114  nextword, nextblock), force_eol);
115  page_res_it.forward();
116  }
117 }
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:247
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
inT16 right() const
Definition: rect.h:75
BLOCK * block
Definition: pageres.h:99
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
Definition: output.cpp:132
BLOCK_RES * block() const
Definition: pageres.h:739
WERD_RES * forward()
Definition: pageres.h:713
WERD_RES * restart_page()
Definition: pageres.h:680
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
inT16 left() const
Definition: rect.h:68
Definition: ocrblock.h:30
BLOCK_RES * next_block() const
Definition: pageres.h:748
Definition: werd.h:60
inT16 bottom() const
Definition: rect.h:61
WERD * word
Definition: pageres.h:175
Definition: rect.h:30
WERD_RES * next_word() const
Definition: pageres.h:742
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
Definition: points.h:189
WERD_RES * word() const
Definition: pageres.h:733
void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 249 of file tessedit.cpp.

251  {
252  STRING remains(lang_str);
253  while (remains.length() > 0) {
254  // Find the start of the lang code and which vector to add to.
255  const char* start = remains.string();
256  while (*start == '+')
257  ++start;
258  GenericVector<STRING>* target = to_load;
259  if (*start == '~') {
260  target = not_to_load;
261  ++start;
262  }
263  // Find the index of the end of the lang code in string start.
264  int end = strlen(start);
265  const char* plus = strchr(start, '+');
266  if (plus != NULL && plus - start < end)
267  end = plus - start;
268  STRING lang_code(start);
269  lang_code.truncate_at(end);
270  STRING next(start + end);
271  remains = next;
272  // Check whether lang_code is already in the target vector and add.
273  if (!IsStrInList(lang_code, *target)) {
275  tprintf("Adding language '%s' to list\n", lang_code.string());
276  target->push_back(lang_code);
277  }
278  }
279 }
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 337 of file pgedit.cpp.

337  {
338  current_page_res = page_res;
339  if (current_page_res->block_res_list.empty())
340  return;
341 
342  recog_done = false;
343  stillRunning = true;
344 
345  build_image_window(width, height);
348 #ifndef GRAPHICS_DISABLED
349  pe = new ParamsEditor(this, image_win);
350 #endif
351  PGEventHandler pgEventHandler(this);
352 
353  image_win->AddEventHandler(&pgEventHandler);
355 
356  SVMenuNode* svMenuRoot = build_menu_new();
357 
358  svMenuRoot->BuildMenu(image_win);
359  image_win->SetVisible(true);
360 
363 }
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
void turn_on_bit(uinT8 bit_num)
Definition: bits16.h:37
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
void AddMessageBox()
Definition: scrollview.cpp:584
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:121
bool recog_done
Definition: pgedit.cpp:118
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:418
BITS16 word_display_mode
Definition: pgedit.cpp:122
ScrollView * image_win
Definition: pgedit.cpp:107
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:946
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:449
ParamsEditor * pe
Definition: pgedit.cpp:108
bool stillRunning
Definition: pgedit.cpp:109
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:308
#define NULL
Definition: host.h:144
void build_image_window(int width, int height)
Definition: pgedit.cpp:193
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:257
void SetVisible(bool visible)
Definition: scrollview.cpp:555
Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 195 of file tesseractclass.h.

195  {
196  return pix_binary_;
197  }
Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 198 of file tesseractclass.h.

198  {
199  return pix_grey_;
200  }
BOOL8 tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
BOOL8  ok_dict_word 
)

Definition at line 545 of file docqual.cpp.

547  {
548  float rating_per_ch;
549  int adjusted_len;
550  const char *str = word->best_choice->unichar_string().string();
551  const char *lengths = word->best_choice->unichar_lengths().string();
552  BOOL8 word_crunchable;
553  int poor_indicator_count = 0;
554 
555  word_crunchable = !crunch_leave_accept_strings ||
556  word->reject_map.length() < 3 ||
558  str, lengths) == AC_UNACCEPTABLE &&
559  !ok_dict_word);
560 
561  adjusted_len = word->reject_map.length();
562  if (adjusted_len > 10)
563  adjusted_len = 10;
564  rating_per_ch = word->best_choice->rating() / adjusted_len;
565 
566  if (rating_per_ch > crunch_pot_poor_rate) {
567  if (crunch_debug > 2) {
568  tprintf("Potential poor rating on \"%s\"\n",
569  word->best_choice->unichar_string().string());
570  }
571  poor_indicator_count++;
572  }
573 
574  if (word_crunchable &&
576  if (crunch_debug > 2) {
577  tprintf("Potential poor cert on \"%s\"\n",
578  word->best_choice->unichar_string().string());
579  }
580  poor_indicator_count++;
581  }
582 
583  if (garbage_level != G_OK) {
584  if (crunch_debug > 2) {
585  tprintf("Potential garbage on \"%s\"\n",
586  word->best_choice->unichar_string().string());
587  }
588  poor_indicator_count++;
589  }
590  return poor_indicator_count >= crunch_pot_indicators;
591 }
float rating() const
Definition: ratngs.h:324
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
const STRING & unichar_lengths() const
Definition: ratngs.h:531
Definition: docqual.h:28
unsigned char BOOL8
Definition: host.h:113
const STRING & unichar_string() const
Definition: ratngs.h:524
float certainty() const
Definition: ratngs.h:327
const UNICHARSET * uch_set
Definition: pageres.h:192
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
Unacceptable word.
Definition: control.h:36
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 193 of file applybox.cpp.

193  {
194  double median_xheight = MedianXHeight(block_list);
195  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
196  // Strip all fuzzy space markers to simplify the PAGE_RES.
197  BLOCK_IT b_it(block_list);
198  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
199  BLOCK* block = b_it.data();
200  ROW_IT r_it(block->row_list());
201  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
202  ROW* row = r_it.data();
203  float diff = fabs(row->x_height() - median_xheight);
204  if (diff > max_deviation) {
205  if (applybox_debug) {
206  tprintf("row xheight=%g, but median xheight = %g\n",
207  row->x_height(), median_xheight);
208  }
209  row->set_x_height(static_cast<float>(median_xheight));
210  }
211  }
212  }
213 }
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:43
#define tprintf(...)
Definition: tprintf.h:31
void set_x_height(float new_xheight)
Definition: ocrrow.h:64
float x_height() const
Definition: ocrrow.h:61
Definition: ocrrow.h:32
Definition: ocrblock.h:30
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 690 of file tesseractclass.cpp.

690  {
692  pixDestroy(&cube_binary_);
693  cube_binary_ = pixClone(pix_binary());
694  // Find the max splitter strategy over all langs.
695  ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
698  for (int i = 0; i < sub_langs_.size(); ++i) {
699  ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
701  static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
702  if (pageseg_strategy > max_pageseg_strategy)
703  max_pageseg_strategy = pageseg_strategy;
704  // Clone the cube image to all the sub langs too.
705  pixDestroy(&sub_langs_[i]->cube_binary_);
706  sub_langs_[i]->cube_binary_ = pixClone(pix_binary());
707  pixDestroy(&sub_langs_[i]->pix_binary_);
708  sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
709  }
710  // Perform shiro-rekha (top-line) splitting and replace the current image by
711  // the newly splitted image.
712  splitter_.set_orig_pix(pix_binary());
713  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
714  if (splitter_.Split(true)) {
715  ASSERT_HOST(splitter_.splitted_image());
716  pixDestroy(&pix_binary_);
717  pix_binary_ = pixClone(splitter_.splitted_image());
718  }
719 }
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg)
Pix * pix_binary() const
int inT32
Definition: host.h:102
void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 726 of file tesseractclass.cpp.

727  {
728  // Find the max splitter strategy over all langs.
729  ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
731  static_cast<inT32>(ocr_devanagari_split_strategy));
732  for (int i = 0; i < sub_langs_.size(); ++i) {
733  ShiroRekhaSplitter::SplitStrategy ocr_strategy =
735  static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
736  if (ocr_strategy > max_ocr_strategy)
737  max_ocr_strategy = ocr_strategy;
738  }
739  // Utilize the segmentation information available.
740  splitter_.set_segmentation_block_list(block_list);
741  splitter_.set_ocr_split_strategy(max_ocr_strategy);
742  // Run the splitter for OCR
743  bool split_for_ocr = splitter_.Split(false);
744  // Restore pix_binary to the binarized original pix for future reference.
745  ASSERT_HOST(splitter_.orig_pix());
746  pixDestroy(&pix_binary_);
747  pix_binary_ = pixClone(splitter_.orig_pix());
748  // If the pageseg and ocr strategies are different, refresh the block list
749  // (from the last SegmentImage call) with blobs from the real image to be used
750  // for OCR.
751  if (splitter_.HasDifferentSplitStrategies()) {
752  BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
753  pixGetHeight(pix_binary_));
754  Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
755  splitter_.orig_pix();
756  extract_edges(pix_for_ocr, &block);
757  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
758  }
759  // The splitter isn't needed any more after this, so save memory by clearing.
760  splitter_.Clear();
761 }
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: ocrblock.h:30
void set_segmentation_block_list(BLOCK_LIST *block_list)
bool Split(bool split_for_pageseg)
#define TRUE
Definition: capi.h:28
void set_ocr_split_strategy(SplitStrategy strategy)
int inT32
Definition: host.h:102
void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > &  words)

Definition at line 36 of file par_control.cpp.

36  {
37  // Prepare all the blobs.
39  for (int w = 0; w < words.size(); ++w) {
40  if (words[w].word->ratings != NULL &&
41  words[w].word->ratings->get(0, 0) == NULL) {
42  for (int s = 0; s < words[w].lang_words.size(); ++s) {
43  Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
44  const WERD_RES& word = *words[w].lang_words[s];
45  for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
46  blobs.push_back(BlobData(b, sub, word));
47  }
48  }
49  }
50  }
51  // Pre-classify all the blobs.
52  if (tessedit_parallelize > 1) {
53  #pragma omp parallel for num_threads(10)
54  for (int b = 0; b < blobs.size(); ++b) {
55  *blobs[b].choices =
56  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
57  }
58  } else {
59  // TODO(AMD) parallelize this.
60  for (int b = 0; b < blobs.size(); ++b) {
61  *blobs[b].choices =
62  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
63  }
64  }
65 }
int size() const
Definition: genericvector.h:72
Definition: callcpp.h:34
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:201
int NumBlobs() const
Definition: blobs.h:425
#define NULL
Definition: host.h:144
T & get(int index) const
BOOL8 tesseract::Tesseract::process_cmd_win_event ( inT32  cmd_event,
char *  new_value 
)

Definition at line 397 of file pgedit.cpp.

400  {
401  char msg[160];
402  BOOL8 exit = FALSE;
403 
404  color_mode = CM_RAINBOW;
405 
406  // Run recognition on the full page if needed.
407  switch (cmd_event) {
408  case BLAMER_CMD_EVENT:
412  case SHOW_BOLD_CMD_EVENT:
418  if (!recog_done) {
420  recog_done = true;
421  }
422  break;
423  default:
424  break;
425  }
426 
427  switch (cmd_event) {
428  case NULL_CMD_EVENT:
429  break;
430 
432  case DUMP_WERD_CMD_EVENT:
435  case RECOG_WERDS:
436  case RECOG_PSEUDO:
437  case SHOW_BLOB_FEATURES:
438  mode =(CMD_EVENTS) cmd_event;
439  break;
442  word_config_ = image_win->ShowInputDialog("Config File Name");
443  break;
445  if (new_value[0] == 'T')
447  else
450  break;
451  case BLAMER_CMD_EVENT:
452  if (new_value[0] == 'T')
454  else
458  break;
460  if (new_value[0] == 'T')
462  else
465  break;
466  case POLYGONAL_CMD_EVENT:
467  if (new_value[0] == 'T')
469  else
472  break;
473  case BL_NORM_CMD_EVENT:
474  if (new_value[0] == 'T')
476  else
479  break;
480  case BITMAP_CMD_EVENT:
481  if (new_value[0] == 'T')
483  else
486  break;
489  break;
490  case IMAGE_CMD_EVENT:
491  display_image =(new_value[0] == 'T');
493  break;
494  case BLOCKS_CMD_EVENT:
495  display_blocks =(new_value[0] == 'T');
497  break;
498  case BASELINES_CMD_EVENT:
499  display_baselines =(new_value[0] == 'T');
501  break;
503  color_mode = CM_SUBSCRIPT;
505  break;
507  color_mode = CM_SUPERSCRIPT;
509  break;
511  color_mode = CM_ITALIC;
513  break;
514  case SHOW_BOLD_CMD_EVENT:
515  color_mode = CM_BOLD;
517  break;
519  color_mode = CM_UNDERLINE;
521  break;
523  color_mode = CM_FIXEDPITCH;
525  break;
527  color_mode = CM_SERIF;
529  break;
531  color_mode = CM_SMALLCAPS;
533  break;
535  color_mode = CM_DROPCAPS;
537  break;
538  case REFRESH_CMD_EVENT:
540  break;
541  case QUIT_CMD_EVENT:
542  exit = TRUE;
544  break;
545 
546  default:
547  sprintf(msg, "Unrecognised event " INT32FORMAT "(%s)",
548  cmd_event, new_value);
549  image_win->AddMessage(msg);
550  break;
551  }
552  return exit;
553 }
void turn_on_bit(uinT8 bit_num)
Definition: bits16.h:37
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
BOOL8 word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:761
bool recog_done
Definition: pgedit.cpp:118
#define INT32FORMAT
Definition: host.h:115
BOOL8 display_baselines
Definition: pgedit.cpp:126
unsigned char BOOL8
Definition: host.h:113
void AddMessage(const char *format,...)
Definition: scrollview.cpp:567
BOOL8 display_blocks
Definition: pgedit.cpp:125
Definition: werd.h:55
CMD_EVENTS mode
Definition: pgedit.cpp:116
BOOL8 display_image
Definition: pgedit.cpp:124
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287
Definition: werd.h:51
BITS16 word_display_mode
Definition: pgedit.cpp:122
ScrollView * image_win
Definition: pgedit.cpp:107
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:946
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:740
#define FALSE
Definition: capi.h:29
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:308
#define TRUE
Definition: capi.h:28
static void Exit()
Definition: scrollview.cpp:589
Definition: werd.h:50
#define NULL
Definition: host.h:144
void turn_off_bit(uinT8 bit_num)
Definition: bits16.h:42
void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 565 of file pgedit.cpp.

566  {
567  // The following variable should remain static, since it is used by
568  // debug editor, which uses a single Tesseract instance.
569  static ICOORD down;
570  ICOORD up;
571  TBOX selection_box;
572  char msg[80];
573 
574  switch(event.type) {
575 
576  case SVET_SELECTION:
577  if (event.type == SVET_SELECTION) {
578  down.set_x(event.x + event.x_size);
579  down.set_y(event.y + event.y_size);
580  if (mode == SHOW_POINT_CMD_EVENT)
581  show_point(current_page_res, event.x, event.y);
582  }
583 
584  up.set_x(event.x);
585  up.set_y(event.y);
586 
587  selection_box = TBOX(down, up);
588 
589  switch(mode) {
593  selection_box,
595  break;
596  case DUMP_WERD_CMD_EVENT:
598  selection_box,
600  break;
603  selection_box,
605  break;
607  debug_word(current_page_res, selection_box);
608  break;
610  break; // ignore up event
611 
612  case RECOG_WERDS:
613  image_win->AddMessage("Recogging selected words");
615  selection_box,
617  break;
618  case RECOG_PSEUDO:
619  image_win->AddMessage("Recogging selected blobs");
620  recog_pseudo_word(current_page_res, selection_box);
621  break;
622  case SHOW_BLOB_FEATURES:
623  blob_feature_display(current_page_res, selection_box);
624  break;
625 
626  default:
627  sprintf(msg, "Mode %d not yet implemented", mode);
628  image_win->AddMessage(msg);
629  break;
630  }
631  default:
632  break;
633  }
634 }
void set_x(inT16 xin)
rewrite function
Definition: points.h:61
BOOL8 word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:922
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:84
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:641
int y
Definition: scrollview.h:67
void AddMessage(const char *format,...)
Definition: scrollview.cpp:567
CMD_EVENTS mode
Definition: pgedit.cpp:116
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
ScrollView * image_win
Definition: pgedit.cpp:107
BOOL8 word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:717
int x_size
Definition: scrollview.h:68
BOOL8 word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:729
integer coordinate
Definition: points.h:30
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:68
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:960
void set_y(inT16 yin)
rewrite function
Definition: points.h:65
Definition: rect.h:30
SVEventType type
Definition: scrollview.h:64
void show_point(PAGE_RES *page_res, float x, float y)
Definition: pgedit.cpp:655
int y_size
Definition: scrollview.h:69
int x
Definition: scrollview.h:66
void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

33  {
34  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
35  page_res_it.forward()) {
36  WERD* word = page_res_it.word()->word;
37  if (word->bounding_box().overlap(selection_box)) {
38  if (!(this->*word_processor)(&page_res_it))
39  return;
40  }
41  }
42 }
TBOX bounding_box() const
Definition: werd.cpp:160
Definition: werd.h:60
#define NULL
Definition: host.h:144
bool overlap(const TBOX &box) const
Definition: rect.h:345
WERD_RES * word() const
Definition: pageres.h:733
bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 118 of file control.cpp.

121  {
122  if (word_config != NULL) {
123  if (word_box.major_overlap(target_word_box)) {
124  if (backup_config_file_ == NULL) {
125  backup_config_file_ = kBackUpConfigFile;
126  FILE* config_fp = fopen(backup_config_file_, "wb");
127  ParamUtils::PrintParams(config_fp, params());
128  fclose(config_fp);
129  ParamUtils::ReadParamsFile(word_config,
131  params());
132  }
133  } else {
134  if (backup_config_file_ != NULL) {
135  ParamUtils::ReadParamsFile(backup_config_file_,
137  params());
138  backup_config_file_ = NULL;
139  }
140  }
141  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
142  return false;
143  }
144  return true;
145 }
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180
const char *const kBackUpConfigFile
Definition: control.cpp:53
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
ParamsVectors * params()
Definition: ccutil.h:65
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
#define NULL
Definition: host.h:144
void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 140 of file docqual.cpp.

141  {
142  if ((tessedit_good_quality_unrej && good_quality_doc))
143  unrej_good_quality_words(page_res_it);
144  doc_and_block_rejection(page_res_it, good_quality_doc);
145  if (unlv_tilde_crunching) {
146  tilde_crunch(page_res_it);
147  tilde_delete(page_res_it);
148  }
149 }
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:421
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:163
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:235
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:593
void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 52 of file tessedit.cpp.

53  {
54  STRING path = datadir;
55  path += "configs/";
56  path += filename;
57  FILE* fp;
58  if ((fp = fopen(path.string(), "rb")) != NULL) {
59  fclose(fp);
60  } else {
61  path = datadir;
62  path += "tessconfigs/";
63  path += filename;
64  if ((fp = fopen(path.string(), "rb")) != NULL) {
65  fclose(fp);
66  } else {
67  path = filename;
68  }
69  }
70  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
71 }
STRING datadir
Definition: ccutil.h:67
ParamsVectors * params()
Definition: ccutil.h:65
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 910 of file control.cpp.

911  {
912  *make_next_word_fuzzy = false;
913  WERD* real_word = pr_it->word()->word;
914  if (real_word->rej_cblob_list()->empty() ||
915  real_word->cblob_list()->empty() ||
916  real_word->rej_cblob_list()->length() > noise_maxperword)
917  return false;
918  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
919  // Get the noise outlines into a vector with matching bool map.
920  GenericVector<C_OUTLINE*> outlines;
921  real_word->GetNoiseOutlines(&outlines);
922  GenericVector<bool> word_wanted;
923  GenericVector<bool> overlapped_any_blob;
924  GenericVector<C_BLOB*> target_blobs;
925  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
926  &word_wanted, &overlapped_any_blob,
927  &target_blobs);
928  // Filter the outlines that overlapped any blob and put them into the word
929  // now. This simplifies the remaining task and also makes it more accurate
930  // as it has more completed blobs to work on.
931  GenericVector<bool> wanted;
932  GenericVector<C_BLOB*> wanted_blobs;
933  GenericVector<C_OUTLINE*> wanted_outlines;
934  int num_overlapped = 0;
935  int num_overlapped_used = 0;
936  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
937  if (overlapped_any_blob[i]) {
938  ++num_overlapped;
939  if (word_wanted[i]) ++num_overlapped_used;
940  wanted.push_back(word_wanted[i]);
941  wanted_blobs.push_back(target_blobs[i]);
942  wanted_outlines.push_back(outlines[i]);
943  outlines[i] = NULL;
944  }
945  }
946  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
947  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
948  &target_blobs);
949  int non_overlapped = 0;
950  int non_overlapped_used = 0;
951  for (int i = 0; i < word_wanted.size(); ++i) {
952  if (word_wanted[i]) ++non_overlapped_used;
953  if (outlines[i] != NULL) ++non_overlapped_used;
954  }
955  if (debug_noise_removal) {
956  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
957  num_overlapped_used, num_overlapped, non_overlapped_used,
958  non_overlapped);
959  real_word->bounding_box().print();
960  }
961  // Now we have decided which outlines we want, put them into the real_word.
962  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
963  make_next_word_fuzzy)) {
964  pr_it->MakeCurrentWordFuzzy();
965  }
966  // TODO(rays) Parts of combos have a deep copy of the real word, and need
967  // to have their noise outlines moved/assigned in the same way!!
968  return num_overlapped_used != 0 || non_overlapped_used != 0;
969 }
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1029
int size() const
Definition: genericvector.h:72
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:548
TBOX bounding_box() const
Definition: werd.cpp:160
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:119
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:976
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1482
Definition: werd.h:60
WERD * word
Definition: pageres.h:175
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:530
#define NULL
Definition: host.h:144
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_RES * word() const
Definition: pageres.h:733
bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 287 of file control.cpp.

291  {
292  PAGE_RES_IT page_res_it(page_res);
293 
295  tessedit_test_adaption.set_value (TRUE);
296  tessedit_minimal_rejection.set_value (TRUE);
297  }
298 
299  if (dopasses==0 || dopasses==1) {
300  page_res_it.restart_page();
301  // ****************** Pass 1 *******************
302 
303  // If the adaptive classifier is full switch to one we prepared earlier,
304  // ie on the previous page. If the current adaptive classifier is non-empty,
305  // prepare a backup starting at this page, in case it fills up. Do all this
306  // independently for each language.
307  if (AdaptiveClassifierIsFull()) {
309  } else if (!AdaptiveClassifierIsEmpty()) {
311  }
312  // Now check the sub-langs as well.
313  for (int i = 0; i < sub_langs_.size(); ++i) {
314  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
315  sub_langs_[i]->SwitchAdaptiveClassifier();
316  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
317  sub_langs_[i]->StartBackupAdaptiveClassifier();
318  }
319  }
320  // Set up all words ready for recognition, so that if parallelism is on
321  // all the input and output classes are ready to run the classifier.
323  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
324  if (tessedit_parallelize) {
325  PrerecAllWordsPar(words);
326  }
327 
328  stats_.word_count = words.size();
329 
330  stats_.dict_words = 0;
331  stats_.doc_blob_quality = 0;
332  stats_.doc_outline_errs = 0;
333  stats_.doc_char_quality = 0;
334  stats_.good_char_count = 0;
335  stats_.doc_good_char_quality = 0;
336 
337  most_recently_used_ = this;
338  // Run pass 1 word recognition.
339  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
340  // Pass 1 post-processing.
341  for (page_res_it.restart_page(); page_res_it.word() != NULL;
342  page_res_it.forward()) {
343  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
344  fix_rep_char(&page_res_it);
345  continue;
346  }
347 
348  // Count dict words.
349  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
350  ++(stats_.dict_words);
351 
352  // Update misadaption log (we only need to do it on pass 1, since
353  // adaption only happens on this pass).
354  if (page_res_it.word()->blamer_bundle != NULL &&
355  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
356  page_res->misadaption_log.push_back(
357  page_res_it.word()->blamer_bundle->misadaption_debug());
358  }
359  }
360  }
361 
362  if (dopasses == 1) return true;
363 
364  // ****************** Pass 2 *******************
366  AnyTessLang()) {
367  page_res_it.restart_page();
369  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
370  if (tessedit_parallelize) {
371  PrerecAllWordsPar(words);
372  }
373  most_recently_used_ = this;
374  // Run pass 2 word recognition.
375  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
376  }
377 
378  // The next passes can only be run if tesseract has been used, as cube
379  // doesn't set all the necessary outputs in WERD_RES.
380  if (AnyTessLang()) {
381  // ****************** Pass 3 *******************
382  // Fix fuzzy spaces.
384 
387  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
388 
389  // ****************** Pass 4 *******************
392 
393  // ****************** Pass 5,6 *******************
394  rejection_passes(page_res, monitor, target_word_box, word_config);
395 
396 #ifndef ANDROID_BUILD
397  // ****************** Pass 7 *******************
398  // Cube combiner.
399  // If cube is loaded and its combiner is present, run it.
401  run_cube_combiner(page_res);
402  }
403 #endif
404 
405  // ****************** Pass 8 *******************
406  font_recognition_pass(page_res);
407 
408  // ****************** Pass 9 *******************
409  // Check the correctness of the final results.
410  blamer_pass(page_res);
411  script_pos_pass(page_res);
412  }
413 
414  // Write results pass.
416  // This is now redundant, but retained commented so show how to obtain
417  // bounding boxes and style information.
418 
419  // changed by jetsoft
420  // needed for dll to output memory structure
421  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
422  output_pass(page_res_it, target_word_box);
423  // end jetsoft
424  PageSegMode pageseg_mode = static_cast<PageSegMode>(
425  static_cast<int>(tessedit_pageseg_mode));
426  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
427 
428  // Remove empty words, as these mess up the result iterators.
429  for (page_res_it.restart_page(); page_res_it.word() != NULL;
430  page_res_it.forward()) {
431  WERD_RES* word = page_res_it.word();
432  if (word->best_choice == NULL || word->best_choice->length() == 0)
433  page_res_it.DeleteCurrentWord();
434  }
435 
436  if (monitor != NULL) {
437  monitor->progress = 100;
438  }
439  return true;
440 }
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
void run_cube_combiner(PAGE_RES *page_res)
int size() const
Definition: genericvector.h:72
bool right_to_left() const
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
int push_back(T object)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:590
bool AdaptiveClassifierIsFull() const
Definition: classify.h:284
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:686
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:710
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
bool tessedit_enable_bigram_correction
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:207
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2015
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1958
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:442
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:628
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:285
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:148
WERD * word
Definition: pageres.h:175
bool AnyTessLang() const
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1624
inT16 progress
Definition: ocrclass.h:115
#define TRUE
Definition: capi.h:28
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:36
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:359
#define NULL
Definition: host.h:144
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:644
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
BOOL8 tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 84 of file control.cpp.

84  {
85  inT16 char_qual;
86  inT16 good_char_qual;
87 
88  WordData word_data(*pr_it);
89  SetupWordPassN(2, &word_data);
90  classify_word_and_language(2, pr_it, &word_data);
92  WERD_RES* word_res = pr_it->word();
93  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
94  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
95  "char_quality: %d; good_char_quality: %d\n",
96  word_res->reject_map.length(),
97  word_blob_quality(word_res, pr_it->row()->row),
98  word_outline_errs(word_res), char_qual, good_char_qual);
99  }
100  return TRUE;
101 }
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
inT32 length() const
Definition: rejctmap.h:237
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
ROW_RES * row() const
Definition: pageres.h:736
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
ROW * row
Definition: pageres.h:127
#define TRUE
Definition: capi.h:28
WERD_RES * word() const
Definition: pageres.h:733
short inT16
Definition: host.h:100
void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 68 of file control.cpp.

69  {
70  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
71  if (it != NULL) {
73  it->DeleteCurrentWord();
74  delete it;
75  }
76 }
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:84
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
void DeleteCurrentWord()
Definition: pageres.cpp:1449
#define NULL
Definition: host.h:144
void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 79 of file recogtraining.cpp.

82  {
83  STRING box_fname = fname;
84  const char *lastdot = strrchr(box_fname.string(), '.');
85  if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
86  box_fname += ".box";
87  // read_next_box() will close box_file
88  FILE *box_file = open_file(box_fname.string(), "r");
89 
90  PAGE_RES_IT page_res_it;
91  page_res_it.page_res = page_res;
92  page_res_it.restart_page();
93  STRING label;
94 
95  // Process all the words on this page.
96  TBOX tbox; // tesseract-identified box
97  TBOX bbox; // box from the box file
98  bool keep_going;
99  int line_number = 0;
100  int examined_words = 0;
101  do {
102  keep_going = read_t(&page_res_it, &tbox);
103  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
104  &bbox);
105  // Align bottom left points of the TBOXes.
106  while (keep_going &&
107  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
108  if (bbox.bottom() < tbox.bottom()) {
109  page_res_it.forward();
110  keep_going = read_t(&page_res_it, &tbox);
111  } else {
112  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
113  &bbox);
114  }
115  }
116  while (keep_going &&
117  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
118  if (bbox.left() > tbox.left()) {
119  page_res_it.forward();
120  keep_going = read_t(&page_res_it, &tbox);
121  } else {
122  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
123  &bbox);
124  }
125  }
126  // OCR the word if top right points of the TBOXes are similar.
127  if (keep_going &&
128  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
129  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
130  ambigs_classify_and_output(label.string(), &page_res_it, output_file);
131  examined_words++;
132  }
133  page_res_it.forward();
134  } while (keep_going);
135  fclose(box_file);
136 
137  // Set up scripts on all of the words that did not get sent to
138  // ambigs_classify_and_output. They all should have, but if all the
139  // werd_res's don't get uch_sets, tesseract will crash when you try
140  // to iterate over them. :-(
141  int total_words = 0;
142  for (page_res_it.restart_page(); page_res_it.block() != NULL;
143  page_res_it.forward()) {
144  if (page_res_it.word()) {
145  if (page_res_it.word()->uch_set == NULL)
146  page_res_it.word()->SetupFake(unicharset);
147  total_words++;
148  }
149  }
150  if (examined_words < 0.85 * total_words) {
151  tprintf("TODO(antonova): clean up recog_training_segmented; "
152  " It examined only a small fraction of the ambigs image.\n");
153  }
154  tprintf("recog_training_segmented: examined %d / %d words.\n",
155  examined_words, total_words);
156 }
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:118
PAGE_RES * page_res
Definition: pageres.h:658
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox)
Definition: rect.h:30
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const inT16 kMaxBoxEdgeDiff
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 46 of file tfacepp.cpp.

46  {
49  if (classify_debug_level) tprintf("No truth for word - skipping\n");
50  word->tess_failed = true;
51  return;
52  }
55  word->SetupBoxWord();
56  if (word->best_choice->length() != word->box_word->length()) {
57  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
58  "Strlen=%d; #Blobs=%d\n",
59  word->best_choice->debug_string().string(),
60  word->best_choice->length(), word->box_word->length());
61  }
62  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
63  // Check that the ratings matrix size matches the sum of all the
64  // segmentation states.
65  if (!word->StatesAllValid()) {
66  tprintf("Not all words have valid states relative to ratings matrix!!");
67  word->DebugWordChoices(true, NULL);
68  ASSERT_HOST(word->StatesAllValid());
69  }
71  /* Override the permuter type if a straight dictionary check disagrees. */
72  uinT8 perm_type = word->best_choice->permuter();
73  if ((perm_type != SYSTEM_DAWG_PERM) &&
74  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
75  uinT8 real_dict_perm_type = dict_word(*word->best_choice);
76  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
77  (real_dict_perm_type == FREQ_DAWG_PERM) ||
78  (real_dict_perm_type == USER_DAWG_PERM)) &&
80  word->best_choice->unichar_lengths().string()) > 0)) {
81  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
82  }
83  }
85  perm_type != word->best_choice->permuter()) {
86  tprintf("Permuter Type Flipped from %d to %d\n",
87  perm_type, word->best_choice->permuter());
88  }
89  }
90  // Factored out from control.cpp
91  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
92  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
93  static_cast<int>(strspn(word->best_choice->unichar_string().string(),
94  " ")) == word->best_choice->length()) {
95  word->tess_failed = true;
96  word->reject_map.initialise(word->box_word->length());
98  } else {
99  word->tess_failed = false;
100  }
101 }
bool StatesAllValid()
Definition: pageres.cpp:449
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:471
tesseract::BoxWord * box_word
Definition: pageres.h:250
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
const STRING & unichar_lengths() const
Definition: ratngs.h:531
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:124
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495
uinT8 permuter() const
Definition: ratngs.h:343
const STRING debug_string() const
Definition: ratngs.h:502
WERD_CHOICE * raw_choice
Definition: pageres.h:224
bool wordrec_skip_no_truth_words
Definition: wordrec.h:166
void rej_word_tess_failure()
Definition: rejctmap.cpp:425
bool empty() const
Definition: genericvector.h:84
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int length() const
Definition: boxword.h:85
BOOL8 tess_failed
Definition: pageres.h:272
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
void SetupBoxWord()
Definition: pageres.cpp:843
void initialise(inT16 length)
Definition: rejctmap.cpp:318
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
BlamerBundle * blamer_bundle
Definition: pageres.h:230
unsigned char uinT8
Definition: host.h:99
void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 110 of file tfacepp.cpp.

110  {
111  int word_length = word->chopped_word->NumBlobs(); // no of blobs
112  if (word_length > MAX_UNDIVIDED_LENGTH) {
113  return split_and_recog_word(word);
114  }
115  cc_recog(word);
116  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
117 
118  // Do sanity checks and minor fixes on best_choice.
119  if (word->best_choice->length() > word_length) {
120  word->best_choice->make_bad(); // should never happen
121  tprintf("recog_word: Discarded long string \"%s\""
122  " (%d characters vs %d blobs)\n",
123  word->best_choice->unichar_string().string(),
124  word->best_choice->length(), word_length);
125  tprintf("Word is at:");
126  word->word->bounding_box().print();
127  }
128  if (word->best_choice->length() < word_length) {
129  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
130  while (word->best_choice->length() < word_length) {
131  word->best_choice->append_unichar_id(space_id, 1, 0.0,
132  word->best_choice->certainty());
133  }
134  }
135 }
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:144
const STRING & unichar_string() const
Definition: ratngs.h:524
int NumBlobs() const
Definition: blobs.h:425
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:440
float certainty() const
Definition: ratngs.h:327
TWERD * rebuild_word
Definition: pageres.h:244
int UNICHAR_ID
Definition: unichar.h:33
WERD * word
Definition: pageres.h:175
void cc_recog(WERD_RES *word)
Definition: tface.cpp:109
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:35
const char * string() const
Definition: strngs.cpp:193
bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
GenericVector< WordData > *  words 
)

Definition at line 207 of file control.cpp.

209  {
210  // TODO(rays) Before this loop can be parallelized (it would yield a massive
211  // speed-up) all remaining member globals need to be converted to local/heap
212  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
213  // added. The results will be significantly different with adaption on, and
214  // deterioration will need investigation.
215  pr_it->restart_page();
216  for (int w = 0; w < words->size(); ++w) {
217  WordData* word = &(*words)[w];
218  if (w > 0) word->prev_word = &(*words)[w - 1];
219  if (monitor != NULL) {
220  monitor->ocr_alive = TRUE;
221  if (pass_n == 1)
222  monitor->progress = 30 + 50 * w / words->size();
223  else
224  monitor->progress = 80 + 10 * w / words->size();
225  if (monitor->deadline_exceeded() ||
226  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
227  words->size()))) {
228  // Timeout. Fake out the rest of the words.
229  for (; w < words->size(); ++w) {
230  (*words)[w].word->SetupFake(unicharset);
231  }
232  return false;
233  }
234  }
235  if (word->word->tess_failed) {
236  int s;
237  for (s = 0; s < word->lang_words.size() &&
238  word->lang_words[s]->tess_failed; ++s) {}
239  // If all are failed, skip it. Image words are skipped by this test.
240  if (s > word->lang_words.size()) continue;
241  }
242  // Sync pr_it with the wth WordData.
243  while (pr_it->word() != NULL && pr_it->word() != word->word)
244  pr_it->forward();
245  ASSERT_HOST(pr_it->word() != NULL);
246  bool make_next_word_fuzzy = false;
247  if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
248  // Needs to be setup again to see the new outlines in the chopped_word.
249  SetupWordPassN(pass_n, word);
250  }
251 
252  classify_word_and_language(pass_n, pr_it, word);
254  tprintf("Pass%d: %s [%s]\n", pass_n,
255  word->word->best_choice->unichar_string().string(),
256  word->word->best_choice->debug_string().string());
257  }
258  pr_it->forward();
259  if (make_next_word_fuzzy && pr_it->word() != NULL) {
260  pr_it->MakeCurrentWordFuzzy();
261  }
262  }
263  return true;
264 }
int size() const
Definition: genericvector.h:72
volatile inT8 ocr_alive
Definition: ocrclass.h:117
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void * cancel_this
Definition: ocrclass.h:120
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:910
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD_RES * forward()
Definition: pageres.h:713
CANCEL_FUNC cancel
Definition: ocrclass.h:119
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1482
WERD_RES * restart_page()
Definition: pageres.h:680
bool deadline_exceeded() const
Definition: ocrclass.h:144
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
WERD * word
Definition: pageres.h:175
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
inT16 progress
Definition: ocrclass.h:115
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
WERD_RES * word() const
Definition: pageres.h:733
void tesseract::Tesseract::recognize_page ( STRING image_name)
void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 263 of file reject.cpp.

263  {
264  TBOX word_box = word->word->bounding_box();
265  // Use the box_word as it is already denormed back to image coordinates.
266  int blobcount = word->box_word->length();
267 
268  if (word_box.left() < tessedit_image_border ||
269  word_box.bottom() < tessedit_image_border ||
270  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
271  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
272  ASSERT_HOST(word->reject_map.length() == blobcount);
273  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
274  TBOX blob_box = word->box_word->BlobBox(blobindex);
275  if (blob_box.left() < tessedit_image_border ||
276  blob_box.bottom() < tessedit_image_border ||
277  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
278  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
279  word->reject_map[blobindex].setrej_edge_char();
280  // Close to edge
281  }
282  }
283  }
284 }
tesseract::BoxWord * box_word
Definition: pageres.h:250
inT32 length() const
Definition: rejctmap.h:237
REJMAP reject_map
Definition: pageres.h:271
int ImageHeight() const
const TBOX & BlobBox(int index) const
Definition: boxword.h:88
TBOX bounding_box() const
Definition: werd.cpp:160
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
int ImageWidth() const
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
WERD * word
Definition: pageres.h:175
const int length() const
Definition: boxword.h:85
Definition: rect.h:30
inT16 top() const
Definition: rect.h:54
void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 191 of file reject.cpp.

191  {
192  inT16 i;
193  inT16 offset;
194 
195  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
196  offset += word->best_choice->unichar_lengths()[i], i += 1) {
198  contains (word->best_choice->unichar_string()[offset])) {
199  //rej 1Il conflict
200  word->reject_map[i].setrej_1Il_conflict ();
201  }
202  }
203 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_lengths() const
Definition: ratngs.h:531
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: strngs.h:44
short inT16
Definition: host.h:100
void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 573 of file reject.cpp.

573  {
574  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
575 
576  if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
579 }
inT32 length() const
Definition: rejctmap.h:237
REJMAP reject_map
Definition: pageres.h:271
void rej_word_mostly_rej()
Definition: rejctmap.cpp:479
double rej_whole_of_mostly_reject_word_fract
inT16 reject_count()
Definition: rejctmap.h:243
void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 590 of file control.cpp.

593  {
594  PAGE_RES_IT page_res_it(page_res);
595  // ****************** Pass 5 *******************
596  // Gather statistics on rejects.
597  int word_index = 0;
598  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
600  WERD_RES* word = page_res_it.word();
601  word_index++;
602  if (monitor != NULL) {
603  monitor->ocr_alive = TRUE;
604  monitor->progress = 95 + 5 * word_index / stats_.word_count;
605  }
606  if (word->rebuild_word == NULL) {
607  // Word was not processed by tesseract.
608  page_res_it.forward();
609  continue;
610  }
611  check_debug_pt(word, 70);
612 
613  // changed by jetsoft
614  // specific to its needs to extract one word when need
615  if (target_word_box &&
617  *target_word_box, word_config, 4)) {
618  page_res_it.forward();
619  continue;
620  }
621  // end jetsoft
622 
623  page_res_it.rej_stat_word();
624  int chars_in_word = word->reject_map.length();
625  int rejects_in_word = word->reject_map.reject_count();
626 
627  int blob_quality = word_blob_quality(word, page_res_it.row()->row);
628  stats_.doc_blob_quality += blob_quality;
629  int outline_errs = word_outline_errs(word);
630  stats_.doc_outline_errs += outline_errs;
631  inT16 all_char_quality;
632  inT16 accepted_all_char_quality;
633  word_char_quality(word, page_res_it.row()->row,
634  &all_char_quality, &accepted_all_char_quality);
635  stats_.doc_char_quality += all_char_quality;
636  uinT8 permuter_type = word->best_choice->permuter();
637  if ((permuter_type == SYSTEM_DAWG_PERM) ||
638  (permuter_type == FREQ_DAWG_PERM) ||
639  (permuter_type == USER_DAWG_PERM)) {
640  stats_.good_char_count += chars_in_word - rejects_in_word;
641  stats_.doc_good_char_quality += accepted_all_char_quality;
642  }
643  check_debug_pt(word, 80);
645  (blob_quality == 0) && (outline_errs >= chars_in_word))
647  check_debug_pt(word, 90);
648  page_res_it.forward();
649  }
650 
652  tprintf
653  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
654  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
655  page_res->char_count, page_res->rej_count,
656  page_res->rej_count / static_cast<float>(page_res->char_count),
657  stats_.doc_blob_quality,
658  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
659  stats_.doc_outline_errs,
660  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
661  stats_.doc_char_quality,
662  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
663  stats_.doc_good_char_quality,
664  (stats_.good_char_count > 0) ?
665  (stats_.doc_good_char_quality /
666  static_cast<float>(stats_.good_char_count)) : 0.0);
667  }
668  BOOL8 good_quality_doc =
669  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
670  quality_rej_pc) &&
671  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
672  quality_blob_pc) &&
673  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
675  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
677 
678  // ****************** Pass 6 *******************
679  // Do whole document or whole block rejection pass
680  if (!tessedit_test_adaption) {
682  quality_based_rejection(page_res_it, good_quality_doc);
683  }
684 }
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
inT32 char_count
Definition: pageres.h:60
volatile inT8 ocr_alive
Definition: ocrclass.h:117
#define tprintf(...)
Definition: tprintf.h:31
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
#define LOC_MM_ADAPT
Definition: errcode.h:52
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:140
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118
TWERD * rebuild_word
Definition: pageres.h:244
uinT8 permuter() const
Definition: ratngs.h:343
inT32 rej_count
Definition: pageres.h:61
void rej_word_bad_quality()
Definition: rejctmap.cpp:488
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
WERD * word
Definition: pageres.h:175
inT16 progress
Definition: ocrclass.h:115
inT16 reject_count()
Definition: rejctmap.h:243
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
short inT16
Definition: host.h:100
unsigned char uinT8
Definition: host.h:99
BOOL8 tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 582 of file reject.cpp.

582  {
583  inT16 char_quality;
584  inT16 accepted_char_quality;
585 
586  if (word->best_choice->unichar_lengths().length() <= 1)
587  return FALSE;
588 
590  contains(word->best_choice->unichar_string()[0]))
591  return FALSE;
592 
593  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
594  for (int i = 1; i < word->best_choice->length(); ++i) {
595  if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
596  }
597 
598  word_char_quality(word, row, &char_quality, &accepted_char_quality);
599 
600  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
601  (char_quality == accepted_char_quality))
602  return TRUE;
603  else
604  return FALSE;
605 }
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
char * ok_repeated_ch_non_alphanum_wds
const STRING & unichar_lengths() const
Definition: ratngs.h:531
inT32 length() const
Definition: strngs.cpp:188
const STRING & unichar_string() const
Definition: ratngs.h:524
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int UNICHAR_ID
Definition: unichar.h:33
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
Definition: strngs.h:44
short inT16
Definition: host.h:100
void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Logs a bad box by line in the box file and box coords.

Definition at line 764 of file applybox.cpp.

765  {
766  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
767  boxfile_lineno + 1, box_ch,
768  box.left(), box.bottom(), box.right(), box.top(), err_msg);
769 }
#define tprintf(...)
Definition: tprintf.h:31
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
inT16 top() const
Definition: rect.h:54
void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1381 of file control.cpp.

1382  {
1383  tprintf("New XHT Match:%s = %s ",
1384  word->best_choice->unichar_string().string(),
1385  word->best_choice->debug_string().string());
1386  word->reject_map.print(debug_fp);
1387  tprintf(" -> %s = %s ",
1388  new_word->best_choice->unichar_string().string(),
1389  new_word->best_choice->debug_string().string());
1390  new_word->reject_map.print(debug_fp);
1391  tprintf(" %s->%s %s %s\n",
1392  word->guessed_x_ht ? "GUESS" : "CERT",
1393  new_word->guessed_x_ht ? "GUESS" : "CERT",
1394  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1395  accept_new_word ? "ACCEPTED" : "");
1396 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
const STRING & unichar_string() const
Definition: ratngs.h:524
const STRING debug_string() const
Definition: ratngs.h:502
BOOL8 guessed_x_ht
Definition: pageres.h:292
FILE * debug_fp
Definition: tessvars.cpp:24
void print(FILE *fp)
Definition: rejctmap.cpp:394
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.

Definition at line 509 of file applybox.cpp.

509  {
510  PAGE_RES_IT pr_it(page_res);
511  WERD_RES* word_res;
512  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
513  WERD* word = word_res->word;
514  if (word->text() == NULL || word->text()[0] == '\0')
515  continue; // Ignore words that have no text.
516  // Convert the correct text to a vector of UNICHAR_ID
517  GenericVector<UNICHAR_ID> target_text;
518  if (!ConvertStringToUnichars(word->text(), &target_text)) {
519  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
520  word->text());
521  pr_it.DeleteCurrentWord();
522  continue;
523  }
524  if (!FindSegmentation(target_text, word_res)) {
525  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
526  word->text());
527  pr_it.DeleteCurrentWord();
528  continue;
529  }
530  }
531 }
#define tprintf(...)
Definition: tprintf.h:31
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:559
Definition: werd.h:60
const char * text() const
Definition: werd.h:125
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:535
bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 340 of file applybox.cpp.

342  {
343  if (applybox_debug > 1) {
344  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
345  }
346  PAGE_RES_IT page_res_it(page_res);
347  WERD_RES* word_res;
348  for (word_res = page_res_it.word(); word_res != NULL;
349  word_res = page_res_it.forward()) {
350  if (!word_res->box_word->bounding_box().major_overlap(box))
351  continue;
352  if (applybox_debug > 1) {
353  tprintf("Checking word box:");
354  word_res->box_word->bounding_box().print();
355  }
356  int word_len = word_res->box_word->length();
357  for (int i = 0; i < word_len; ++i) {
358  TBOX char_box = TBOX();
359  int blob_count = 0;
360  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
361  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
362  if (!blob_box.major_overlap(box))
363  break;
364  if (word_res->correct_text[i + blob_count].length() > 0)
365  break; // Blob is claimed already.
366  double current_box_miss_metric = BoxMissMetric(blob_box, box);
367  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
368  if (applybox_debug > 2) {
369  tprintf("Checking blob:");
370  blob_box.print();
371  tprintf("Current miss metric = %g, next = %g\n",
372  current_box_miss_metric, next_box_miss_metric);
373  }
374  if (current_box_miss_metric > next_box_miss_metric)
375  break; // Blob is a better match for next box.
376  char_box += blob_box;
377  }
378  if (blob_count > 0) {
379  if (applybox_debug > 1) {
380  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
381  }
382  if (!char_box.almost_equal(box, 3) &&
383  (box.x_gap(next_box) < -3 ||
384  (prev_box != NULL && prev_box->x_gap(box) < -3))) {
385  return false;
386  }
387  // We refine just the box_word, best_state and correct_text here.
388  // The rebuild_word is made in TidyUp.
389  // blob_count blobs are put together to match the box. Merge the
390  // box_word boxes, save the blob_count in the state and the text.
391  word_res->box_word->MergeBoxes(i, i + blob_count);
392  word_res->best_state[i] = blob_count;
393  word_res->correct_text[i] = correct_text;
394  if (applybox_debug > 2) {
395  tprintf("%d Blobs match: blob box:", blob_count);
396  word_res->box_word->BlobBox(i).print();
397  tprintf("Matches box:");
398  box.print();
399  tprintf("With next box:");
400  next_box.print();
401  }
402  // Eliminated best_state and correct_text entries for the consumed
403  // blobs.
404  for (int j = 1; j < blob_count; ++j) {
405  word_res->best_state.remove(i + 1);
406  word_res->correct_text.remove(i + 1);
407  }
408  // Assume that no box spans multiple source words, so we are done with
409  // this box.
410  if (applybox_debug > 1) {
411  tprintf("Best state = ");
412  for (int j = 0; j < word_res->best_state.size(); ++j) {
413  tprintf("%d ", word_res->best_state[j]);
414  }
415  tprintf("\n");
416  tprintf("Correct text = [[ ");
417  for (int j = 0; j < word_res->correct_text.size(); ++j) {
418  tprintf("%s ", word_res->correct_text[j].string());
419  }
420  tprintf("]]\n");
421  }
422  return true;
423  }
424  }
425  }
426  if (applybox_debug > 0) {
427  tprintf("FAIL!\n");
428  }
429  return false; // Failure.
430 }
int size() const
Definition: genericvector.h:72
tesseract::BoxWord * box_word
Definition: pageres.h:250
int length() const
Definition: genericvector.h:79
#define tprintf(...)
Definition: tprintf.h:31
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
const TBOX & BlobBox(int index) const
Definition: boxword.h:88
void print() const
Definition: rect.h:270
void MergeBoxes(int start, int end)
Definition: boxword.cpp:134
GenericVector< STRING > correct_text
Definition: pageres.h:259
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
WERD * word
Definition: pageres.h:175
void remove(int index)
const int length() const
Definition: boxword.h:85
int x_gap(const TBOX &box) const
Definition: rect.h:217
Definition: rect.h:30
const TBOX & bounding_box() const
Definition: boxword.h:82
GenericVector< int > best_state
Definition: pageres.h:255
#define NULL
Definition: host.h:144
bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an overlapping blob for a box.

Definition at line 438 of file applybox.cpp.

440  {
441  if (applybox_debug > 1) {
442  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
443  }
444  WERD* new_word = NULL;
445  BLOCK_IT b_it(block_list);
446  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
447  BLOCK* block = b_it.data();
448  if (!box.major_overlap(block->bounding_box()))
449  continue;
450  ROW_IT r_it(block->row_list());
451  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
452  ROW* row = r_it.data();
453  if (!box.major_overlap(row->bounding_box()))
454  continue;
455  WERD_IT w_it(row->word_list());
456  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
457  WERD* word = w_it.data();
458  if (applybox_debug > 2) {
459  tprintf("Checking word:");
460  word->bounding_box().print();
461  }
462  if (word->text() != NULL && word->text()[0] != '\0')
463  continue; // Ignore words that are already done.
464  if (!box.major_overlap(word->bounding_box()))
465  continue;
466  C_BLOB_IT blob_it(word->cblob_list());
467  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
468  blob_it.forward()) {
469  C_BLOB* blob = blob_it.data();
470  TBOX blob_box = blob->bounding_box();
471  if (!blob_box.major_overlap(box))
472  continue;
473  double current_box_miss_metric = BoxMissMetric(blob_box, box);
474  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
475  if (applybox_debug > 2) {
476  tprintf("Checking blob:");
477  blob_box.print();
478  tprintf("Current miss metric = %g, next = %g\n",
479  current_box_miss_metric, next_box_miss_metric);
480  }
481  if (current_box_miss_metric > next_box_miss_metric)
482  continue; // Blob is a better match for next box.
483  if (applybox_debug > 2) {
484  tprintf("Blob match: blob:");
485  blob_box.print();
486  tprintf("Matches box:");
487  box.print();
488  tprintf("With next box:");
489  next_box.print();
490  }
491  if (new_word == NULL) {
492  // Make a new word with a single blob.
493  new_word = word->shallow_copy();
494  new_word->set_text(correct_text);
495  w_it.add_to_end(new_word);
496  }
497  C_BLOB_IT new_blob_it(new_word->cblob_list());
498  new_blob_it.add_to_end(blob_it.extract());
499  }
500  }
501  }
502  }
503  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
504  return new_word != NULL;
505 }
void set_text(const char *new_text)
Definition: werd.h:126
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
Definition: ocrrow.h:32
Definition: ocrblock.h:30
TBOX bounding_box() const
Definition: ocrrow.h:85
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
Definition: werd.h:60
WERD * shallow_copy()
Definition: werd.cpp:352
const char * text() const
Definition: werd.h:125
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
#define NULL
Definition: host.h:144
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 660 of file tesseractclass.cpp.

660  {
662  for (int i = 0; i < sub_langs_.size(); ++i) {
663  sub_langs_[i]->ResetAdaptiveClassifierInternal();
664  }
665 }
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:613
void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 668 of file tesseractclass.cpp.

668  {
670  for (int i = 0; i < sub_langs_.size(); ++i) {
671  sub_langs_[i]->getDict().ResetDocumentDictionary();
672  }
673 }
Dict & getDict()
Definition: classify.h:65
void ResetDocumentDictionary()
Definition: dict.h:301
const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 187 of file tesseractclass.h.

187  {
188  return reskew_;
189  }
int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 869 of file control.cpp.

872  {
873  bool debug = classify_debug_level || cube_debug_level;
874  if (debug) {
875  tprintf("Trying word using lang %s, oem %d\n",
876  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
877  }
878  // Run the recognizer on the word.
879  PointerVector<WERD_RES> new_words;
880  (this->*recognizer)(word_data, in_word, &new_words);
881  if (new_words.empty()) {
882  // Transfer input word to new_words, as the classifier must have put
883  // the result back in the input.
884  new_words.push_back(*in_word);
885  *in_word = NULL;
886  }
887  if (debug) {
888  for (int i = 0; i < new_words.size(); ++i)
889  new_words[i]->DebugTopChoice("Lang result");
890  }
891  // Initial version is a bit of a hack based on better certainty and rating
892  // (to reduce false positives from cube) or a dictionary vs non-dictionary
893  // word.
894  return SelectBestWords(classify_max_rating_ratio,
896  debug, &new_words, best_words);
897 }
#define tprintf(...)
Definition: tprintf.h:31
double classify_max_rating_ratio
Definition: classify.h:402
double classify_max_certainty_margin
Definition: classify.h:404
STRING lang
Definition: ccutil.h:69
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 248 of file tesseractclass.h.

248  {
249  return right_to_left_;
250  }
void tesseract::Tesseract::run_cube_combiner ( PAGE_RES page_res)

Definition at line 193 of file cube_control.cpp.

193  {
194  if (page_res == NULL || tess_cube_combiner_ == NULL)
195  return;
196  PAGE_RES_IT page_res_it(page_res);
197  // Iterate through the word results and call cube on each word.
198  for (page_res_it.restart_page(); page_res_it.word () != NULL;
199  page_res_it.forward()) {
200  BLOCK* block = page_res_it.block()->block;
201  if (block->poly_block() != NULL && !block->poly_block()->IsText())
202  continue; // Don't deal with non-text blocks.
203  WERD_RES* word = page_res_it.word();
204  // Skip cube entirely if tesseract's certainty is greater than threshold.
205  int combiner_run_thresh = convert_prob_to_tess_certainty(
206  cube_cntxt_->Params()->CombinerRunThresh());
207  if (word->best_choice->certainty() >= combiner_run_thresh) {
208  continue;
209  }
210  // Use the same language as Tesseract used for the word.
211  Tesseract* lang_tess = word->tesseract;
212 
213  // Setup a trial WERD_RES in which to classify with cube.
214  WERD_RES cube_word;
215  cube_word.InitForRetryRecognition(*word);
216  cube_word.SetupForRecognition(lang_tess->unicharset, this, BestPix(),
218  NULL, false, false, false,
219  page_res_it.row()->row,
220  page_res_it.block()->block);
221  CubeObject *cube_obj = lang_tess->cube_recognize_word(
222  page_res_it.block()->block, &cube_word);
223  if (cube_obj != NULL)
224  lang_tess->cube_combine_word(cube_obj, &cube_word, word);
225  delete cube_obj;
226  }
227 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
bool IsText() const
Definition: polyblk.h:52
float certainty() const
Definition: ratngs.h:327
Definition: ocrblock.h:30
tesseract::Tesseract * tesseract
Definition: pageres.h:266
TuningParams * Params() const
Pix * BestPix() const
WERD * word
Definition: pageres.h:175
double CombinerRunThresh() const
Definition: tuning_params.h:62
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define NULL
Definition: host.h:144
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)
inT16 tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 607 of file reject.cpp.

607  {
608  const WERD_CHOICE &word = *werd_res->best_choice;
609  int dict_word_type = werd_res->tesseract->dict_word(word);
610  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
611 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:124
tesseract::Tesseract * tesseract
Definition: pageres.h:266
Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 231 of file tesseractclass.h.

231  {
232  return scaled_color_;
233  }
int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 234 of file tesseractclass.h.

234  {
235  return scaled_factor_;
236  }
void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 710 of file control.cpp.

710  {
711  PAGE_RES_IT page_res_it(page_res);
712  for (page_res_it.restart_page(); page_res_it.word() != NULL;
713  page_res_it.forward()) {
714  WERD_RES* word = page_res_it.word();
715  if (word->word->flag(W_REP_CHAR)) {
716  page_res_it.forward();
717  continue;
718  }
719  float x_height = page_res_it.block()->block->x_height();
720  float word_x_height = word->x_height;
721  if (word_x_height < word->best_choice->min_x_height() ||
722  word_x_height > word->best_choice->max_x_height()) {
723  word_x_height = (word->best_choice->min_x_height() +
724  word->best_choice->max_x_height()) / 2.0f;
725  }
726  // Test for small caps. Word capheight must be close to block xheight,
727  // and word must contain no lower case letters, and at least one upper case.
728  double small_cap_xheight = x_height * kXHeightCapRatio;
729  double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
730  if (word->uch_set->script_has_xheight() &&
731  small_cap_xheight - small_cap_delta <= word_x_height &&
732  word_x_height <= small_cap_xheight + small_cap_delta) {
733  // Scan for upper/lower.
734  int num_upper = 0;
735  int num_lower = 0;
736  for (int i = 0; i < word->best_choice->length(); ++i) {
737  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
738  ++num_upper;
739  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
740  ++num_lower;
741  }
742  if (num_upper > 0 && num_lower == 0)
743  word->small_caps = true;
744  }
745  word->SetScriptPositions();
746  }
747 }
void SetScriptPositions()
Definition: pageres.cpp:853
static const double kXHeightCapRatio
Definition: ccstruct.h:37
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool small_caps
Definition: pageres.h:283
float x_height
Definition: pageres.h:295
float min_x_height() const
Definition: ratngs.h:333
bool script_has_xheight() const
Definition: unicharset.h:849
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
WERD * word
Definition: pageres.h:175
float max_x_height() const
Definition: ratngs.h:336
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
#define NULL
Definition: host.h:144
void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).

Parameters
choicesis an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
choices_pos
choices_length
target_text
text_index
rating
segmentation
best_rating
best_segmentation

Definition at line 629 of file applybox.cpp.

635  {
637  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
638  // Rating of matching choice or worst choice if no match.
639  float choice_rating = 0.0f;
640  // Find the corresponding best BLOB_CHOICE.
641  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
642  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
643  choice_it.forward()) {
644  BLOB_CHOICE* choice = choice_it.data();
645  choice_rating = choice->rating();
646  UNICHAR_ID class_id = choice->unichar_id();
647  if (class_id == target_text[text_index]) {
648  break;
649  }
650  // Search ambigs table.
651  if (class_id < table.size() && table[class_id] != NULL) {
652  AmbigSpec_IT spec_it(table[class_id]);
653  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
654  spec_it.forward()) {
655  const AmbigSpec *ambig_spec = spec_it.data();
656  // We'll only do 1-1.
657  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
658  ambig_spec->correct_ngram_id == target_text[text_index])
659  break;
660  }
661  if (!spec_it.cycled_list())
662  break; // Found an ambig.
663  }
664  }
665  if (choice_it.cycled_list())
666  continue; // No match.
667  segmentation->push_back(length);
668  if (choices_pos + length == choices_length &&
669  text_index + 1 == target_text.size()) {
670  // This is a complete match. If the rating is good record a new best.
671  if (applybox_debug > 2) {
672  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
673  rating + choice_rating, *best_rating, segmentation->size(),
674  best_segmentation->size());
675  }
676  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
677  *best_segmentation = *segmentation;
678  *best_rating = rating + choice_rating;
679  }
680  } else if (choices_pos + length < choices_length &&
681  text_index + 1 < target_text.size()) {
682  if (applybox_debug > 3) {
683  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
684  target_text[text_index],
685  unicharset.id_to_unichar(target_text[text_index]),
686  choice_it.data()->unichar_id() == target_text[text_index]
687  ? "Match" : "Ambig",
688  choices_pos, length);
689  }
690  SearchForText(choices, choices_pos + length, choices_length, target_text,
691  text_index + 1, rating + choice_rating, segmentation,
692  best_rating, best_segmentation);
693  if (applybox_debug > 3) {
694  tprintf("End recursion for %d=%s\n", target_text[text_index],
695  unicharset.id_to_unichar(target_text[text_index]));
696  }
697  }
698  segmentation->truncate(segmentation->size() - 1);
699  }
700 }
int size() const
Definition: genericvector.h:72
void truncate(int size)
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:142
float rating() const
Definition: ratngs.h:79
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
Dict & getDict()
Definition: classify.h:65
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:629
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:153
int UNICHAR_ID
Definition: unichar.h:33
bool empty() const
Definition: genericvector.h:84
#define NULL
Definition: host.h:144
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.

Definition at line 109 of file pagesegmain.cpp.

110  {
111  ASSERT_HOST(pix_binary_ != NULL);
112  int width = pixGetWidth(pix_binary_);
113  int height = pixGetHeight(pix_binary_);
114  // Get page segmentation mode.
115  PageSegMode pageseg_mode = static_cast<PageSegMode>(
116  static_cast<int>(tessedit_pageseg_mode));
117  // If a UNLV zone file can be found, use that instead of segmentation.
118  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
119  input_file != NULL && input_file->length() > 0) {
120  STRING name = *input_file;
121  const char* lastdot = strrchr(name.string(), '.');
122  if (lastdot != NULL)
123  name[lastdot - name.string()] = '\0';
124  read_unlv_file(name, width, height, blocks);
125  }
126  if (blocks->empty()) {
127  // No UNLV file present. Work according to the PageSegMode.
128  // First make a single block covering the whole image.
129  BLOCK_IT block_it(blocks);
130  BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
132  block_it.add_to_end(block);
133  } else {
134  // UNLV file present. Use PSM_SINGLE_BLOCK.
135  pageseg_mode = PSM_SINGLE_BLOCK;
136  }
137  // The diacritic_blobs holds noise blobs that may be diacritics. They
138  // are separated out on areas of the image that seem noisy and short-circuit
139  // the layout process, going straight from the initial partition creation
140  // right through to after word segmentation, where they are added to the
141  // rej_cblobs list of the most appropriate word. From there classification
142  // will determine whether they are used.
143  BLOBNBOX_LIST diacritic_blobs;
144  int auto_page_seg_ret_val = 0;
145  TO_BLOCK_LIST to_blocks;
146  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
147  PSM_SPARSE(pageseg_mode)) {
148  auto_page_seg_ret_val = AutoPageSeg(
149  pageseg_mode, blocks, &to_blocks,
150  enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
151  if (pageseg_mode == PSM_OSD_ONLY)
152  return auto_page_seg_ret_val;
153  // To create blobs from the image region bounds uncomment this line:
154  // to_blocks.clear(); // Uncomment to go back to the old mode.
155  } else {
156  deskew_ = FCOORD(1.0f, 0.0f);
157  reskew_ = FCOORD(1.0f, 0.0f);
158  if (pageseg_mode == PSM_CIRCLE_WORD) {
159  Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
160  if (pixcleaned != NULL) {
161  pixDestroy(&pix_binary_);
162  pix_binary_ = pixcleaned;
163  }
164  }
165  }
166 
167  if (auto_page_seg_ret_val < 0) {
168  return -1;
169  }
170 
171  if (blocks->empty()) {
173  tprintf("Empty page\n");
174  return 0; // AutoPageSeg found an empty page.
175  }
176  bool splitting =
178  bool cjk_mode = textord_use_cjk_fp_model;
179 
180  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
181  pix_thresholds_, pix_grey_, splitting || cjk_mode,
182  &diacritic_blobs, blocks, &to_blocks);
183  return auto_page_seg_ret_val;
184 }
Treat the image as a single word in a circle.
Definition: publictypes.h:163
bool right_to_left() const
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:188
#define tprintf(...)
Definition: tprintf.h:31
inT32 length() const
Definition: strngs.cpp:188
#define ASSERT_HOST(x)
Definition: errcode.h:84
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:268
int textord_debug_tabfind
Definition: alignedblob.cpp:27
name_table name
Definition: ocrblock.h:30
Orientation and script detection only.
Definition: publictypes.h:152
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
#define TRUE
Definition: capi.h:28
Definition: strngs.h:44
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:185
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
#define NULL
Definition: host.h:144
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:179
const char * string() const
Definition: strngs.cpp:193
bool read_unlv_file(STRING name, inT32 xsize, inT32 ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:36
Definition: points.h:189
void set_right_to_left(bool value)
Definition: ocrblock.h:86
bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const GenericVector< C_OUTLINE * > &  outlines,
int  num_outlines,
GenericVector< bool > *  ok_outlines 
)

Definition at line 1105 of file control.cpp.

1108  {
1109  STRING best_str;
1110  float target_cert = certainty_threshold;
1111  if (blob != NULL) {
1112  float target_c2;
1113  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1114  if (debug_noise_removal) {
1115  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1116  target_cert, target_c2);
1117  blob->bounding_box().print();
1118  }
1119  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1120  }
1121  GenericVector<bool> test_outlines = *ok_outlines;
1122  // Start with all the outlines in.
1123  STRING all_str;
1124  GenericVector<bool> best_outlines = *ok_outlines;
1125  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1126  pr_it, blob, &all_str);
1127  if (debug_noise_removal) {
1128  TBOX ol_box;
1129  for (int i = 0; i < test_outlines.size(); ++i) {
1130  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1131  }
1132  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1133  all_str.string(), best_cert, best_cert - target_cert);
1134  ol_box.print();
1135  }
1136  // Iteratively zero out the bit that improves the certainty the most, until
1137  // we get past the threshold, have zero bits, or fail to improve.
1138  int best_index = 0; // To zero out.
1139  while (num_outlines > 1 && best_index >= 0 &&
1140  (blob == NULL || best_cert < target_cert || blob != NULL)) {
1141  // Find the best bit to zero out.
1142  best_index = -1;
1143  for (int i = 0; i < outlines.size(); ++i) {
1144  if (test_outlines[i]) {
1145  test_outlines[i] = false;
1146  STRING str;
1147  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1148  pr_it, blob, &str);
1149  if (debug_noise_removal) {
1150  TBOX ol_box;
1151  for (int j = 0; j < outlines.size(); ++j) {
1152  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1153  tprintf("%d", test_outlines[j]);
1154  }
1155  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1156  cert, cert - target_cert);
1157  ol_box.print();
1158  }
1159  if (cert > best_cert) {
1160  best_cert = cert;
1161  best_index = i;
1162  best_outlines = test_outlines;
1163  }
1164  test_outlines[i] = true;
1165  }
1166  }
1167  if (best_index >= 0) {
1168  test_outlines[best_index] = false;
1169  --num_outlines;
1170  }
1171  }
1172  if (best_cert >= target_cert) {
1173  // Save the best combination.
1174  *ok_outlines = best_outlines;
1175  if (debug_noise_removal) {
1176  tprintf("%s noise combination ", blob ? "Adding" : "New");
1177  for (int i = 0; i < best_outlines.size(); ++i) {
1178  tprintf("%d", best_outlines[i]);
1179  }
1180  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1181  target_cert);
1182  }
1183  return true;
1184  }
1185  return false;
1186 }
int size() const
Definition: genericvector.h:72
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1190
#define tprintf(...)
Definition: tprintf.h:31
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1232
void print() const
Definition: rect.h:270
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::set_done ( WERD_RES word,
inT16  pass 
)
void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 201 of file tesseractclass.h.

201  {
202  pixDestroy(&pix_grey_);
203  pix_grey_ = grey_pix;
204  }
void tesseract::Tesseract::set_pix_thresholds ( Pix *  thresholds)
inline

Definition at line 215 of file tesseractclass.h.

215  {
216  pixDestroy(&pix_thresholds_);
217  pix_thresholds_ = thresholds;
218  }
void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 222 of file tesseractclass.h.

222  {
223  source_resolution_ = ppi;
224  }
void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 307 of file output.cpp.

307  {
308  int len = word_res->reject_map.length();
309  const WERD_CHOICE &word = *(word_res->best_choice);
310  const UNICHARSET &uchset = *word.unicharset();
311  int i;
312  float rating_per_ch;
313 
314  if (suspect_level == 0) {
315  for (i = 0; i < len; i++) {
316  if (word_res->reject_map[i].rejected())
317  word_res->reject_map[i].setrej_minimal_rej_accept();
318  }
319  return;
320  }
321 
322  if (suspect_level >= 3)
323  return; //Use defaults
324 
325  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
326 
327  if (safe_dict_word(word_res) &&
328  (count_alphas(word) > suspect_short_words)) {
329  /* Unreject alphas in dictionary words */
330  for (i = 0; i < len; ++i) {
331  if (word_res->reject_map[i].rejected() &&
332  uchset.get_isalpha(word.unichar_id(i)))
333  word_res->reject_map[i].setrej_minimal_rej_accept();
334  }
335  }
336 
337  rating_per_ch = word.rating() / word_res->reject_map.length();
338 
339  if (rating_per_ch >= suspect_rating_per_ch)
340  return; //Dont touch bad ratings
341 
342  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
343  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
344  for (i = 0; i < len; ++i) {
345  if (word_res->reject_map[i].rejected() &&
346  (!uchset.eq(word.unichar_id(i), " ")))
347  word_res->reject_map[i].setrej_minimal_rej_accept();
348  }
349  }
350 
351  for (i = 0; i < len; i++) {
352  if (word_res->reject_map[i].rejected()) {
353  if (word_res->reject_map[i].flag(R_DOC_REJ))
354  word_res->reject_map[i].setrej_minimal_rej_accept();
355  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
356  word_res->reject_map[i].setrej_minimal_rej_accept();
357  if (word_res->reject_map[i].flag(R_ROW_REJ))
358  word_res->reject_map[i].setrej_minimal_rej_accept();
359  }
360  }
361 
362  if (suspect_level == 2)
363  return;
364 
365  if (!suspect_constrain_1Il ||
366  (word_res->reject_map.length() <= suspect_short_words)) {
367  for (i = 0; i < len; i++) {
368  if (word_res->reject_map[i].rejected()) {
369  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
370  word_res->reject_map[i].flag(R_POSTNN_1IL)))
371  word_res->reject_map[i].setrej_minimal_rej_accept();
372 
373  if (!suspect_constrain_1Il &&
374  word_res->reject_map[i].flag(R_MM_REJECT))
375  word_res->reject_map[i].setrej_minimal_rej_accept();
376  }
377  }
378  }
379 
380  if (acceptable_word_string(*word_res->uch_set,
381  word.unichar_string().string(),
382  word.unichar_lengths().string()) !=
383  AC_UNACCEPTABLE ||
385  word.unichar_lengths().string())) {
386  if (word_res->reject_map.length() > suspect_short_words) {
387  for (i = 0; i < len; i++) {
388  if (word_res->reject_map[i].rejected() &&
389  (!word_res->reject_map[i].perm_rejected() ||
390  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
391  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
392  word_res->reject_map[i].flag (R_MM_REJECT))) {
393  word_res->reject_map[i].setrej_minimal_rej_accept();
394  }
395  }
396  }
397  }
398 }
inT16 count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:400
float rating() const
Definition: ratngs.h:324
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
const STRING & unichar_lengths() const
Definition: ratngs.h:531
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
const STRING & unichar_string() const
Definition: ratngs.h:524
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
BOOL8 acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:421
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
Unacceptable word.
Definition: control.h:36
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1880 of file control.cpp.

1880  {
1881  // Don't try to set the word fonts for a cube word, as the configs
1882  // will be meaningless.
1883  if (word->chopped_word == NULL) return;
1884  ASSERT_HOST(word->best_choice != NULL);
1885 
1886  int fontinfo_size = get_fontinfo_table().size();
1887  if (fontinfo_size == 0) return;
1888  GenericVector<int> font_total_score;
1889  font_total_score.init_to_size(fontinfo_size, 0);
1890 
1891  word->italic = 0;
1892  word->bold = 0;
1893  // Compute the font scores for the word
1894  if (tessedit_debug_fonts) {
1895  tprintf("Examining fonts in %s\n",
1896  word->best_choice->debug_string().string());
1897  }
1898  for (int b = 0; b < word->best_choice->length(); ++b) {
1899  BLOB_CHOICE* choice = word->GetBlobChoice(b);
1900  if (choice == NULL) continue;
1901  const GenericVector<ScoredFont>& fonts = choice->fonts();
1902  for (int f = 0; f < fonts.size(); ++f) {
1903  int fontinfo_id = fonts[f].fontinfo_id;
1904  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1905  font_total_score[fontinfo_id] += fonts[f].score;
1906  }
1907  }
1908  }
1909  // Find the top and 2nd choice for the word.
1910  int score1 = 0, score2 = 0;
1911  inT16 font_id1 = -1, font_id2 = -1;
1912  for (int f = 0; f < fontinfo_size; ++f) {
1913  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1914  tprintf("Font %s, total score = %d\n",
1915  fontinfo_table_.get(f).name, font_total_score[f]);
1916  }
1917  if (font_total_score[f] > score1) {
1918  score2 = score1;
1919  font_id2 = font_id1;
1920  score1 = font_total_score[f];
1921  font_id1 = f;
1922  } else if (font_total_score[f] > score2) {
1923  score2 = font_total_score[f];
1924  font_id2 = f;
1925  }
1926  }
1927  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
1928  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
1929  // Each score has a limit of MAX_UINT16, so divide by that to get the number
1930  // of "votes" for that font, ie number of perfect scores.
1931  word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
1932  word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
1933  if (score1 > 0) {
1934  FontInfo fi = fontinfo_table_.get(font_id1);
1935  if (tessedit_debug_fonts) {
1936  if (word->fontinfo_id2_count > 0) {
1937  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1938  fi.name, word->fontinfo_id_count,
1939  fontinfo_table_.get(font_id2).name,
1940  word->fontinfo_id2_count);
1941  } else {
1942  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
1943  fi.name, word->fontinfo_id_count);
1944  }
1945  }
1946  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
1947  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
1948  }
1949 }
bool is_bold() const
Definition: fontinfo.h:112
int size() const
Definition: genericvector.h:72
#define MAX_UINT16
Definition: host.h:122
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * chopped_word
Definition: pageres.h:201
inT8 bold
Definition: pageres.h:286
#define tprintf(...)
Definition: tprintf.h:31
const FontInfo * fontinfo
Definition: pageres.h:288
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:488
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT8 fontinfo_id_count
Definition: pageres.h:290
void init_to_size(int size, T t)
bool is_italic() const
Definition: fontinfo.h:111
const STRING debug_string() const
Definition: ratngs.h:502
const FontInfo * fontinfo2
Definition: pageres.h:289
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
inT8 fontinfo_id2_count
Definition: pageres.h:291
#define MAX_INT8
Definition: host.h:118
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91
#define NULL
Definition: host.h:144
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
inT8 italic
Definition: pageres.h:285
const char * string() const
Definition: strngs.cpp:193
short inT16
Definition: host.h:100
void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 675 of file tesseractclass.cpp.

675  {
676  // Set the white and blacklists (if any)
678  tessedit_char_whitelist.string(),
679  tessedit_char_unblacklist.string());
680  // Black and white lists should apply to all loaded classifiers.
681  for (int i = 0; i < sub_langs_.size(); ++i) {
682  sub_langs_[i]->unicharset.set_black_and_whitelist(
684  tessedit_char_unblacklist.string());
685  }
686 }
UNICHARSET unicharset
Definition: ccutil.h:72
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:948
void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 654 of file tesseractclass.cpp.

654  {
655  equ_detect_ = detector;
656  equ_detect_->SetLangTesseract(this);
657 }
void SetLangTesseract(Tesseract *lang_tesseract)
void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 237 of file tesseractclass.h.

237  {
238  scaled_factor_ = factor;
239  scaled_color_ = color;
240  }
void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
GenericVector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 148 of file control.cpp.

152  {
153  // Prepare all the words.
154  PAGE_RES_IT page_res_it(page_res);
155  for (page_res_it.restart_page(); page_res_it.word() != NULL;
156  page_res_it.forward()) {
157  if (target_word_box == NULL ||
158  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
159  *target_word_box, word_config, 1)) {
160  words->push_back(WordData(page_res_it));
161  }
162  }
163  // Setup all the words for recognition with polygonal approximation.
164  for (int w = 0; w < words->size(); ++w) {
165  SetupWordPassN(pass_n, &(*words)[w]);
166  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
167  }
168 }
int size() const
Definition: genericvector.h:72
int push_back(T object)
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
#define NULL
Definition: host.h:144
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 217 of file applybox.cpp.

218  {
219  PreenXHeights(block_list);
220  // Strip all fuzzy space markers to simplify the PAGE_RES.
221  BLOCK_IT b_it(block_list);
222  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
223  BLOCK* block = b_it.data();
224  ROW_IT r_it(block->row_list());
225  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
226  ROW* row = r_it.data();
227  WERD_IT w_it(row->word_list());
228  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
229  WERD* word = w_it.data();
230  if (word->cblob_list()->empty()) {
231  delete w_it.extract();
232  } else {
233  word->set_flag(W_FUZZY_SP, false);
234  word->set_flag(W_FUZZY_NON, false);
235  }
236  }
237  }
238  }
239  PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
240  PAGE_RES_IT pr_it(page_res);
241  WERD_RES* word_res;
242  while ((word_res = pr_it.word()) != NULL) {
243  MaximallyChopWord(boxes, pr_it.block()->block,
244  pr_it.row()->row, word_res);
245  pr_it.forward();
246  }
247  return page_res;
248 }
Definition: ocrrow.h:32
Definition: ocrblock.h:30
Definition: werd.h:60
WERD * word
Definition: pageres.h:175
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:253
#define NULL
Definition: host.h:144
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:193
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 309 of file pagesegmain.cpp.

312  {
313  int vertical_x = 0;
314  int vertical_y = 1;
315  TabVector_LIST v_lines;
316  TabVector_LIST h_lines;
317  ICOORD bleft(0, 0);
318 
319  ASSERT_HOST(pix_binary_ != NULL);
321  pixWrite("tessinput.png", pix_binary_, IFF_PNG);
322  }
323  // Leptonica is used to find the rule/separator lines in the input.
324  LineFinder::FindAndRemoveLines(source_resolution_,
325  textord_tabfind_show_vlines, pix_binary_,
326  &vertical_x, &vertical_y, music_mask_pix,
327  &v_lines, &h_lines);
329  pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
330  // Leptonica is used to find a mask of the photo regions in the input.
331  *photo_mask_pix = ImageFind::FindImages(pix_binary_);
333  pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
334  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
335 
336  // The rest of the algorithm uses the usual connected components.
337  textord_.find_components(pix_binary_, blocks, to_blocks);
338 
339  TO_BLOCK_IT to_block_it(to_blocks);
340  // There must be exactly one input block.
341  // TODO(rays) handle new textline finding with a UNLV zone file.
342  ASSERT_HOST(to_blocks->singleton());
343  TO_BLOCK* to_block = to_block_it.data();
344  TBOX blkbox = to_block->block->bounding_box();
345  ColumnFinder* finder = NULL;
346 
347  if (to_block->line_size >= 2) {
348  finder = new ColumnFinder(static_cast<int>(to_block->line_size),
349  blkbox.botleft(), blkbox.topright(),
350  source_resolution_, textord_use_cjk_fp_model,
352  &v_lines, &h_lines, vertical_x, vertical_y);
353 
354  finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
355 
356  if (equ_detect_) {
357  equ_detect_->LabelSpecialText(to_block);
358  }
359 
360  BLOBNBOX_CLIST osd_blobs;
361  // osd_orientation is the number of 90 degree rotations to make the
362  // characters upright. (See osdetect.h for precise definition.)
363  // We want the text lines horizontal, (vertical text indicates vertical
364  // textlines) which may conflict (eg vertically written CJK).
365  int osd_orientation = 0;
366  bool vertical_text = textord_tabfind_force_vertical_text ||
367  pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
368  if (!vertical_text && textord_tabfind_vertical_text &&
369  PSM_ORIENTATION_ENABLED(pageseg_mode)) {
370  vertical_text =
371  finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
372  to_block, &osd_blobs);
373  }
374  if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
375  GenericVector<int> osd_scripts;
376  if (osd_tess != this) {
377  // We are running osd as part of layout analysis, so constrain the
378  // scripts to those allowed by *this.
379  AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
380  for (int s = 0; s < sub_langs_.size(); ++s) {
381  AddAllScriptsConverted(sub_langs_[s]->unicharset,
382  osd_tess->unicharset, &osd_scripts);
383  }
384  }
385  os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
386  if (pageseg_mode == PSM_OSD_ONLY) {
387  delete finder;
388  return NULL;
389  }
390  osd_orientation = osr->best_result.orientation_id;
391  double osd_score = osr->orientations[osd_orientation];
392  double osd_margin = min_orientation_margin * 2;
393  for (int i = 0; i < 4; ++i) {
394  if (i != osd_orientation &&
395  osd_score - osr->orientations[i] < osd_margin) {
396  osd_margin = osd_score - osr->orientations[i];
397  }
398  }
399  int best_script_id = osr->best_result.script_id;
400  const char* best_script_str =
401  osd_tess->unicharset.get_script_from_script_id(best_script_id);
402  bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
403  best_script_id == osd_tess->unicharset.hiragana_sid() ||
404  best_script_id == osd_tess->unicharset.katakana_sid() ||
405  strcmp("Japanese", best_script_str) == 0 ||
406  strcmp("Korean", best_script_str) == 0 ||
407  strcmp("Hangul", best_script_str) == 0;
408  if (cjk) {
409  finder->set_cjk_script(true);
410  }
411  if (osd_margin < min_orientation_margin) {
412  // The margin is weak.
413  if (!cjk && !vertical_text && osd_orientation == 2) {
414  // upside down latin text is improbable with such a weak margin.
415  tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
416  "Don't rotate.\n", osd_margin);
417  osd_orientation = 0;
418  } else {
419  tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
420  "but using orientation anyway: %d\n",
421  osd_blobs.length(), osd_margin, osd_orientation);
422  }
423  }
424  }
425  osd_blobs.shallow_clear();
426  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
427  }
428 
429  return finder;
430 }
const ICOORD & botleft() const
Definition: rect.h:88
static Pix * FindImages(Pix *pix)
Definition: imagefind.cpp:65
int script_id
Definition: osdetect.h:42
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
static void FindAndRemoveLines(int resolution, bool debug, Pix *pix, int *vertical_x, int *vertical_y, Pix **pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:243
float orientations[4]
Definition: osdetect.h:74
#define ASSERT_HOST(x)
Definition: errcode.h:84
int orientation_id
Definition: osdetect.h:41
double textord_tabfind_vertical_text_ratio
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:182
Orientation and script detection only.
Definition: publictypes.h:152
int LabelSpecialText(TO_BLOCK *to_block)
integer coordinate
Definition: points.h:30
Definition: rect.h:30
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:274
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:208
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:185
#define NULL
Definition: host.h:144
const ICOORD & topright() const
Definition: rect.h:100
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:179
bool textord_tabfind_force_vertical_text
double textord_tabfind_aligned_gap_fraction
OSBestResult best_result
Definition: osdetect.h:79
void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 439 of file tessedit.cpp.

439  {
440  // Note that we can get away with bitwise copying FontInfo in
441  // all_fonts, as it is a temporary structure and we avoid setting the
442  // delete callback.
443  UnicityTable<FontInfo> all_fonts;
445 
446  // Create the universal ID table.
447  CollectFonts(get_fontinfo_table(), &all_fonts);
448  for (int i = 0; i < sub_langs_.size(); ++i) {
449  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
450  }
451  // Assign ids from the table to each font table.
452  AssignIds(all_fonts, &get_fontinfo_table());
453  for (int i = 0; i < sub_langs_.size(); ++i) {
454  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
455  }
456  font_table_size_ = all_fonts.size();
457 }
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
int size() const
Return the size used.
void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 171 of file control.cpp.

171  {
172  if (pass_n == 1 || !word->word->done) {
173  if (pass_n == 1) {
174  word->word->SetupForRecognition(unicharset, this, BestPix(),
179  word->row, word->block);
180  } else if (pass_n == 2) {
181  // TODO(rays) Should we do this on pass1 too?
182  word->word->caps_height = 0.0;
183  if (word->word->x_height == 0.0f)
184  word->word->x_height = word->row->x_height();
185  }
186  word->lang_words.truncate(0);
187  for (int s = 0; s <= sub_langs_.size(); ++s) {
188  // The sub_langs_.size() entry is for the master language.
189  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
190  WERD_RES* word_res = new WERD_RES;
191  word_res->InitForRetryRecognition(*word->word);
192  word->lang_words.push_back(word_res);
193  // Cube doesn't get setup for pass2.
194  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
195  word_res->SetupForRecognition(
196  lang_t->unicharset, lang_t, BestPix(),
197  lang_t->tessedit_ocr_engine_mode, NULL,
198  lang_t->classify_bln_numeric_mode,
199  lang_t->textord_use_cjk_fp_model,
200  lang_t->poly_allow_detailed_fx, word->row, word->block);
201  }
202  }
203  }
204 }
bool classify_bln_numeric_mode
Definition: classify.h:500
UNICHARSET unicharset
Definition: ccutil.h:72
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
Pix * BestPix() const
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define NULL
Definition: host.h:144
void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)
int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 219 of file tesseractclass.h.

219  {
220  return source_resolution_;
221  }
void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 144 of file tfacepp.cpp.

144  {
145  // Find the biggest blob gap in the chopped_word.
146  int bestgap = -MAX_INT32;
147  int split_index = 0;
148  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
149  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
150  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
151  int gap = blob_box.left() - prev_box.right();
152  if (gap > bestgap) {
153  bestgap = gap;
154  split_index = b;
155  }
156  }
157  ASSERT_HOST(split_index > 0);
158 
159  WERD_RES *word2 = NULL;
160  BlamerBundle *orig_bb = NULL;
161  split_word(word, split_index, &word2, &orig_bb);
162 
163  // Recognize the first part of the word.
164  recog_word_recursive(word);
165  // Recognize the second part of the word.
166  recog_word_recursive(word2);
167 
168  join_words(word, word2, orig_bb);
169 }
TWERD * chopped_word
Definition: pageres.h:201
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:182
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
int NumBlobs() const
Definition: blobs.h:425
inT16 left() const
Definition: rect.h:68
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:240
#define MAX_INT32
Definition: host.h:120
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: rect.h:30
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
#define NULL
Definition: host.h:144
void tesseract::Tesseract::split_word ( WERD_RES word,
int  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 182 of file tfacepp.cpp.

185  {
186  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
187 
188  // Save a copy of the blamer bundle so we can try to reconstruct it below.
189  BlamerBundle *orig_bb =
190  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
191 
192  WERD_RES *word2 = new WERD_RES(*word);
193 
194  // blow away the copied chopped_word, as we want to work with
195  // the blobs from the input chopped_word so seam_arrays can be merged.
196  TWERD *chopped = word->chopped_word;
197  TWERD *chopped2 = new TWERD;
198  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
199  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
200  chopped2->blobs.push_back(chopped->blobs[i]);
201  }
202  chopped->blobs.truncate(split_pt);
203  word->chopped_word = NULL;
204  delete word2->chopped_word;
205  word2->chopped_word = NULL;
206 
207  const UNICHARSET &unicharset = *word->uch_set;
208  word->ClearResults();
209  word2->ClearResults();
210  word->chopped_word = chopped;
211  word2->chopped_word = chopped2;
212  word->SetupBasicsFromChoppedWord(unicharset);
213  word2->SetupBasicsFromChoppedWord(unicharset);
214 
215  // Try to adjust the blamer bundle.
216  if (orig_bb != NULL) {
217  // TODO(rays) Looks like a leak to me.
218  // orig_bb should take, rather than copy.
219  word->blamer_bundle = new BlamerBundle();
220  word2->blamer_bundle = new BlamerBundle();
221  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
222  word2->chopped_word->blobs[0]->bounding_box().left(),
224  word->blamer_bundle, word2->blamer_bundle);
225  }
226 
227  *right_piece = word2;
228  *orig_blamer_bundle = orig_bb;
229 }
void truncate(int size)
void ClearResults()
Definition: pageres.cpp:1140
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:201
T & back() const
UNICHARSET unicharset
Definition: ccutil.h:72
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:334
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool wordrec_debug_blamer
Definition: wordrec.h:167
int NumBlobs() const
Definition: blobs.h:425
const UNICHARSET * uch_set
Definition: pageres.h:192
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void reserve(int size)
#define NULL
Definition: host.h:144
Definition: blobs.h:395
TBOX bounding_box() const
Definition: blobs.cpp:482
BlamerBundle * blamer_bundle
Definition: pageres.h:230
bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, acccept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 101 of file superscript.cpp.

101  {
102  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
103  !word->best_choice) {
104  return false;
105  }
106  int num_leading, num_trailing;
107  ScriptPos sp_leading, sp_trailing;
108  float leading_certainty, trailing_certainty;
109  float avg_certainty, unlikely_threshold;
110 
111  // Calculate the number of whole suspicious characters at the edges.
113  word, &num_leading, &sp_leading, &leading_certainty,
114  &num_trailing, &sp_trailing, &trailing_certainty,
115  &avg_certainty, &unlikely_threshold);
116 
117  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
118  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
119 
120  int num_blobs = word->best_choice->length();
121 
122  // Calculate the remainder (partial characters) at the edges.
123  // This accounts for us having classified the best version of
124  // a word as [speaker?'] when it was instead [speaker.^{21}]
125  // (that is we accidentally thought the 2 was attached to the period).
126  int num_remainder_leading = 0, num_remainder_trailing = 0;
127  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
128  int super_y_bottom =
130  int sub_y_top =
132  int last_word_char = num_blobs - 1 - num_trailing;
133  float last_char_certainty = word->best_choice->certainty(last_word_char);
134  if (word->best_choice->unichar_id(last_word_char) != 0 &&
135  last_char_certainty <= unlikely_threshold) {
136  ScriptPos rpos;
137  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
138  NULL, NULL, &rpos, &num_remainder_trailing);
139  if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
140  if (num_remainder_trailing > 0 &&
141  last_char_certainty < trailing_certainty) {
142  trailing_certainty = last_char_certainty;
143  }
144  }
145  bool another_blob_available = (num_remainder_trailing == 0) ||
146  num_leading + num_trailing + 1 < num_blobs;
147  int first_char_certainty = word->best_choice->certainty(num_leading);
148  if (another_blob_available &&
149  word->best_choice->unichar_id(num_leading) != 0 &&
150  first_char_certainty <= unlikely_threshold) {
151  ScriptPos lpos;
152  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
153  &lpos, &num_remainder_leading, NULL, NULL);
154  if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
155  if (num_remainder_leading > 0 &&
156  first_char_certainty < leading_certainty) {
157  leading_certainty = first_char_certainty;
158  }
159  }
160  }
161 
162  // If nothing to do, bail now.
163  if (num_leading + num_trailing +
164  num_remainder_leading + num_remainder_trailing == 0) {
165  return false;
166  }
167 
168  if (superscript_debug >= 1) {
169  tprintf("Candidate for superscript detection: %s (",
170  word->best_choice->unichar_string().string());
171  if (num_leading || num_remainder_leading) {
172  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
173  leading_pos);
174  }
175  if (num_trailing || num_remainder_trailing) {
176  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
177  trailing_pos);
178  }
179  tprintf(")\n");
180  }
181  if (superscript_debug >= 3) {
182  word->best_choice->print();
183  }
184  if (superscript_debug >= 2) {
185  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
186  avg_certainty, unlikely_threshold);
187  if (num_leading)
188  tprintf("Orig. leading (min): %.2f ", leading_certainty);
189  if (num_trailing)
190  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
191  tprintf("\n");
192  }
193 
194  // We've now calculated the number of rebuilt blobs we want to carve off.
195  // However, split_word() works from TBLOBs in chopped_word, so we need to
196  // convert to those.
197  int num_chopped_leading =
198  LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
199  int num_chopped_trailing =
200  TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
201 
202  int retry_leading = 0;
203  int retry_trailing = 0;
204  bool is_good = false;
205  WERD_RES *revised = TrySuperscriptSplits(
206  num_chopped_leading, leading_certainty, sp_leading,
207  num_chopped_trailing, trailing_certainty, sp_trailing,
208  word, &is_good, &retry_leading, &retry_trailing);
209  if (is_good) {
210  word->ConsumeWordResults(revised);
211  } else if (retry_leading || retry_trailing) {
212  int retry_chopped_leading =
213  LeadingUnicharsToChopped(revised, retry_leading);
214  int retry_chopped_trailing =
215  TrailingUnicharsToChopped(revised, retry_trailing);
216  WERD_RES *revised2 = TrySuperscriptSplits(
217  retry_chopped_leading, leading_certainty, sp_leading,
218  retry_chopped_trailing, trailing_certainty, sp_trailing,
219  revised, &is_good, &retry_leading, &retry_trailing);
220  if (is_good) {
221  word->ConsumeWordResults(revised2);
222  }
223  delete revised2;
224  }
225  delete revised;
226  return is_good;
227 }
const int kBlnXHeight
Definition: normalis.h:28
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
const STRING & unichar_string() const
Definition: ratngs.h:524
void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, ScriptPos *trailing_pos, int *num_trailing_outliers)
Definition: superscript.cpp:46
float certainty() const
Definition: ratngs.h:327
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const int kBlnBaselineOffset
Definition: normalis.h:29
WERD * word
Definition: pageres.h:175
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
BOOL8 tess_failed
Definition: pageres.h:272
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void print() const
Definition: ratngs.h:563
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
BOOL8 tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 507 of file docqual.cpp.

508  {
509  float rating_per_ch;
510  int adjusted_len;
511  int crunch_mode = 0;
512 
513  if ((word->best_choice->unichar_string().length () == 0) ||
514  (strspn (word->best_choice->unichar_string().string(), " ") ==
515  word->best_choice->unichar_string().length ()))
516  crunch_mode = 1;
517  else {
518  adjusted_len = word->reject_map.length ();
519  if (adjusted_len > crunch_rating_max)
520  adjusted_len = crunch_rating_max;
521  rating_per_ch = word->best_choice->rating () / adjusted_len;
522 
523  if (rating_per_ch > crunch_terrible_rating)
524  crunch_mode = 2;
525  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
526  crunch_mode = 3;
527  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
528  (garbage_level != G_OK))
529  crunch_mode = 4;
530  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
531  (garbage_level != G_OK))
532  crunch_mode = 5;
533  }
534  if (crunch_mode > 0) {
535  if (crunch_debug > 2) {
536  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
537  crunch_mode, word->best_choice->unichar_string().string());
538  }
539  return TRUE;
540  }
541  else
542  return FALSE;
543 }
float rating() const
Definition: ratngs.h:324
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
Definition: docqual.h:28
inT32 length() const
Definition: strngs.cpp:188
const STRING & unichar_string() const
Definition: ratngs.h:524
float certainty() const
Definition: ratngs.h:327
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
const char * string() const
Definition: strngs.cpp:193
bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 69 of file tessbox.cpp.

69  {
70  return getDict().AcceptableResult(word);
71 }
Dict & getDict()
Definition: classify.h:65
bool AcceptableResult(WERD_RES *word)
Definition: stopper.cpp:111
void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 79 of file tessbox.cpp.

79  {
80  getDict().add_document_word(*word_choice);
81 }
Dict & getDict()
Definition: classify.h:65
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:567
void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 39 of file tessbox.cpp.

39  {
40  int saved_enable_assoc = 0;
41  int saved_chop_enable = 0;
42 
43  if (word->word->flag(W_DONT_CHOP)) {
44  saved_enable_assoc = wordrec_enable_assoc;
45  saved_chop_enable = chop_enable;
46  wordrec_enable_assoc.set_value(0);
47  chop_enable.set_value(0);
48  }
49  if (pass_n == 1)
50  set_pass1();
51  else
52  set_pass2();
53  recog_word(word);
54  if (word->best_choice == NULL)
55  word->SetupFake(*word->uch_set);
56  if (word->word->flag(W_DONT_CHOP)) {
57  wordrec_enable_assoc.set_value(saved_enable_assoc);
58  chop_enable.set_value(saved_chop_enable);
59  }
60 }
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:343
void set_pass1()
Definition: tface.cpp:85
WERD_CHOICE * best_choice
Definition: pageres.h:219
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:46
bool wordrec_enable_assoc
Definition: wordrec.h:130
const UNICHARSET * uch_set
Definition: pageres.h:192
WERD * word
Definition: pageres.h:175
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
#define NULL
Definition: host.h:144
void set_pass2()
Definition: tface.cpp:97
bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1437 of file control.cpp.

1439  {
1440  bool accept_new_x_ht = false;
1441  WERD_RES new_x_ht_word(word->word);
1442  if (word->blamer_bundle != NULL) {
1443  new_x_ht_word.blamer_bundle = new BlamerBundle();
1444  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1445  }
1446  new_x_ht_word.x_height = new_x_ht;
1447  new_x_ht_word.baseline_shift = baseline_shift;
1448  new_x_ht_word.caps_height = 0.0;
1449  new_x_ht_word.SetupForRecognition(
1452  poly_allow_detailed_fx, row, block);
1453  match_word_pass_n(2, &new_x_ht_word, row, block);
1454  if (!new_x_ht_word.tess_failed) {
1455  int new_misfits = CountMisfitTops(&new_x_ht_word);
1456  if (debug_x_ht_level >= 1) {
1457  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1458  original_misfits, word->x_height,
1459  new_misfits, new_x_ht);
1460  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1461  word->best_choice->rating(), word->best_choice->certainty(),
1462  new_x_ht_word.best_choice->rating(),
1463  new_x_ht_word.best_choice->certainty());
1464  }
1465  // The misfits must improve and either the rating or certainty.
1466  accept_new_x_ht = new_misfits < original_misfits &&
1467  (new_x_ht_word.best_choice->certainty() >
1468  word->best_choice->certainty() ||
1469  new_x_ht_word.best_choice->rating() <
1470  word->best_choice->rating());
1471  if (debug_x_ht_level >= 1) {
1472  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1473  }
1474  }
1475  if (accept_new_x_ht) {
1476  word->ConsumeWordResults(&new_x_ht_word);
1477  return true;
1478  }
1479  return false;
1480 }
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1549
bool classify_bln_numeric_mode
Definition: classify.h:500
float rating() const
Definition: ratngs.h:324
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1381
float x_height
Definition: pageres.h:295
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
float certainty() const
Definition: ratngs.h:327
Pix * BestPix() const
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
BlamerBundle * blamer_bundle
Definition: pageres.h:230
const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 241 of file tesseractclass.h.

241  {
242  return textord_;
243  }
void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)
  • Counts up the labelled words and the blobs within.
  • Deletes all unused or emptied words, counting the unused ones.
  • Resets W_BOL and W_EOL flags correctly.
  • Builds the rebuild_word and rebuilds the box_word and the best_choice.

Definition at line 706 of file applybox.cpp.

706  {
707  int ok_blob_count = 0;
708  int bad_blob_count = 0;
709  int ok_word_count = 0;
710  int unlabelled_words = 0;
711  PAGE_RES_IT pr_it(page_res);
712  WERD_RES* word_res;
713  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
714  int ok_in_word = 0;
715  int blob_count = word_res->correct_text.size();
716  WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
717  word_choice->set_permuter(TOP_CHOICE_PERM);
718  for (int c = 0; c < blob_count; ++c) {
719  if (word_res->correct_text[c].length() > 0) {
720  ++ok_in_word;
721  }
722  // Since we only need a fake word_res->best_choice, the actual
723  // unichar_ids do not matter. Which is fortunate, since TidyUp()
724  // can be called while training Tesseract, at the stage where
725  // unicharset is not meaningful yet.
727  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
728  }
729  if (ok_in_word > 0) {
730  ok_blob_count += ok_in_word;
731  bad_blob_count += word_res->correct_text.size() - ok_in_word;
732  word_res->LogNewRawChoice(word_choice);
733  word_res->LogNewCookedChoice(1, false, word_choice);
734  } else {
735  ++unlabelled_words;
736  if (applybox_debug > 0) {
737  tprintf("APPLY_BOXES: Unlabelled word at :");
738  word_res->word->bounding_box().print();
739  }
740  pr_it.DeleteCurrentWord();
741  delete word_choice;
742  }
743  }
744  pr_it.restart_page();
745  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
746  // Denormalize back to a BoxWord.
747  word_res->RebuildBestState();
748  word_res->SetupBoxWord();
749  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
750  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
751  }
752  if (applybox_debug > 0) {
753  tprintf(" Found %d good blobs.\n", ok_blob_count);
754  if (bad_blob_count > 0) {
755  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
756  bad_blob_count, ok_word_count);
757  }
758  if (unlabelled_words > 0)
759  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
760  }
761 }
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
int size() const
Definition: genericvector.h:72
void RebuildBestState()
Definition: pageres.cpp:800
int length() const
Definition: genericvector.h:79
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
TBOX bounding_box() const
Definition: werd.cpp:160
GenericVector< STRING > correct_text
Definition: pageres.h:259
Definition: werd.h:35
Definition: werd.h:36
const UNICHARSET * uch_set
Definition: pageres.h:192
WERD * word
Definition: pageres.h:175
void SetupBoxWord()
Definition: pageres.cpp:843
GenericVector< int > best_state
Definition: pageres.h:255
#define NULL
Definition: host.h:144
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 421 of file docqual.cpp.

421  {
422  WERD_RES *word;
423  GARBAGE_LEVEL garbage_level;
424  PAGE_RES_IT copy_it;
425  BOOL8 prev_potential_marked = FALSE;
426  BOOL8 found_terrible_word = FALSE;
427  BOOL8 ok_dict_word;
428 
429  page_res_it.restart_page();
430  while (page_res_it.word() != NULL) {
431  POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
432  if (pb != NULL && !pb->IsText()) {
433  page_res_it.forward();
434  continue;
435  }
436  word = page_res_it.word();
437 
439  convert_bad_unlv_chs(word);
440 
442  word->merge_tess_fails();
443 
444  if (word->reject_map.accept_count () != 0) {
445  found_terrible_word = FALSE;
446  //Forget earlier potential crunches
447  prev_potential_marked = FALSE;
448  }
449  else {
450  ok_dict_word = safe_dict_word(word);
451  garbage_level = garbage_word (word, ok_dict_word);
452 
453  if ((garbage_level != G_NEVER_CRUNCH) &&
454  (terrible_word_crunch (word, garbage_level))) {
455  if (crunch_debug > 0) {
456  tprintf ("T CRUNCHING: \"%s\"\n",
457  word->best_choice->unichar_string().string());
458  }
460  if (prev_potential_marked) {
461  while (copy_it.word () != word) {
462  if (crunch_debug > 0) {
463  tprintf ("P1 CRUNCHING: \"%s\"\n",
464  copy_it.word()->best_choice->unichar_string().string());
465  }
466  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
467  copy_it.forward ();
468  }
469  prev_potential_marked = FALSE;
470  }
471  found_terrible_word = TRUE;
472  }
473  else if ((garbage_level != G_NEVER_CRUNCH) &&
474  (potential_word_crunch (word,
475  garbage_level, ok_dict_word))) {
476  if (found_terrible_word) {
477  if (crunch_debug > 0) {
478  tprintf ("P2 CRUNCHING: \"%s\"\n",
479  word->best_choice->unichar_string().string());
480  }
482  }
483  else if (!prev_potential_marked) {
484  copy_it = page_res_it;
485  prev_potential_marked = TRUE;
486  if (crunch_debug > 1) {
487  tprintf ("P3 CRUNCHING: \"%s\"\n",
488  word->best_choice->unichar_string().string());
489  }
490  }
491  }
492  else {
493  found_terrible_word = FALSE;
494  //Forget earlier potential crunches
495  prev_potential_marked = FALSE;
496  if (crunch_debug > 2) {
497  tprintf ("NO CRUNCH: \"%s\"\n",
498  word->best_choice->unichar_string().string());
499  }
500  }
501  }
502  page_res_it.forward ();
503  }
504 }
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:663
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:545
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
unsigned char BOOL8
Definition: host.h:113
bool IsText() const
Definition: polyblk.h:52
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
BLOCK * block
Definition: pageres.h:99
const STRING & unichar_string() const
Definition: ratngs.h:524
BLOCK_RES * block() const
Definition: pageres.h:739
WERD_RES * forward()
Definition: pageres.h:713
bool crunch_early_convert_bad_unlv_chs
WERD_RES * restart_page()
Definition: pageres.h:680
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:683
inT16 accept_count()
Definition: rejctmap.cpp:331
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:507
#define FALSE
Definition: capi.h:29
GARBAGE_LEVEL
Definition: docqual.h:25
#define TRUE
Definition: capi.h:28
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void merge_tess_fails()
Definition: pageres.cpp:1061
WERD_RES * word() const
Definition: pageres.h:733
void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 593 of file docqual.cpp.

593  {
594  WERD_RES *word;
595  PAGE_RES_IT copy_it;
596  BOOL8 deleting_from_bol = FALSE;
597  BOOL8 marked_delete_point = FALSE;
598  inT16 debug_delete_mode;
599  CRUNCH_MODE delete_mode;
600  inT16 x_debug_delete_mode;
601  CRUNCH_MODE x_delete_mode;
602 
603  page_res_it.restart_page();
604  while (page_res_it.word() != NULL) {
605  word = page_res_it.word();
606 
607  delete_mode = word_deletable (word, debug_delete_mode);
608  if (delete_mode != CR_NONE) {
609  if (word->word->flag (W_BOL) || deleting_from_bol) {
610  if (crunch_debug > 0) {
611  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
612  debug_delete_mode,
613  word->best_choice->unichar_string().string());
614  }
615  word->unlv_crunch_mode = delete_mode;
616  deleting_from_bol = TRUE;
617  } else if (word->word->flag(W_EOL)) {
618  if (marked_delete_point) {
619  while (copy_it.word() != word) {
620  x_delete_mode = word_deletable (copy_it.word (),
621  x_debug_delete_mode);
622  if (crunch_debug > 0) {
623  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
624  x_debug_delete_mode,
625  copy_it.word()->best_choice->unichar_string().string());
626  }
627  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
628  copy_it.forward ();
629  }
630  }
631  if (crunch_debug > 0) {
632  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
633  debug_delete_mode,
634  word->best_choice->unichar_string().string());
635  }
636  word->unlv_crunch_mode = delete_mode;
637  deleting_from_bol = FALSE;
638  marked_delete_point = FALSE;
639  }
640  else {
641  if (!marked_delete_point) {
642  copy_it = page_res_it;
643  marked_delete_point = TRUE;
644  }
645  }
646  }
647  else {
648  deleting_from_bol = FALSE;
649  //Forget earlier potential crunches
650  marked_delete_point = FALSE;
651  }
652  /*
653  The following step has been left till now as the tess fails are used to
654  determine if the word is deletable.
655  */
657  word->merge_tess_fails();
658  page_res_it.forward ();
659  }
660 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
unsigned char BOOL8
Definition: host.h:113
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: werd.h:35
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:898
WERD_RES * forward()
Definition: pageres.h:713
WERD_RES * restart_page()
Definition: pageres.h:680
Definition: werd.h:36
CRUNCH_MODE
Definition: pageres.h:145
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void merge_tess_fails()
Definition: pageres.cpp:1061
WERD_RES * word() const
Definition: pageres.h:733
short inT16
Definition: host.h:100
bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1402 of file control.cpp.

1402  {
1403  bool accept_new_x_ht = false;
1404  int original_misfits = CountMisfitTops(word);
1405  if (original_misfits == 0)
1406  return false;
1407  float baseline_shift = 0.0f;
1408  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1409  if (baseline_shift != 0.0f) {
1410  // Try the shift on its own first.
1411  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1412  word, block, row))
1413  return false;
1414  original_misfits = CountMisfitTops(word);
1415  if (original_misfits > 0) {
1416  float new_baseline_shift;
1417  // Now recompute the new x_height.
1418  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1419  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1420  // No test of return value here, as we are definitely making a change
1421  // to the word by shifting the baseline.
1422  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1423  word, block, row);
1424  }
1425  }
1426  return true;
1427  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1428  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1429  word, block, row);
1430  } else {
1431  return false;
1432  }
1433 }
const double kMinRefitXHeightFraction
Definition: control.cpp:58
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101
float x_height
Definition: pageres.h:295
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1437
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 382 of file superscript.cpp.

388  {
389  int num_chopped = word->chopped_word->NumBlobs();
390 
391  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
392 
393  // Chop apart the word into up to three pieces.
394 
395  BlamerBundle *bb0 = NULL;
396  BlamerBundle *bb1 = NULL;
397  WERD_RES *prefix = NULL;
398  WERD_RES *core = NULL;
399  WERD_RES *suffix = NULL;
400  if (num_chopped_leading > 0) {
401  prefix = new WERD_RES(*word);
402  split_word(prefix, num_chopped_leading, &core, &bb0);
403  } else {
404  core = new WERD_RES(*word);
405  }
406 
407  if (num_chopped_trailing > 0) {
408  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
409  split_word(core, split_pt, &suffix, &bb1);
410  }
411 
412  // Recognize the pieces in turn.
413  int saved_cp_multiplier = classify_class_pruner_multiplier;
414  int saved_im_multiplier = classify_integer_matcher_multiplier;
415  if (prefix) {
416  // Turn off Tesseract's y-position penalties for the leading superscript.
419 
420  // Adjust our expectations about the baseline for this prefix.
421  if (superscript_debug >= 3) {
422  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
423  }
424  recog_word_recursive(prefix);
425  if (superscript_debug >= 2) {
426  tprintf(" The leading bits look like %s %s\n",
427  ScriptPosToString(leading_pos),
428  prefix->best_choice->unichar_string().string());
429  }
430 
431  // Restore the normal y-position penalties.
432  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
433  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
434  }
435 
436  if (superscript_debug >= 3) {
437  tprintf(" recognizing middle %d chopped blobs\n",
438  num_chopped - num_chopped_leading - num_chopped_trailing);
439  }
440 
441  if (suffix) {
442  // Turn off Tesseract's y-position penalties for the trailing superscript.
445 
446  if (superscript_debug >= 3) {
447  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
448  }
449  recog_word_recursive(suffix);
450  if (superscript_debug >= 2) {
451  tprintf(" The trailing bits look like %s %s\n",
452  ScriptPosToString(trailing_pos),
453  suffix->best_choice->unichar_string().string());
454  }
455 
456  // Restore the normal y-position penalties.
457  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
458  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
459  }
460 
461  // Evaluate whether we think the results are believably better
462  // than what we already had.
463  bool good_prefix = !prefix || BelievableSuperscript(
464  superscript_debug >= 1, *prefix,
465  superscript_bettered_certainty * leading_certainty,
466  retry_rebuild_leading, NULL);
467  bool good_suffix = !suffix || BelievableSuperscript(
468  superscript_debug >= 1, *suffix,
469  superscript_bettered_certainty * trailing_certainty,
470  NULL, retry_rebuild_trailing);
471 
472  *is_good = good_prefix && good_suffix;
473  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
474  // None of it is any good. Quit now.
475  delete core;
476  delete prefix;
477  delete suffix;
478  return NULL;
479  }
480  recog_word_recursive(core);
481 
482  // Now paste the results together into core.
483  if (suffix) {
484  suffix->SetAllScriptPositions(trailing_pos);
485  join_words(core, suffix, bb1);
486  }
487  if (prefix) {
488  prefix->SetAllScriptPositions(leading_pos);
489  join_words(prefix, core, bb0);
490  core = prefix;
491  prefix = NULL;
492  }
493 
494  if (superscript_debug >= 1) {
495  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
496  core->best_choice->unichar_string().string());
497  }
498  return core;
499 }
int classify_integer_matcher_multiplier
Definition: classify.h:469
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:182
const STRING & unichar_string() const
Definition: ratngs.h:524
int NumBlobs() const
Definition: blobs.h:425
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:180
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:240
int classify_class_pruner_multiplier
Definition: classify.h:465
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:860
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
double superscript_bettered_certainty
void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 117 of file docqual.cpp.

117  {
118  if (word->bln_boxes == NULL ||
119  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
120  return;
121 
122  DocQualCallbacks cb(word);
124  *word->rebuild_word,
126 }
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
TWERD * rebuild_word
Definition: pageres.h:244
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool empty() const
Definition: genericvector.h:84
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
#define NULL
Definition: host.h:144
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:49
void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 163 of file docqual.cpp.

164  {
165  WERD_RES *word;
166  ROW_RES *current_row;
167  BLOCK_RES *current_block;
168  int i;
169 
170  page_res_it.restart_page ();
171  while (page_res_it.word () != NULL) {
172  check_debug_pt (page_res_it.word (), 100);
173  if (bland_unrej) {
174  word = page_res_it.word ();
175  for (i = 0; i < word->reject_map.length (); i++) {
176  if (word->reject_map[i].accept_if_good_quality ())
177  word->reject_map[i].setrej_quality_accept ();
178  }
179  page_res_it.forward ();
180  }
181  else if ((page_res_it.row ()->char_count > 0) &&
182  ((page_res_it.row ()->rej_count /
183  (float) page_res_it.row ()->char_count) <=
185  word = page_res_it.word ();
189  word->best_choice->unichar_string().string(),
191  != AC_UNACCEPTABLE)) {
192  unrej_good_chs(word, page_res_it.row ()->row);
193  }
194  page_res_it.forward ();
195  }
196  else {
197  /* Skip to end of dodgy row */
198  current_row = page_res_it.row ();
199  while ((page_res_it.word () != NULL) &&
200  (page_res_it.row () == current_row))
201  page_res_it.forward ();
202  }
203  check_debug_pt (page_res_it.word (), 110);
204  }
205  page_res_it.restart_page ();
206  page_res_it.page_res->char_count = 0;
207  page_res_it.page_res->rej_count = 0;
208  current_block = NULL;
209  current_row = NULL;
210  while (page_res_it.word () != NULL) {
211  if (current_block != page_res_it.block ()) {
212  current_block = page_res_it.block ();
213  current_block->char_count = 0;
214  current_block->rej_count = 0;
215  }
216  if (current_row != page_res_it.row ()) {
217  current_row = page_res_it.row ();
218  current_row->char_count = 0;
219  current_row->rej_count = 0;
220  current_row->whole_word_rej_count = 0;
221  }
222  page_res_it.rej_stat_word ();
223  page_res_it.forward ();
224  }
225 }
void rej_stat_word()
Definition: pageres.cpp:1673
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
inT32 char_count
Definition: pageres.h:60
inT32 whole_word_rej_count
Definition: pageres.h:130
PAGE_RES * page_res
Definition: pageres.h:658
const STRING & unichar_lengths() const
Definition: ratngs.h:531
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:354
const STRING & unichar_string() const
Definition: ratngs.h:524
BLOCK_RES * block() const
Definition: pageres.h:739
WERD_RES * forward()
Definition: pageres.h:713
WERD_RES * restart_page()
Definition: pageres.h:680
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
inT32 char_count
Definition: pageres.h:100
const UNICHARSET * uch_set
Definition: pageres.h:192
ROW_RES * row() const
Definition: pageres.h:736
inT32 rej_count
Definition: pageres.h:61
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
inT32 rej_count
Definition: pageres.h:129
inT32 char_count
Definition: pageres.h:128
inT32 rej_count
Definition: pageres.h:101
Unacceptable word.
Definition: control.h:36
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:117
ROW * row
Definition: pageres.h:127
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
WERD_RES * word() const
Definition: pageres.h:733
BOOL8 tesseract::Tesseract::word_adaptable ( WERD_RES word,
uinT16  mode 
)

Definition at line 45 of file adaptions.cpp.

47  {
49  tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
50  word->best_choice == NULL ? "" :
52  word->best_choice->rating(), word->best_choice->certainty());
53  }
54 
55  BOOL8 status = FALSE;
56  BITS16 flags(mode);
57 
58  enum MODES
59  {
60  ADAPTABLE_WERD,
61  ACCEPTABLE_WERD,
62  CHECK_DAWGS,
63  CHECK_SPACES,
64  CHECK_ONE_ELL_CONFLICT,
65  CHECK_AMBIG_WERD
66  };
67 
68  /*
69  0: NO adaption
70  */
71  if (mode == 0) {
72  if (tessedit_adaption_debug) tprintf("adaption disabled\n");
73  return FALSE;
74  }
75 
76  if (flags.bit (ADAPTABLE_WERD)) {
77  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
78  if (tessedit_adaption_debug && !status) {
79  tprintf("tess_would_adapt bit is false\n");
80  }
81  }
82 
83  if (flags.bit (ACCEPTABLE_WERD)) {
84  status |= word->tess_accepted;
85  if (tessedit_adaption_debug && !status) {
86  tprintf("tess_accepted bit is false\n");
87  }
88  }
89 
90  if (!status) { // If not set then
91  return FALSE; // ignore other checks
92  }
93 
94  if (flags.bit (CHECK_DAWGS) &&
95  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
96  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
97  (word->best_choice->permuter () != USER_DAWG_PERM) &&
98  (word->best_choice->permuter () != NUMBER_PERM)) {
99  if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
100  return FALSE;
101  }
102 
103  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
104  if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
105  return FALSE;
106  }
107 
108  if (flags.bit (CHECK_SPACES) &&
109  (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
110  if (tessedit_adaption_debug) tprintf("word contains spaces\n");
111  return FALSE;
112  }
113 
114  if (flags.bit (CHECK_AMBIG_WERD) &&
116  if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
117  return FALSE;
118  }
119 
121  tprintf("returning status %d\n", status);
122  }
123  return status;
124 }
BOOL8 tess_accepted
Definition: pageres.h:280
float rating() const
Definition: ratngs.h:324
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
Definition: reject.cpp:292
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
unsigned char BOOL8
Definition: host.h:113
CMD_EVENTS mode
Definition: pgedit.cpp:116
BOOL8 tess_would_adapt
Definition: pageres.h:281
bool dangerous_ambig_found() const
Definition: ratngs.h:360
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: bits16.h:25
float certainty() const
Definition: ratngs.h:327
uinT8 permuter() const
Definition: ratngs.h:343
#define FALSE
Definition: capi.h:29
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
BOOL8 tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_its)

Definition at line 717 of file pgedit.cpp.

717  {
718  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
720  return word_set_display(pr_it);
721 }
ScrollView * image_win
Definition: pgedit.cpp:107
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:946
BOOL8 tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 729 of file pgedit.cpp.

729  {
730  WERD_RES* word_res = pr_it->word();
731  if (word_res->chopped_word == NULL) {
732  // Setup word normalization parameters.
733  word_res->SetupForRecognition(unicharset, this, BestPix(),
738  pr_it->row()->row, pr_it->block()->block);
739  }
742  1.0, 0.0f, -1000.0f, 1000.0f);
743  C_BLOB_IT it(word_res->word->cblob_list());
745  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
746  it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
748  color = WERD::NextColor(color);
749  }
751  return TRUE;
752 }
bool classify_bln_numeric_mode
Definition: classify.h:500
TWERD * chopped_word
Definition: pageres.h:201
static void Update()
Definition: scrollview.cpp:715
UNICHARSET unicharset
Definition: ccutil.h:72
BLOCK * block
Definition: pageres.h:99
BLOCK_RES * block() const
Definition: pageres.h:739
ScrollView * bln_word_window_handle()
Definition: pgedit.cpp:172
void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor, float y_offset, float minx, float maxx)
Definition: pgedit.cpp:210
void Clear()
Definition: scrollview.cpp:595
ROW_RES * row() const
Definition: pageres.h:736
DENORM denorm
Definition: pageres.h:190
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:306
Pix * BestPix() const
WERD * word
Definition: pageres.h:175
ROW * row
Definition: pageres.h:127
#define TRUE
Definition: capi.h:28
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define NULL
Definition: host.h:144
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_RES * word() const
Definition: pageres.h:733
inT16 tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 65 of file docqual.cpp.

65  {
66  if (word->bln_boxes == NULL ||
67  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
68  return 0;
69 
70  DocQualCallbacks cb(word);
72  *word->rebuild_word,
74  return cb.match_count;
75 }
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
TWERD * rebuild_word
Definition: pageres.h:244
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool empty() const
Definition: genericvector.h:84
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
#define NULL
Definition: host.h:144
void CountMatchingBlobs(int index)
Definition: docqual.cpp:39
void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
inT16 match_count,
inT16 accepted_match_count 
)

Definition at line 97 of file docqual.cpp.

100  {
101  if (word->bln_boxes == NULL ||
102  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
103  return;
104 
105  DocQualCallbacks cb(word);
107  *word->rebuild_word,
109  *match_count = cb.match_count;
110  *accepted_match_count = cb.accepted_match_count;
111 }
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
TWERD * rebuild_word
Definition: pageres.h:244
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool empty() const
Definition: genericvector.h:84
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:43
#define NULL
Definition: host.h:144
BOOL8 tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 509 of file reject.cpp.

510  {
511  inT16 i;
512  inT16 offset;
513 
514  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
515  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
516  (word_lengths[i] != 1 || word[offset] != '1'))
517  return TRUE;
518  }
519  return FALSE;
520 }
UNICHARSET unicharset
Definition: ccutil.h:72
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
short inT16
Definition: host.h:100
CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
inT16 delete_mode 
)

Definition at line 898 of file docqual.cpp.

898  {
899  int word_len = word->reject_map.length ();
900  float rating_per_ch;
901  TBOX box; //BB of word
902 
903  if (word->unlv_crunch_mode == CR_NONE) {
904  delete_mode = 0;
905  return CR_NONE;
906  }
907 
908  if (word_len == 0) {
909  delete_mode = 1;
910  return CR_DELETE;
911  }
912 
913  if (word->rebuild_word != NULL) {
914  // Cube leaves rebuild_word NULL.
915  box = word->rebuild_word->bounding_box();
916  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
917  delete_mode = 4;
918  return CR_DELETE;
919  }
920 
921  if (noise_outlines(word->rebuild_word)) {
922  delete_mode = 5;
923  return CR_DELETE;
924  }
925  }
926 
927  if ((failure_count (word) * 1.5) > word_len) {
928  delete_mode = 2;
929  return CR_LOOSE_SPACE;
930  }
931 
932  if (word->best_choice->certainty () < crunch_del_cert) {
933  delete_mode = 7;
934  return CR_LOOSE_SPACE;
935  }
936 
937  rating_per_ch = word->best_choice->rating () / word_len;
938 
939  if (rating_per_ch > crunch_del_rating) {
940  delete_mode = 8;
941  return CR_LOOSE_SPACE;
942  }
943 
945  delete_mode = 9;
946  return CR_LOOSE_SPACE;
947  }
948 
949  if (box.bottom () >
951  delete_mode = 10;
952  return CR_LOOSE_SPACE;
953  }
954 
955  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
956  delete_mode = 11;
957  return CR_LOOSE_SPACE;
958  }
959 
960  if (box.width () < crunch_del_min_width * kBlnXHeight) {
961  delete_mode = 3;
962  return CR_LOOSE_SPACE;
963  }
964 
965  delete_mode = 0;
966  return CR_NONE;
967 }
const int kBlnXHeight
Definition: normalis.h:28
float rating() const
Definition: ratngs.h:324
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:969
float certainty() const
Definition: ratngs.h:327
TWERD * rebuild_word
Definition: pageres.h:244
const int kBlnBaselineOffset
Definition: normalis.h:29
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:881
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:981
Definition: rect.h:30
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
BOOL8 tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 761 of file pgedit.cpp.

761  {
762  WERD_RES* word_res = pr_it->word();
763  WERD* word = word_res->word;
764  TBOX word_bb; // word bounding box
765  int word_height; // ht of word BB
766  BOOL8 displayed_something = FALSE;
767  float shift; // from bot left
768  C_BLOB_IT c_it; // cblob iterator
769 
770  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
771  BoxWord* box_word = word_res->box_word;
772  WERD_CHOICE* best_choice = word_res->best_choice;
773  int length = box_word->length();
774  if (word_res->fontinfo == NULL) return false;
775  const FontInfo& font_info = *word_res->fontinfo;
776  for (int i = 0; i < length; ++i) {
778  switch (color_mode) {
779  case CM_SUBSCRIPT:
780  if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
781  color = ScrollView::RED;
782  break;
783  case CM_SUPERSCRIPT:
784  if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
785  color = ScrollView::RED;
786  break;
787  case CM_ITALIC:
788  if (font_info.is_italic())
789  color = ScrollView::RED;
790  break;
791  case CM_BOLD:
792  if (font_info.is_bold())
793  color = ScrollView::RED;
794  break;
795  case CM_FIXEDPITCH:
796  if (font_info.is_fixed_pitch())
797  color = ScrollView::RED;
798  break;
799  case CM_SERIF:
800  if (font_info.is_serif())
801  color = ScrollView::RED;
802  break;
803  case CM_SMALLCAPS:
804  if (word_res->small_caps)
805  color = ScrollView::RED;
806  break;
807  case CM_DROPCAPS:
808  if (best_choice->BlobPosition(i) == SP_DROPCAP)
809  color = ScrollView::RED;
810  break;
811  // TODO(rays) underline is currently completely unsupported.
812  case CM_UNDERLINE:
813  default:
814  break;
815  }
816  image_win->Pen(color);
817  TBOX box = box_word->BlobBox(i);
818  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
819  }
820  return true;
821  }
822  /*
823  Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
824  etc. are to keep the compiler happy.
825  */
826  // display bounding box
827  if (word->display_flag(DF_BOX)) {
828  word->bounding_box().plot(image_win,
832  editor_image_word_bb_color));
833 
836  image_win->Pen(c);
837  c_it.set_to_list(word->cblob_list());
838  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
839  c_it.data()->bounding_box().plot(image_win);
840  displayed_something = TRUE;
841  }
842 
843  // display edge steps
844  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
845  word->plot(image_win); // rainbow colors
846  displayed_something = TRUE;
847  }
848 
849  // display poly approx
850  if (word->display_flag(DF_POLYGONAL)) {
851  // need to convert
853  tword->plot(image_win);
854  delete tword;
855  displayed_something = TRUE;
856  }
857 
858  // Display correct text and blamer information.
859  STRING text;
860  STRING blame;
861  if (word->display_flag(DF_TEXT) && word->text() != NULL) {
862  text = word->text();
863  }
864  if (word->display_flag(DF_BLAMER) &&
865  !(word_res->blamer_bundle != NULL &&
867  text = "";
868  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
869  if (blamer_bundle == NULL) {
870  text += "NULL";
871  } else {
872  text = blamer_bundle->TruthString();
873  }
874  text += " -> ";
875  STRING best_choice_str;
876  if (word_res->best_choice == NULL) {
877  best_choice_str = "NULL";
878  } else {
879  word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
880  }
881  text += best_choice_str;
882  IncorrectResultReason reason = (blamer_bundle == NULL) ?
883  IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
884  ASSERT_HOST(reason < IRR_NUM_REASONS)
885  blame += " [";
886  blame += BlamerBundle::IncorrectReasonName(reason);
887  blame += "]";
888  }
889  if (text.length() > 0) {
890  word_bb = word->bounding_box();
892  word_height = word_bb.height();
893  int text_height = 0.50 * word_height;
894  if (text_height > 20) text_height = 20;
895  image_win->TextAttributes("Arial", text_height, false, false, false);
896  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
897  image_win->Text(word_bb.left() + shift,
898  word_bb.bottom() + 0.25 * word_height, text.string());
899  if (blame.length() > 0) {
900  image_win->Text(word_bb.left() + shift,
901  word_bb.bottom() + 0.25 * word_height - text_height,
902  blame.string());
903  }
904 
905  displayed_something = TRUE;
906  }
907 
908  if (!displayed_something) // display BBox anyway
909  word->bounding_box().plot(image_win,
910  (ScrollView::Color)((inT32) editor_image_word_bb_color),
912  editor_image_word_bb_color));
913  return TRUE;
914 }
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:319
bool is_bold() const
Definition: fontinfo.h:112
IncorrectResultReason
Definition: blamer.h:37
void Pen(Color color)
Definition: scrollview.cpp:726
tesseract::BoxWord * box_word
Definition: pageres.h:250
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:658
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:641
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
const FontInfo * fontinfo
Definition: pageres.h:288
Definition: werd.h:55
bool small_caps
Definition: pageres.h:283
inT16 right() const
Definition: rect.h:75
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
#define ASSERT_HOST(x)
Definition: errcode.h:84
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:793
Definition: werd.h:51
void plot(ScrollView *window)
Definition: blobs.cpp:918
STRING TruthString() const
Definition: blamer.h:100
ScrollView * image_win
Definition: pgedit.cpp:107
inT16 left() const
Definition: rect.h:68
BOOL8 display_flag(uinT8 flag) const
Definition: werd.h:131
bool is_fixed_pitch() const
Definition: fontinfo.h:113
bool is_italic() const
Definition: fontinfo.h:111
Definition: werd.h:60
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
const char * text() const
Definition: werd.h:125
inT16 bottom() const
Definition: rect.h:61
WERD * word
Definition: pageres.h:175
int editor_image_word_bb_color
Definition: pgedit.cpp:136
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
#define FALSE
Definition: capi.h:29
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:297
bool is_serif() const
Definition: fontinfo.h:114
Definition: rect.h:30
#define TRUE
Definition: capi.h:28
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
int editor_image_blob_bb_color
Definition: pgedit.cpp:138
Definition: strngs.h:44
Definition: werd.h:50
#define NULL
Definition: host.h:144
Definition: blobs.h:395
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
BlamerBundle * blamer_bundle
Definition: pageres.h:230
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_RES * word() const
Definition: pageres.h:733
int inT32
Definition: host.h:102
void plot(ScrollView *fd) const
Definition: rect.h:278
BOOL8 tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 922 of file pgedit.cpp.

922  {
923  if (pr_it->block()->block != NULL) {
924  tprintf("\nBlock data...\n");
925  pr_it->block()->block->print(NULL, FALSE);
926  }
927  tprintf("\nRow data...\n");
928  pr_it->row()->row->print(NULL);
929  tprintf("\nWord data...\n");
930  WERD_RES* word_res = pr_it->word();
931  word_res->word->print();
932  if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
934  tprintf("Current blamer debug: %s\n",
935  word_res->blamer_bundle->debug().string());
936  }
937  return TRUE;
938 }
#define tprintf(...)
Definition: tprintf.h:31
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
void print(FILE *fp, BOOL8 dump)
dump whole table
Definition: ocrblock.cpp:196
BLOCK * block
Definition: pageres.h:99
bool wordrec_debug_blamer
Definition: wordrec.h:167
BLOCK_RES * block() const
Definition: pageres.h:739
ROW_RES * row() const
Definition: pageres.h:736
void print()
Definition: werd.cpp:266
void print(FILE *fp)
Definition: ocrrow.cpp:167
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
ROW * row
Definition: pageres.h:127
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
BlamerBundle * blamer_bundle
Definition: pageres.h:230
WERD_RES * word() const
Definition: pageres.h:733
const STRING & debug() const
Definition: blamer.h:116
inT16 tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 77 of file docqual.cpp.

77  {
78  inT16 i = 0;
79  inT16 err_count = 0;
80 
81  if (word->rebuild_word != NULL) {
82  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
83  TBLOB* blob = word->rebuild_word->blobs[b];
84  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
85  blob->NumOutlines());
86  i++;
87  }
88  }
89  return err_count;
90 }
Definition: blobs.h:261
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_string() const
Definition: ratngs.h:524
int NumBlobs() const
Definition: blobs.h:425
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:128
TWERD * rebuild_word
Definition: pageres.h:244
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
#define NULL
Definition: host.h:144
int NumOutlines() const
Definition: blobs.cpp:469
short inT16
Definition: host.h:100
BOOL8 tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 946 of file pgedit.cpp.

946  {
947  WERD* word = pr_it->word()->word;
955  return word_display(pr_it);
956 }
BOOL8 bit(uinT8 bit_num) const
Definition: bits16.h:56
BOOL8 word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:761
void set_display_flag(uinT8 flag, BOOL8 value)
Definition: werd.h:132
Definition: werd.h:55
Definition: werd.h:51
BITS16 word_display_mode
Definition: pgedit.cpp:122
Definition: werd.h:60
WERD * word
Definition: pageres.h:175
Definition: werd.h:50
WERD_RES * word() const
Definition: pageres.h:733
inT16 tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 681 of file fixspace.cpp.

682  {
683  float noise_score[512];
684  int i;
685  int min_noise_blob; // 1st contender
686  int max_noise_blob; // last contender
687  int non_noise_count;
688  int worst_noise_blob; // Worst blob
689  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
690  float non_noise_limit = kBlnXHeight * 0.8;
691 
692  if (word_res->rebuild_word == NULL)
693  return -1; // Can't handle cube words.
694 
695  // Normalised.
696  int blob_count = word_res->box_word->length();
697  ASSERT_HOST(blob_count <= 512);
698  if (blob_count < 5)
699  return -1; // too short to split
700 
701  /* Get the noise scores for all blobs */
702 
703  #ifndef SECURE_NAMES
704  if (debug_fix_space_level > 5)
705  tprintf("FP fixspace Noise metrics for \"%s\": ",
706  word_res->best_choice->unichar_string().string());
707  #endif
708 
709  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
710  TBLOB* blob = word_res->rebuild_word->blobs[i];
711  if (word_res->reject_map[i].accepted())
712  noise_score[i] = non_noise_limit;
713  else
714  noise_score[i] = blob_noise_score(blob);
715 
716  if (debug_fix_space_level > 5)
717  tprintf("%1.1f ", noise_score[i]);
718  }
719  if (debug_fix_space_level > 5)
720  tprintf("\n");
721 
722  /* Now find the worst one which is far enough away from the end of the word */
723 
724  non_noise_count = 0;
725  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
726  if (noise_score[i] >= non_noise_limit) {
727  non_noise_count++;
728  }
729  }
730  if (non_noise_count < fixsp_non_noise_limit)
731  return -1;
732 
733  min_noise_blob = i;
734 
735  non_noise_count = 0;
736  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
737  i--) {
738  if (noise_score[i] >= non_noise_limit) {
739  non_noise_count++;
740  }
741  }
742  if (non_noise_count < fixsp_non_noise_limit)
743  return -1;
744 
745  max_noise_blob = i;
746 
747  if (min_noise_blob > max_noise_blob)
748  return -1;
749 
750  *worst_noise_score = small_limit;
751  worst_noise_blob = -1;
752  for (i = min_noise_blob; i <= max_noise_blob; i++) {
753  if (noise_score[i] < *worst_noise_score) {
754  worst_noise_blob = i;
755  *worst_noise_score = noise_score[i];
756  }
757  }
758  return worst_noise_blob;
759 }
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
tesseract::BoxWord * box_word
Definition: pageres.h:250
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:681
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
int NumBlobs() const
Definition: blobs.h:425
TWERD * rebuild_word
Definition: pageres.h:244
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:761
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int length() const
Definition: boxword.h:85
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
BOOL8  force_eol 
)

Definition at line 132 of file output.cpp.

134  { // override tilde crunch?
135  WERD_RES *word = page_res_it.word();
136  const UNICHARSET &uchset = *word->uch_set;
137  int i;
138  BOOL8 need_reject = FALSE;
139  UNICHAR_ID space = uchset.unichar_to_id(" ");
140 
141  if ((word->unlv_crunch_mode != CR_NONE ||
142  word->best_choice->length() == 0) &&
144  if ((word->unlv_crunch_mode != CR_DELETE) &&
145  (!stats_.tilde_crunch_written ||
146  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
147  (word->word->space () > 0) &&
148  !word->word->flag (W_FUZZY_NON) &&
149  !word->word->flag (W_FUZZY_SP)))) {
150  if (!word->word->flag (W_BOL) &&
151  (word->word->space () > 0) &&
152  !word->word->flag (W_FUZZY_NON) &&
153  !word->word->flag (W_FUZZY_SP)) {
154  stats_.last_char_was_tilde = false;
155  }
156  need_reject = TRUE;
157  }
158  if ((need_reject && !stats_.last_char_was_tilde) ||
159  (force_eol && stats_.write_results_empty_block)) {
160  /* Write a reject char - mark as rejected unless zero_rejection mode */
161  stats_.last_char_was_tilde = TRUE;
162  stats_.tilde_crunch_written = true;
163  stats_.last_char_was_newline = false;
164  stats_.write_results_empty_block = false;
165  }
166 
167  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
168  stats_.tilde_crunch_written = false;
169  stats_.last_char_was_newline = true;
170  stats_.last_char_was_tilde = false;
171  }
172 
173  if (force_eol)
174  stats_.write_results_empty_block = true;
175  return;
176  }
177 
178  /* NORMAL PROCESSING of non tilde crunched words */
179 
180  stats_.tilde_crunch_written = false;
181  if (newline_type)
182  stats_.last_char_was_newline = true;
183  else
184  stats_.last_char_was_newline = false;
185  stats_.write_results_empty_block = force_eol; // about to write a real word
186 
187  if (unlv_tilde_crunching &&
188  stats_.last_char_was_tilde &&
189  (word->word->space() == 0) &&
191  (word->best_choice->unichar_id(0) == space)) {
192  /* Prevent adjacent tilde across words - we know that adjacent tildes within
193  words have been removed */
194  word->MergeAdjacentBlobs(0);
195  }
196  if (newline_type ||
198  stats_.last_char_was_tilde = false;
199  else {
200  if (word->reject_map.length () > 0) {
201  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
202  stats_.last_char_was_tilde = true;
203  else
204  stats_.last_char_was_tilde = false;
205  }
206  else if (word->word->space () > 0)
207  stats_.last_char_was_tilde = false;
208  /* else it is unchanged as there are no output chars */
209  }
210 
211  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
212 
213  set_unlv_suspects(word);
214  check_debug_pt (word, 120);
216  tprintf ("Dict word: \"%s\": %d\n",
217  word->best_choice->debug_string().string(),
218  dict_word(*(word->best_choice)));
219  }
220  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
222  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
223  for (i = 0; i < word->best_choice->length(); ++i) {
224  if (word->reject_map[i].rejected())
225  word->reject_map[i].setrej_minimal_rej_accept();
226  }
227  }
229  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
230  for (i = 0; i < word->best_choice->length(); ++i) {
231  if ((word->best_choice->unichar_id(i) != space) &&
232  word->reject_map[i].rejected())
233  word->reject_map[i].setrej_minimal_rej_accept();
234  }
235  }
236  }
237 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
unsigned char BOOL8
Definition: host.h:113
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:124
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: werd.h:35
Definition: werd.h:36
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
const STRING debug_string() const
Definition: ratngs.h:502
int UNICHAR_ID
Definition: unichar.h:33
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:307
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
uinT8 space()
Definition: werd.h:104
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:968
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
const char * string() const
Definition: strngs.cpp:193
WERD_RES * word() const
Definition: pageres.h:733

Member Data Documentation

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 817 of file tesseractclass.h.

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 822 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 826 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 829 of file tesseractclass.h.

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 818 of file tesseractclass.h.

double tesseract::Tesseract::bestrate_pruning_factor = 2.0

"Multiplying factor of" " current best rate to prune other hypotheses"

Definition at line 1103 of file tesseractclass.h.

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 816 of file tesseractclass.h.

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no chekcs"

Definition at line 929 of file tesseractclass.h.

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 869 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 870 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 871 of file tesseractclass.h.

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1043 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 958 of file tesseractclass.h.

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 967 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 947 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 952 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 953 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 949 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 948 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 950 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 946 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 937 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 936 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 961 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Dont pot crunch sensible strings"

Definition at line 960 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Dont crunch words with long lower case strings"

Definition at line 963 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Dont touch sensible strings"

Definition at line 957 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Dont crunch words with long lower case strings"

Definition at line 965 of file tesseractclass.h.

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 966 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 941 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 942 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 945 of file tesseractclass.h.

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 956 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 944 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 943 of file tesseractclass.h.

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 955 of file tesseractclass.h.

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 954 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 939 of file tesseractclass.h.

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 938 of file tesseractclass.h.

int tesseract::Tesseract::cube_debug_level = 1

"Print cube debug info."

Definition at line 893 of file tesseractclass.h.

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 868 of file tesseractclass.h.

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 973 of file tesseractclass.h.

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 852 of file tesseractclass.h.

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 867 of file tesseractclass.h.

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 897 of file tesseractclass.h.

bool tesseract::Tesseract::enable_new_segsearch = false

"Enable new segmentation search path."

Definition at line 1143 of file tesseractclass.h.

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 851 of file tesseractclass.h.

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1050 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 972 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 969 of file tesseractclass.h.

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 970 of file tesseractclass.h.

double tesseract::Tesseract::heuristic_max_char_wh_ratio = 2.0

"max char width-to-height ratio allowed in segmentation"

Definition at line 1141 of file tesseractclass.h.

double tesseract::Tesseract::heuristic_segcost_rating_base = 1.25

"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."

Definition at line 1132 of file tesseractclass.h.

double tesseract::Tesseract::heuristic_weight_rating = 1

"weight associated with char rating in combined cost of state"

Definition at line 1134 of file tesseractclass.h.

double tesseract::Tesseract::heuristic_weight_seamcut = 0

"weight associated with seam cut in combined cost of state"

Definition at line 1139 of file tesseractclass.h.

double tesseract::Tesseract::heuristic_weight_width = 1000.0

"weight associated with width evidence in combined cost of" " state"

Definition at line 1137 of file tesseractclass.h.

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 935 of file tesseractclass.h.

bool tesseract::Tesseract::include_page_breaks = false

"Include page separator string in output text after each " "image/page."

Definition at line 1083 of file tesseractclass.h.

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1049 of file tesseractclass.h.

int tesseract::Tesseract::language_model_fixed_length_choices_depth = 3

"Depth of blob choice lists to explore" " when fixed length dawgs are on"

Definition at line 1126 of file tesseractclass.h.

bool tesseract::Tesseract::load_fixed_length_dawgs = true

"Load fixed length" " dawgs (e.g. for non-space delimited languages)"

Definition at line 1099 of file tesseractclass.h.

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1061 of file tesseractclass.h.

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1044 of file tesseractclass.h.

bool tesseract::Tesseract::ngram_permuter_activated = false

"Activate character-level n-gram-based permuter"

Definition at line 1122 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 855 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 858 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 864 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 861 of file tesseractclass.h.

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 865 of file tesseractclass.h.

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 866 of file tesseractclass.h.

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 975 of file tesseractclass.h.

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 811 of file tesseractclass.h.

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1042 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 895 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 894 of file tesseractclass.h.

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1085 of file tesseractclass.h.

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 807 of file tesseractclass.h.

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 889 of file tesseractclass.h.

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 892 of file tesseractclass.h.

bool tesseract::Tesseract::permute_chartype_word = 0

"Turn on character type (property) consistency permuter"

Definition at line 1115 of file tesseractclass.h.

bool tesseract::Tesseract::permute_debug = 0

"char permutation debug"

Definition at line 1101 of file tesseractclass.h.

bool tesseract::Tesseract::permute_fixed_length_dawg = 0

"Turn on fixed-length phrasebook search permuter"

Definition at line 1113 of file tesseractclass.h.

bool tesseract::Tesseract::permute_only_top = false

"Run only the top choice permuter"

Definition at line 1123 of file tesseractclass.h.

bool tesseract::Tesseract::permute_script_word = 0

"Turn on word script consistency permuter"

Definition at line 1105 of file tesseractclass.h.

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1065 of file tesseractclass.h.

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1080 of file tesseractclass.h.

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 873 of file tesseractclass.h.

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 876 of file tesseractclass.h.

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 877 of file tesseractclass.h.

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 875 of file tesseractclass.h.

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 872 of file tesseractclass.h.

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 931 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Dont double check"

Definition at line 1033 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1032 of file tesseractclass.h.

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1038 of file tesseractclass.h.

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1031 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1036 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1037 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1034 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1035 of file tesseractclass.h.

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1039 of file tesseractclass.h.

int tesseract::Tesseract::segment_debug = 0

"Debug the whole segmentation process"

Definition at line 1100 of file tesseractclass.h.

double tesseract::Tesseract::segment_reward_chartype = 0.97

"Score multipler for char type consistency within a word. "

Definition at line 1117 of file tesseractclass.h.

double tesseract::Tesseract::segment_reward_ngram_best_choice = 0.99

"Score multipler for ngram permuter's best choice" " (only used in the Han script path)."

Definition at line 1120 of file tesseractclass.h.

double tesseract::Tesseract::segment_reward_script = 0.95

"Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."

Definition at line 1111 of file tesseractclass.h.

bool tesseract::Tesseract::segment_segcost_rating = 0

"incorporate segmentation cost in word rating?"

Definition at line 1107 of file tesseractclass.h.

double tesseract::Tesseract::segsearch_max_fixed_pitch_char_wh_ratio = 2.0

"Maximum character width-to-height ratio for" "fixed pitch fonts"

Definition at line 1146 of file tesseractclass.h.

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 994 of file tesseractclass.h.

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 986 of file tesseractclass.h.

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 979 of file tesseractclass.h.

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 998 of file tesseractclass.h.

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 990 of file tesseractclass.h.

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 982 of file tesseractclass.h.

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1016 of file tesseractclass.h.

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1014 of file tesseractclass.h.

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1009 of file tesseractclass.h.

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Dont touch bad rating limit"

Definition at line 1015 of file tesseractclass.h.

int tesseract::Tesseract::suspect_short_words = 2

"Dont Suspect dict wds longer than this"

Definition at line 1013 of file tesseractclass.h.

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 1011 of file tesseractclass.h.

int tesseract::Tesseract::tessdata_manager_debug_level = 0

"Debug level for TessdataManager functions."

Definition at line 1053 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 815 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 803 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 848 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 797 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 801 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 799 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 1023 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1045 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 1005 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1006 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_txt = true

"Write .txt output file"

Definition at line 1004 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 842 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 926 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 841 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 928 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 830 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 915 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 917 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 831 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 787 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 844 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 846 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 840 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 834 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 837 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1026 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 923 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 899 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1040 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1068 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1055 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1028 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 785 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 883 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 881 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1017 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY

"Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" " to loading and running only Tesseract (no Cube, no combiner)." " (Values from OcrEngineMode enum in tesseractclass.h)"

Definition at line 795 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ok_mode = 5

"Acceptance decision algorithm"

Definition at line 1097 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1051 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specifc page to process"

Definition at line 1047 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 791 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1078 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctation joins"

Definition at line 971 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 911 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 919 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 913 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 838 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 925 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 904 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 902 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1024 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 906 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1025 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 779 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 781 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 921 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 879 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 882 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 885 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 832 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 783 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Dont bother with word plausibility"

Definition at line 836 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1030 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1057 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 900 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 909 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1020 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 1000 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1048 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 813 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 1002 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 1003 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Dont reject ANYTHING AT ALL"

Definition at line 1022 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Dont reject ANYTHING"

Definition at line 1018 of file tesseractclass.h.

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 886 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 887 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 888 of file tesseractclass.h.

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1069 of file tesseractclass.h.

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1077 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1072 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1062 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_vertical_horizontal_mix = true

"find horizontal lines such as headers in vertical page mode"

Definition at line 1096 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1070 of file tesseractclass.h.

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1075 of file tesseractclass.h.

bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE

"Use CJK fixed pitch model"

Definition at line 1063 of file tesseractclass.h.

bool tesseract::Tesseract::unlv_tilde_crunching = true

"Mark v.bad words for tilde crunch"

Definition at line 933 of file tesseractclass.h.

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1008 of file tesseractclass.h.

bool tesseract::Tesseract::use_new_state_cost = FALSE

"use new state cost heuristics for segmentation state evaluation"

Definition at line 1128 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 977 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 978 of file tesseractclass.h.


The documentation for this class was generated from the following files: