All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::Wordrec Class Reference

#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:
tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Tesseract

Public Member Functions

 Wordrec ()
 
virtual ~Wordrec ()
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void WordSearch (WERD_RES *word_res)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, bool init_classifier, bool init_permute)
 
cc_recog

Recognize a word.

void cc_recog (WERD_RES *word)
 
program_editdown

This function holds any nessessary post processing for the Wise Owl program.

void program_editdown (inT32 elasped_time)
 
set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()
 
set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()
 
end_recog

Cleanup and exit the recog program.

int end_recog ()
 
call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)
 
classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters
blobCurrent blob
stringThe string to display in ScrollView
colorThe colour to use when displayed with ScrollView
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)
 
add_point_to_list

Add an edge point to a POINT_GROUP containg a list of other points.

void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, PointHeap *points)
 
new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_min_point (EDGEPT *local_min, PointHeap *points)
 
new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_max_point (EDGEPT *local_max, PointHeap *points)
 
vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
improve_one_blob

Finds the best place to chop, based on the worst blob, fixpt, or next to a fragment, according to the input. Returns the SEAM corresponding to the chop point, if any is found, and the index in the ratings_matrix of the chopped blob. Note that blob_choices is just a copy of the pointers in the leading diagonal of the ratings MATRIX. Although the blob is chopped, the returned SEAM is yet to be inserted into word->seam_array and the resulting blobs are unclassified, so this function can be used by ApplyBox as well as during recognition.

SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. The results are returned in the WERD_RES.

void chop_word_main (WERD_RES *word)
 
improve_by_chopping

Repeatedly chops the worst blob, classifying the new blobs fixing up all the data, and incrementally runs the segmentation search until a good word is found, or no more chops can be found.

void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (bool load_pre_trained_templates)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (FILE *File)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
TessdataManager tessdata_manager
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 123 of file wordrec.h.

Constructor & Destructor Documentation

tesseract::Wordrec::Wordrec ( )

Definition at line 26 of file wordrec.cpp.

26  :
27  // control parameters
29  "Merge the fragments in the ratings matrix and delete them"
30  " after merging", params()),
31  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
32  params()),
33  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
34  params()),
36  "force associator to run regardless of what enable_assoc is."
37  "This is used for CJK where component grouping is necessary.",
38  CCUtil::params()),
39  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
40  params()),
42  "Use information from fragments to guide chopping process",
43  params()),
44  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
45  params()),
46  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
47  params()),
48  INT_MEMBER(chop_debug, 0, "Chop debug",
49  params()),
50  BOOL_MEMBER(chop_enable, 1, "Chop enable",
51  params()),
52  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
53  params()),
54  INT_MEMBER(chop_split_length, 10000, "Split Length",
55  params()),
56  INT_MEMBER(chop_same_distance, 2, "Same distance",
57  params()),
58  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
59  params()),
60  INT_MEMBER(chop_seam_pile_size, 150, "Max number of seams in seam_pile",
61  params()),
62  BOOL_MEMBER(chop_new_seam_pile, 1, "Use new seam_pile", params()),
63  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
64  params()),
65  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
66  params()),
67  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
68  params()),
69  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
70  params()),
71  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
72  params()),
73  INT_MEMBER(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
74  "above which we don't care that a chop is not near the center.",
75  params()),
76  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
77  params()),
78  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
79  params()),
80  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
81  params()),
82  double_MEMBER(chop_good_split, 50.0, "Good split limit",
83  params()),
84  INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
85  params()),
86  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
87  params()),
89  "include fixed-pitch heuristics in char segmentation",
90  params()),
92  "Debug level for wordrec", params()),
94  "Max number of broken pieces to associate", params()),
96  "Only run OCR for words that had truth recorded in BlamerBundle",
97  params()),
99  "Print blamer debug messages", params()),
101  "Try to set the blame for errors", params()),
103  "SegSearch debug level", params()),
105  "Maximum number of pain points stored in the queue",
106  params()),
108  "Maximum number of pain point classifications per chunk that"
109  "did not result in finding a better word choice.",
110  params()),
112  "Maximum character width-to-height ratio", params()),
114  "Save alternative paths found during chopping"
115  " and segmentation search",
116  params()) {
118  language_model_ = new LanguageModel(&get_fontinfo_table(),
119  &(getDict()));
121 }
int repair_unchopped_blobs
Definition: wordrec.h:137
int chop_min_outline_points
Definition: wordrec.h:144
bool fragments_guide_chopper
Definition: wordrec.h:136
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:420
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
double chop_ok_split
Definition: wordrec.h:156
int chop_same_distance
Definition: wordrec.h:143
double chop_overlap_knob
Definition: wordrec.h:150
double chop_width_change_knob
Definition: wordrec.h:155
double chop_sharpness_knob
Definition: wordrec.h:154
double tessedit_certainty_threshold
Definition: wordrec.h:138
double chop_good_split
Definition: wordrec.h:157
double wordrec_worst_state
Definition: wordrec.h:134
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
bool chop_new_seam_pile
Definition: wordrec.h:146
int chop_seam_pile_size
Definition: wordrec.h:145
int segment_adjust_debug
Definition: wordrec.h:159
bool wordrec_no_block
Definition: wordrec.h:129
bool wordrec_enable_assoc
Definition: wordrec.h:130
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
int wordrec_max_join_chunks
Definition: wordrec.h:164
bool wordrec_debug_blamer
Definition: wordrec.h:167
bool save_alt_choices
Definition: wordrec.h:178
bool force_word_assoc
Definition: wordrec.h:133
bool wordrec_run_blamer
Definition: wordrec.h:168
LanguageModel * language_model_
Definition: wordrec.h:411
double chop_center_knob
Definition: wordrec.h:151
int chop_centered_maxwidth
Definition: wordrec.h:153
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
Dict & getDict()
Definition: classify.h:65
bool wordrec_skip_no_truth_words
Definition: wordrec.h:166
double chop_split_dist_knob
Definition: wordrec.h:149
int segsearch_max_pain_points
Definition: wordrec.h:171
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
ParamsVectors * params()
Definition: ccutil.h:65
bool chop_vertical_creep
Definition: wordrec.h:141
#define FALSE
Definition: capi.h:29
int segsearch_max_futile_classifications
Definition: wordrec.h:173
#define TRUE
Definition: capi.h:28
int chop_min_outline_area
Definition: wordrec.h:148
bool merge_fragments_in_matrix
Definition: wordrec.h:128
int wordrec_debug_level
Definition: wordrec.h:162
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
int segsearch_debug_level
Definition: wordrec.h:169
#define NULL
Definition: host.h:144
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
tesseract::Wordrec::~Wordrec ( )
virtual

Definition at line 123 of file wordrec.cpp.

123  {
124  delete language_model_;
125 }
LanguageModel * language_model_
Definition: wordrec.h:411

Member Function Documentation

void tesseract::Wordrec::add_point_to_list ( PointHeap point_heap,
EDGEPT point 
)

Definition at line 64 of file chop.cpp.

64  {
65  if (point_heap->size() < MAX_NUM_POINTS - 2) {
66  PointPair pair(point_priority(point), point);
67  point_heap->Push(&pair);
68  }
69 
70 #ifndef GRAPHICS_DISABLED
71  if (chop_debug > 2)
72  mark_outline(point);
73 #endif
74 }
#define MAX_NUM_POINTS
Definition: chop.h:39
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
void Push(Pair *entry)
Definition: genericheap.h:95
void mark_outline(EDGEPT *edgept)
Definition: plotedges.cpp:95
void tesseract::Wordrec::add_seam_to_queue ( float  new_priority,
SEAM new_seam,
SeamQueue seams 
)

Definition at line 64 of file findseam.cpp.

65  {
66  if (new_seam == NULL) return;
67  if (chop_debug) {
68  tprintf("Pushing new seam with priority %g :", new_priority);
69  new_seam->Print("seam: ");
70  }
71  if (seams->size() >= MAX_NUM_SEAMS) {
72  SeamPair old_pair(0, NULL);
73  if (seams->PopWorst(&old_pair) && old_pair.key() <= new_priority) {
74  if (chop_debug) {
75  tprintf("Old seam staying with priority %g\n", old_pair.key());
76  }
77  delete new_seam;
78  seams->Push(&old_pair);
79  return;
80  } else if (chop_debug) {
81  tprintf("New seam with priority %g beats old worst seam with %g\n",
82  new_priority, old_pair.key());
83  }
84  }
85  SeamPair new_pair(new_priority, new_seam);
86  seams->Push(&new_pair);
87 }
#define tprintf(...)
Definition: tprintf.h:31
void Print(const char *label) const
Definition: seam.cpp:160
#define MAX_NUM_SEAMS
Definition: findseam.cpp:46
void Push(Pair *entry)
Definition: genericheap.h:95
bool PopWorst(Pair *entry)
Definition: genericheap.h:138
#define NULL
Definition: host.h:144
int tesseract::Wordrec::angle_change ( EDGEPT point1,
EDGEPT point2,
EDGEPT point3 
)

Definition at line 88 of file chop.cpp.

88  {
89  VECTOR vector1;
90  VECTOR vector2;
91 
92  int angle;
93  float length;
94 
95  /* Compute angle */
96  vector1.x = point2->pos.x - point1->pos.x;
97  vector1.y = point2->pos.y - point1->pos.y;
98  vector2.x = point3->pos.x - point2->pos.x;
99  vector2.y = point3->pos.y - point2->pos.y;
100  /* Use cross product */
101  length = (float)sqrt((float)LENGTH(vector1) * LENGTH(vector2));
102  if ((int) length == 0)
103  return (0);
104  angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) /
105  length) / PI * 180.0 + 0.5));
106 
107  /* Use dot product */
108  if (SCALAR (vector1, vector2) < 0)
109  angle = 180 - angle;
110  /* Adjust angle */
111  if (angle > 180)
112  angle -= 360;
113  if (angle <= -180)
114  angle += 360;
115  return (angle);
116 }
inT16 y
Definition: blobs.h:72
#define CROSS(a, b)
Definition: vecfuncs.h:52
Definition: blobs.h:50
inT16 x
Definition: blobs.h:71
TPOINT pos
Definition: blobs.h:163
#define PI
Definition: const.h:19
#define SCALAR(a, b)
Definition: vecfuncs.h:61
#define LENGTH(a)
Definition: vecfuncs.h:70
SEAM * tesseract::Wordrec::attempt_blob_chop ( TWERD word,
TBLOB blob,
inT32  blob_number,
bool  italic_blob,
const GenericVector< SEAM * > &  seams 
)

Definition at line 170 of file chopper.cpp.

172  {
175  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
176  // Insert it into the word.
177  word->blobs.insert(other_blob, blob_number + 1);
178 
179  SEAM *seam = NULL;
180  if (prioritize_division) {
181  TPOINT location;
182  if (divisible_blob(blob, italic_blob, &location)) {
183  seam = new SEAM(0.0f, location);
184  }
185  }
186  if (seam == NULL)
187  seam = pick_good_seam(blob);
188  if (chop_debug) {
189  if (seam != NULL)
190  seam->Print("Good seam picked=");
191  else
192  tprintf("\n** no seam picked *** \n");
193  }
194  if (seam) {
195  seam->ApplySeam(italic_blob, blob, other_blob);
196  }
197 
198  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
199  seams, seam);
200  if (seam == NULL) {
204  // If the blob can simply be divided into outlines, then do that.
205  TPOINT location;
206  if (divisible_blob(blob, italic_blob, &location)) {
207  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
208  word->blobs.insert(other_blob, blob_number + 1);
209  seam = new SEAM(0.0f, location);
210  seam->ApplySeam(italic_blob, blob, other_blob);
211  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
212  seams, seam);
213  }
214  }
215  }
216  if (seam != NULL) {
217  // Make sure this seam doesn't get chopped again.
218  seam->Finalize();
219  }
220  return seam;
221 }
Definition: blobs.h:261
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:83
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:934
int repair_unchopped_blobs
Definition: wordrec.h:137
#define tprintf(...)
Definition: tprintf.h:31
void Print(const char *label) const
Definition: seam.cpp:160
bool prioritize_division
Definition: classify.h:387
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:124
void insert(T t, int index)
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:216
Definition: blobs.h:50
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:352
#define NULL
Definition: host.h:144
void Finalize()
Definition: seam.h:116
Definition: seam.h:44
TESSLINE * outlines
Definition: blobs.h:377
bool allow_blob_division
Definition: classify.h:382
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:123
BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher ( TBLOB blob)

Definition at line 134 of file tface.cpp.

134  {
135  // Rotate the blob for classification if necessary.
136  TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded();
137  if (rotated_blob == NULL) {
138  rotated_blob = tessblob;
139  }
140  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result
141  AdaptiveClassifier(rotated_blob, ratings);
142  if (rotated_blob != tessblob) {
143  delete rotated_blob;
144  }
145  return ratings;
146 }
Definition: blobs.h:261
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:363
#define NULL
Definition: host.h:144
void tesseract::Wordrec::CallFillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
inline

Definition at line 195 of file wordrec.h.

198  {
199  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
200  }
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:420
void tesseract::Wordrec::cc_recog ( WERD_RES word)

Definition at line 109 of file tface.cpp.

109  {
111  chop_word_main(word);
112  word->DebugWordChoices(getDict().stopper_debug_level >= 1,
113  getDict().word_to_debug.string());
114  ASSERT_HOST(word->StatesAllValid());
115 }
bool StatesAllValid()
Definition: pageres.cpp:449
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:471
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:394
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: werd.h:36
Dict & getDict()
Definition: classify.h:65
WERD * word
Definition: pageres.h:175
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void tesseract::Wordrec::choose_best_seam ( SeamQueue seam_queue,
const SPLIT split,
PRIORITY  priority,
SEAM **  seam_result,
TBLOB blob,
SeamPile seam_pile 
)

Definition at line 103 of file findseam.cpp.

105  {
106  SEAM *seam;
107  char str[80];
108  float my_priority;
109  /* Add seam of split */
110  my_priority = priority;
111  if (split != NULL) {
112  TPOINT split_point = split->point1->pos;
113  split_point += split->point2->pos;
114  split_point /= 2;
115  seam = new SEAM(my_priority, split_point, *split);
116  if (chop_debug > 1) seam->Print("Partial priority ");
117  add_seam_to_queue(my_priority, seam, seam_queue);
118 
119  if (my_priority > chop_good_split)
120  return;
121  }
122 
123  TBOX bbox = blob->bounding_box();
124  /* Queue loop */
125  while (!seam_queue->empty()) {
126  SeamPair seam_pair;
127  seam_queue->Pop(&seam_pair);
128  seam = seam_pair.extract_data();
129  /* Set full priority */
130  my_priority = seam->FullPriority(bbox.left(), bbox.right(),
133  if (chop_debug) {
134  sprintf (str, "Full my_priority %0.0f, ", my_priority);
135  seam->Print(str);
136  }
137 
138  if ((*seam_result == NULL || (*seam_result)->priority() > my_priority) &&
139  my_priority < chop_ok_split) {
140  /* No crossing */
141  if (seam->IsHealthy(*blob, chop_min_outline_points,
143  delete *seam_result;
144  *seam_result = new SEAM(*seam);
145  (*seam_result)->set_priority(my_priority);
146  } else {
147  delete seam;
148  seam = NULL;
149  my_priority = BAD_PRIORITY;
150  }
151  }
152 
153  if (my_priority < chop_good_split) {
154  if (seam)
155  delete seam;
156  return; /* Made good answer */
157  }
158 
159  if (seam) {
160  /* Combine with others */
161  if (seam_pile->size() < chop_seam_pile_size) {
162  combine_seam(*seam_pile, seam, seam_queue);
163  SeamDecPair pair(seam_pair.key(), seam);
164  seam_pile->Push(&pair);
165  } else if (chop_new_seam_pile &&
166  seam_pile->size() == chop_seam_pile_size &&
167  seam_pile->PeekTop().key() > seam_pair.key()) {
168  combine_seam(*seam_pile, seam, seam_queue);
169  SeamDecPair pair;
170  seam_pile->Pop(&pair); // pop the worst.
171  // Replace the seam in pair (deleting the old one) with
172  // the new seam and score, then push back into the heap.
173  pair.set_key(seam_pair.key());
174  pair.set_data(seam);
175  seam_pile->Push(&pair);
176  } else {
177  delete seam;
178  }
179  }
180 
181  my_priority = seam_queue->empty() ? NO_FULL_PRIORITY
182  : seam_queue->PeekTop().key();
183  if ((my_priority > chop_ok_split) ||
184  (my_priority > chop_good_split && split))
185  return;
186  }
187 }
#define NO_FULL_PRIORITY
Definition: findseam.cpp:49
bool Pop(Pair *entry)
Definition: genericheap.h:116
int chop_min_outline_points
Definition: wordrec.h:144
float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth, double center_knob, double width_change_knob) const
Definition: seam.cpp:245
double chop_ok_split
Definition: wordrec.h:156
void Print(const char *label) const
Definition: seam.cpp:160
double chop_overlap_knob
Definition: wordrec.h:150
double chop_width_change_knob
Definition: wordrec.h:155
bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const
Definition: seam.cpp:72
double chop_good_split
Definition: wordrec.h:157
bool chop_new_seam_pile
Definition: wordrec.h:146
const Key & key() const
Definition: kdpair.h:116
int chop_seam_pile_size
Definition: wordrec.h:145
inT16 right() const
Definition: rect.h:75
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:64
EDGEPT * point2
Definition: split.h:104
inT16 left() const
Definition: rect.h:68
double chop_center_knob
Definition: wordrec.h:151
int chop_centered_maxwidth
Definition: wordrec.h:153
void set_key(const Key &new_key)
Definition: kdpair.h:119
Definition: blobs.h:50
void set_data(Data *new_data)
Definition: kdpair.h:126
void Push(Pair *entry)
Definition: genericheap.h:95
const Pair & PeekTop() const
Definition: genericheap.h:108
bool empty() const
Definition: genericheap.h:68
TPOINT pos
Definition: blobs.h:163
Definition: rect.h:30
int chop_min_outline_area
Definition: wordrec.h:148
#define NULL
Definition: host.h:144
Definition: seam.h:44
TBOX bounding_box() const
Definition: blobs.cpp:482
#define BAD_PRIORITY
Definition: findseam.cpp:51
Data * extract_data()
Definition: kdpair.h:131
EDGEPT * point1
Definition: split.h:103
void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
Definition: findseam.cpp:197
SEAM * tesseract::Wordrec::chop_numbered_blob ( TWERD word,
inT32  blob_number,
bool  italic_blob,
const GenericVector< SEAM * > &  seams 
)

Definition at line 224 of file chopper.cpp.

226  {
227  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
228  italic_blob, seams);
229 }
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:170
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
SEAM * tesseract::Wordrec::chop_one_blob ( const GenericVector< TBOX > &  boxes,
const GenericVector< BLOB_CHOICE * > &  blob_choices,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 374 of file chopper.cpp.

377  {
378  if (prioritize_division) {
379  return chop_overlapping_blob(boxes, true, word_res, blob_number);
380  } else {
381  return improve_one_blob(blob_choices, NULL, false, true, word_res,
382  blob_number);
383  }
384 }
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:330
bool prioritize_division
Definition: classify.h:387
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:232
#define NULL
Definition: host.h:144
SEAM * tesseract::Wordrec::chop_overlapping_blob ( const GenericVector< TBOX > &  boxes,
bool  italic_blob,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 232 of file chopper.cpp.

234  {
235  TWERD *word = word_res->chopped_word;
236  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
237  TBLOB *blob = word->blobs[*blob_number];
238  TPOINT topleft, botright;
239  topleft.x = blob->bounding_box().left();
240  topleft.y = blob->bounding_box().top();
241  botright.x = blob->bounding_box().right();
242  botright.y = blob->bounding_box().bottom();
243 
244  TPOINT original_topleft, original_botright;
245  word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
246  word_res->denorm.DenormTransform(NULL, botright, &original_botright);
247 
248  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
249  original_botright.x, original_topleft.y);
250 
251  bool almost_equal_box = false;
252  int num_overlap = 0;
253  for (int i = 0; i < boxes.size(); i++) {
254  if (original_box.overlap_fraction(boxes[i]) > 0.125)
255  num_overlap++;
256  if (original_box.almost_equal(boxes[i], 3))
257  almost_equal_box = true;
258  }
259 
260  TPOINT location;
261  if (divisible_blob(blob, italic_blob, &location) ||
262  (!almost_equal_box && num_overlap > 1)) {
263  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
264  italic_blob, word_res->seam_array);
265  if (seam != NULL)
266  return seam;
267  }
268  }
269 
270  *blob_number = -1;
271  return NULL;
272 }
Definition: blobs.h:261
int size() const
Definition: genericvector.h:72
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:934
TWERD * chopped_word
Definition: pageres.h:201
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
inT16 y
Definition: blobs.h:72
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:389
inT16 right() const
Definition: rect.h:75
int NumBlobs() const
Definition: blobs.h:425
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:50
DENORM denorm
Definition: pageres.h:190
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
inT16 x
Definition: blobs.h:71
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:170
inT16 bottom() const
Definition: rect.h:61
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: rect.h:30
#define NULL
Definition: host.h:144
Definition: blobs.h:395
Definition: seam.h:44
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
void tesseract::Wordrec::chop_word_main ( WERD_RES word)

Definition at line 394 of file chopper.cpp.

394  {
395  int num_blobs = word->chopped_word->NumBlobs();
396  if (word->ratings == NULL) {
397  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
398  }
399  if (word->ratings->get(0, 0) == NULL) {
400  // Run initial classification.
401  for (int b = 0; b < num_blobs; ++b) {
402  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
403  "Initial:", word->chopped_word,
404  word->blamer_bundle);
405  word->ratings->put(b, b, choices);
406  }
407  } else {
408  // Blobs have been pre-classified. Set matrix cell for all blob choices
409  for (int col = 0; col < word->ratings->dimension(); ++col) {
410  for (int row = col; row < word->ratings->dimension() &&
411  row < col + word->ratings->bandwidth(); ++row) {
412  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
413  if (choices != NULL) {
414  BLOB_CHOICE_IT bc_it(choices);
415  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
416  bc_it.data()->set_matrix_cell(col, row);
417  }
418  }
419  }
420  }
421  }
422 
423  // Run Segmentation Search.
424  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
425  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
426 
427  if (word->best_choice == NULL) {
428  // SegSearch found no valid paths, so just use the leading diagonal.
429  word->FakeWordFromRatings();
430  }
431  word->RebuildBestState();
432  // If we finished without a hyphen at the end of the word, let the next word
433  // be found in the dictionary.
434  if (word->word->flag(W_EOL) &&
435  !getDict().has_hyphen_end(*word->best_choice)) {
436  getDict().reset_hyphen_vars(true);
437  }
438 
439  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
440  CallFillLattice(*word->ratings, word->best_choices,
441  *word->uch_set, word->blamer_bundle);
442  }
443  if (wordrec_debug_level > 0) {
444  tprintf("Final Ratings Matrix:\n");
445  word->ratings->print(getDict().getUnicharset());
446  }
447  word->FilterWordChoices(getDict().stopper_debug_level);
448 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
void RebuildBestState()
Definition: pageres.cpp:800
MATRIX * ratings
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:219
T get(int column, int row) const
Definition: matrix.h:171
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
void put(int column, int row, const T &thing)
Definition: matrix.h:166
int dimension() const
Definition: matrix.h:247
int wordrec_max_join_chunks
Definition: wordrec.h:164
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37
int NumBlobs() const
Definition: blobs.h:425
Definition: werd.h:36
const UNICHARSET * uch_set
Definition: pageres.h:192
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:504
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:57
Dict & getDict()
Definition: classify.h:65
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:195
WERD * word
Definition: pageres.h:175
int bandwidth() const
Definition: matrix.h:249
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
int wordrec_debug_level
Definition: wordrec.h:162
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: matrix.h:289
#define NULL
Definition: host.h:144
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
void FakeWordFromRatings()
Definition: pageres.cpp:892
BlamerBundle * blamer_bundle
Definition: pageres.h:230
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob ( TBLOB blob,
const char *  string,
C_COL  color,
BlamerBundle blamer_bundle 
)

Definition at line 56 of file wordclass.cpp.

58  {
59 #ifndef GRAPHICS_DISABLED
61  display_blob(blob, color);
62 #endif
63  // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.
64  BLOB_CHOICE_LIST* choices = call_matcher(blob);
65  // If a blob with the same bounding box as one of the truth character
66  // bounding boxes is not classified as the corresponding truth character
67  // blame character classifier for incorrect answer.
68  if (blamer_bundle != NULL) {
69  blamer_bundle->BlameClassifier(getDict().getUnicharset(),
70  blob->bounding_box(),
71  *choices,
73  }
74  #ifndef GRAPHICS_DISABLED
75  if (classify_debug_level && string)
76  print_ratings_list(string, choices, getDict().getUnicharset());
77 
80 #endif
81 
82  return choices;
83 }
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:257
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
bool wordrec_debug_blamer
Definition: wordrec.h:167
bool wordrec_display_all_blobs
Definition: render.cpp:49
bool wordrec_blob_pause
Definition: render.cpp:53
Dict & getDict()
Definition: classify.h:65
char window_wait(ScrollView *win)
Definition: callcpp.cpp:111
ScrollView * blob_window
Definition: render.cpp:43
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:64
BLOB_CHOICE_LIST * call_matcher(TBLOB *blob)
Definition: tface.cpp:134
#define NULL
Definition: host.h:144
TBOX bounding_box() const
Definition: blobs.cpp:482
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece ( const GenericVector< SEAM * > &  seams,
inT16  start,
inT16  end,
const char *  description,
TWERD word,
BlamerBundle blamer_bundle 
)
virtual

Definition at line 57 of file pieces.cpp.

62  {
63  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
64  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
65  White, blamer_bundle);
66  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
67  BLOB_CHOICE_IT bc_it(choices);
68  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
69  bc_it.data()->set_matrix_cell(start, end);
70  }
71 
72  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
73 
74  return (choices);
75 }
Definition: callcpp.h:34
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:216
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:194
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:56
void tesseract::Wordrec::combine_seam ( const SeamPile seam_pile,
const SEAM seam,
SeamQueue seam_queue 
)

Definition at line 197 of file findseam.cpp.

198  {
199  for (int x = 0; x < seam_pile.size(); ++x) {
200  const SEAM *this_one = seam_pile.get(x).data();
201  if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) {
202  SEAM *new_one = new SEAM(*seam);
203  new_one->CombineWith(*this_one);
204  if (chop_debug > 1) new_one->Print("Combo priority ");
205  add_seam_to_queue(new_one->priority(), new_one, seam_queue);
206  }
207  }
208 }
double chop_ok_split
Definition: wordrec.h:156
void Print(const char *label) const
Definition: seam.cpp:160
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:64
bool CombineableWith(const SEAM &other, int max_x_dist, float max_total_priority) const
Definition: seam.cpp:46
const Pair & get(int index) const
Definition: genericheap.h:87
void CombineWith(const SEAM &other)
Definition: seam.cpp:60
#define SPLIT_CLOSENESS
Definition: findseam.cpp:44
float priority() const
Definition: seam.h:65
Definition: seam.h:44
int tesseract::Wordrec::dict_word ( const WERD_CHOICE word)

Definition at line 124 of file tface.cpp.

124  {
125  return getDict().valid_word(word);
126 }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
Dict & getDict()
Definition: classify.h:65
void tesseract::Wordrec::DoSegSearch ( WERD_RES word_res)

Definition at line 31 of file segsearch.cpp.

31  {
32  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
33  // Run Segmentation Search.
34  SegSearch(word_res, &best_choice_bundle, NULL);
35 }
MATRIX * ratings
Definition: pageres.h:215
int dimension() const
Definition: matrix.h:247
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37
#define NULL
Definition: host.h:144
int tesseract::Wordrec::end_recog ( )

Definition at line 61 of file tface.cpp.

61  {
62  program_editdown (0);
63 
64  return (0);
65 }
void program_editdown(inT32 elasped_time)
Definition: tface.cpp:74
void tesseract::Wordrec::fill_filtered_fragment_list ( BLOB_CHOICE_LIST *  choices,
int  fragment_pos,
int  num_frag_parts,
BLOB_CHOICE_LIST *  filtered_choices 
)

Definition at line 106 of file pieces.cpp.

109  {
110  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
111  BLOB_CHOICE_IT choices_it(choices);
112 
113  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
114  choices_it.forward()) {
115  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
116  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
117 
118  if (frag != NULL && frag->get_pos() == fragment_pos &&
119  frag->get_total() == num_frag_parts) {
120  // Recover the unichar_id of the unichar that this fragment is
121  // a part of
122  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
123  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
124  b->set_unichar_id(original_unichar);
125  filtered_choices_it.add_to_end(b);
126  }
127  }
128 
129  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
130 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
UNICHARSET unicharset
Definition: ccutil.h:72
int get_total() const
Definition: unicharset.h:66
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int UNICHAR_ID
Definition: unichar.h:33
const char * get_unichar() const
Definition: unicharset.h:64
#define NULL
Definition: host.h:144
int get_pos() const
Definition: unicharset.h:65
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
void tesseract::Wordrec::FillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
void tesseract::Wordrec::get_fragment_lists ( inT16  current_frag,
inT16  current_row,
inT16  start,
inT16  num_frag_parts,
inT16  num_blobs,
MATRIX ratings,
BLOB_CHOICE_LIST *  choice_lists 
)

Definition at line 283 of file pieces.cpp.

286  {
287  if (current_frag == num_frag_parts) {
288  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
289  choice_lists, ratings);
290  return;
291  }
292 
293  for (inT16 x = current_row; x < num_blobs; x++) {
294  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
295  if (choices == NULL)
296  continue;
297 
298  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
299  &choice_lists[current_frag]);
300  if (!choice_lists[current_frag].empty()) {
301  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
302  num_blobs, ratings, choice_lists);
303  choice_lists[current_frag].clear();
304  }
305  }
306 }
T get(int column, int row) const
Definition: matrix.h:171
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:106
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:283
void merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:139
#define NULL
Definition: host.h:144
short inT16
Definition: host.h:100
PRIORITY tesseract::Wordrec::grade_sharpness ( register SPLIT split)

Definition at line 74 of file gradechop.cpp.

74  {
75  register PRIORITY grade;
76 
77  grade = point_priority (split->point1) + point_priority (split->point2);
78 
79  if (grade < -360.0)
80  grade = 0;
81  else
82  grade += 360.0;
83 
84  grade *= chop_sharpness_knob; /* Values 0 to -360 */
85 
86  return (grade);
87 }
double chop_sharpness_knob
Definition: wordrec.h:154
float PRIORITY
Definition: seam.h:42
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
PRIORITY tesseract::Wordrec::grade_split_length ( register SPLIT split)

Definition at line 51 of file gradechop.cpp.

51  {
52  register PRIORITY grade;
53  register float split_length;
54 
55  split_length =
56  split->point1->WeightedDistance(*split->point2, chop_x_y_weight);
57 
58  if (split_length <= 0)
59  grade = 0;
60  else
61  grade = sqrt (split_length) * chop_split_dist_knob;
62 
63  return (MAX (0.0, grade));
64 }
#define MAX(x, y)
Definition: ndminx.h:24
float PRIORITY
Definition: seam.h:42
double chop_split_dist_knob
Definition: wordrec.h:149
void tesseract::Wordrec::improve_by_chopping ( float  rating_cert_scale,
WERD_RES word,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending 
)

Definition at line 457 of file chopper.cpp.

462  {
463  int blob_number;
464  do { // improvement loop.
465  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
466  // one to chop.
467  GenericVector<BLOB_CHOICE*> blob_choices;
468  int num_blobs = word->ratings->dimension();
469  for (int i = 0; i < num_blobs; ++i) {
470  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
471  if (choices == NULL || choices->empty()) {
472  blob_choices.push_back(NULL);
473  } else {
474  BLOB_CHOICE_IT bc_it(choices);
475  blob_choices.push_back(bc_it.data());
476  }
477  }
478  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
479  false, false, word, &blob_number);
480  if (seam == NULL) break;
481  // A chop has been made. We have to correct all the data structures to
482  // take into account the extra bottom-level blob.
483  // Put the seam into the seam_array and correct everything else on the
484  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
485  // states in WERD_CHOICEs, and blob widths.
486  word->InsertSeam(blob_number, seam);
487  // Insert a new entry in the beam array.
488  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
489  // Fixpts are outdated, but will get recalculated.
490  best_choice_bundle->fixpt.clear();
491  // Remap existing pain points.
492  pain_points->RemapForSplit(blob_number);
493  // Insert a new pending at the chop point.
494  pending->insert(SegSearchPending(), blob_number);
495 
496  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
497  // as that updates the pending correctly and adds new pain points.
498  MATRIX_COORD pain_point(blob_number, blob_number);
499  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
500  pain_points, blamer_bundle);
501  pain_point.col = blob_number + 1;
502  pain_point.row = blob_number + 1;
503  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
504  pain_points, blamer_bundle);
506  // N-gram evaluation depends on the number of blobs in a chunk, so we
507  // have to re-evaluate everything in the word.
508  ResetNGramSearch(word, best_choice_bundle, pending);
509  blob_number = 0;
510  }
511  // Run language model incrementally. (Except with the n-gram model on.)
512  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
513  word, pain_points, best_choice_bundle, blamer_bundle);
514  } while (!language_model_->AcceptableChoiceFound() &&
515  word->ratings->dimension() < kMaxNumChunks);
516 
517  // If after running only the chopper best_choice is incorrect and no blame
518  // has been yet set, blame the classifier if best_choice is classifier's
519  // top choice and is a dictionary word (i.e. language model could not have
520  // helped). Otherwise blame the tradeoff between the classifier and
521  // the old language model (permuters).
522  if (word->blamer_bundle != NULL &&
524  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
525  bool valid_permuter = word->best_choice != NULL &&
528  getDict().getUnicharset(),
529  valid_permuter,
531  }
532 }
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
MATRIX * ratings
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:219
int push_back(T object)
T get(int column, int row) const
Definition: matrix.h:171
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:330
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
int dimension() const
Definition: matrix.h:247
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:369
bool wordrec_debug_blamer
Definition: wordrec.h:167
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
void insert(T t, int index)
LanguageModel * language_model_
Definition: wordrec.h:411
uinT8 permuter() const
Definition: ratngs.h:343
Dict & getDict()
Definition: classify.h:65
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:325
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
#define NULL
Definition: host.h:144
Definition: seam.h:44
BlamerBundle * blamer_bundle
Definition: pageres.h:230
SEAM * tesseract::Wordrec::improve_one_blob ( const GenericVector< BLOB_CHOICE * > &  blob_choices,
DANGERR fixpt,
bool  split_next_to_fragment,
bool  italic_blob,
WERD_RES word,
int *  blob_number 
)

Definition at line 330 of file chopper.cpp.

335  {
336  float rating_ceiling = MAX_FLOAT32;
337  SEAM *seam = NULL;
338  do {
339  *blob_number = select_blob_to_split_from_fixpt(fixpt);
340  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
341  bool split_point_from_dict = (*blob_number != -1);
342  if (split_point_from_dict) {
343  fixpt->clear();
344  } else {
345  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
346  split_next_to_fragment);
347  }
348  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
349  if (*blob_number == -1)
350  return NULL;
351 
352  // TODO(rays) it may eventually help to allow italic_blob to be true,
353  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
354  word->seam_array);
355  if (seam != NULL)
356  return seam; // Success!
357  if (blob_choices[*blob_number] == NULL)
358  return NULL;
359  if (!split_point_from_dict) {
360  // We chopped the worst rated blob, try something else next time.
361  rating_ceiling = blob_choices[*blob_number]->rating();
362  }
363  } while (true);
364  return seam;
365 }
int select_blob_to_split(const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:541
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
SEAM * chop_numbered_blob(TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:224
#define MAX_FLOAT32
Definition: host.h:124
#define NULL
Definition: host.h:144
Definition: seam.h:44
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:633
void tesseract::Wordrec::InitBlamerForSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 342 of file segsearch.cpp.

345  {
346  pain_points->Clear(); // Clear pain points heap.
348  pain_points, &LMPainPoints::GenerateForBlamer,
349  static_cast<double>(segsearch_max_char_wh_ratio), word_res);
350  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
351  getDict().WildcardID(), wordrec_debug_blamer,
352  blamer_debug, pp_cb);
353  delete pp_cb;
354 }
MATRIX * ratings
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
bool wordrec_debug_blamer
Definition: wordrec.h:167
Dict & getDict()
Definition: classify.h:65
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:473
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
void tesseract::Wordrec::InitialSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 150 of file segsearch.cpp.

153  {
154  if (segsearch_debug_level > 0) {
155  tprintf("Starting SegSearch on ratings matrix%s:\n",
156  wordrec_enable_assoc ? " (with assoc)" : "");
157  word_res->ratings->print(getDict().getUnicharset());
158  }
159 
160  pain_points->GenerateInitial(word_res);
161 
162  // Compute scaling factor that will help us recover blob outline length
163  // from classifier rating and certainty for the blob.
164  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
165 
168  segsearch_max_char_wh_ratio, rating_cert_scale);
169 
170  // Initialize blamer-related information: map character boxes recorded in
171  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
172  // ratings matrix. We expect this step to succeed, since when running the
173  // chopper we checked that the correct chops are present.
174  if (blamer_bundle != NULL) {
175  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
177  }
178 
179  // pending[col] tells whether there is update work to do to combine
180  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
181  // As the language model state is updated, pending entries are modified to
182  // minimize duplication of work. It is important that during the update the
183  // children are considered in the non-decreasing order of their column, since
184  // this guarantees that all the parents would be up to date before an update
185  // of a child is done.
186  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
187 
188  // Search the ratings matrix for the initial best path.
189  (*pending)[0].SetColumnClassified();
190  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
191  pain_points, best_choice_bundle, blamer_bundle);
192 }
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
MATRIX * ratings
Definition: pageres.h:215
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
int dimension() const
Definition: matrix.h:247
bool wordrec_enable_assoc
Definition: wordrec.h:130
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:407
bool wordrec_debug_blamer
Definition: wordrec.h:167
LanguageModel * language_model_
Definition: wordrec.h:411
void init_to_size(int size, T t)
Dict & getDict()
Definition: classify.h:65
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
int segsearch_debug_level
Definition: wordrec.h:169
#define NULL
Definition: host.h:144
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
double certainty_scale
Definition: dict.h:601
bool tesseract::Wordrec::is_inside_angle ( EDGEPT pt)

Definition at line 78 of file chop.cpp.

78  {
79  return angle_change(pt->prev, pt, pt->next) < chop_inside_angle;
80 }
EDGEPT * prev
Definition: blobs.h:170
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:88
EDGEPT * next
Definition: blobs.h:169
void tesseract::Wordrec::merge_and_put_fragment_lists ( inT16  row,
inT16  column,
inT16  num_frag_parts,
BLOB_CHOICE_LIST *  choice_lists,
MATRIX ratings 
)

Definition at line 139 of file pieces.cpp.

142  {
143  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
144 
145  for (int i = 0; i < num_frag_parts; i++) {
146  choice_lists_it[i].set_to_list(&choice_lists[i]);
147  choice_lists_it[i].mark_cycle_pt();
148  }
149 
150  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
151  if (merged_choice == NULL)
152  merged_choice = new BLOB_CHOICE_LIST;
153 
154  bool end_of_list = false;
155  BLOB_CHOICE_IT merged_choice_it(merged_choice);
156  while (!end_of_list) {
157  // Find the maximum unichar_id of the current entry the iterators
158  // are pointing at
159  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
160  for (int i = 0; i < num_frag_parts; i++) {
161  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
162  if (max_unichar_id < unichar_id) {
163  max_unichar_id = unichar_id;
164  }
165  }
166 
167  // Move the each iterators until it gets to an entry that has a
168  // value greater than or equal to max_unichar_id
169  for (int i = 0; i < num_frag_parts; i++) {
170  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
171  while (!choice_lists_it[i].cycled_list() &&
172  unichar_id < max_unichar_id) {
173  choice_lists_it[i].forward();
174  unichar_id = choice_lists_it[i].data()->unichar_id();
175  }
176  if (choice_lists_it[i].cycled_list()) {
177  end_of_list = true;
178  break;
179  }
180  }
181 
182  if (end_of_list)
183  break;
184 
185  // Checks if the fragments are parts of the same character
186  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
187  bool same_unichar = true;
188  for (int i = 1; i < num_frag_parts; i++) {
189  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
190  if (unichar_id != first_unichar_id) {
191  same_unichar = false;
192  break;
193  }
194  }
195 
196  if (same_unichar) {
197  // Add the merged character to the result
198  UNICHAR_ID merged_unichar_id = first_unichar_id;
199  GenericVector<ScoredFont> merged_fonts =
200  choice_lists_it[0].data()->fonts();
201  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
202  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
203  float positive_yshift = 0, negative_yshift = 0;
204  int merged_script_id = choice_lists_it[0].data()->script_id();
205  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
206 
207  float merged_rating = 0, merged_certainty = 0;
208  for (int i = 0; i < num_frag_parts; i++) {
209  float rating = choice_lists_it[i].data()->rating();
210  float certainty = choice_lists_it[i].data()->certainty();
211 
212  if (i == 0 || certainty < merged_certainty)
213  merged_certainty = certainty;
214  merged_rating += rating;
215 
216  choice_lists_it[i].forward();
217  if (choice_lists_it[i].cycled_list())
218  end_of_list = true;
219  IntersectRange(choice_lists_it[i].data()->min_xheight(),
220  choice_lists_it[i].data()->max_xheight(),
221  &merged_min_xheight, &merged_max_xheight);
222  float yshift = choice_lists_it[i].data()->yshift();
223  if (yshift > positive_yshift) positive_yshift = yshift;
224  if (yshift < negative_yshift) negative_yshift = yshift;
225  // Use the min font rating over the parts.
226  // TODO(rays) font lists are unsorted. Need to be faster?
227  const GenericVector<ScoredFont>& frag_fonts =
228  choice_lists_it[i].data()->fonts();
229  for (int f = 0; f < frag_fonts.size(); ++f) {
230  int merged_f = 0;
231  for (merged_f = 0; merged_f < merged_fonts.size() &&
232  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
233  ++merged_f) {}
234  if (merged_f == merged_fonts.size()) {
235  merged_fonts.push_back(frag_fonts[f]);
236  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
237  merged_fonts[merged_f].score = frag_fonts[f].score;
238  }
239  }
240  }
241 
242  float merged_yshift = positive_yshift != 0
243  ? (negative_yshift != 0 ? 0 : positive_yshift)
244  : negative_yshift;
245  BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id,
246  merged_rating,
247  merged_certainty,
248  merged_script_id,
249  merged_min_xheight,
250  merged_max_xheight,
251  merged_yshift,
252  classifier);
253  choice->set_fonts(merged_fonts);
254  merged_choice_it.add_to_end(choice);
255  }
256  }
257 
259  print_ratings_list("Merged Fragments", merged_choice,
260  unicharset);
261 
262  if (merged_choice->empty())
263  delete merged_choice;
264  else
265  ratings->put(row, column, merged_choice);
266 
267  delete [] choice_lists_it;
268 }
int size() const
Definition: genericvector.h:72
int push_back(T object)
T get(int column, int row) const
Definition: matrix.h:171
UNICHARSET unicharset
Definition: ccutil.h:72
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:146
void put(int column, int row, const T &thing)
Definition: matrix.h:166
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
int UNICHAR_ID
Definition: unichar.h:33
BlobChoiceClassifier
Definition: ratngs.h:40
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:94
#define NULL
Definition: host.h:144
void tesseract::Wordrec::merge_fragments ( MATRIX ratings,
inT16  num_blobs 
)

Definition at line 315 of file pieces.cpp.

315  {
316  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
317  for (inT16 start = 0; start < num_blobs; start++) {
318  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
319  frag_parts++) {
320  get_fragment_lists(0, start, start, frag_parts, num_blobs,
321  ratings, choice_lists);
322  }
323  }
324 
325  // Delete fragments from the rating matrix
326  for (inT16 x = 0; x < num_blobs; x++) {
327  for (inT16 y = x; y < num_blobs; y++) {
328  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
329  if (choices != NULL) {
330  BLOB_CHOICE_IT choices_it(choices);
331  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
332  choices_it.forward()) {
333  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
334  const CHAR_FRAGMENT *frag =
335  unicharset.get_fragment(choice_unichar_id);
336  if (frag != NULL)
337  delete choices_it.extract();
338  }
339  }
340  }
341  }
342 }
T get(int column, int row) const
Definition: matrix.h:171
UNICHARSET unicharset
Definition: ccutil.h:72
static const int kMaxChunks
Definition: unicharset.h:49
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int UNICHAR_ID
Definition: unichar.h:33
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:283
#define NULL
Definition: host.h:144
short inT16
Definition: host.h:100
bool tesseract::Wordrec::near_point ( EDGEPT point,
EDGEPT line_pt_0,
EDGEPT line_pt_1,
EDGEPT **  near_pt 
)

Definition at line 49 of file outlines.cpp.

51  {
52  TPOINT p;
53 
54  float slope;
55  float intercept;
56 
57  float x0 = line_pt_0->pos.x;
58  float x1 = line_pt_1->pos.x;
59  float y0 = line_pt_0->pos.y;
60  float y1 = line_pt_1->pos.y;
61 
62  if (x0 == x1) {
63  /* Handle vertical line */
64  p.x = (inT16) x0;
65  p.y = point->pos.y;
66  }
67  else {
68  /* Slope and intercept */
69  slope = (y0 - y1) / (x0 - x1);
70  intercept = y1 - x1 * slope;
71 
72  /* Find perpendicular */
73  p.x = (inT16) ((point->pos.x + (point->pos.y - intercept) * slope) /
74  (slope * slope + 1));
75  p.y = (inT16) (slope * p.x + intercept);
76  }
77 
78  if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
79  (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
80  /* Intersection on line */
81  *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
82  return true;
83  } else { /* Intersection not on line */
84  *near_pt = closest(point, line_pt_0, line_pt_1);
85  return false;
86  }
87 }
inT16 y
Definition: blobs.h:72
#define is_on_line(p, p0, p1)
Definition: outlines.h:120
#define closest(test_p, p1, p2)
Definition: outlines.h:71
#define same_point(p1, p2)
Definition: outlines.h:49
Definition: blobs.h:50
inT16 x
Definition: blobs.h:71
EDGEPT * make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev)
Definition: split.cpp:142
TPOINT pos
Definition: blobs.h:163
short inT16
Definition: host.h:100
void tesseract::Wordrec::new_max_point ( EDGEPT local_max,
PointHeap points 
)

Definition at line 245 of file chop.cpp.

245  {
246  inT16 dir;
247 
248  dir = direction (local_max);
249 
250  if (dir > 0) {
251  add_point_to_list(points, local_max);
252  return;
253  }
254 
255  if (dir == 0 && point_priority (local_max) < 0) {
256  add_point_to_list(points, local_max);
257  return;
258  }
259 }
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:64
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
short inT16
Definition: host.h:100
void tesseract::Wordrec::new_min_point ( EDGEPT local_min,
PointHeap points 
)

Definition at line 221 of file chop.cpp.

221  {
222  inT16 dir;
223 
224  dir = direction (local_min);
225 
226  if (dir < 0) {
227  add_point_to_list(points, local_min);
228  return;
229  }
230 
231  if (dir == 0 && point_priority (local_min) < 0) {
232  add_point_to_list(points, local_min);
233  return;
234  }
235 }
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:64
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
short inT16
Definition: host.h:100
EDGEPT * tesseract::Wordrec::pick_close_point ( EDGEPT critical_point,
EDGEPT vertical_point,
int *  best_dist 
)

Definition at line 124 of file chop.cpp.

126  {
127  EDGEPT *best_point = NULL;
128  int this_distance;
129  int found_better;
130 
131  do {
132  found_better = FALSE;
133 
134  this_distance = edgept_dist (critical_point, vertical_point);
135  if (this_distance <= *best_dist) {
136 
137  if (!(same_point (critical_point->pos, vertical_point->pos) ||
138  same_point (critical_point->pos, vertical_point->next->pos) ||
139  (best_point && same_point (best_point->pos, vertical_point->pos)) ||
140  is_exterior_point (critical_point, vertical_point))) {
141  *best_dist = this_distance;
142  best_point = vertical_point;
144  found_better = TRUE;
145  }
146  }
147  vertical_point = vertical_point->next;
148  }
149  while (found_better == TRUE);
150 
151  return (best_point);
152 }
#define is_exterior_point(edge, point)
Definition: outlines.h:97
#define edgept_dist(p1, p2)
Definition: outlines.h:87
#define same_point(p1, p2)
Definition: outlines.h:49
EDGEPT * next
Definition: blobs.h:169
bool chop_vertical_creep
Definition: wordrec.h:141
#define FALSE
Definition: capi.h:29
TPOINT pos
Definition: blobs.h:163
Definition: blobs.h:76
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB blob)

Definition at line 216 of file findseam.cpp.

216  {
217  SeamPile seam_pile(chop_seam_pile_size);
218  EDGEPT *points[MAX_NUM_POINTS];
219  EDGEPT_CLIST new_points;
220  SEAM *seam = NULL;
221  TESSLINE *outline;
222  inT16 num_points = 0;
223 
224 #ifndef GRAPHICS_DISABLED
225  if (chop_debug > 2)
226  wordrec_display_splits.set_value(true);
227 
228  draw_blob_edges(blob);
229 #endif
230 
231  PointHeap point_heap(MAX_NUM_POINTS);
232  for (outline = blob->outlines; outline; outline = outline->next)
233  prioritize_points(outline, &point_heap);
234 
235  while (!point_heap.empty() && num_points < MAX_NUM_POINTS) {
236  points[num_points++] = point_heap.PeekTop().data;
237  point_heap.Pop(NULL);
238  }
239 
240  /* Initialize queue */
241  SeamQueue seam_queue(MAX_NUM_SEAMS);
242 
243  try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob);
244  try_vertical_splits(points, num_points, &new_points,
245  &seam_queue, &seam_pile, &seam, blob);
246 
247  if (seam == NULL) {
248  choose_best_seam(&seam_queue, NULL, BAD_PRIORITY, &seam, blob, &seam_pile);
249  } else if (seam->priority() > chop_good_split) {
250  choose_best_seam(&seam_queue, NULL, seam->priority(), &seam, blob,
251  &seam_pile);
252  }
253 
254  EDGEPT_C_IT it(&new_points);
255  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
256  EDGEPT *inserted_point = it.data();
257  if (seam == NULL || !seam->UsesPoint(inserted_point)) {
258  for (outline = blob->outlines; outline; outline = outline->next) {
259  if (outline->loop == inserted_point) {
260  outline->loop = outline->loop->next;
261  }
262  }
263  remove_edgept(inserted_point);
264  }
265  }
266 
267  if (seam) {
268  if (seam->priority() > chop_ok_split) {
269  delete seam;
270  seam = NULL;
271  }
272 #ifndef GRAPHICS_DISABLED
273  else if (wordrec_display_splits) {
274  seam->Mark(edge_window);
275  if (chop_debug > 2) {
278  }
279  }
280 #endif
281  }
282 
283  if (chop_debug)
284  wordrec_display_splits.set_value(false);
285 
286  return (seam);
287 }
#define MAX_NUM_POINTS
Definition: chop.h:39
bool wordrec_display_splits
Definition: split.cpp:44
double chop_ok_split
Definition: wordrec.h:156
void remove_edgept(EDGEPT *point)
Definition: split.cpp:203
void Mark(ScrollView *window) const
Definition: seam.cpp:186
TESSLINE * next
Definition: blobs.h:258
double chop_good_split
Definition: wordrec.h:157
int chop_seam_pile_size
Definition: wordrec.h:145
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:103
#define update_edge_window()
Definition: plotedges.h:45
void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:297
#define MAX_NUM_SEAMS
Definition: findseam.cpp:46
EDGEPT * next
Definition: blobs.h:169
float priority() const
Definition: seam.h:65
bool UsesPoint(const EDGEPT *point) const
Definition: seam.h:88
void draw_blob_edges(TBLOB *blob)
Definition: plotedges.cpp:77
void prioritize_points(TESSLINE *outline, PointHeap *points)
Definition: chop.cpp:162
Definition: blobs.h:76
void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:335
#define NULL
Definition: host.h:144
ScrollView * edge_window
Definition: plotedges.cpp:43
Definition: seam.h:44
TESSLINE * outlines
Definition: blobs.h:377
#define BAD_PRIORITY
Definition: findseam.cpp:51
EDGEPT * loop
Definition: blobs.h:257
#define edge_window_wait()
Definition: plotedges.h:57
short inT16
Definition: host.h:100
PRIORITY tesseract::Wordrec::point_priority ( EDGEPT point)

Definition at line 54 of file chop.cpp.

54  {
55  return (PRIORITY)angle_change(point->prev, point, point->next);
56 }
EDGEPT * prev
Definition: blobs.h:170
float PRIORITY
Definition: seam.h:42
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:88
EDGEPT * next
Definition: blobs.h:169
void tesseract::Wordrec::prioritize_points ( TESSLINE outline,
PointHeap points 
)

Definition at line 162 of file chop.cpp.

162  {
163  EDGEPT *this_point;
164  EDGEPT *local_min = NULL;
165  EDGEPT *local_max = NULL;
166 
167  this_point = outline->loop;
168  local_min = this_point;
169  local_max = this_point;
170  do {
171  if (this_point->vec.y < 0) {
172  /* Look for minima */
173  if (local_max != NULL)
174  new_max_point(local_max, points);
175  else if (is_inside_angle (this_point))
176  add_point_to_list(points, this_point);
177  local_max = NULL;
178  local_min = this_point->next;
179  }
180  else if (this_point->vec.y > 0) {
181  /* Look for maxima */
182  if (local_min != NULL)
183  new_min_point(local_min, points);
184  else if (is_inside_angle (this_point))
185  add_point_to_list(points, this_point);
186  local_min = NULL;
187  local_max = this_point->next;
188  }
189  else {
190  /* Flat area */
191  if (local_max != NULL) {
192  if (local_max->prev->vec.y != 0) {
193  new_max_point(local_max, points);
194  }
195  local_max = this_point->next;
196  local_min = NULL;
197  }
198  else {
199  if (local_min->prev->vec.y != 0) {
200  new_min_point(local_min, points);
201  }
202  local_min = this_point->next;
203  local_max = NULL;
204  }
205  }
206 
207  /* Next point */
208  this_point = this_point->next;
209  }
210  while (this_point != outline->loop);
211 }
EDGEPT * prev
Definition: blobs.h:170
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:64
void new_max_point(EDGEPT *local_max, PointHeap *points)
Definition: chop.cpp:245
inT16 y
Definition: blobs.h:72
bool is_inside_angle(EDGEPT *pt)
Definition: chop.cpp:78
VECTOR vec
Definition: blobs.h:164
EDGEPT * next
Definition: blobs.h:169
void new_min_point(EDGEPT *local_min, PointHeap *points)
Definition: chop.cpp:221
Definition: blobs.h:76
#define NULL
Definition: host.h:144
EDGEPT * loop
Definition: blobs.h:257
void tesseract::Wordrec::ProcessSegSearchPainPoint ( float  pain_point_priority,
const MATRIX_COORD pain_point,
const char *  pain_point_type,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle 
)
protected

Definition at line 262 of file segsearch.cpp.

266  {
267  if (segsearch_debug_level > 0) {
268  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
269  pain_point_type, pain_point_priority,
270  pain_point.col, pain_point.row);
271  }
272  ASSERT_HOST(pain_points != NULL);
273  MATRIX *ratings = word_res->ratings;
274  // Classify blob [pain_point.col pain_point.row]
275  if (!pain_point.Valid(*ratings)) {
276  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
277  }
278  ASSERT_HOST(pain_point.Valid(*ratings));
279  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
280  pain_point.col, pain_point.row,
281  pain_point_type,
282  word_res->chopped_word,
283  blamer_bundle);
284  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
285  if (lst == NULL) {
286  ratings->put(pain_point.col, pain_point.row, classified);
287  } else {
288  // We can not delete old BLOB_CHOICEs, since they might contain
289  // ViterbiStateEntries that are parents of other "active" entries.
290  // Thus if the matrix cell already contains classifications we add
291  // the new ones to the beginning of the list.
292  BLOB_CHOICE_IT it(lst);
293  it.add_list_before(classified);
294  delete classified; // safe to delete, since empty after add_list_before()
295  classified = NULL;
296  }
297 
298  if (segsearch_debug_level > 0) {
299  print_ratings_list("Updated ratings matrix with a new entry:",
300  ratings->get(pain_point.col, pain_point.row),
301  getDict().getUnicharset());
302  ratings->print(getDict().getUnicharset());
303  }
304 
305  // Insert initial "pain points" to join the newly classified blob
306  // with its left and right neighbors.
307  if (classified != NULL && !classified->empty()) {
308  if (pain_point.col > 0) {
309  pain_points->GeneratePainPoint(
310  pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
311  true, segsearch_max_char_wh_ratio, word_res);
312  }
313  if (pain_point.row + 1 < ratings->dimension()) {
314  pain_points->GeneratePainPoint(
315  pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
316  true, segsearch_max_char_wh_ratio, word_res);
317  }
318  }
319  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
320 }
MATRIX * ratings
Definition: pageres.h:215
T get(int column, int row) const
Definition: matrix.h:171
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
void put(int column, int row, const T &thing)
Definition: matrix.h:166
int dimension() const
Definition: matrix.h:247
#define ASSERT_HOST(x)
Definition: errcode.h:84
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:57
bool Valid(const MATRIX &m) const
Definition: matrix.h:327
Dict & getDict()
Definition: classify.h:65
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
Definition: matrix.h:289
int segsearch_debug_level
Definition: wordrec.h:169
#define NULL
Definition: host.h:144
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
void tesseract::Wordrec::program_editdown ( inT32  elasped_time)

Definition at line 74 of file tface.cpp.

74  {
76  getDict().End();
77 }
void End()
Definition: dict.cpp:310
Dict & getDict()
Definition: classify.h:65
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:456
void tesseract::Wordrec::program_editup ( const char *  textbase,
bool  init_classifier,
bool  init_permute 
)

Definition at line 46 of file tface.cpp.

48  {
49  if (textbase != NULL) imagefile = textbase;
51  InitAdaptiveClassifier(init_classifier);
52  if (init_dict) getDict().Load(Dict::GlobalDawgCache());
54 }
double chop_ok_split
Definition: wordrec.h:156
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
STRING imagefile
Definition: ccutil.h:74
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186
PRIORITY pass2_ok_split
Definition: wordrec.h:412
Dict & getDict()
Definition: classify.h:65
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:507
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194
#define NULL
Definition: host.h:144
void InitAdaptiveClassifier(bool load_pre_trained_templates)
Definition: adaptmatch.cpp:527
void tesseract::Wordrec::ResetNGramSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
GenericVector< SegSearchPending > *  pending 
)
protected

Definition at line 325 of file segsearch.cpp.

327  {
328  // TODO(rays) More refactoring required here.
329  // Delete existing viterbi states.
330  for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
331  best_choice_bundle->beam[col]->Clear();
332  }
333  // Reset best_choice_bundle.
334  word_res->ClearWordChoices();
335  best_choice_bundle->best_vse = NULL;
336  // Clear out all existing pendings and add a new one for the first column.
337  (*pending)[0].SetColumnClassified();
338  for (int i = 1; i < pending->size(); ++i)
339  (*pending)[i].Clear();
340 }
int size() const
Definition: genericvector.h:72
void ClearWordChoices()
Definition: pageres.cpp:1173
#define NULL
Definition: host.h:144
void tesseract::Wordrec::SaveAltChoices ( const LIST best_choices,
WERD_RES word 
)
void tesseract::Wordrec::SegSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 37 of file segsearch.cpp.

39  {
40  LMPainPoints pain_points(segsearch_max_pain_points,
44  // Compute scaling factor that will help us recover blob outline length
45  // from classifier rating and certainty for the blob.
46  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
48  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
49  blamer_bundle);
50 
51  if (!SegSearchDone(0)) { // find a better choice
52  if (chop_enable && word_res->chopped_word != NULL) {
53  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
54  blamer_bundle, &pain_points, &pending);
55  }
56  if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
57 
58  if (blamer_bundle != NULL &&
59  !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
60  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
61  }
62  }
63  // Keep trying to find a better path by fixing the "pain points".
64 
65  MATRIX_COORD pain_point;
66  float pain_point_priority;
67  int num_futile_classifications = 0;
68  STRING blamer_debug;
69  while (wordrec_enable_assoc &&
70  (!SegSearchDone(num_futile_classifications) ||
71  (blamer_bundle != NULL &&
72  blamer_bundle->GuidedSegsearchStillGoing()))) {
73  // Get the next valid "pain point".
74  bool found_nothing = true;
75  LMPainPointsType pp_type;
76  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
77  LM_PPTYPE_NUM) {
78  if (!pain_point.Valid(*word_res->ratings)) {
79  word_res->ratings->IncreaseBandSize(
80  pain_point.row - pain_point.col + 1);
81  }
82  if (pain_point.Valid(*word_res->ratings) &&
83  !word_res->ratings->Classified(pain_point.col, pain_point.row,
84  getDict().WildcardID())) {
85  found_nothing = false;
86  break;
87  }
88  }
89  if (found_nothing) {
90  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
91  break;
92  }
93  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
95  &pending, word_res, &pain_points, blamer_bundle);
96 
97  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
98  word_res, &pain_points, best_choice_bundle,
99  blamer_bundle);
100  if (!best_choice_bundle->updated) ++num_futile_classifications;
101 
102  if (segsearch_debug_level > 0) {
103  tprintf("num_futile_classifications %d\n", num_futile_classifications);
104  }
105 
106  best_choice_bundle->updated = false; // reset updated
107 
108  // See if it's time to terminate SegSearch or time for starting a guided
109  // search for the true path to find the blame for the incorrect best_choice.
110  if (SegSearchDone(num_futile_classifications) &&
111  blamer_bundle != NULL &&
112  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
113  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
114  &blamer_debug);
115  }
116  } // end while loop exploring alternative paths
117  if (blamer_bundle != NULL) {
118  blamer_bundle->FinishSegSearch(word_res->best_choice,
119  wordrec_debug_blamer, &blamer_debug);
120  }
121 
122  if (segsearch_debug_level > 0) {
123  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
125  }
126 }
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:342
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
static void PrintSeams(const char *label, const GenericVector< SEAM * > &seams)
Definition: seam.cpp:173
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:461
MATRIX * ratings
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * chopped_word
Definition: pageres.h:201
#define tprintf(...)
Definition: tprintf.h:31
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
bool wordrec_enable_assoc
Definition: wordrec.h:130
static const char * PainPointDescription(LMPainPointsType type)
bool wordrec_debug_blamer
Definition: wordrec.h:167
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
LanguageModel * language_model_
Definition: wordrec.h:411
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:506
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:150
bool Valid(const MATRIX &m) const
Definition: matrix.h:327
Dict & getDict()
Definition: classify.h:65
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
int segsearch_max_pain_points
Definition: wordrec.h:171
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:310
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:36
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:457
Definition: strngs.h:44
int segsearch_debug_level
Definition: wordrec.h:169
#define NULL
Definition: host.h:144
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:426
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
double certainty_scale
Definition: dict.h:601
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
bool tesseract::Wordrec::SegSearchDone ( int  num_futile_classifications)
inlineprotected

Definition at line 426 of file wordrec.h.

426  {
428  num_futile_classifications >=
430  }
LanguageModel * language_model_
Definition: wordrec.h:411
int segsearch_max_futile_classifications
Definition: wordrec.h:173
int tesseract::Wordrec::select_blob_to_split ( const GenericVector< BLOB_CHOICE * > &  blob_choices,
float  rating_ceiling,
bool  split_next_to_fragment 
)

Definition at line 541 of file chopper.cpp.

543  {
544  BLOB_CHOICE *blob_choice;
545  int x;
546  float worst = -MAX_FLOAT32;
547  int worst_index = -1;
548  float worst_near_fragment = -MAX_FLOAT32;
549  int worst_index_near_fragment = -1;
550  const CHAR_FRAGMENT **fragments = NULL;
551 
552  if (chop_debug) {
553  if (rating_ceiling < MAX_FLOAT32)
554  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
555  else
556  tprintf("rating_ceiling = No Limit\n");
557  }
558 
559  if (split_next_to_fragment && blob_choices.size() > 0) {
560  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
561  if (blob_choices[0] != NULL) {
562  fragments[0] = getDict().getUnicharset().get_fragment(
563  blob_choices[0]->unichar_id());
564  } else {
565  fragments[0] = NULL;
566  }
567  }
568 
569  for (x = 0; x < blob_choices.size(); ++x) {
570  if (blob_choices[x] == NULL) {
571  if (fragments != NULL) {
572  delete[] fragments;
573  }
574  return x;
575  } else {
576  blob_choice = blob_choices[x];
577  // Populate fragments for the following position.
578  if (split_next_to_fragment && x+1 < blob_choices.size()) {
579  if (blob_choices[x + 1] != NULL) {
580  fragments[x + 1] = getDict().getUnicharset().get_fragment(
581  blob_choices[x + 1]->unichar_id());
582  } else {
583  fragments[x + 1] = NULL;
584  }
585  }
586  if (blob_choice->rating() < rating_ceiling &&
587  blob_choice->certainty() < tessedit_certainty_threshold) {
588  // Update worst and worst_index.
589  if (blob_choice->rating() > worst) {
590  worst_index = x;
591  worst = blob_choice->rating();
592  }
593  if (split_next_to_fragment) {
594  // Update worst_near_fragment and worst_index_near_fragment.
595  bool expand_following_fragment =
596  (x + 1 < blob_choices.size() &&
597  fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
598  bool expand_preceding_fragment =
599  (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
600  if ((expand_following_fragment || expand_preceding_fragment) &&
601  blob_choice->rating() > worst_near_fragment) {
602  worst_index_near_fragment = x;
603  worst_near_fragment = blob_choice->rating();
604  if (chop_debug) {
605  tprintf("worst_index_near_fragment=%d"
606  " expand_following_fragment=%d"
607  " expand_preceding_fragment=%d\n",
608  worst_index_near_fragment,
609  expand_following_fragment,
610  expand_preceding_fragment);
611  }
612  }
613  }
614  }
615  }
616  }
617  if (fragments != NULL) {
618  delete[] fragments;
619  }
620  // TODO(daria): maybe a threshold of badness for
621  // worst_near_fragment would be useful.
622  return worst_index_near_fragment != -1 ?
623  worst_index_near_fragment : worst_index;
624 }
int size() const
Definition: genericvector.h:72
int length() const
Definition: genericvector.h:79
#define tprintf(...)
Definition: tprintf.h:31
double tessedit_certainty_threshold
Definition: wordrec.h:138
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
float rating() const
Definition: ratngs.h:79
Dict & getDict()
Definition: classify.h:65
bool is_ending() const
Definition: unicharset.h:102
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define MAX_FLOAT32
Definition: host.h:124
#define NULL
Definition: host.h:144
float certainty() const
Definition: ratngs.h:82
bool is_beginning() const
Definition: unicharset.h:99
int tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR fixpt)

Definition at line 633 of file chopper.cpp.

633  {
634  if (!fixpt)
635  return -1;
636  for (int i = 0; i < fixpt->size(); i++) {
637  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
638  (*fixpt)[i].dangerous &&
639  (*fixpt)[i].correct_is_ngram) {
640  return (*fixpt)[i].begin;
641  }
642  }
643  return -1;
644 }
int size() const
Definition: genericvector.h:72
void tesseract::Wordrec::set_pass1 ( )

Definition at line 85 of file tface.cpp.

85  {
86  chop_ok_split.set_value(70.0);
88  SettupPass1();
89 }
double chop_ok_split
Definition: wordrec.h:156
LanguageModel * language_model_
Definition: wordrec.h:411
ParamsModel & getParamsModel()
void SetPass(PassEnum pass)
Definition: params_model.h:72
void tesseract::Wordrec::set_pass2 ( )

Definition at line 97 of file tface.cpp.

void tesseract::Wordrec::try_point_pairs ( EDGEPT points[MAX_NUM_POINTS],
inT16  num_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 297 of file findseam.cpp.

302  {
303  inT16 x;
304  inT16 y;
305  PRIORITY priority;
306 
307  for (x = 0; x < num_points; x++) {
308  for (y = x + 1; y < num_points; y++) {
309  if (points[y] &&
310  points[x]->WeightedDistance(*points[y], chop_x_y_weight) <
312  points[x] != points[y]->next && points[y] != points[x]->next &&
313  !is_exterior_point(points[x], points[y]) &&
314  !is_exterior_point(points[y], points[x])) {
315  SPLIT split(points[x], points[y]);
316  priority = partial_split_priority(&split);
317 
318  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
319  }
320  }
321  }
322 }
#define partial_split_priority(split)
Definition: gradechop.h:46
#define is_exterior_point(edge, point)
Definition: outlines.h:97
float PRIORITY
Definition: seam.h:42
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:103
Definition: split.h:37
short inT16
Definition: host.h:100
void tesseract::Wordrec::try_vertical_splits ( EDGEPT points[MAX_NUM_POINTS],
inT16  num_points,
EDGEPT_CLIST *  new_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 335 of file findseam.cpp.

341  {
342  EDGEPT *vertical_point = NULL;
343  inT16 x;
344  PRIORITY priority;
345  TESSLINE *outline;
346 
347  for (x = 0; x < num_points; x++) {
348  vertical_point = NULL;
349  for (outline = blob->outlines; outline; outline = outline->next) {
350  vertical_projection_point(points[x], outline->loop,
351  &vertical_point, new_points);
352  }
353 
354  if (vertical_point && points[x] != vertical_point->next &&
355  vertical_point != points[x]->next &&
356  points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) <
358  SPLIT split(points[x], vertical_point);
359  priority = partial_split_priority(&split);
360  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
361  }
362  }
363 }
#define partial_split_priority(split)
Definition: gradechop.h:46
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:99
TESSLINE * next
Definition: blobs.h:258
float PRIORITY
Definition: seam.h:42
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
Definition: chop.cpp:274
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:103
EDGEPT * next
Definition: blobs.h:169
Definition: blobs.h:76
#define NULL
Definition: host.h:144
TESSLINE * outlines
Definition: blobs.h:377
EDGEPT * loop
Definition: blobs.h:257
Definition: split.h:37
short inT16
Definition: host.h:100
void tesseract::Wordrec::UpdateSegSearchNodes ( float  rating_cert_scale,
int  starting_col,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 194 of file segsearch.cpp.

201  {
202  MATRIX *ratings = word_res->ratings;
203  ASSERT_HOST(ratings->dimension() == pending->size());
204  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
205  for (int col = starting_col; col < ratings->dimension(); ++col) {
206  if (!(*pending)[col].WorkToDo()) continue;
207  int first_row = col;
208  int last_row = MIN(ratings->dimension() - 1,
209  col + ratings->bandwidth() - 1);
210  if ((*pending)[col].SingleRow() >= 0) {
211  first_row = last_row = (*pending)[col].SingleRow();
212  }
213  if (segsearch_debug_level > 0) {
214  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
215  col, first_row, last_row,
216  (*pending)[col].IsRowJustClassified(MAX_INT32));
217  }
218  // Iterate over the pending list for this column.
219  for (int row = first_row; row <= last_row; ++row) {
220  // Update language model state of this child+parent pair.
221  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
222  LanguageModelState *parent_node =
223  col == 0 ? NULL : best_choice_bundle->beam[col - 1];
224  if (current_node != NULL &&
225  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
226  col, row, current_node, parent_node,
227  pain_points, word_res,
228  best_choice_bundle, blamer_bundle) &&
229  row + 1 < ratings->dimension()) {
230  // Since the language model state of this entry changed, process all
231  // the child column.
232  (*pending)[row + 1].RevisitWholeColumn();
233  if (segsearch_debug_level > 0) {
234  tprintf("Added child col=%d to pending\n", row + 1);
235  }
236  } // end if UpdateState.
237  } // end for row.
238  } // end for col.
239  if (best_choice_bundle->best_vse != NULL) {
240  ASSERT_HOST(word_res->StatesAllValid());
241  if (best_choice_bundle->best_vse->updated) {
242  pain_points->GenerateFromPath(rating_cert_scale,
243  best_choice_bundle->best_vse, word_res);
244  if (!best_choice_bundle->fixpt.empty()) {
245  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
246  best_choice_bundle->best_vse, word_res);
247  }
248  }
249  }
250  // The segsearch is completed. Reset all updated flags on all VSEs and reset
251  // all pendings.
252  for (int col = 0; col < pending->size(); ++col) {
253  (*pending)[col].Clear();
254  ViterbiStateEntry_IT
255  vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
256  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
257  vse_it.data()->updated = false;
258  }
259  }
260 }
int size() const
Definition: genericvector.h:72
MATRIX * ratings
Definition: pageres.h:215
T get(int column, int row) const
Definition: matrix.h:171
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
int dimension() const
Definition: matrix.h:247
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
LanguageModel * language_model_
Definition: wordrec.h:411
#define MAX_INT32
Definition: host.h:120
int bandwidth() const
Definition: matrix.h:249
Definition: matrix.h:289
int segsearch_debug_level
Definition: wordrec.h:169
#define NULL
Definition: host.h:144
void tesseract::Wordrec::vertical_projection_point ( EDGEPT split_point,
EDGEPT target_point,
EDGEPT **  best_point,
EDGEPT_CLIST *  new_points 
)

Definition at line 274 of file chop.cpp.

276  {
277  EDGEPT *p; /* Iterator */
278  EDGEPT *this_edgept; /* Iterator */
279  EDGEPT_C_IT new_point_it(new_points);
280  int x = split_point->pos.x; /* X value of vertical */
281  int best_dist = LARGE_DISTANCE;/* Best point found */
282 
283  if (*best_point != NULL)
284  best_dist = edgept_dist(split_point, *best_point);
285 
286  p = target_point;
287  /* Look at each edge point */
288  do {
289  if (((p->pos.x <= x && x <= p->next->pos.x) ||
290  (p->next->pos.x <= x && x <= p->pos.x)) &&
291  !same_point(split_point->pos, p->pos) &&
292  !same_point(split_point->pos, p->next->pos) &&
293  !p->IsChopPt() &&
294  (*best_point == NULL || !same_point((*best_point)->pos, p->pos))) {
295 
296  if (near_point(split_point, p, p->next, &this_edgept)) {
297  new_point_it.add_before_then_move(this_edgept);
298  }
299 
300  if (*best_point == NULL)
301  best_dist = edgept_dist (split_point, this_edgept);
302 
303  this_edgept =
304  pick_close_point(split_point, this_edgept, &best_dist);
305  if (this_edgept)
306  *best_point = this_edgept;
307  }
308 
309  p = p->next;
310  }
311  while (p != target_point);
312 }
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
Definition: outlines.cpp:49
bool IsChopPt() const
Definition: blobs.h:159
EDGEPT * pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
Definition: chop.cpp:124
#define edgept_dist(p1, p2)
Definition: outlines.h:87
#define same_point(p1, p2)
Definition: outlines.h:49
EDGEPT * next
Definition: blobs.h:169
inT16 x
Definition: blobs.h:71
TPOINT pos
Definition: blobs.h:163
Definition: blobs.h:76
#define NULL
Definition: host.h:144
#define LARGE_DISTANCE
Definition: outlines.h:36
void tesseract::Wordrec::WordSearch ( WERD_RES word_res)

Definition at line 130 of file segsearch.cpp.

130  {
131  LMPainPoints pain_points(segsearch_max_pain_points,
136  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
137  // Run Segmentation Search.
138  InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
139  if (segsearch_debug_level > 0) {
140  tprintf("Ending ratings matrix%s:\n",
141  wordrec_enable_assoc ? " (with assoc)" : "");
142  word_res->ratings->print(getDict().getUnicharset());
143  }
144 }
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
MATRIX * ratings
Definition: pageres.h:215
#define tprintf(...)
Definition: tprintf.h:31
int dimension() const
Definition: matrix.h:247
bool wordrec_enable_assoc
Definition: wordrec.h:130
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:150
Dict & getDict()
Definition: classify.h:65
int segsearch_max_pain_points
Definition: wordrec.h:171
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
int segsearch_debug_level
Definition: wordrec.h:169
#define NULL
Definition: host.h:144
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175

Member Data Documentation

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = FALSE

"include fixed-pitch heuristics in char segmentation"

Definition at line 161 of file wordrec.h.

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 418 of file wordrec.h.

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 151 of file wordrec.h.

int tesseract::Wordrec::chop_centered_maxwidth = 90

"Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center."

Definition at line 153 of file wordrec.h.

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 139 of file wordrec.h.

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 140 of file wordrec.h.

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 157 of file wordrec.h.

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 147 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 148 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 144 of file wordrec.h.

bool tesseract::Wordrec::chop_new_seam_pile = 1

"Use new seam_pile"

Definition at line 146 of file wordrec.h.

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 156 of file wordrec.h.

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 150 of file wordrec.h.

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 143 of file wordrec.h.

int tesseract::Wordrec::chop_seam_pile_size = 150

"Max number of seams in seam_pile"

Definition at line 145 of file wordrec.h.

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 154 of file wordrec.h.

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 149 of file wordrec.h.

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 142 of file wordrec.h.

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 141 of file wordrec.h.

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 155 of file wordrec.h.

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 158 of file wordrec.h.

void(Wordrec::* tesseract::Wordrec::fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 420 of file wordrec.h.

bool tesseract::Wordrec::force_word_assoc = FALSE

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 133 of file wordrec.h.

bool tesseract::Wordrec::fragments_guide_chopper = FALSE

"Use information from fragments to guide chopping process"

Definition at line 136 of file wordrec.h.

LanguageModel* tesseract::Wordrec::language_model_

Definition at line 411 of file wordrec.h.

bool tesseract::Wordrec::merge_fragments_in_matrix = TRUE

"Merge the fragments in the ratings matrix and delete them " "after merging"

Definition at line 128 of file wordrec.h.

PRIORITY tesseract::Wordrec::pass2_ok_split

Definition at line 412 of file wordrec.h.

WERD_CHOICE* tesseract::Wordrec::prev_word_best_choice_

Definition at line 416 of file wordrec.h.

int tesseract::Wordrec::repair_unchopped_blobs = 1

"Fix blobs that aren't chopped"

Definition at line 137 of file wordrec.h.

bool tesseract::Wordrec::save_alt_choices = true

"Save alternative paths found during chopping " "and segmentation search"

Definition at line 178 of file wordrec.h.

int tesseract::Wordrec::segment_adjust_debug = 0

"Segmentation adjustment debug"

Definition at line 159 of file wordrec.h.

int tesseract::Wordrec::segsearch_debug_level = 0

"SegSearch debug level"

Definition at line 169 of file wordrec.h.

double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0

"Maximum character width-to-height ratio"

Definition at line 175 of file wordrec.h.

int tesseract::Wordrec::segsearch_max_futile_classifications = 10

"Maximum number of pain point classifications per word."

Definition at line 173 of file wordrec.h.

int tesseract::Wordrec::segsearch_max_pain_points = 2000

"Maximum number of pain points stored in the queue"

Definition at line 171 of file wordrec.h.

double tesseract::Wordrec::tessedit_certainty_threshold = -2.25

"Good blob limit"

Definition at line 138 of file wordrec.h.

bool tesseract::Wordrec::wordrec_debug_blamer = false

"Print blamer debug messages"

Definition at line 167 of file wordrec.h.

int tesseract::Wordrec::wordrec_debug_level = 0

"Debug level for wordrec"

Definition at line 162 of file wordrec.h.

bool tesseract::Wordrec::wordrec_enable_assoc = TRUE

"Associator Enable"

Definition at line 130 of file wordrec.h.

int tesseract::Wordrec::wordrec_max_join_chunks = 4

"Max number of broken pieces to associate"

Definition at line 164 of file wordrec.h.

bool tesseract::Wordrec::wordrec_no_block = FALSE

"Don't output block information"

Definition at line 129 of file wordrec.h.

bool tesseract::Wordrec::wordrec_run_blamer = false

"Try to set the blame for errors"

Definition at line 168 of file wordrec.h.

bool tesseract::Wordrec::wordrec_skip_no_truth_words = false

"Only run OCR for words that had truth recorded in BlamerBundle"

Definition at line 166 of file wordrec.h.

double tesseract::Wordrec::wordrec_worst_state = 1

"Worst segmentation state"

Definition at line 134 of file wordrec.h.


The documentation for this class was generated from the following files: