All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract Namespace Reference

Classes

class  AlignedBlob
 
struct  AlignedBlobParams
 
class  AltList
 
class  AmbigSpec
 
struct  AssociateStats
 
class  AssociateUtils
 
class  BaselineBlock
 
class  BaselineDetect
 
class  BaselineRow
 
class  BBGrid
 
class  BeamSearch
 
struct  BestChoiceBundle
 Bundle together all the things pertaining to the best choice/state. More...
 
struct  Bigram
 
class  BitVector
 
struct  BlobData
 
class  BlobGrid
 
struct  BlockGroup
 
class  Bmp8
 
class  BoolParam
 
class  BoxChar
 
struct  BoxCharPtrSort
 
class  BoxWord
 
class  CachedFile
 
class  CCNonTextDetect
 
class  CCStruct
 
class  CCUtil
 
class  CCUtilMutex
 
class  CharAltList
 
struct  CharBigram
 
class  CharBigrams
 
struct  CharBigramTable
 
class  CharClassifier
 
class  CharClassifierFactory
 
class  CharSamp
 
class  CharSampEnum
 
class  CharSampSet
 
class  CharSet
 
class  ChoiceIterator
 
class  Classify
 
class  ClassPruner
 
struct  Cluster
 
class  ColPartition
 
class  ColPartitionGrid
 
class  ColPartitionSet
 
class  ColSegment
 
class  ColumnFinder
 
class  ConComp
 
class  ConCompPt
 
class  ConvNetCharClassifier
 
class  CubeClassifier
 
class  CubeLineObject
 
class  CubeLineSegmenter
 
class  CubeObject
 
class  CubeRecoContext
 
class  CubeSearchObject
 
class  CubeTessClassifier
 
class  CubeTuningParams
 
class  CubeUtils
 
class  CUtil
 
class  Dawg
 
struct  DawgArgs
 
class  DawgCache
 
struct  DawgLoader
 
struct  DawgPosition
 
class  DawgPositionVector
 
class  DetLineFit
 
class  Dict
 
struct  DocQualCallbacks
 
class  DocumentCache
 
class  DocumentData
 
class  DoubleParam
 
class  DoublePtr
 
class  DPPoint
 
class  EquationDetect
 
class  EquationDetectBase
 
class  ErrorCounter
 
class  FeatureBase
 
class  FeatureBmp
 
class  FeatureChebyshev
 
class  FeatureHybrid
 
class  File
 
struct  FloatWordFeature
 
struct  FontInfo
 
class  FontInfoTable
 
struct  FontPairSizeInfo
 
struct  FontSet
 
struct  FontSpacingInfo
 
class  FontUtils
 
class  FRAGMENT
 
class  GenericHeap
 
struct  GeometricClassifierState
 
class  GridBase
 
class  GridSearch
 
class  HybridNeuralNetCharClassifier
 
class  IcuErrorCode
 
class  ImageData
 
class  ImageFind
 
class  ImageThresholder
 
class  IndexMap
 
class  IndexMapBiDi
 
class  InputBuffer
 
class  InputFileBuffer
 
struct  Interval
 
class  IntFeatureDist
 
class  IntFeatureMap
 
class  IntFeatureSpace
 
class  IntGrid
 
class  IntParam
 
struct  KDPair
 
struct  KDPairDec
 
struct  KDPairInc
 
class  KDPtrPair
 
struct  KDPtrPairDec
 
struct  KDPtrPairInc
 
class  KDVector
 
class  LangModEdge
 
class  LangModel
 
class  LanguageModel
 
struct  LanguageModelDawgInfo
 
struct  LanguageModelNgramInfo
 
struct  LanguageModelState
 Struct to store information maintained by various language model components. More...
 
class  LigatureTable
 
class  LineFinder
 
struct  LineHypothesis
 
struct  LMConsistencyInfo
 
class  LMPainPoints
 
class  LTRResultIterator
 
class  MasterTrainer
 
class  MutableIterator
 
class  NeuralNet
 
class  Neuron
 
struct  NodeChild
 
class  ObjectCache
 
class  OutputBuffer
 
class  PageIterator
 
struct  PairSizeInfo
 
class  PangoFontInfo
 
class  ParagraphModelSmearer
 
class  ParagraphTheory
 
class  Param
 
class  ParamsModel
 
class  ParamsTrainingBundle
 
struct  ParamsTrainingHypothesis
 
struct  ParamsVectors
 
class  ParamUtils
 
class  PixelHistogram
 
singleton  PointerVector
 
struct  PtrHash
 
class  ResultIterator
 
class  RowInfo
 
class  RowScratchRegisters
 
class  SampleIterator
 
struct  ScoredFont
 
class  SearchColumn
 
class  SearchNode
 
class  SearchNodeHashTable
 
class  SearchObject
 
class  SegSearchPending
 
class  Shape
 
class  ShapeClassifier
 
struct  ShapeDist
 
struct  ShapeQueueEntry
 
struct  ShapeRating
 
class  ShapeTable
 
class  ShiroRekhaSplitter
 
class  SimpleClusterer
 
struct  SpacingProperties
 
class  SquishedDawg
 
class  StringParam
 
class  StringRenderer
 
class  StrokeWidth
 
class  StructuredTable
 
class  TabConstraint
 
class  TabEventHandler
 
class  TabFind
 
class  TableFinder
 
class  TableRecognizer
 
class  TabVector
 
struct  TESS_CHAR
 
class  TessBaseAPI
 
class  TessBoxTextRenderer
 
class  TessClassifier
 
class  TessdataManager
 
class  Tesseract
 
class  TesseractCubeCombiner
 
struct  TesseractStats
 
class  TessHOcrRenderer
 
class  TessLangModEdge
 
class  TessLangModel
 
class  TessPDFRenderer
 
class  TessResultRenderer
 
class  TessTextRenderer
 
class  TessUnlvRenderer
 
class  TextlineProjection
 
class  Textord
 
class  TFile
 
class  TrainingSample
 
class  TrainingSampleSet
 
class  TRand
 
class  Trie
 
class  TuningParams
 
class  UnicharAmbigs
 
struct  UnicharAndFonts
 
class  UnicharIdArrayUtils
 
struct  UnicharRating
 
class  UnicodeSpanSkipper
 
struct  ViterbiStateEntry
 
class  WordAltList
 
struct  WordData
 
class  WordFeature
 
class  WordListLangModel
 
class  Wordrec
 
class  WordSizeModel
 
class  WordUnigrams
 
class  WordWithBox
 
class  WorkingPartSet
 

Typedefs

typedef int(Dict::* DictFunc )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
typedef double(Dict::* ProbabilityInContextFunc )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
typedef float(Dict::* ParamsModelClassifyFunc )(const char *lang, void *path)
 
typedef void(Wordrec::* FillLatticeFunc )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
typedef TessCallback4< const
UNICHARSET &, int,
PageIterator *, Pix * > 
TruthCallback
 
typedef GenericVectorEqEq
< const ParagraphModel * > 
SetOfModels
 
typedef void(Tesseract::* WordRecognizer )(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
typedef GenericVector
< ParamsTrainingHypothesis
ParamsTrainingHypothesisList
 
typedef GenericVector< UNICHAR_IDUnicharIdVector
 
typedef GenericVector
< AmbigSpec_LIST * > 
UnicharAmbigsVector
 
typedef bool(* FileReader )(const STRING &filename, GenericVector< char > *data)
 
typedef bool(* FileWriter )(const GenericVector< char > &data, const STRING &filename)
 
typedef KDPairInc< int, int > IntKDPair
 
typedef GenericHeap
< ShapeQueueEntry
ShapeQueue
 
typedef signed int char_32
 
typedef basic_string< char_32string_32
 
typedef GenericVector< NodeChildNodeChildVector
 
typedef GenericVector< int > SuccessorList
 
typedef GenericVector
< SuccessorList * > 
SuccessorListsVector
 
typedef GenericVector< Dawg * > DawgVector
 
typedef GridSearch< BLOBNBOX,
BLOBNBOX_CLIST, BLOBNBOX_C_IT > 
BlobGridSearch
 
typedef GridSearch
< ColPartition,
ColPartition_CLIST,
ColPartition_C_IT > 
ColPartitionGridSearch
 
typedef GenericVector
< ColPartitionSet * > 
PartSetVector
 
typedef TessResultCallback1
< bool, int > 
WidthCallback
 
typedef BBGrid< ColSegment,
ColSegment_CLIST,
ColSegment_C_IT > 
ColSegmentGrid
 
typedef GridSearch< ColSegment,
ColSegment_CLIST,
ColSegment_C_IT > 
ColSegmentGridSearch
 
typedef BBGrid< WordWithBox,
WordWithBox_CLIST,
WordWithBox_C_IT > 
WordGrid
 
typedef GridSearch
< WordWithBox,
WordWithBox_CLIST,
WordWithBox_C_IT > 
WordSearch
 
typedef hash_map< string,
string, StringHash
LigHash
 
typedef GenericHeap
< MatrixCoordPair
PainPointHeap
 
typedef unsigned char LanguageModelFlagsType
 Used for expressing various language model flags. More...
 

Enumerations

enum  LineType { LT_START = 'S', LT_BODY = 'C', LT_UNKNOWN = 'U', LT_MULTIPLE = 'M' }
 
enum  CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT }
 
enum  NormalizationMode { NM_BASELINE = -3, NM_CHAR_ISOTROPIC = -2, NM_CHAR_ANISOTROPIC = -1 }
 
enum  kParamsTrainingFeatureType {
  PTRAIN_DIGITS_SHORT, PTRAIN_DIGITS_MED, PTRAIN_DIGITS_LONG, PTRAIN_NUM_SHORT,
  PTRAIN_NUM_MED, PTRAIN_NUM_LONG, PTRAIN_DOC_SHORT, PTRAIN_DOC_MED,
  PTRAIN_DOC_LONG, PTRAIN_DICT_SHORT, PTRAIN_DICT_MED, PTRAIN_DICT_LONG,
  PTRAIN_FREQ_SHORT, PTRAIN_FREQ_MED, PTRAIN_FREQ_LONG, PTRAIN_SHAPE_COST_PER_CHAR,
  PTRAIN_NGRAM_COST_PER_CHAR, PTRAIN_NUM_BAD_PUNC, PTRAIN_NUM_BAD_CASE, PTRAIN_XHEIGHT_CONSISTENCY,
  PTRAIN_NUM_BAD_CHAR_TYPE, PTRAIN_NUM_BAD_SPACING, PTRAIN_NUM_BAD_FONT, PTRAIN_RATING_PER_CHAR,
  PTRAIN_NUM_FEATURE_TYPES
}
 
enum  Orientation { ORIENTATION_PAGE_UP = 0, ORIENTATION_PAGE_RIGHT = 1, ORIENTATION_PAGE_DOWN = 2, ORIENTATION_PAGE_LEFT = 3 }
 
enum  WritingDirection { WRITING_DIRECTION_LEFT_TO_RIGHT = 0, WRITING_DIRECTION_RIGHT_TO_LEFT = 1, WRITING_DIRECTION_TOP_TO_BOTTOM = 2 }
 
enum  TextlineOrder { TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, TEXTLINE_ORDER_TOP_TO_BOTTOM = 2 }
 
enum  PageSegMode {
  PSM_OSD_ONLY, PSM_AUTO_OSD, PSM_AUTO_ONLY, PSM_AUTO,
  PSM_SINGLE_COLUMN, PSM_SINGLE_BLOCK_VERT_TEXT, PSM_SINGLE_BLOCK, PSM_SINGLE_LINE,
  PSM_SINGLE_WORD, PSM_CIRCLE_WORD, PSM_SINGLE_CHAR, PSM_SPARSE_TEXT,
  PSM_SPARSE_TEXT_OSD, PSM_RAW_LINE, PSM_COUNT
}
 
enum  PageIteratorLevel {
  RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD,
  RIL_SYMBOL
}
 
enum  ParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, JUSTIFICATION_CENTER, JUSTIFICATION_RIGHT }
 
enum  OcrEngineMode { OEM_TESSERACT_ONLY, OEM_CUBE_ONLY, OEM_TESSERACT_CUBE_COMBINED, OEM_DEFAULT }
 
enum  ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }
 
enum  AmbigType {
  NOT_AMBIG, REPLACE_AMBIG, DEFINITE_AMBIG, SIMILAR_AMBIG,
  CASE_AMBIG, AMBIG_TYPE_COUNT
}
 
enum  SetParamConstraint { SET_PARAM_CONSTRAINT_NONE, SET_PARAM_CONSTRAINT_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_INIT_ONLY }
 
enum  TessdataType {
  TESSDATA_LANG_CONFIG, TESSDATA_UNICHARSET, TESSDATA_AMBIGS, TESSDATA_INTTEMP,
  TESSDATA_PFFMTABLE, TESSDATA_NORMPROTO, TESSDATA_PUNC_DAWG, TESSDATA_SYSTEM_DAWG,
  TESSDATA_NUMBER_DAWG, TESSDATA_FREQ_DAWG, TESSDATA_FIXED_LENGTH_DAWGS, TESSDATA_CUBE_UNICHARSET,
  TESSDATA_CUBE_SYSTEM_DAWG, TESSDATA_SHAPE_TABLE, TESSDATA_BIGRAM_DAWG, TESSDATA_UNAMBIG_DAWG,
  TESSDATA_PARAMS_MODEL, TESSDATA_NUM_ENTRIES
}
 
enum  CharSegmentationType { CST_FRAGMENT, CST_WHOLE, CST_IMPROPER, CST_NGRAM }
 
enum  CountTypes {
  CT_UNICHAR_TOP_OK, CT_UNICHAR_TOP1_ERR, CT_UNICHAR_TOP2_ERR, CT_UNICHAR_TOPN_ERR,
  CT_UNICHAR_TOPTOP_ERR, CT_OK_MULTI_UNICHAR, CT_OK_JOINED, CT_OK_BROKEN,
  CT_REJECT, CT_FONT_ATTR_ERR, CT_OK_MULTI_FONT, CT_NUM_RESULTS,
  CT_RANK, CT_REJECTED_JUNK, CT_ACCEPTED_JUNK, CT_SIZE
}
 
enum  DawgType {
  DAWG_TYPE_PUNCTUATION, DAWG_TYPE_WORD, DAWG_TYPE_NUMBER, DAWG_TYPE_PATTERN,
  DAWG_TYPE_COUNT
}
 
enum  XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT }
 
enum  ColumnSpanningType {
  CST_NOISE, CST_FLOWING, CST_HEADING, CST_PULLOUT,
  CST_COUNT
}
 
enum  NeighbourPartitionType {
  NPT_HTEXT, NPT_VTEXT, NPT_WEAK_HTEXT, NPT_WEAK_VTEXT,
  NPT_IMAGE, NPT_COUNT
}
 
enum  LeftOrRight { LR_LEFT, LR_RIGHT }
 
enum  PartitionFindResult { PFR_OK, PFR_SKEW, PFR_NOISE }
 
enum  ColSegType {
  COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED,
  COL_COUNT
}
 
enum  TabAlignment {
  TA_LEFT_ALIGNED, TA_LEFT_RAGGED, TA_CENTER_JUSTIFIED, TA_RIGHT_ALIGNED,
  TA_RIGHT_RAGGED, TA_SEPARATOR, TA_COUNT
}
 
enum  LMPainPointsType {
  LM_PPTYPE_BLAMER, LM_PPTYPE_AMBIG, LM_PPTYPE_PATH, LM_PPTYPE_SHAPE,
  LM_PPTYPE_NUM
}
 

Functions

int CubeAPITest (Boxa *boxa_blocks, Pixa *pixa_blocks, Boxa *boxa_words, Pixa *pixa_words, const FCOORD &reskew, Pix *page_pix, PAGE_RES *page_res)
 
TBLOBmake_tesseract_blob (float baseline, float xheight, float descender, float ascender, bool numeric_mode, Pix *pix)
 
STRING HOcrEscape (const char *text)
 
double prec (double x)
 
long dist2 (int x1, int y1, int x2, int y2)
 
void GetWordBaseline (int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length)
 
void AffineMatrix (int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d)
 
void ClipBaseline (int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2)
 
bool IsTextOrEquationType (PolyBlockType type)
 
bool IsLeftIndented (const EquationDetect::IndentType type)
 
bool IsRightIndented (const EquationDetect::IndentType type)
 
STRING RtlEmbed (const STRING &word, bool rtlify)
 
bool IsLatinLetter (int ch)
 
bool IsDigitLike (int ch)
 
bool IsOpeningPunct (int ch)
 
bool IsTerminalPunct (int ch)
 
const char * SkipChars (const char *str, const char *toskip)
 
const char * SkipChars (const char *str, bool(*skip)(int))
 
const char * SkipOne (const char *str, const char *toskip)
 
bool LikelyListNumeral (const STRING &word)
 
bool LikelyListMark (const STRING &word)
 
bool AsciiLikelyListItem (const STRING &word)
 
int UnicodeFor (const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
 
bool LikelyListMarkUnicode (int ch)
 
bool UniLikelyListItem (const UNICHARSET *u, const WERD_CHOICE *werd)
 
void LeftWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
 
void RightWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
 
int ClosestCluster (const GenericVector< Cluster > &clusters, int value)
 
void CalculateTabStops (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
 
void MarkRowsWithModel (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
 
void GeometricClassifyThreeTabStopTextBlock (int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
 
void GeometricClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
 
bool ValidFirstLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
 
bool ValidBodyLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
 
bool CrownCompatible (const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
 
void DiscardUnusedModels (const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
 
void DowngradeWeakestToCrowns (int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
 
void RecomputeMarginsAndClearHypotheses (GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
 
int InterwordSpace (const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
 
bool FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
 
bool FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after)
 
bool TextSupportsBreak (const RowScratchRegisters &before, const RowScratchRegisters &after)
 
bool LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after)
 
bool LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j)
 
ParagraphModel InternalParagraphModelByOutline (const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
 
ParagraphModel ParagraphModelByOutline (int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
 
bool RowsFitModel (const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
 
void MarkStrongEvidence (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
 
void ModelStrongEvidence (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
 
void StrongEvidenceClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
 
void SeparateSimpleLeaderLines (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
 
void ConvertHypothesizedModelRunsToParagraphs (int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory)
 
bool RowIsStranded (const GenericVector< RowScratchRegisters > &rows, int row)
 
void LeftoverSegments (const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
 
void CanonicalizeDetectionResults (GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
 
void DetectParagraphs (int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
 
void InitializeTextAndBoxesPreRecognition (const MutableIterator &it, RowInfo *info)
 
void InitializeRowInfo (bool after_recognition, const MutableIterator &it, RowInfo *info)
 
void DetectParagraphs (int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector< ParagraphModel * > *models)
 
bool StrongModel (const ParagraphModel *model)
 
bool read_t (PAGE_RES_IT *page_res_it, TBOX *tbox)
 
void YOutlierPieces (WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, ScriptPos *trailing_pos, int *num_trailing_outliers)
 
bool CompareFontInfo (const FontInfo &fi1, const FontInfo &fi2)
 
bool CompareFontSet (const FontSet &fs1, const FontSet &fs2)
 
void FontInfoDeleteCallback (FontInfo f)
 
void FontSetDeleteCallback (FontSet fs)
 
bool read_info (FILE *f, FontInfo *fi, bool swap)
 
bool write_info (FILE *f, const FontInfo &fi)
 
bool read_spacing_info (FILE *f, FontInfo *fi, bool swap)
 
bool write_spacing_info (FILE *f, const FontInfo &fi)
 
bool read_set (FILE *f, FontSet *fs, bool swap)
 
bool write_set (FILE *f, const FontSet &fs)
 
int OtsuThreshold (Pix *src_pix, int left, int top, int width, int height, int **thresholds, int **hi_values)
 
void HistogramRect (Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
 
int OtsuStats (const int *histogram, int *H_out, int *omega0_out)
 
int ParamsTrainingFeatureByName (const char *name)
 
bool PSM_OSD_ENABLED (int pageseg_mode)
 
bool PSM_ORIENTATION_ENABLED (int pageseg_mode)
 
bool PSM_COL_FIND_ENABLED (int pageseg_mode)
 
bool PSM_SPARSE (int pageseg_mode)
 
bool PSM_BLOCK_FIND_ENABLED (int pageseg_mode)
 
bool PSM_LINE_FIND_ENABLED (int pageseg_mode)
 
bool PSM_WORD_FIND_ENABLED (int pageseg_mode)
 
const char * ScriptPosToString (enum ScriptPos script_pos)
 
 ELISTIZE (AmbigSpec)
 
 ELISTIZEH (AmbigSpec)
 
bool LoadDataFromFile (const STRING &filename, GenericVector< char > *data)
 
bool SaveDataToFile (const GenericVector< char > &data, const STRING &filename)
 
template<typename T >
bool cmp_eq (T const &t1, T const &t2)
 
template<typename T >
int sort_cmp (const void *t1, const void *t2)
 
template<typename T >
int sort_ptr_cmp (const void *t1, const void *t2)
 
void ExtractFontName (const STRING &filename, STRING *fontname)
 
TrainingSampleBlobToTrainingSample (const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
 
uinT8 NormalizeDirection (uinT8 dir, const FCOORD &unnormed_pos, const DENORM &denorm, const DENORM *root_denorm)
 
void ClearFeatureSpaceWindow (NORM_METHOD norm_method, ScrollView *window)
 
void CallWithUTF8 (TessCallback1< const char * > *cb, const WERD_CHOICE *wc)
 
Pix * GridReducedPix (const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom)
 
Pix * TraceOutlineOnReducedPix (C_OUTLINE *outline, int gridsize, ICOORD bleft, int *left, int *bottom)
 
Pix * TraceBlockOnReducedPix (BLOCK *block, int gridsize, ICOORD bleft, int *left, int *bottom)
 
template<class BBC >
int SortByBoxLeft (const void *void1, const void *void2)
 
template<class BBC >
int SortRightToLeft (const void *void1, const void *void2)
 
template<class BBC >
int SortByBoxBottom (const void *void1, const void *void2)
 
template<typename T >
void DeleteObject (T *object)
 
void SetBlobStrokeWidth (Pix *pix, BLOBNBOX *blob)
 
void assign_blobs_to_blocks2 (Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
 
void ParseCommandLineFlags (const char *usage, int *argc, char ***argv, const bool remove_flags)
 
ShapeTableLoadShapeTable (const STRING &file_prefix)
 
void WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
 
MasterTrainerLoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
 
Pix * DegradeImage (Pix *input, int exposure, TRand *randomizer, float *rotation)
 
void UTF8ToUTF32 (const char *utf8_str, GenericVector< char32 > *str32)
 
void UTF32ToUTF8 (const GenericVector< char32 > &str32, STRING *utf8_str)
 
bool is_hyphen_punc (const char32 ch)
 
bool is_single_quote (const char32 ch)
 
bool is_double_quote (const char32 ch)
 
STRING NormalizeUTF8String (const char *str8)
 
void NormalizeChar32 (char32 ch, GenericVector< char32 > *str)
 
char32 OCRNormalize (char32 ch)
 
bool IsOCREquivalent (char32 ch1, char32 ch2)
 
bool IsValidCodepoint (const char32 ch)
 
bool IsWhitespace (const char32 ch)
 
bool IsUTF8Whitespace (const char *text)
 
int SpanUTF8Whitespace (const char *text)
 
int SpanUTF8NotWhitespace (const char *text)
 
bool IsInterchangeValid (const char32 ch)
 
bool IsInterchangeValid7BitAscii (const char32 ch)
 
char32 FullwidthToHalfwidth (const char32 ch)
 
Pix * CairoARGB32ToPixFormat (cairo_surface_t *surface)
 
void ExtractFontProperties (const string &utf8_text, StringRenderer *render, const string &output_base)
 
bool MakeIndividualGlyphs (Pix *pix, const vector< BoxChar * > &vbox, const int input_tiff_page)
 
void SetupBasicProperties (bool report_errors, UNICHARSET *unicharset)
 
void SetPropertiesForInputFile (const string &script_dir, const string &input_unicharset_file, const string &output_unicharset_file, const string &output_xheights_file)
 
 ELISTIZE (ViterbiStateEntry)
 
 ELISTIZEH (ViterbiStateEntry)
 
template<class BLOB_CHOICE >
int SortByUnicharID (const void *void1, const void *void2)
 
template<class BLOB_CHOICE >
int SortByRating (const void *void1, const void *void2)
 
convert_prob_to_tess_certainty

Normalize a probability in the range [0.0, 1.0] to a tesseract certainty in the range [-20.0, 0.0]

char_box_to_tbox

Create a TBOX from a character bounding box. If nonzero, the x_offset accounts for any additional padding of the word box that should be taken into account.

TBOX char_box_to_tbox (Box *char_box, TBOX word_box, int x_offset)
 

Variables

const int kMinRectSize = 10
 
const char kTesseractReject = '~'
 
const char kUNLVReject = '~'
 
const char kUNLVSuspect = '^'
 
const char * kInputFile = "noname.tif"
 
const char * kOldVarsFile = "failed_vars.txt"
 
const int kMaxIntSize = 22
 
const int kMinCredibleResolution = 70
 Minimum believable resolution. More...
 
const int kMaxCredibleResolution = 2400
 
const int kNumbersPerBlob = 5
 
const int kBytesPerNumber = 5
 
const int kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1
 
const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1
 
const int kBytesPer64BitNumber = 20
 
const int kMaxBytesPerLine
 
const int kUniChs []
 
const int kLatinChs []
 
const int kBasicBufSize = 2048
 
const int kCharWidth = 2
 
const float kMathDigitDensityTh1 = 0.25
 
const float kMathDigitDensityTh2 = 0.1
 
const float kMathItalicDensityTh = 0.5
 
const float kUnclearDensityTh = 0.25
 
const int kSeedBlobsCountTh = 10
 
const int kLeftIndentAlignmentCountTh = 1
 
const int kMaxCharTopRange = 48
 
const int kDefaultResolution = 300
 Default resolution used if input in not believable. More...
 
const int kMaxCircleErosions = 8
 
const ParagraphModelkCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F)
 
const ParagraphModelkCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F)
 
const inT16 kMaxBoxEdgeDiff = 2
 
const int kBoxClipTolerance = 2
 
const int kNumEndPoints = 3
 
const int kMinPointsForErrorCount = 16
 
const int kMaxRealDistance = 2.0
 
const int kFeaturePadding = 2
 
const int kImagePadding = 4
 
const int kNumPagesPerMiniBatch = 100
 
const int kHistogramSize = 256
 
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1)
 
CCUtilMutex tprintfMutex
 
const char * kUTF8LineSeparator = "\u2028"
 
const char * kUTF8ParagraphSeparator = "\u2029"
 
const char * kLRM = "\u200E"
 
const char * kRLM = "\u200F"
 
const char * kRLE = "\u202A"
 
const char * kPDF = "\u202C"
 
const char * kHyphenLikeUTF8 []
 
const char * kApostropheLikeUTF8 []
 
const char kUniversalAmbigsFile []
 
const int ksizeofUniversalAmbigsFile = sizeof(kUniversalAmbigsFile)
 
const double kRatingEpsilon = 1.0 / 32
 
const int kMaxOffsetDist = 32
 
const double kMinPCLengthIncrease = 1.0 / 1024
 
const int kMinClusteredShapes = 1
 
const int kMaxUnicharsPerCluster = 2000
 
const float kFontMergeDistance = 0.025
 
const float kInfiniteDist = 999.0f
 
const int kRandomizingCenter = 128
 
const int kTestChar = -1
 
const int kSquareLimit = 25
 
const int kPrime1 = 17
 
const int kPrime2 = 13
 
const int kMinOutlierSamples = 5
 
const int kStateCnt = 4
 
const int kNumLiteralCnt = 5
 
const int case_state_table [6][4]
 
const char kDoNotReverse [] = "RRP_DO_NO_REVERSE"
 
const char kReverseIfHasRTL [] = "RRP_REVERSE_IF_HAS_RTL"
 
const char kForceReverse [] = "RRP_FORCE_REVERSE"
 
const char *const RTLReversePolicyNames []
 
const double kAlignedFraction = 0.03125
 
const double kRaggedFraction = 2.5
 
const double kAlignedGapFraction = 0.75
 
const double kRaggedGapFraction = 1.0
 
const int kVLineAlignment = 3
 
const int kVLineGutter = 1
 
const int kVLineSearchSize = 150
 
const int kMinRaggedTabs = 5
 
const int kMinAlignedTabs = 4
 
const int kVLineMinLength = 500
 
const double kMinTabGradient = 4.0
 
const int kMaxSkewFactor = 15
 
const char * kTextordDebugPix = "psdebug_pix"
 
const double kMaxSmallNeighboursPerPix = 1.0 / 32
 
const int kMaxLargeOverlapsWithSmall = 3
 
const int kMaxMediumOverlapsWithSmall = 12
 
const int kMaxLargeOverlapsWithMedium = 12
 
const int kOriginalNoiseMultiple = 8
 
const int kNoisePadding = 4
 
const double kPhotoOffsetFraction = 0.375
 
const double kMinGoodTextPARatio = 1.5
 
const int kMinColumnWidth = 100
 
const int kMaxIncompatibleColumnCount = 2
 
const double kMarginOverlapFraction = 0.25
 
const double kHorizontalGapMergeFraction = 0.5
 
const double kMinNonNoiseFraction = 0.5
 
const double kMinGutterWidthGrid = 0.5
 
const double kMaxDistToPartSizeRatio = 1.5
 
bool textord_tabfind_show_initial_partitions = false
 
bool textord_tabfind_show_reject_blobs = false
 
int textord_tabfind_show_partitions = 0
 
bool textord_tabfind_show_columns = false
 
bool textord_tabfind_show_blocks = false
 
bool textord_tabfind_find_tables = true
 
const int kMaxPartnerDepth = 4
 
const double kMaxSpacingDrift = 1.0 / 72
 
const double kMaxTopSpacingFraction = 0.25
 
const double kMaxSameBlockLineSpacing = 3
 
const double kMaxSizeRatio = 1.5
 
const double kMaxLeaderGapFractionOfMax = 0.25
 
const double kMaxLeaderGapFractionOfMin = 0.5
 
const int kMinLeaderCount = 5
 
const int kLeaderCutCost = 8
 
const int kMinStrongTextValue = 6
 
const int kMinChainTextValue = 3
 
const int kHorzStrongTextlineCount = 8
 
const int kHorzStrongTextlineHeight = 10
 
const int kHorzStrongTextlineAspect = 5
 
const double kMaxBaselineError = 0.4375
 
const double kMinBaselineCoverage = 0.5
 
const int kMaxRMSColorNoise = 128
 
const int kMaxColorDistance = 900
 
const int kRGBRMSColors = 4
 
bool textord_tabfind_show_color_fit = false
 
const int kMaxPadFactor = 6
 
const int kMaxNeighbourDistFactor = 4
 
const int kMaxCaptionLines = 7
 
const double kMinCaptionGapRatio = 2.0
 
const double kMinCaptionGapHeightRatio = 0.5
 
const double kBigPartSizeRatio = 1.75
 
const double kStrokeWidthFractionTolerance = 0.25
 
const double kStrokeWidthConstantTolerance = 2.0
 
const double kTinyEnoughTextlineOverlapFraction = 0.25
 
const double kMaxPartitionSpacing = 1.75
 
const int kSmoothDecisionMargin = 4
 
const double kMinRectangularFraction = 0.125
 
const double kMaxRectangularFraction = 0.75
 
const double kMaxRectangularGradient = 0.1
 
const int kMinImageFindSize = 100
 
const double kRMSFitScaling = 8.0
 
const int kMinColorDifference = 16
 
const int kThinLineFraction = 20
 Denominator of resolution makes max pixel width to allow thin lines. More...
 
const int kMinLineLengthFraction = 4
 Denominator of resolution makes min pixels to demand line lengths to be. More...
 
const int kCrackSpacing = 100
 Spacing of cracks across the page to break up tall vertical lines. More...
 
const int kLineFindGridSize = 50
 Grid size used by line finder. Not very critical. More...
 
const int kMinThickLineWidth = 12
 
const int kMaxLineResidue = 6
 
const double kThickLengthMultiple = 0.75
 
const double kMaxNonLineDensity = 0.25
 
const double kMaxStaveHeight = 1.0
 
const double kMinMusicPixelFraction = 0.75
 
int textord_tabfind_show_strokewidths = 0
 
bool textord_tabfind_only_strokewidths = false
 
const double kStrokeWidthTolerance = 1.5
 
const double kStrokeWidthFractionCJK = 0.25
 
const double kStrokeWidthCJK = 2.0
 
const int kCJKRadius = 2
 
const double kCJKBrokenDistanceFraction = 0.25
 
const int kCJKMaxComponents = 8
 
const double kCJKAspectRatio = 1.25
 
const double kCJKAspectRatioIncrease = 1.0625
 
const int kMaxCJKSizeRatio = 5
 
const double kBrokenCJKIterationFraction = 0.125
 
const double kDiacriticXPadRatio = 7.0
 
const double kDiacriticYPadRatio = 1.75
 
const double kMinDiacriticSizeRatio = 1.0625
 
const double kMaxDiacriticDistanceRatio = 1.25
 
const double kMaxDiacriticGapToBaseCharHeight = 1.0
 
const int kSearchRadius = 2
 
const int kLineTrapLongest = 4
 
const int kLineTrapShortest = 2
 
const int kMostlyOneDirRatio = 3
 
const double kLineResidueAspectRatio = 8.0
 
const int kLineResiduePadRatio = 3
 
const double kLineResidueSizeRatio = 1.75
 
const float kSizeRatioToReject = 2.0
 
const int kMaxLargeOverlaps = 3
 
const double kNeighbourSearchFactor = 2.5
 
const double kNoiseOverlapGrowthFactor = 4.0
 
const double kNoiseOverlapAreaFactor = 1.0 / 512
 
const double kShapePerimeterRatio = 3.0
 
const int kTabRadiusFactor = 5
 
const int kMinVerticalSearch = 3
 
const int kMaxVerticalSearch = 12
 
const int kMaxRaggedSearch = 25
 
const int kMinLinesInColumn = 10
 
const double kMinFractionalLinesInColumn = 0.125
 
const double kMinGutterWidthAbsolute = 0.02
 
const double kMaxGutterWidthAbsolute = 2.00
 
const int kRaggedGutterMultiple = 5
 
const double kLineFragmentAspectRatio = 10.0
 
const double kSmoothFactor = 0.25
 
const double kCharVerticalOverlapFraction = 0.375
 
const double kMaxHorizontalGap = 3.0
 
const int kMinEvaluatedTabs = 3
 
const int kMaxTextLineBlobRatio = 5
 
const int kMinTextLineBlobRatio = 3
 
const double kMinImageArea = 0.5
 
const double kCosMaxSkewAngle = 0.866025
 
bool textord_tabfind_show_initialtabs = false
 
bool textord_tabfind_show_finaltabs = false
 
const int kColumnWidthFactor = 20
 
const int kMaxVerticalSpacing = 500
 
const int kMaxBlobWidth = 500
 
const double kSplitPartitionSize = 2.0
 
const double kAllowTextHeight = 0.5
 
const double kAllowTextWidth = 0.6
 
const double kAllowTextArea = 0.8
 
const double kAllowBlobHeight = 0.3
 
const double kAllowBlobWidth = 0.4
 
const double kAllowBlobArea = 0.05
 
const int kMinBoxesInTextPartition = 10
 
const int kMaxBoxesInDataPartition = 20
 
const double kMaxGapInTextPartition = 4.0
 
const double kMinMaxGapInTextPartition = 0.5
 
const double kMaxBlobOverlapFactor = 4.0
 
const double kMaxTableCellXheight = 2.0
 
const int kMaxColumnHeaderDistance = 4
 
const double kTableColumnThreshold = 3.0
 
const int kRulingVerticalMargin = 3
 
const double kMinOverlapWithTable = 0.6
 
const int kSideSpaceMargin = 10
 
const double kSmallTableProjectionThreshold = 0.35
 
const double kLargeTableProjectionThreshold = 0.45
 
const int kLargeTableRowCount = 6
 
const int kMinRowsInTable = 3
 
const double kRequiredFullJustifiedSpacing = 4.0
 
const int kAdjacentLeaderSearchPadding = 2
 
const double kParagraphEndingPreviousLineRatio = 1.3
 
const double kMaxParagraphEndingLeftSpaceMultiple = 3.0
 
const double kMinParagraphEndingTextToWhitespaceRatio = 3.0
 
const double kMaxXProjectionGapFactor = 2.0
 
const double kStrokeWidthFractionalTolerance = 0.25
 
bool textord_dump_table_images = false
 
bool textord_show_tables = false
 
bool textord_tablefind_show_mark = false
 
bool textord_tablefind_show_stats = false
 
bool textord_tablefind_recognize_tables = false
 
const double kHorizontalSpacing = 0.30
 
const double kVerticalSpacing = -0.2
 
const int kCellSplitRowThreshold = 0
 
const int kCellSplitColumnThreshold = 0
 
const int kLinedTableMinVerticalLines = 3
 
const int kLinedTableMinHorizontalLines = 3
 
const double kRequiredColumns = 0.7
 
const double kMarginFactor = 1.1
 
const double kMaxRowSize = 2.5
 
const double kGoodRowNumberOfColumnsSmall [] = { 2, 2, 2, 2, 2, 3, 3 }
 
const int kGoodRowNumberOfColumnsSmallSize
 
const double kGoodRowNumberOfColumnsLarge = 0.7
 
const double kMinFilledArea = 0.35
 
const int kGutterMultiple = 4
 
const int kGutterToNeighbourRatio = 3
 
const int kSimilarVectorDist = 10
 
const int kSimilarRaggedDist = 50
 
const int kMaxFillinMultiple = 11
 
const double kMinGutterFraction = 0.5
 
const double kLineCountReciprocal = 4.0
 
const double kMinAlignedGutter = 0.25
 
const double kMinRaggedGutter = 1.5
 
double textord_tabvector_vertical_gap_fraction = 0.5
 
double textord_tabvector_vertical_box_ratio = 0.5
 
const char * kAlignmentNames []
 
const int kMaxLineLength = 1024
 
const float kRotationRange = 0.02f
 
const int kExposureFactor = 16
 
const int kSaltnPepper = 5
 
const int kMinRampSize = 1000
 
const int kMinLigature = 0xfb00
 
const int kMaxLigature = 0xfb17
 

Detailed Description

The box file is assumed to contain box definitions, one per line, of the following format for blob-level boxes:

*   <UTF8 str> <left> <bottom> <right> <top> <page id>
* 

and for word/line-level boxes:

*   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
* 

NOTES: The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.

<page id>=""> is 0-based, and the page number is used for multipage input (tiff).

In the blob-level form, each line represents a recognizable unit, which may be several UTF-8 bytes, but there is a bounding box around each recognizable unit, and no classifier is needed to train in this mode (bootstrapping.)

In the word/line-level form, the line begins with the literal "WordStr", and the bounding box bounds either a whole line or a whole word. The recognizable units in the word/line are listed after the # at the end of the line and are space delimited, ignoring any original spaces on the line. Eg.

* word -> #w o r d
* multi word line -> #m u l t i w o r d l i n e
* 

The recognizable units must be space-delimited in order to allow multiple unicodes to be used for a single recognizable unit, eg Hindi.

In this mode, the classifier must have been pre-trained with the desired character set, or it will not be able to find the character segmentations.

Make a word from the selected blobs and run Tess on them.

Parameters
page_resrecognise blobs
selection_boxwithin this box

fp_eval_word_spacing() Evaluation function for fixed pitch word lists.

Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars

build_menu()

Construct the menu tree used by the command window

process_cmd_win_event()

Process a command returned from the command window (Just call the appropriate command handler)

word_blank_and_set_display() Word processor

Blank display of word then redisplay word according to current display mode settings


Public Function Prototypes


Include Files and Type Defines

Typedef Documentation

typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> tesseract::BlobGridSearch

Definition at line 31 of file blobgrid.h.

typedef signed int tesseract::char_32

Definition at line 40 of file string_32.h.

typedef GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> tesseract::ColPartitionGridSearch

Definition at line 913 of file colpartition.h.

typedef BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGrid

Definition at line 118 of file tablefind.h.

typedef GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGridSearch

Definition at line 121 of file tablefind.h.

Definition at line 50 of file dict.h.

typedef int(Dict::* tesseract::DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 81 of file baseapi.h.

typedef bool(* tesseract::FileReader)(const STRING &filename, GenericVector< char > *data)

Definition at line 349 of file genericvector.h.

typedef bool(* tesseract::FileWriter)(const GenericVector< char > &data, const STRING &filename)

Definition at line 352 of file genericvector.h.

typedef void(Wordrec::* tesseract::FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 90 of file baseapi.h.

typedef KDPairInc<int, int> tesseract::IntKDPair

Definition at line 179 of file kdpair.h.

typedef unsigned char tesseract::LanguageModelFlagsType

Used for expressing various language model flags.

Definition at line 37 of file lm_state.h.

typedef hash_map<string, string, StringHash> tesseract::LigHash

Definition at line 32 of file ligature_table.h.

Definition at line 67 of file dawg.h.

typedef float(Dict::* tesseract::ParamsModelClassifyFunc)(const char *lang, void *path)

Definition at line 88 of file baseapi.h.

typedef double(Dict::* tesseract::ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Definition at line 83 of file baseapi.h.

Definition at line 94 of file paragraphs_internal.h.

Definition at line 156 of file shapetable.h.

typedef basic_string<char_32> tesseract::string_32

Definition at line 41 of file string_32.h.

Definition at line 68 of file dawg.h.

Definition at line 69 of file dawg.h.

Definition at line 95 of file baseapi.h.

typedef GenericVector<AmbigSpec_LIST *> tesseract::UnicharAmbigsVector

Definition at line 142 of file ambigs.h.

Definition at line 34 of file ambigs.h.

Definition at line 36 of file tabfind.h.

typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> tesseract::WordGrid

Definition at line 65 of file textord.h.

typedef void(Tesseract::* tesseract::WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)

Definition at line 166 of file tesseractclass.h.

typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> tesseract::WordSearch

Definition at line 66 of file textord.h.

Enumeration Type Documentation

Enumerator
NOT_AMBIG 
REPLACE_AMBIG 
DEFINITE_AMBIG 
SIMILAR_AMBIG 
CASE_AMBIG 
AMBIG_TYPE_COUNT 

Definition at line 44 of file ambigs.h.

44  {
45  NOT_AMBIG, // the ngram pair is not ambiguous
46  REPLACE_AMBIG, // ocred ngram should always be substituted with correct
47  DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
48  SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
49  CASE_AMBIG, // this is a case ambiguity (1-1)
50 
51  AMBIG_TYPE_COUNT // number of enum entries
52 };
Enumerator
CST_FRAGMENT 
CST_WHOLE 
CST_IMPROPER 
CST_NGRAM 

Definition at line 54 of file classify.h.

54  {
55  CST_FRAGMENT, // A partial character.
56  CST_WHOLE, // A correctly segmented character.
57  CST_IMPROPER, // More than one but less than 2 characters.
58  CST_NGRAM // Multiple characters.
59 };
Enumerator
ACTION_1_CMD_EVENT 
RECOG_WERDS 
RECOG_PSEUDO 
ACTION_2_CMD_EVENT 

Definition at line 477 of file tessedit.cpp.

Enumerator
COL_UNKNOWN 
COL_TEXT 
COL_TABLE 
COL_MIXED 
COL_COUNT 

Definition at line 30 of file tablefind.h.

Enumerator
CST_NOISE 
CST_FLOWING 
CST_HEADING 
CST_PULLOUT 
CST_COUNT 

Definition at line 47 of file colpartition.h.

47  {
48  CST_NOISE, // Strictly between columns.
49  CST_FLOWING, // Strictly within a single column.
50  CST_HEADING, // Spans multiple columns.
51  CST_PULLOUT, // Touches multiple columns, but doesn't span them.
52  CST_COUNT // Number of entries.
53 };
Enumerator
CT_UNICHAR_TOP_OK 
CT_UNICHAR_TOP1_ERR 
CT_UNICHAR_TOP2_ERR 
CT_UNICHAR_TOPN_ERR 
CT_UNICHAR_TOPTOP_ERR 
CT_OK_MULTI_UNICHAR 
CT_OK_JOINED 
CT_OK_BROKEN 
CT_REJECT 
CT_FONT_ATTR_ERR 
CT_OK_MULTI_FONT 
CT_NUM_RESULTS 
CT_RANK 
CT_REJECTED_JUNK 
CT_ACCEPTED_JUNK 
CT_SIZE 

Definition at line 69 of file errorcounter.h.

69  {
70  CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id.
71  // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
72  // kRatingEpsilon from the first result in each group. The real top choice
73  // is measured using TOPTOP.
74  CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
75  CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
76  CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
77  CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct.
78  CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
79  CT_OK_JOINED, // Top shape id is correct but marked joined.
80  CT_OK_BROKEN, // Top shape id is correct but marked broken.
81  CT_REJECT, // Classifier hates this.
82  CT_FONT_ATTR_ERR, // Top unichar OK, but font attributes incorrect.
83  CT_OK_MULTI_FONT, // CT_FONT_ATTR_OK but there are multiple font attrs.
84  CT_NUM_RESULTS, // Number of answers produced.
85  CT_RANK, // Rank of correct answer.
86  CT_REJECTED_JUNK, // Junk that was correctly rejected.
87  CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
88 
89  CT_SIZE // Number of types for array sizing.
90 };
Enumerator
DAWG_TYPE_PUNCTUATION 
DAWG_TYPE_WORD 
DAWG_TYPE_NUMBER 
DAWG_TYPE_PATTERN 
DAWG_TYPE_COUNT 

Definition at line 71 of file dawg.h.

Enumerator
PTRAIN_DIGITS_SHORT 
PTRAIN_DIGITS_MED 
PTRAIN_DIGITS_LONG 
PTRAIN_NUM_SHORT 
PTRAIN_NUM_MED 
PTRAIN_NUM_LONG 
PTRAIN_DOC_SHORT 
PTRAIN_DOC_MED 
PTRAIN_DOC_LONG 
PTRAIN_DICT_SHORT 
PTRAIN_DICT_MED 
PTRAIN_DICT_LONG 
PTRAIN_FREQ_SHORT 
PTRAIN_FREQ_MED 
PTRAIN_FREQ_LONG 
PTRAIN_SHAPE_COST_PER_CHAR 
PTRAIN_NGRAM_COST_PER_CHAR 
PTRAIN_NUM_BAD_PUNC 
PTRAIN_NUM_BAD_CASE 
PTRAIN_XHEIGHT_CONSISTENCY 
PTRAIN_NUM_BAD_CHAR_TYPE 
PTRAIN_NUM_BAD_SPACING 
PTRAIN_NUM_BAD_FONT 
PTRAIN_RATING_PER_CHAR 
PTRAIN_NUM_FEATURE_TYPES 

Definition at line 39 of file params_training_featdef.h.

39  {
40  // Digits
42  PTRAIN_DIGITS_MED, // 1
43  PTRAIN_DIGITS_LONG, // 2
44  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
45  PTRAIN_NUM_SHORT, // 3
46  PTRAIN_NUM_MED, // 4
47  PTRAIN_NUM_LONG, // 5
48  // Document word (DOC_DAWG_PERM)
49  PTRAIN_DOC_SHORT, // 6
50  PTRAIN_DOC_MED, // 7
51  PTRAIN_DOC_LONG, // 8
52  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
53  PTRAIN_DICT_SHORT, // 9
54  PTRAIN_DICT_MED, // 10
55  PTRAIN_DICT_LONG, // 11
56  // Frequent word (FREQ_DAWG_PERM)
57  PTRAIN_FREQ_SHORT, // 12
58  PTRAIN_FREQ_MED, // 13
59  PTRAIN_FREQ_LONG, // 14
62  PTRAIN_NUM_BAD_PUNC, // 17
63  PTRAIN_NUM_BAD_CASE, // 18
67  PTRAIN_NUM_BAD_FONT, // 22
69 
71 };
Enumerator
LR_LEFT 
LR_RIGHT 

Definition at line 39 of file strokewidth.h.

Enumerator
LT_START 
LT_BODY 
LT_UNKNOWN 
LT_MULTIPLE 

Definition at line 54 of file paragraphs_internal.h.

54  {
55  LT_START = 'S', // First line of a paragraph.
56  LT_BODY = 'C', // Continuation line of a paragraph.
57  LT_UNKNOWN = 'U', // No clues.
58  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
59 };
Enumerator
LM_PPTYPE_BLAMER 
LM_PPTYPE_AMBIG 
LM_PPTYPE_PATH 
LM_PPTYPE_SHAPE 
LM_PPTYPE_NUM 

Definition at line 37 of file lm_pain_points.h.

Enumerator
NPT_HTEXT 
NPT_VTEXT 
NPT_WEAK_HTEXT 
NPT_WEAK_VTEXT 
NPT_IMAGE 
NPT_COUNT 

Definition at line 1558 of file colpartitiongrid.cpp.

1558  {
1559  NPT_HTEXT, // Definite horizontal text.
1560  NPT_VTEXT, // Definite vertical text.
1561  NPT_WEAK_HTEXT, // Weakly horizontal text. Counts as HTEXT for HTEXT, but
1562  // image for image and VTEXT.
1563  NPT_WEAK_VTEXT, // Weakly vertical text. Counts as VTEXT for VTEXT, but
1564  // image for image and HTEXT.
1565  NPT_IMAGE, // Defininte non-text.
1566  NPT_COUNT // Number of array elements.
1567 };
Enumerator
NM_BASELINE 
NM_CHAR_ISOTROPIC 
NM_CHAR_ANISOTROPIC 

Definition at line 44 of file normalis.h.

44  {
45  NM_BASELINE = -3, // The original BL normalization mode.
46  NM_CHAR_ISOTROPIC = -2, // Character normalization but isotropic.
47  NM_CHAR_ANISOTROPIC = -1 // The original CN normalization mode.
48 };

When Tesseract/Cube is initialized we can choose to instantiate/load/run only the Tesseract part, only the Cube part or both along with the combiner. The preference of which engine to use is stored in tessedit_ocr_engine_mode.

ATTENTION: When modifying this enum, please make sure to make the appropriate changes to all the enums mirroring it (e.g. OCREngine in cityblock/workflow/detection/detection_storage.proto). Such enums will mention the connection to OcrEngineMode in the comments.

Enumerator
OEM_TESSERACT_ONLY 
OEM_CUBE_ONLY 
OEM_TESSERACT_CUBE_COMBINED 
OEM_DEFAULT 

Definition at line 256 of file publictypes.h.

256  {
257  OEM_TESSERACT_ONLY, // Run Tesseract only - fastest
258  OEM_CUBE_ONLY, // Run Cube only - better accuracy, but slower
259  OEM_TESSERACT_CUBE_COMBINED, // Run both and combine results - best accuracy
260  OEM_DEFAULT // Specify this mode when calling init_*(),
261  // to indicate that any of the above modes
262  // should be automatically inferred from the
263  // variables in the language-specific config,
264  // command-line configs, or if not specified
265  // in any of the above should be set to the
266  // default OEM_TESSERACT_ONLY.
267 };

+---------------—+ Orientation Example: | 1 Aaaa Aaaa Aaaa | ==================== | Aaa aa aaa aa | To left is a diagram of some (1) English and | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. | 2 | | ####### c c C | Upright Latin characters are represented as A and a. | ####### c c c | '<' represents a latin character rotated | < ####### c c c | anti-clockwise 90 degrees. | < ####### c c | | < ####### . c | Upright Chinese characters are represented C and c. | 3 ####### c | +---------------—+ NOTA BENE: enum values here should match goodoc.proto

If you orient your head so that "up" aligns with Orientation, then the characters will appear "right side up" and readable.

In the example above, both the English and Chinese paragraphs are oriented so their "up" is the top of the page (page up). The photo credit is read with one's head turned leftward ("up" is to page left).

The values of this enum match the convention of Tesseract's osdetect.h

Enumerator
ORIENTATION_PAGE_UP 
ORIENTATION_PAGE_RIGHT 
ORIENTATION_PAGE_DOWN 
ORIENTATION_PAGE_LEFT 

Definition at line 108 of file publictypes.h.

enum of the elements of the page hierarchy, used in ResultIterator to provide functions that operate on each level without having to have 5x as many functions.

Enumerator
RIL_BLOCK 
RIL_PARA 
RIL_TEXTLINE 
RIL_WORD 
RIL_SYMBOL 

Definition at line 207 of file publictypes.h.

207  {
208  RIL_BLOCK, // Block of text/image/separator line.
209  RIL_PARA, // Paragraph within a block.
210  RIL_TEXTLINE, // Line within a paragraph.
211  RIL_WORD, // Word within a textline.
212  RIL_SYMBOL // Symbol/character within a word.
213 };
Definition: capi.h:76
Definition: capi.h:76
Definition: capi.h:76

Possible modes for page layout analysis. These must be kept in order of decreasing amount of layout analysis to be done, except for OSD_ONLY, so that the inequality test macros below work.

Enumerator
PSM_OSD_ONLY 

Orientation and script detection only.

PSM_AUTO_OSD 

Automatic page segmentation with orientation and script detection. (OSD)

PSM_AUTO_ONLY 

Automatic page segmentation, but no OSD, or OCR.

PSM_AUTO 

Fully automatic page segmentation, but no OSD.

PSM_SINGLE_COLUMN 

Assume a single column of text of variable sizes.

PSM_SINGLE_BLOCK_VERT_TEXT 

Assume a single uniform block of vertically aligned text.

PSM_SINGLE_BLOCK 

Assume a single uniform block of text. (Default.)

PSM_SINGLE_LINE 

Treat the image as a single text line.

PSM_SINGLE_WORD 

Treat the image as a single word.

PSM_CIRCLE_WORD 

Treat the image as a single word in a circle.

PSM_SINGLE_CHAR 

Treat the image as a single character.

PSM_SPARSE_TEXT 

Find as much text as possible in no particular order.

PSM_SPARSE_TEXT_OSD 

Sparse text with orientation and script det.

PSM_RAW_LINE 

Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

PSM_COUNT 

Number of enum entries.

Definition at line 151 of file publictypes.h.

JUSTIFICATION_UNKNONW The alignment is not clearly one of the other options. This could happen for example if there are only one or two lines of text or the text looks like source code or poetry.

NOTA BENE: Fully justified paragraphs (text aligned to both left and right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text is written with a left-to-right script and with JUSTIFICATION_RIGHT if their text is written in a right-to-left script.

Interpretation for text read in vertical lines: "Left" is wherever the starting reading position is.

JUSTIFICATION_LEFT Each line, except possibly the first, is flush to the same left tab stop.

JUSTIFICATION_CENTER The text lines of the paragraph are centered about a line going down through their middle of the text lines.

JUSTIFICATION_RIGHT Each line, except possibly the first, is flush to the same right tab stop.

Enumerator
JUSTIFICATION_UNKNOWN 
JUSTIFICATION_LEFT 
JUSTIFICATION_CENTER 
JUSTIFICATION_RIGHT 

Definition at line 239 of file publictypes.h.

Enumerator
PFR_OK 
PFR_SKEW 
PFR_NOISE 

Definition at line 46 of file strokewidth.h.

46  {
47  PFR_OK, // Everything is OK.
48  PFR_SKEW, // Skew was detected and rotated.
49  PFR_NOISE // Noise was detected and removed.
50 };
Enumerator
SP_NORMAL 
SP_SUBSCRIPT 
SP_SUPERSCRIPT 
SP_DROPCAP 

Definition at line 260 of file ratngs.h.

Enumerator
SET_PARAM_CONSTRAINT_NONE 
SET_PARAM_CONSTRAINT_DEBUG_ONLY 
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY 
SET_PARAM_CONSTRAINT_NON_INIT_ONLY 

Definition at line 36 of file params.h.

Enumerator
TA_LEFT_ALIGNED 
TA_LEFT_RAGGED 
TA_CENTER_JUSTIFIED 
TA_RIGHT_ALIGNED 
TA_RIGHT_RAGGED 
TA_SEPARATOR 
TA_COUNT 

Definition at line 43 of file tabvector.h.

Enumerator
TESSDATA_LANG_CONFIG 
TESSDATA_UNICHARSET 
TESSDATA_AMBIGS 
TESSDATA_INTTEMP 
TESSDATA_PFFMTABLE 
TESSDATA_NORMPROTO 
TESSDATA_PUNC_DAWG 
TESSDATA_SYSTEM_DAWG 
TESSDATA_NUMBER_DAWG 
TESSDATA_FREQ_DAWG 
TESSDATA_FIXED_LENGTH_DAWGS 
TESSDATA_CUBE_UNICHARSET 
TESSDATA_CUBE_SYSTEM_DAWG 
TESSDATA_SHAPE_TABLE 
TESSDATA_BIGRAM_DAWG 
TESSDATA_UNAMBIG_DAWG 
TESSDATA_PARAMS_MODEL 
TESSDATA_NUM_ENTRIES 

Definition at line 53 of file tessdatamanager.h.

53  {
56  TESSDATA_AMBIGS, // 2
57  TESSDATA_INTTEMP, // 3
58  TESSDATA_PFFMTABLE, // 4
59  TESSDATA_NORMPROTO, // 5
60  TESSDATA_PUNC_DAWG, // 6
63  TESSDATA_FREQ_DAWG, // 9
64  TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
71 
73 };

The text lines are read in the given sequence.

In English, the order is top-to-bottom. In Chinese, vertical text lines are read right-to-left. Mongolian is written in vertical columns top to bottom like Chinese, but the lines order left-to right.

Note that only some combinations make sense. For example, WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM

Enumerator
TEXTLINE_ORDER_LEFT_TO_RIGHT 
TEXTLINE_ORDER_RIGHT_TO_LEFT 
TEXTLINE_ORDER_TOP_TO_BOTTOM 

Definition at line 140 of file publictypes.h.

The grapheme clusters within a line of text are laid out logically in this direction, judged when looking at the text line rotated so that its Orientation is "page up".

For English text, the writing direction is left-to-right. For the Chinese text in the above example, the writing direction is top-to-bottom.

Enumerator
WRITING_DIRECTION_LEFT_TO_RIGHT 
WRITING_DIRECTION_RIGHT_TO_LEFT 
WRITING_DIRECTION_TOP_TO_BOTTOM 

Definition at line 123 of file publictypes.h.

Enumerator
XH_GOOD 
XH_SUBNORMAL 
XH_INCONSISTENT 

Definition at line 75 of file dict.h.

Function Documentation

void tesseract::AffineMatrix ( int  writing_direction,
int  line_x1,
int  line_y1,
int  line_x2,
int  line_y2,
double *  a,
double *  b,
double *  c,
double *  d 
)

Definition at line 246 of file pdfrenderer.cpp.

248  {
249  double theta = atan2(static_cast<double>(line_y1 - line_y2),
250  static_cast<double>(line_x2 - line_x1));
251  *a = cos(theta);
252  *b = sin(theta);
253  *c = -sin(theta);
254  *d = cos(theta);
255  switch(writing_direction) {
257  *a = -*a;
258  *b = -*b;
259  break;
261  // TODO(jbreiden) Consider using the vertical PDF writing mode.
262  break;
263  default:
264  break;
265  }
266 }
bool tesseract::AsciiLikelyListItem ( const STRING word)

Definition at line 267 of file paragraphs.cpp.

267  {
268  return LikelyListMark(word) || LikelyListNumeral(word);
269 }
bool LikelyListNumeral(const STRING &word)
Definition: paragraphs.cpp:228
bool LikelyListMark(const STRING &word)
Definition: paragraphs.cpp:262
void tesseract::assign_blobs_to_blocks2 ( Pix *  pix,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  port_blocks 
)

Definition at line 157 of file tordmain.cpp.

159  { // output list
160  BLOCK *block; // current block
161  BLOBNBOX *newblob; // created blob
162  C_BLOB *blob; // current blob
163  BLOCK_IT block_it = blocks;
164  C_BLOB_IT blob_it; // iterator
165  BLOBNBOX_IT port_box_it; // iterator
166  // destination iterator
167  TO_BLOCK_IT port_block_it = port_blocks;
168  TO_BLOCK *port_block; // created block
169 
170  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
171  block = block_it.data();
172  port_block = new TO_BLOCK(block);
173 
174  // Convert the good outlines to block->blob_list
175  port_box_it.set_to_list(&port_block->blobs);
176  blob_it.set_to_list(block->blob_list());
177  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
178  blob = blob_it.extract();
179  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
180  SetBlobStrokeWidth(pix, newblob);
181  port_box_it.add_after_then_move(newblob);
182  }
183 
184  // Put the rejected outlines in block->noise_blobs, which allows them to
185  // be reconsidered and sorted back into rows and recover outlines mistakenly
186  // rejected.
187  port_box_it.set_to_list(&port_block->noise_blobs);
188  blob_it.set_to_list(block->reject_blobs());
189  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
190  blob = blob_it.extract();
191  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
192  SetBlobStrokeWidth(pix, newblob);
193  port_box_it.add_after_then_move(newblob);
194  }
195 
196  port_block_it.add_after_then_move(port_block);
197  }
198 }
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
Definition: ocrblock.h:30
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:770
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:135
void SetBlobStrokeWidth(Pix *pix, BLOBNBOX *blob)
Definition: tordmain.cpp:58
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
TrainingSample * tesseract::BlobToTrainingSample ( const TBLOB blob,
bool  nonlinear_norm,
INT_FX_RESULT_STRUCT fx_info,
GenericVector< INT_FEATURE_STRUCT > *  bl_features 
)

Definition at line 81 of file intfx.cpp.

83  {
85  Classify::ExtractFeatures(blob, nonlinear_norm, bl_features,
86  &cn_features, fx_info, NULL);
87  // TODO(rays) Use blob->PreciseBoundingBox() instead.
88  TBOX box = blob.bounding_box();
90  int num_features = fx_info->NumCN;
91  if (num_features > 0) {
92  sample = TrainingSample::CopyFromFeatures(*fx_info, box, &cn_features[0],
93  num_features);
94  }
95  if (sample != NULL) {
96  // Set the bounding box (in original image coordinates) in the sample.
97  TPOINT topleft, botright;
98  topleft.x = box.left();
99  topleft.y = box.top();
100  botright.x = box.right();
101  botright.y = box.bottom();
102  TPOINT original_topleft, original_botright;
103  blob.denorm().DenormTransform(NULL, topleft, &original_topleft);
104  blob.denorm().DenormTransform(NULL, botright, &original_botright);
105  sample->set_bounding_box(TBOX(original_topleft.x, original_botright.y,
106  original_botright.x, original_topleft.y));
107  }
108  return sample;
109 }
void set_bounding_box(const TBOX &box)
inT16 y
Definition: blobs.h:72
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:389
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:50
inT16 x
Definition: blobs.h:71
const DENORM & denorm() const
Definition: blobs.h:340
inT16 bottom() const
Definition: rect.h:61
Definition: cluster.h:32
Definition: rect.h:30
#define NULL
Definition: host.h:144
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
Pix* tesseract::CairoARGB32ToPixFormat ( cairo_surface_t *  surface)

Definition at line 78 of file stringrenderer.cpp.

78  {
79  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
80  printf("Unexpected surface format %d\n",
81  cairo_image_surface_get_format(surface));
82  return NULL;
83  }
84  const int width = cairo_image_surface_get_width(surface);
85  const int height = cairo_image_surface_get_height(surface);
86  Pix* pix = pixCreate(width, height, 32);
87  int byte_stride = cairo_image_surface_get_stride(surface);
88 
89  for (int i = 0; i < height; ++i) {
90  memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
91  cairo_image_surface_get_data(surface) + i * byte_stride,
92  byte_stride - ((i == height - 1) ? 1 : 0));
93  }
94  return pix;
95 }
#define NULL
Definition: host.h:144
void tesseract::CalculateTabStops ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
int  tolerance,
GenericVector< Cluster > *  left_tabs,
GenericVector< Cluster > *  right_tabs 
)

Definition at line 691 of file paragraphs.cpp.

695  {
696  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
697  return;
698  // First pass: toss all left and right indents into clusterers.
699  SimpleClusterer initial_lefts(tolerance);
700  SimpleClusterer initial_rights(tolerance);
701  GenericVector<Cluster> initial_left_tabs;
702  GenericVector<Cluster> initial_right_tabs;
703  for (int i = row_start; i < row_end; i++) {
704  initial_lefts.Add((*rows)[i].lindent_);
705  initial_rights.Add((*rows)[i].rindent_);
706  }
707  initial_lefts.GetClusters(&initial_left_tabs);
708  initial_rights.GetClusters(&initial_right_tabs);
709 
710  // Second pass: cluster only lines that are not "stray"
711  // An example of a stray line is a page number -- a line whose start
712  // and end tab-stops are far outside the typical start and end tab-stops
713  // for the block.
714  // Put another way, we only cluster data from lines whose start or end
715  // tab stop is frequent.
716  SimpleClusterer lefts(tolerance);
717  SimpleClusterer rights(tolerance);
718 
719  // Outlier elimination. We might want to switch this to test outlier-ness
720  // based on how strange a position an outlier is in instead of or in addition
721  // to how rare it is. These outliers get re-added if we end up having too
722  // few tab stops, to work with, however.
723  int infrequent_enough_to_ignore = 0;
724  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
725  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
726 
727  for (int i = row_start; i < row_end; i++) {
728  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
729  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
730  if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
731  initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
732  lefts.Add((*rows)[i].lindent_);
733  rights.Add((*rows)[i].rindent_);
734  }
735  }
736  lefts.GetClusters(left_tabs);
737  rights.GetClusters(right_tabs);
738 
739  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
740  (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
741  // One side is really ragged, and the other only has one tab stop,
742  // so those "insignificant outliers" are probably important, actually.
743  // This often happens on a page of an index. Add back in the ones
744  // we omitted in the first pass.
745  for (int i = row_start; i < row_end; i++) {
746  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
747  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
748  if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
749  initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
750  lefts.Add((*rows)[i].lindent_);
751  rights.Add((*rows)[i].rindent_);
752  }
753  }
754  }
755  lefts.GetClusters(left_tabs);
756  rights.GetClusters(right_tabs);
757 
758  // If one side is almost a two-indent aligned side, and the other clearly
759  // isn't, try to prune out the least frequent tab stop from that side.
760  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
761  int to_prune = -1;
762  for (int i = left_tabs->size() - 1; i >= 0; i--) {
763  if (to_prune < 0 ||
764  (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
765  to_prune = i;
766  }
767  }
768  if (to_prune >= 0 &&
769  (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
770  left_tabs->remove(to_prune);
771  }
772  }
773  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
774  int to_prune = -1;
775  for (int i = right_tabs->size() - 1; i >= 0; i--) {
776  if (to_prune < 0 ||
777  (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
778  to_prune = i;
779  }
780  }
781  if (to_prune >= 0 &&
782  (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
783  right_tabs->remove(to_prune);
784  }
785  }
786 }
int size() const
Definition: genericvector.h:72
void remove(int index)
int count(LIST var_list)
Definition: oldlist.cpp:108
int ClosestCluster(const GenericVector< Cluster > &clusters, int value)
Definition: paragraphs.cpp:665
void tesseract::CallWithUTF8 ( TessCallback1< const char * > *  cb,
const WERD_CHOICE wc 
)

Definition at line 112 of file dawg.cpp.

112  {
113  STRING s;
114  wc->string_and_lengths(&s, NULL);
115  cb->Run(s.string());
116 }
virtual void Run(A1)=0
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::CanonicalizeDetectionResults ( GenericVector< PARA * > *  row_owners,
PARA_LIST *  paragraphs 
)

Definition at line 2232 of file paragraphs.cpp.

2234  {
2235  GenericVector<PARA *> &rows = *row_owners;
2236  paragraphs->clear();
2237  PARA_IT out(paragraphs);
2238  PARA *formerly_null = NULL;
2239  for (int i = 0; i < rows.size(); i++) {
2240  if (rows[i] == NULL) {
2241  if (i == 0 || rows[i - 1] != formerly_null) {
2242  rows[i] = formerly_null = new PARA();
2243  } else {
2244  rows[i] = formerly_null;
2245  continue;
2246  }
2247  } else if (i > 0 && rows[i - 1] == rows[i]) {
2248  continue;
2249  }
2250  out.add_after_then_move(rows[i]);
2251  }
2252 }
int size() const
Definition: genericvector.h:72
Definition: ocrpara.h:29
#define NULL
Definition: host.h:144
TBOX tesseract::char_box_to_tbox ( Box *  char_box,
TBOX  word_box,
int  x_offset 
)

Definition at line 42 of file cube_control.cpp.

42  {
43  l_int32 left;
44  l_int32 top;
45  l_int32 width;
46  l_int32 height;
47  l_int32 right;
48  l_int32 bottom;
49 
50  boxGetGeometry(char_box, &left, &top, &width, &height);
51  left += word_box.left() - x_offset;
52  right = left + width;
53  top = word_box.bottom() + word_box.height() - top;
54  bottom = top - height;
55  return TBOX(left, bottom, right, top);
56 }
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
inT16 height() const
Definition: rect.h:104
Definition: rect.h:30
void tesseract::ClearFeatureSpaceWindow ( NORM_METHOD  norm_method,
ScrollView window 
)

Clears the given window and draws the featurespace guides for the appropriate normalization method.

Definition at line 1104 of file intproto.cpp.

1104  {
1105  window->Clear();
1106 
1107  window->Pen(ScrollView::GREY);
1108  // Draw the feature space limit rectangle.
1109  window->Rectangle(0, 0, INT_MAX_X, INT_MAX_Y);
1110  if (norm_method == baseline) {
1111  window->SetCursor(0, INT_DESCENDER);
1112  window->DrawTo(INT_MAX_X, INT_DESCENDER);
1113  window->SetCursor(0, INT_BASELINE);
1114  window->DrawTo(INT_MAX_X, INT_BASELINE);
1115  window->SetCursor(0, INT_XHEIGHT);
1116  window->DrawTo(INT_MAX_X, INT_XHEIGHT);
1117  window->SetCursor(0, INT_CAPHEIGHT);
1118  window->DrawTo(INT_MAX_X, INT_CAPHEIGHT);
1119  } else {
1122  }
1123 }
void Pen(Color color)
Definition: scrollview.cpp:726
#define INT_YCENTER
Definition: intproto.cpp:62
void DrawTo(int x, int y)
Definition: scrollview.cpp:531
#define INT_YRADIUS
Definition: intproto.cpp:64
#define INT_MAX_X
Definition: intproto.cpp:67
#define INT_XRADIUS
Definition: intproto.cpp:63
void Clear()
Definition: scrollview.cpp:595
#define INT_XCENTER
Definition: intproto.cpp:61
#define INT_CAPHEIGHT
Definition: intproto.cpp:59
#define INT_DESCENDER
Definition: intproto.cpp:56
void SetCursor(int x, int y)
Definition: scrollview.cpp:525
#define INT_BASELINE
Definition: intproto.cpp:57
#define INT_MAX_Y
Definition: intproto.cpp:68
#define INT_XHEIGHT
Definition: intproto.cpp:58
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
void tesseract::ClipBaseline ( int  ppi,
int  x1,
int  y1,
int  x2,
int  y2,
int *  line_x1,
int *  line_y1,
int *  line_x2,
int *  line_y2 
)

Definition at line 275 of file pdfrenderer.cpp.

277  {
278  *line_x1 = x1;
279  *line_y1 = y1;
280  *line_x2 = x2;
281  *line_y2 = y2;
282  double rise = abs(y2 - y1) * 72 / ppi;
283  double run = abs(x2 - x1) * 72 / ppi;
284  if (rise < 2.0 && 2.0 < run)
285  *line_y1 = *line_y2 = (y1 + y2) / 2;
286 }
int tesseract::ClosestCluster ( const GenericVector< Cluster > &  clusters,
int  value 
)

Definition at line 665 of file paragraphs.cpp.

665  {
666  int best_index = 0;
667  for (int i = 0; i < clusters.size(); i++) {
668  if (abs(value - clusters[i].center) <
669  abs(value - clusters[best_index].center))
670  best_index = i;
671  }
672  return best_index;
673 }
int size() const
Definition: genericvector.h:72
template<typename T >
bool tesseract::cmp_eq ( T const &  t1,
T const &  t2 
)

Definition at line 382 of file genericvector.h.

382  {
383  return t1 == t2;
384 }
bool tesseract::CompareFontInfo ( const FontInfo &  fi1,
const FontInfo &  fi2 
)

Definition at line 120 of file fontinfo.cpp.

120  {
121  // The font properties are required to be the same for two font with the same
122  // name, so there is no need to test them.
123  // Consequently, querying the table with only its font name as information is
124  // enough to retrieve its properties.
125  return strcmp(fi1.name, fi2.name) == 0;
126 }
bool tesseract::CompareFontSet ( const FontSet &  fs1,
const FontSet &  fs2 
)

Definition at line 128 of file fontinfo.cpp.

128  {
129  if (fs1.size != fs2.size)
130  return false;
131  for (int i = 0; i < fs1.size; ++i) {
132  if (fs1.configs[i] != fs2.configs[i])
133  return false;
134  }
135  return true;
136 }
void tesseract::ConvertHypothesizedModelRunsToParagraphs ( int  debug_level,
const GenericVector< RowScratchRegisters > &  rows,
GenericVector< PARA * > *  row_owners,
ParagraphTheory *  theory 
)

Definition at line 2041 of file paragraphs.cpp.

2045  {
2046  int end = rows.size();
2047  int start;
2048  for (; end > 0; end = start) {
2049  start = end - 1;
2050  const ParagraphModel *model = NULL;
2051  // TODO(eger): Be smarter about dealing with multiple hypotheses.
2052  bool single_line_paragraph = false;
2053  SetOfModels models;
2054  rows[start].NonNullHypotheses(&models);
2055  if (models.size() > 0) {
2056  model = models[0];
2057  if (rows[start].GetLineType(model) != LT_BODY)
2058  single_line_paragraph = true;
2059  }
2060  if (model && !single_line_paragraph) {
2061  // walk back looking for more body lines and then a start line.
2062  while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2063  // do nothing
2064  }
2065  if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2066  model = NULL;
2067  }
2068  }
2069  if (model == NULL) {
2070  continue;
2071  }
2072  // rows[start, end) should be a paragraph.
2073  PARA *p = new PARA();
2074  if (model == kCrownLeft || model == kCrownRight) {
2076  // Crown paragraph.
2077  // If we can find an existing ParagraphModel that fits, use it,
2078  // else create a new one.
2079  for (int row = end; row < rows.size(); row++) {
2080  if ((*row_owners)[row] &&
2081  (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2082  (start == 0 ||
2083  ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2084  model = (*row_owners)[row]->model;
2085  break;
2086  }
2087  }
2088  if (model == kCrownLeft) {
2089  // No subsequent model fits, so cons one up.
2090  model = theory->AddModel(ParagraphModel(
2091  JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
2092  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2093  } else if (model == kCrownRight) {
2094  // No subsequent model fits, so cons one up.
2095  model = theory->AddModel(ParagraphModel(
2096  JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
2097  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2098  }
2099  }
2100  rows[start].SetUnknown();
2101  rows[start].AddStartLine(model);
2102  for (int i = start + 1; i < end; i++) {
2103  rows[i].SetUnknown();
2104  rows[i].AddBodyLine(model);
2105  }
2106  p->model = model;
2107  p->has_drop_cap = rows[start].ri_->has_drop_cap;
2108  p->is_list_item =
2110  ? rows[start].ri_->rword_indicates_list_item
2111  : rows[start].ri_->lword_indicates_list_item;
2112  for (int row = start; row < end; row++) {
2113  if ((*row_owners)[row] != NULL) {
2114  tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2115  "more than once!\n");
2116  }
2117  (*row_owners)[row] = p;
2118  }
2119  }
2120 }
int size() const
Definition: genericvector.h:72
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
#define tprintf(...)
Definition: tprintf.h:31
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47
bool is_list_item
Definition: ocrpara.h:38
GenericVectorEqEq< const ParagraphModel * > SetOfModels
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: ocrpara.h:29
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45
bool is_very_first_or_continuation
Definition: ocrpara.h:43
#define NULL
Definition: host.h:144
const ParagraphModel * model
Definition: ocrpara.h:36
bool has_drop_cap
Definition: ocrpara.h:46
bool tesseract::CrownCompatible ( const GenericVector< RowScratchRegisters > *  rows,
int  a,
int  b,
const ParagraphModel model 
)

Definition at line 1288 of file paragraphs.cpp.

1289  {
1290  if (model != kCrownRight && model != kCrownLeft) {
1291  tprintf("CrownCompatible() should only be called with crown models!\n");
1292  return false;
1293  }
1294  RowScratchRegisters &row_a = (*rows)[a];
1295  RowScratchRegisters &row_b = (*rows)[b];
1296  if (model == kCrownRight) {
1297  return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1298  row_b.rindent_ + row_b.rmargin_,
1299  Epsilon(row_a.ri_->average_interword_space));
1300  }
1301  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1302  row_b.lindent_ + row_b.lmargin_,
1303  Epsilon(row_a.ri_->average_interword_space));
1304 }
#define tprintf(...)
Definition: tprintf.h:31
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:148
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45
int tesseract::CubeAPITest ( Boxa *  boxa_blocks,
Pixa *  pixa_blocks,
Boxa *  boxa_words,
Pixa *  pixa_words,
const FCOORD reskew,
Pix *  page_pix,
PAGE_RES page_res 
)

Placeholder for call to Cube and test that the input data is correct. reskew is the direction of baselines in the skewed image in normalized (cos theta, sin theta) form, so (0.866, 0.5) would represent a 30 degree anticlockwise skew.

Definition at line 757 of file baseapi.cpp.

760  {
761  int block_count = boxaGetCount(boxa_blocks);
762  ASSERT_HOST(block_count == pixaGetCount(pixa_blocks));
763  // Write each block to the current directory as junk_write_display.nnn.png.
764  for (int i = 0; i < block_count; ++i) {
765  Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE);
766  pixDisplayWrite(pix, 1);
767  }
768  int word_count = boxaGetCount(boxa_words);
769  ASSERT_HOST(word_count == pixaGetCount(pixa_words));
770  int pr_word = 0;
771  PAGE_RES_IT page_res_it(page_res);
772  for (page_res_it.restart_page(); page_res_it.word () != NULL;
773  page_res_it.forward(), ++pr_word) {
774  WERD_RES *word = page_res_it.word();
775  WERD_CHOICE* choice = word->best_choice;
776  // Write the first 100 words to files names wordims/<wordstring>.tif.
777  if (pr_word < 100) {
778  STRING filename("wordims/");
779  if (choice != NULL) {
780  filename += choice->unichar_string();
781  } else {
782  char numbuf[32];
783  filename += "unclassified";
784  snprintf(numbuf, 32, "%03d", pr_word);
785  filename += numbuf;
786  }
787  filename += ".tif";
788  Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE);
789  pixWrite(filename.string(), pix, IFF_TIFF_G4);
790  }
791  }
792  ASSERT_HOST(pr_word == word_count);
793  return 0;
794 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
WERD * word
Definition: pageres.h:175
Definition: strngs.h:44
#define NULL
Definition: host.h:144
struct Pix * tesseract::DegradeImage ( Pix *  input,
int  exposure,
TRand *  randomizer,
float *  rotation 
)

Definition at line 65 of file degradeimage.cpp.

66  {
67  Pix* pix = pixConvertTo8(input, false);
68  pixDestroy(&input);
69  input = pix;
70  int width = pixGetWidth(input);
71  int height = pixGetHeight(input);
72  if (exposure >= 2) {
73  // An erosion simulates the spreading darkening of a dark copy.
74  // This is backwards to binary morphology,
75  // see http://www.leptonica.com/grayscale-morphology.html
76  pix = input;
77  input = pixErodeGray(pix, 3, 3);
78  pixDestroy(&pix);
79  }
80  // A convolution is essential to any mode as no scanner produces an
81  // image as sharp as the electronic image.
82  pix = pixBlockconv(input, 1, 1);
83  pixDestroy(&input);
84  // A small random rotation helps to make the edges jaggy in a realistic way.
85  if (rotation != NULL) {
86  float radians_clockwise = 0.0f;
87  if (*rotation) {
88  radians_clockwise = *rotation;
89  } else if (randomizer != NULL) {
90  radians_clockwise = randomizer->SignedRand(kRotationRange);
91  }
92 
93  input = pixRotate(pix, radians_clockwise,
94  L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
95  0, 0);
96  // Rotate the boxes to match.
97  *rotation = radians_clockwise;
98  pixDestroy(&pix);
99  } else {
100  input = pix;
101  }
102 
103  if (exposure >= 3 || exposure == 1) {
104  // Erosion after the convolution is not as heavy as before, so it is
105  // good for level 1 and in addition as a level 3.
106  // This is backwards to binary morphology,
107  // see http://www.leptonica.com/grayscale-morphology.html
108  pix = input;
109  input = pixErodeGray(pix, 3, 3);
110  pixDestroy(&pix);
111  }
112  // The convolution really needed to be 2x2 to be realistic enough, but
113  // we only have 3x3, so we have to bias the image darker or lose thin
114  // strokes.
115  int erosion_offset = 0;
116  // For light and 0 exposure, there is no dilation, so compensate for the
117  // convolution with a big darkening bias which is undone for lighter
118  // exposures.
119  if (exposure <= 0)
120  erosion_offset = -3 * kExposureFactor;
121  // Add in a general offset of the greyscales for the exposure level so
122  // a threshold of 128 gives a reasonable binary result.
123  erosion_offset -= exposure * kExposureFactor;
124  // Add a gradual fade over the page and a small amount of salt and pepper
125  // noise to simulate noise in the sensor/paper fibres and varying
126  // illumination.
127  l_uint32* data = pixGetData(input);
128  for (int y = 0; y < height; ++y) {
129  for (int x = 0; x < width; ++x) {
130  int pixel = GET_DATA_BYTE(data, x);
131  if (randomizer != NULL)
132  pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
133  if (height + width > kMinRampSize)
134  pixel -= (2*x + y) * 32 / (height + width);
135  pixel += erosion_offset;
136  if (pixel < 0)
137  pixel = 0;
138  if (pixel > 255)
139  pixel = 255;
140  SET_DATA_BYTE(data, x, pixel);
141  }
142  data += input->wpl;
143  }
144  return input;
145 }
const int kExposureFactor
const float kRotationRange
const int kSaltnPepper
const int kMinRampSize
#define NULL
Definition: host.h:144
template<typename T >
void tesseract::DeleteObject ( T *  object)

Definition at line 165 of file tablefind.cpp.

165  {
166  delete object;
167 }
void tesseract::DetectParagraphs ( int  debug_level,
GenericVector< RowInfo > *  row_infos,
GenericVector< PARA * > *  row_owners,
PARA_LIST *  paragraphs,
GenericVector< ParagraphModel * > *  models 
)

Definition at line 2264 of file paragraphs.cpp.

2268  {
2270  ParagraphTheory theory(models);
2271 
2272  // Initialize row_owners to be a bunch of NULL pointers.
2273  row_owners->init_to_size(row_infos->size(), NULL);
2274 
2275  // Set up row scratch registers for the main algorithm.
2276  rows.init_to_size(row_infos->size(), RowScratchRegisters());
2277  for (int i = 0; i < row_infos->size(); i++) {
2278  rows[i].Init((*row_infos)[i]);
2279  }
2280 
2281  // Pass 1:
2282  // Detect sequences of lines that all contain leader dots (.....)
2283  // These are likely Tables of Contents. If there are three text lines in
2284  // a row with leader dots, it's pretty safe to say the middle one should
2285  // be a paragraph of its own.
2286  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2287 
2288  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2289 
2290  GenericVector<Interval> leftovers;
2291  LeftoverSegments(rows, &leftovers, 0, rows.size());
2292  for (int i = 0; i < leftovers.size(); i++) {
2293  // Pass 2a:
2294  // Find any strongly evidenced start-of-paragraph lines. If they're
2295  // followed by two lines that look like body lines, make a paragraph
2296  // model for that and see if that model applies throughout the text
2297  // (that is, "smear" it).
2298  StrongEvidenceClassify(debug_level, &rows,
2299  leftovers[i].begin, leftovers[i].end, &theory);
2300 
2301  // Pass 2b:
2302  // If we had any luck in pass 2a, we got part of the page and didn't
2303  // know how to classify a few runs of rows. Take the segments that
2304  // didn't find a model and reprocess them individually.
2305  GenericVector<Interval> leftovers2;
2306  LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2307  bool pass2a_was_useful = leftovers2.size() > 1 ||
2308  (leftovers2.size() == 1 &&
2309  (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
2310  if (pass2a_was_useful) {
2311  for (int j = 0; j < leftovers2.size(); j++) {
2312  StrongEvidenceClassify(debug_level, &rows,
2313  leftovers2[j].begin, leftovers2[j].end,
2314  &theory);
2315  }
2316  }
2317  }
2318 
2319  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2320 
2321  // Pass 3:
2322  // These are the dregs for which we didn't have enough strong textual
2323  // and geometric clues to form matching models for. Let's see if
2324  // the geometric clues are simple enough that we could just use those.
2325  LeftoverSegments(rows, &leftovers, 0, rows.size());
2326  for (int i = 0; i < leftovers.size(); i++) {
2327  GeometricClassify(debug_level, &rows,
2328  leftovers[i].begin, leftovers[i].end, &theory);
2329  }
2330 
2331  // Undo any flush models for which there's little evidence.
2332  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2333 
2334  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2335 
2336  // Pass 4:
2337  // Take everything that's still not marked up well and clear all markings.
2338  LeftoverSegments(rows, &leftovers, 0, rows.size());
2339  for (int i = 0; i < leftovers.size(); i++) {
2340  for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2341  rows[j].SetUnknown();
2342  }
2343  }
2344 
2345  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2346 
2347  // Convert all of the unique hypothesis runs to PARAs.
2348  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2349  &theory);
2350 
2351  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2352 
2353  // Finally, clean up any dangling NULL row paragraph parents.
2354  CanonicalizeDetectionResults(row_owners, paragraphs);
2355 }
int size() const
Definition: genericvector.h:72
void ConvertHypothesizedModelRunsToParagraphs(int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory)
void LeftoverSegments(const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
void init_to_size(int size, T t)
void SeparateSimpleLeaderLines(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
#define NULL
Definition: host.h:144
void GeometricClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
void StrongEvidenceClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
void tesseract::DetectParagraphs ( int  debug_level,
bool  after_text_recognition,
const MutableIterator *  block_start,
GenericVector< ParagraphModel * > *  models 
)

Definition at line 2509 of file paragraphs.cpp.

2512  {
2513  // Clear out any preconceived notions.
2514  if (block_start->Empty(RIL_TEXTLINE)) {
2515  return;
2516  }
2517  BLOCK *block = block_start->PageResIt()->block()->block;
2518  block->para_list()->clear();
2519  bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
2520 
2521  // Convert the Tesseract structures to RowInfos
2522  // for the paragraph detection algorithm.
2523  MutableIterator row(*block_start);
2524  if (row.Empty(RIL_TEXTLINE))
2525  return; // end of input already.
2526 
2527  GenericVector<RowInfo> row_infos;
2528  do {
2529  if (!row.PageResIt()->row())
2530  continue; // empty row.
2531  row.PageResIt()->row()->row->set_para(NULL);
2532  row_infos.push_back(RowInfo());
2533  RowInfo &ri = row_infos.back();
2534  InitializeRowInfo(after_text_recognition, row, &ri);
2535  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
2536  row.Next(RIL_TEXTLINE));
2537 
2538  // If we're called before text recognition, we might not have
2539  // tight block bounding boxes, so trim by the minimum on each side.
2540  if (row_infos.size() > 0) {
2541  int min_lmargin = row_infos[0].pix_ldistance;
2542  int min_rmargin = row_infos[0].pix_rdistance;
2543  for (int i = 1; i < row_infos.size(); i++) {
2544  if (row_infos[i].pix_ldistance < min_lmargin)
2545  min_lmargin = row_infos[i].pix_ldistance;
2546  if (row_infos[i].pix_rdistance < min_rmargin)
2547  min_rmargin = row_infos[i].pix_rdistance;
2548  }
2549  if (min_lmargin > 0 || min_rmargin > 0) {
2550  for (int i = 0; i < row_infos.size(); i++) {
2551  row_infos[i].pix_ldistance -= min_lmargin;
2552  row_infos[i].pix_rdistance -= min_rmargin;
2553  }
2554  }
2555  }
2556 
2557  // Run the paragraph detection algorithm.
2558  GenericVector<PARA *> row_owners;
2559  GenericVector<PARA *> the_paragraphs;
2560  if (!is_image_block) {
2561  DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
2562  models);
2563  } else {
2564  row_owners.init_to_size(row_infos.size(), NULL);
2565  CanonicalizeDetectionResults(&row_owners, block->para_list());
2566  }
2567 
2568  // Now stitch in the row_owners into the rows.
2569  row = *block_start;
2570  for (int i = 0; i < row_owners.size(); i++) {
2571  while (!row.PageResIt()->row())
2572  row.Next(RIL_TEXTLINE);
2573  row.PageResIt()->row()->row->set_para(row_owners[i]);
2574  row.Next(RIL_TEXTLINE);
2575  }
2576 }
int size() const
Definition: genericvector.h:72
int push_back(T object)
void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info)
T & back() const
bool IsText() const
Definition: polyblk.h:52
Definition: capi.h:76
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
PARA_LIST * para_list()
Definition: ocrblock.h:128
Definition: ocrblock.h:30
void init_to_size(int size, T t)
#define NULL
Definition: host.h:144
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void DetectParagraphs(int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector< ParagraphModel * > *models)
void tesseract::DiscardUnusedModels ( const GenericVector< RowScratchRegisters > &  rows,
ParagraphTheory *  theory 
)

Definition at line 1455 of file paragraphs.cpp.

1456  {
1457  SetOfModels used_models;
1458  for (int i = 0; i < rows.size(); i++) {
1459  rows[i].StrongHypotheses(&used_models);
1460  }
1461  theory->DiscardUnusedModels(used_models);
1462 }
int size() const
Definition: genericvector.h:72
GenericVectorEqEq< const ParagraphModel * > SetOfModels
long tesseract::dist2 ( int  x1,
int  y1,
int  x2,
int  y2 
)

Definition at line 192 of file pdfrenderer.cpp.

192  {
193  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
194 }
void tesseract::DowngradeWeakestToCrowns ( int  debug_level,
ParagraphTheory *  theory,
GenericVector< RowScratchRegisters > *  rows 
)

Definition at line 1488 of file paragraphs.cpp.

1490  {
1491  int start;
1492  for (int end = rows->size(); end > 0; end = start) {
1493  // Search back for a body line of a unique type.
1494  const ParagraphModel *model = NULL;
1495  while (end > 0 &&
1496  (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
1497  end--;
1498  }
1499  if (end == 0) break;
1500  start = end - 1;
1501  while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1502  start--; // walk back to the first line that is not the same body type.
1503  }
1504  if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1505  StrongModel(model) &&
1506  NearlyEqual(model->first_indent(), model->body_indent(),
1507  model->tolerance())) {
1508  start--;
1509  }
1510  start++;
1511  // Now rows[start, end) is a sequence of unique body hypotheses of model.
1512  if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
1513  continue;
1514  if (!StrongModel(model)) {
1515  while (start > 0 &&
1516  CrownCompatible(rows, start - 1, start, model))
1517  start--;
1518  }
1519  if (start == 0 ||
1520  (!StrongModel(model)) ||
1521  (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1522  // crownify rows[start, end)
1523  const ParagraphModel *crown_model = model;
1524  if (StrongModel(model)) {
1525  if (model->justification() == JUSTIFICATION_LEFT)
1526  crown_model = kCrownLeft;
1527  else
1528  crown_model = kCrownRight;
1529  }
1530  (*rows)[start].SetUnknown();
1531  (*rows)[start].AddStartLine(crown_model);
1532  for (int row = start + 1; row < end; row++) {
1533  (*rows)[row].SetUnknown();
1534  (*rows)[row].AddBodyLine(crown_model);
1535  }
1536  }
1537  }
1538  DiscardUnusedModels(*rows, theory);
1539 }
int size() const
Definition: genericvector.h:72
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
int body_indent() const
Definition: ocrpara.h:169
int tolerance() const
Definition: ocrpara.h:170
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47
bool StrongModel(const ParagraphModel *model)
int first_indent() const
Definition: ocrpara.h:168
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:148
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45
#define NULL
Definition: host.h:144
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
tesseract::ELISTIZE ( ViterbiStateEntry  )
tesseract::ELISTIZE ( AmbigSpec  )
tesseract::ELISTIZEH ( AmbigSpec  )
tesseract::ELISTIZEH ( ViterbiStateEntry  )
void tesseract::ExtractFontName ( const STRING filename,
STRING fontname 
)

Public Code

Definition at line 46 of file blobclass.cpp.

46  {
47  *fontname = classify_font_name;
48  if (*fontname == kUnknownFontName) {
49  // filename is expected to be of the form [lang].[fontname].exp[num]
50  // The [lang], [fontname] and [num] fields should not have '.' characters.
51  const char *basename = strrchr(filename.string(), '/');
52  const char *firstdot = strchr(basename ? basename : filename.string(), '.');
53  const char *lastdot = strrchr(filename.string(), '.');
54  if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
55  ++firstdot;
56  *fontname = firstdot;
57  fontname->truncate_at(lastdot - firstdot);
58  }
59  }
60 }
void truncate_at(inT32 index)
Definition: strngs.cpp:264
#define NULL
Definition: host.h:144
char * classify_font_name
Definition: blobclass.cpp:35
const char * string() const
Definition: strngs.cpp:193
void tesseract::ExtractFontProperties ( const string &  utf8_text,
StringRenderer render,
const string &  output_base 
)

Definition at line 212 of file text2image.cpp.

214  {
215  map<string, SpacingProperties> spacing_map;
216  map<string, SpacingProperties>::iterator spacing_map_it0;
217  map<string, SpacingProperties>::iterator spacing_map_it1;
218  int x_bearing, x_advance;
219  int len = utf8_text.length();
220  int offset = 0;
221  const char* text = utf8_text.c_str();
222  while (offset < len) {
223  offset += render->RenderToImage(text + offset, strlen(text + offset), NULL);
224  const vector<BoxChar*> &boxes = render->GetBoxes();
225 
226  // If the page break split a bigram, correct the offset so we try the bigram
227  // on the next iteration.
228  if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) &&
229  IsWhitespaceBox(boxes[boxes.size() - 2])) {
230  if (boxes.size() > 3) {
231  tprintf("WARNING: Adjusting to bad page break after '%s%s'\n",
232  boxes[boxes.size() - 4]->ch().c_str(),
233  boxes[boxes.size() - 3]->ch().c_str());
234  }
235  offset -= boxes[boxes.size() - 1]->ch().size();
236  }
237 
238  for (int b = 0; b < boxes.size(); b += 2) {
239  while (b < boxes.size() && IsWhitespaceBox(boxes[b])) ++b;
240  if (b + 1 >= boxes.size()) break;
241  const string &ch0 = boxes[b]->ch();
242  // We encountered a ligature. This happens in at least two scenarios:
243  // One is when the rendered bigram forms a grapheme cluster (eg. the
244  // second character in the bigram is a combining vowel), in which case we
245  // correctly output only one bounding box.
246  // A second far less frequent case is when caused some fonts like 'DejaVu
247  // Sans Ultra-Light' force Pango to render a ligatured character even if
248  // the input consists of the separated characters. NOTE(ranjith): As per
249  // behdad@ this is not currently controllable at the level of the Pango
250  // API.
251  // Safeguard against these cases here by just skipping the bigram.
252  if (IsWhitespaceBox(boxes[b+1])) {
253  continue;
254  }
255  int xgap = (boxes[b+1]->box()->x -
256  (boxes[b]->box()->x + boxes[b]->box()->w));
257  spacing_map_it0 = spacing_map.find(ch0);
258  int ok_count = 0;
259  if (spacing_map_it0 == spacing_map.end() &&
260  render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) {
261  spacing_map[ch0] = SpacingProperties(
262  x_bearing, x_advance - x_bearing - boxes[b]->box()->w);
263  spacing_map_it0 = spacing_map.find(ch0);
264  ++ok_count;
265  }
266  const string &ch1 = boxes[b+1]->ch();
267  tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str());
268  spacing_map_it1 = spacing_map.find(ch1);
269  if (spacing_map_it1 == spacing_map.end() &&
270  render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) {
271  spacing_map[ch1] = SpacingProperties(
272  x_bearing, x_advance - x_bearing - boxes[b+1]->box()->w);
273  spacing_map_it1 = spacing_map.find(ch1);
274  ++ok_count;
275  }
276  if (ok_count == 2 && xgap != (spacing_map_it0->second.x_gap_after +
277  spacing_map_it1->second.x_gap_before)) {
278  spacing_map_it0->second.kerned_x_gaps[ch1] = xgap;
279  }
280  }
281  render->ClearBoxes();
282  }
283  string output_string;
284  const int kBufSize = 1024;
285  char buf[kBufSize];
286  snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size()));
287  output_string.append(buf);
288  map<string, SpacingProperties>::const_iterator spacing_map_it;
289  for (spacing_map_it = spacing_map.begin();
290  spacing_map_it != spacing_map.end(); ++spacing_map_it) {
291  snprintf(buf, kBufSize,
292  "%s %d %d %d", spacing_map_it->first.c_str(),
293  spacing_map_it->second.x_gap_before,
294  spacing_map_it->second.x_gap_after,
295  static_cast<int>(spacing_map_it->second.kerned_x_gaps.size()));
296  output_string.append(buf);
297  map<string, int>::const_iterator kern_it;
298  for (kern_it = spacing_map_it->second.kerned_x_gaps.begin();
299  kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) {
300  snprintf(buf, kBufSize,
301  " %s %d", kern_it->first.c_str(), kern_it->second);
302  output_string.append(buf);
303  }
304  output_string.append("\n");
305  }
306  File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo");
307 }
const PangoFontInfo & font() const
bool GetSpacingProperties(const string &utf8_char, int *x_bearing, int *x_advance) const
#define tprintf(...)
Definition: tprintf.h:31
const vector< BoxChar * > & GetBoxes() const
#define tlog(level,...)
Definition: tlog.h:33
#define NULL
Definition: host.h:144
int RenderToImage(const char *text, int text_length, Pix **pix)
bool tesseract::FirstWordWouldHaveFit ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after,
tesseract::ParagraphJustification  justification 
)

Definition at line 1621 of file paragraphs.cpp.

1623  {
1624  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1625  return true;
1626 
1627  if (justification == JUSTIFICATION_UNKNOWN) {
1628  tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1629  }
1630  int available_space;
1631  if (justification == JUSTIFICATION_CENTER) {
1632  available_space = before.lindent_ + before.rindent_;
1633  } else {
1634  available_space = before.OffsideIndent(justification);
1635  }
1636  available_space -= before.ri_->average_interword_space;
1637 
1638  if (before.ri_->ltr)
1639  return after.ri_->lword_box.width() < available_space;
1640  return after.ri_->rword_box.width() < available_space;
1641 }
#define tprintf(...)
Definition: tprintf.h:31
bool tesseract::FirstWordWouldHaveFit ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after 
)

Definition at line 1646 of file paragraphs.cpp.

1647  {
1648  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1649  return true;
1650 
1651  int available_space = before.lindent_;
1652  if (before.rindent_ > available_space)
1653  available_space = before.rindent_;
1654  available_space -= before.ri_->average_interword_space;
1655 
1656  if (before.ri_->ltr)
1657  return after.ri_->lword_box.width() < available_space;
1658  return after.ri_->rword_box.width() < available_space;
1659 }
void tesseract::FontInfoDeleteCallback ( FontInfo  f)

Definition at line 139 of file fontinfo.cpp.

139  {
140  if (f.spacing_vec != NULL) {
141  f.spacing_vec->delete_data_pointers();
142  delete f.spacing_vec;
143  }
144  delete[] f.name;
145 }
#define NULL
Definition: host.h:144
void tesseract::FontSetDeleteCallback ( FontSet  fs)

Definition at line 146 of file fontinfo.cpp.

146  {
147  delete[] fs.configs;
148 }
char32 tesseract::FullwidthToHalfwidth ( const char32  ch)

Definition at line 239 of file normstrngs.cpp.

239  {
240  // Return unchanged if not in the fullwidth-halfwidth Unicode block.
241  if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
242  if (ch != 0x3000) return ch;
243  }
244  // Special case for fullwidth left and right "white parentheses".
245  if (ch == 0xFF5F) return 0x2985;
246  if (ch == 0xFF60) return 0x2986;
247  // Construct a full-to-half width transliterator.
248  IcuErrorCode error_code;
249  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
250  const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
251  "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
252  error_code.assertSuccess();
253  error_code.reset();
254 
255  fulltohalf->transliterate(uch_str);
256  delete fulltohalf;
257  ASSERT_HOST(uch_str.length() != 0);
258  return uch_str[0];
259 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:170
void tesseract::GeometricClassify ( int  debug_level,
GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
ParagraphTheory *  theory 
)

Definition at line 1077 of file paragraphs.cpp.

1080  {
1081  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1082  return;
1083  if (debug_level > 1) {
1084  tprintf("###############################################\n");
1085  tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
1086  row_start, row_end);
1087  tprintf("###############################################\n");
1088  }
1089  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1090 
1091  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1092  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1093  s.Fail(2, "Too much variety for simple outline classification.");
1094  return;
1095  }
1096  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1097  s.Fail(1, "Not enough variety for simple outline classification.");
1098  return;
1099  }
1100  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1101  GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1102  return;
1103  }
1104 
1105  // At this point, we know that one side has at least two tab stops, and the
1106  // other side has one or two tab stops.
1107  // Left to determine:
1108  // (1) Which is the body indent and which is the first line indent?
1109  // (2) Is the text fully justified?
1110 
1111  // If one side happens to have three or more tab stops, assume that side
1112  // is opposite of the aligned side.
1113  if (s.right_tabs.size() > 2) {
1114  s.AssumeLeftJustification();
1115  } else if (s.left_tabs.size() > 2) {
1116  s.AssumeRightJustification();
1117  } else if (s.ltr) { // guess based on script direction
1118  s.AssumeLeftJustification();
1119  } else {
1120  s.AssumeRightJustification();
1121  }
1122 
1123  if (s.AlignTabs().size() == 2) {
1124  // For each tab stop on the aligned side, how many of them appear
1125  // to be paragraph start lines? [first lines]
1126  int firsts[2] = {0, 0};
1127  // Count the first line as a likely paragraph start line.
1128  firsts[s.AlignsideTabIndex(s.row_start)]++;
1129  // For each line, if the first word would have fit on the previous
1130  // line count it as a likely paragraph start line.
1131  bool jam_packed = true;
1132  for (int i = s.row_start + 1; i < s.row_end; i++) {
1133  if (s.FirstWordWouldHaveFit(i - 1, i)) {
1134  firsts[s.AlignsideTabIndex(i)]++;
1135  jam_packed = false;
1136  }
1137  }
1138  // Make an extra accounting for the last line of the paragraph just
1139  // in case it's the only short line in the block. That is, take its
1140  // first word as typical and see if this looks like the *last* line
1141  // of a paragraph. If so, mark the *other* indent as probably a first.
1142  if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1143  firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1144  }
1145 
1146  int percent0firsts, percent1firsts;
1147  percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1148  percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1149 
1150  // TODO(eger): Tune these constants if necessary.
1151  if ((percent0firsts < 20 && 30 < percent1firsts) ||
1152  percent0firsts + 30 < percent1firsts) {
1153  s.first_indent = s.AlignTabs()[1].center;
1154  s.body_indent = s.AlignTabs()[0].center;
1155  } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1156  percent1firsts + 30 < percent0firsts) {
1157  s.first_indent = s.AlignTabs()[0].center;
1158  s.body_indent = s.AlignTabs()[1].center;
1159  } else {
1160  // Ambiguous! Probably lineated (poetry)
1161  if (debug_level > 1) {
1162  tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1163  s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1164  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1165  s.AlignTabs()[0].center, percent0firsts);
1166  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1167  s.AlignTabs()[1].center, percent1firsts);
1168  s.PrintRows();
1169  }
1170  return;
1171  }
1172  } else {
1173  // There's only one tab stop for the "aligned to" side.
1174  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1175  }
1176 
1177  // At this point, we have our model.
1178  const ParagraphModel *model = theory->AddModel(s.Model());
1179 
1180  // Now all we have to do is figure out if the text is fully justified or not.
1181  // eop_threshold: default to fully justified unless we see evidence below.
1182  // See description on MarkRowsWithModel()
1183  s.eop_threshold =
1184  (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1185  // If the text is not fully justified, re-set the eop_threshold to 0.
1186  if (s.AlignTabs().size() == 2) {
1187  // Paragraphs with a paragraph-start indent.
1188  for (int i = s.row_start; i < s.row_end - 1; i++) {
1189  if (ValidFirstLine(s.rows, i + 1, model) &&
1190  !NearlyEqual(s.OffsideTabs()[0].center,
1191  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1192  // We found a non-end-of-paragraph short line: not fully justified.
1193  s.eop_threshold = 0;
1194  break;
1195  }
1196  }
1197  } else {
1198  // Paragraphs with no paragraph-start indent.
1199  for (int i = s.row_start; i < s.row_end - 1; i++) {
1200  if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1201  !NearlyEqual(s.OffsideTabs()[0].center,
1202  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1203  // We found a non-end-of-paragraph short line: not fully justified.
1204  s.eop_threshold = 0;
1205  break;
1206  }
1207  }
1208  }
1209  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1210 }
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
Definition: paragraphs.cpp:985
#define tprintf(...)
Definition: tprintf.h:31
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
Definition: paragraphs.cpp:807
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:148
void tesseract::GeometricClassifyThreeTabStopTextBlock ( int  debug_level,
GeometricClassifierState &  s,
ParagraphTheory *  theory 
)

Definition at line 985 of file paragraphs.cpp.

988  {
989  int num_rows = s.row_end - s.row_start;
990  int num_full_rows = 0;
991  int last_row_full = 0;
992  for (int i = s.row_start; i < s.row_end; i++) {
993  if (s.IsFullRow(i)) {
994  num_full_rows++;
995  if (i == s.row_end - 1) last_row_full++;
996  }
997  }
998 
999  if (num_full_rows < 0.7 * num_rows) {
1000  s.Fail(1, "Not enough full lines to know which lines start paras.");
1001  return;
1002  }
1003 
1004  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1005  s.eop_threshold = 0;
1006 
1007  if (s.ltr) {
1008  s.AssumeLeftJustification();
1009  } else {
1010  s.AssumeRightJustification();
1011  }
1012 
1013  if (debug_level > 0) {
1014  tprintf("# Not enough variety for clear outline classification. "
1015  "Guessing these are %s aligned based on script.\n",
1016  s.ltr ? "left" : "right");
1017  s.PrintRows();
1018  }
1019 
1020  if (s.AlignTabs().size() == 2) { // case A1 or A2
1021  s.first_indent = s.AlignTabs()[1].center;
1022  s.body_indent = s.AlignTabs()[0].center;
1023  } else { // case B1 or B2
1024  if (num_rows - 1 == num_full_rows - last_row_full) {
1025  // case B2
1026  const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1027  (*s.rows)[s.row_start].AddStartLine(model);
1028  for (int i = s.row_start + 1; i < s.row_end; i++) {
1029  (*s.rows)[i].AddBodyLine(model);
1030  }
1031  return;
1032  } else {
1033  // case B1
1034  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1035  s.eop_threshold = (s.OffsideTabs()[0].center +
1036  s.OffsideTabs()[1].center) / 2;
1037  }
1038  }
1039  const ParagraphModel *model = theory->AddModel(s.Model());
1040  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
1041  s.ltr, s.eop_threshold);
1042  return;
1043 }
#define tprintf(...)
Definition: tprintf.h:31
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47
int first_indent() const
Definition: ocrpara.h:168
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
Definition: paragraphs.cpp:807
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45
void tesseract::GetWordBaseline ( int  writing_direction,
int  ppi,
int  height,
int  word_x1,
int  word_y1,
int  word_x2,
int  word_y2,
int  line_x1,
int  line_y1,
int  line_x2,
int  line_y2,
double *  x0,
double *  y0,
double *  length 
)

Definition at line 204 of file pdfrenderer.cpp.

207  {
208  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
209  Swap(&word_x1, &word_x2);
210  Swap(&word_y1, &word_y2);
211  }
212  double word_length;
213  double x, y;
214  {
215  int px = word_x1;
216  int py = word_y1;
217  double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
218  if (l2 == 0) {
219  x = line_x1;
220  y = line_y1;
221  } else {
222  double t = ((px - line_x2) * (line_x2 - line_x1) +
223  (py - line_y2) * (line_y2 - line_y1)) / l2;
224  x = line_x2 + t * (line_x2 - line_x1);
225  y = line_y2 + t * (line_y2 - line_y1);
226  }
227  word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
228  word_x2, word_y2)));
229  word_length = word_length * 72.0 / ppi;
230  x = x * 72 / ppi;
231  y = height - (y * 72.0 / ppi);
232  }
233  *x0 = x;
234  *y0 = y;
235  *length = word_length;
236 }
long dist2(int x1, int y1, int x2, int y2)
void Swap(T *p1, T *p2)
Definition: helpers.h:90
Pix* tesseract::GridReducedPix ( const TBOX box,
int  gridsize,
ICOORD  bleft,
int *  left,
int *  bottom 
)

Definition at line 212 of file bbgrid.cpp.

213  {
214  // Compute grid bounds of the outline and pad all round by 1.
215  int grid_left = (box.left() - bleft.x()) / gridsize - 1;
216  int grid_bottom = (box.bottom() - bleft.y()) / gridsize - 1;
217  int grid_right = (box.right() - bleft.x()) / gridsize + 1;
218  int grid_top = (box.top() - bleft.y()) / gridsize + 1;
219  *left = grid_left;
220  *bottom = grid_bottom;
221  return pixCreate(grid_right - grid_left + 1,
222  grid_top - grid_bottom + 1,
223  1);
224 }
inT16 right() const
Definition: rect.h:75
inT16 y() const
access_function
Definition: points.h:56
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
inT16 x() const
access function
Definition: points.h:52
inT16 top() const
Definition: rect.h:54
void tesseract::HistogramRect ( Pix *  src_pix,
int  channel,
int  left,
int  top,
int  width,
int  height,
int *  histogram 
)

Definition at line 157 of file otsuthr.cpp.

159  {
160  PERF_COUNT_START("HistogramRect")
161  int num_channels = pixGetDepth(src_pix) / 8;
162  channel = ClipToRange(channel, 0, num_channels - 1);
163  int bottom = top + height;
164  memset(histogram, 0, sizeof(*histogram) * kHistogramSize);
165  int src_wpl = pixGetWpl(src_pix);
166  l_uint32* srcdata = pixGetData(src_pix);
167  for (int y = top; y < bottom; ++y) {
168  const l_uint32* linedata = srcdata + y * src_wpl;
169  for (int x = 0; x < width; ++x) {
170  int pixel = GET_DATA_BYTE(const_cast<void*>(
171  reinterpret_cast<const void *>(linedata)),
172  (x + left) * num_channels + channel);
173  ++histogram[pixel];
174  }
175  }
177 }
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
#define PERF_COUNT_START(FUNCT_NAME)
#define PERF_COUNT_END
const int kHistogramSize
Definition: otsuthr.h:27
STRING tesseract::HOcrEscape ( const char *  text)

Escape a char string - remove <>&"' with HTML codes.

Escape a char string - remove &<>"' with HTML codes.

Definition at line 2644 of file baseapi.cpp.

2644  {
2645  STRING ret;
2646  const char *ptr;
2647  for (ptr = text; *ptr; ptr++) {
2648  switch (*ptr) {
2649  case '<': ret += "&lt;"; break;
2650  case '>': ret += "&gt;"; break;
2651  case '&': ret += "&amp;"; break;
2652  case '"': ret += "&quot;"; break;
2653  case '\'': ret += "&#39;"; break;
2654  default: ret += *ptr;
2655  }
2656  }
2657  return ret;
2658 }
Definition: strngs.h:44
void tesseract::InitializeRowInfo ( bool  after_recognition,
const MutableIterator &  it,
RowInfo *  info 
)

Definition at line 2411 of file paragraphs.cpp.

2413  {
2414  if (it.PageResIt()->row() != NULL) {
2415  ROW *row = it.PageResIt()->row()->row;
2416  info->pix_ldistance = row->lmargin();
2417  info->pix_rdistance = row->rmargin();
2418  info->average_interword_space =
2419  row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
2420  info->pix_xheight = row->x_height();
2421  info->has_leaders = false;
2422  info->has_drop_cap = row->has_drop_cap();
2423  info->ltr = true; // set below depending on word scripts
2424  } else {
2425  info->pix_ldistance = info->pix_rdistance = 0;
2426  info->average_interword_space = 1;
2427  info->pix_xheight = 1.0;
2428  info->has_leaders = false;
2429  info->has_drop_cap = false;
2430  info->ltr = true;
2431  }
2432 
2433  info->num_words = 0;
2434  info->lword_indicates_list_item = false;
2435  info->lword_likely_starts_idea = false;
2436  info->lword_likely_ends_idea = false;
2437  info->rword_indicates_list_item = false;
2438  info->rword_likely_starts_idea = false;
2439  info->rword_likely_ends_idea = false;
2440  info->has_leaders = false;
2441  info->ltr = 1;
2442 
2443  if (!after_recognition) {
2445  return;
2446  }
2447  info->text = "";
2448  char *text = it.GetUTF8Text(RIL_TEXTLINE);
2449  int trailing_ws_idx = strlen(text); // strip trailing space
2450  while (trailing_ws_idx > 0 &&
2451  // isspace() only takes ASCII
2452  ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2453  isspace(text[trailing_ws_idx - 1]))
2454  trailing_ws_idx--;
2455  if (trailing_ws_idx > 0) {
2456  int lspaces = info->pix_ldistance / info->average_interword_space;
2457  for (int i = 0; i < lspaces; i++)
2458  info->text += ' ';
2459  for (int i = 0; i < trailing_ws_idx; i++)
2460  info->text += text[i];
2461  }
2462  delete []text;
2463 
2464  if (info->text.size() == 0) {
2465  return;
2466  }
2467 
2468  PAGE_RES_IT page_res_it = *it.PageResIt();
2470  WERD_RES *word_res = page_res_it.restart_row();
2471  ROW_RES *this_row = page_res_it.row();
2472  int num_leaders = 0;
2473  int ltr = 0;
2474  int rtl = 0;
2475  do {
2476  if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2477  werds.push_back(word_res);
2478  ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2479  rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2480  if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
2481  }
2482  word_res = page_res_it.forward();
2483  } while (page_res_it.row() == this_row);
2484  info->ltr = ltr >= rtl;
2485  info->has_leaders = num_leaders > 3;
2486  info->num_words = werds.size();
2487  if (werds.size() > 0) {
2488  WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2489  info->lword_text = lword->best_choice->unichar_string().string();
2490  info->rword_text = rword->best_choice->unichar_string().string();
2491  info->lword_box = lword->word->bounding_box();
2492  info->rword_box = rword->word->bounding_box();
2493  LeftWordAttributes(lword->uch_set, lword->best_choice,
2494  info->lword_text,
2495  &info->lword_indicates_list_item,
2496  &info->lword_likely_starts_idea,
2497  &info->lword_likely_ends_idea);
2498  RightWordAttributes(rword->uch_set, rword->best_choice,
2499  info->rword_text,
2500  &info->rword_indicates_list_item,
2501  &info->rword_likely_starts_idea,
2502  &info->rword_likely_ends_idea);
2503  }
2504 }
int size() const
Definition: genericvector.h:72
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info)
inT32 space() const
Definition: ocrrow.h:76
#define MAX(x, y)
Definition: ndminx.h:24
WERD_CHOICE * best_choice
Definition: pageres.h:219
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:394
int push_back(T object)
bool has_drop_cap() const
Definition: ocrrow.h:108
inT16 rmargin() const
Definition: ocrrow.h:101
float x_height() const
Definition: ocrrow.h:61
TBOX bounding_box() const
Definition: werd.cpp:160
inT32 length() const
Definition: strngs.cpp:188
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:441
bool AnyRtlCharsInWord() const
Definition: pageres.h:372
Definition: ocrrow.h:32
const STRING & unichar_string() const
Definition: ratngs.h:524
WERD_RES * forward()
Definition: pageres.h:713
inT16 lmargin() const
Definition: ocrrow.h:98
const UNICHARSET * uch_set
Definition: pageres.h:192
ROW_RES * row() const
Definition: pageres.h:736
WERD_RES * restart_row()
Definition: pageres.cpp:1636
WERD * word
Definition: pageres.h:175
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool AnyLtrCharsInWord() const
Definition: pageres.h:389
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::InitializeTextAndBoxesPreRecognition ( const MutableIterator &  it,
RowInfo *  info 
)

Definition at line 2359 of file paragraphs.cpp.

2360  {
2361  // Set up text, lword_text, and rword_text (mostly for debug printing).
2362  STRING fake_text;
2363  PageIterator pit(static_cast<const PageIterator&>(it));
2364  bool first_word = true;
2365  if (!pit.Empty(RIL_WORD)) {
2366  do {
2367  fake_text += "x";
2368  if (first_word) info->lword_text += "x";
2369  info->rword_text += "x";
2370  if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2371  !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2372  fake_text += " ";
2373  info->rword_text = "";
2374  first_word = false;
2375  }
2376  } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
2377  pit.Next(RIL_SYMBOL));
2378  }
2379  if (fake_text.size() == 0) return;
2380 
2381  int lspaces = info->pix_ldistance / info->average_interword_space;
2382  for (int i = 0; i < lspaces; i++) {
2383  info->text += ' ';
2384  }
2385  info->text += fake_text;
2386 
2387  // Set up lword_box, rword_box, and num_words.
2388  PAGE_RES_IT page_res_it = *it.PageResIt();
2389  WERD_RES *word_res = page_res_it.restart_row();
2390  ROW_RES *this_row = page_res_it.row();
2391 
2392  WERD_RES *lword = NULL;
2393  WERD_RES *rword = NULL;
2394  info->num_words = 0;
2395  do {
2396  if (word_res) {
2397  if (!lword) lword = word_res;
2398  if (rword != word_res) info->num_words++;
2399  rword = word_res;
2400  }
2401  word_res = page_res_it.forward();
2402  } while (page_res_it.row() == this_row);
2403 
2404  if (lword) info->lword_box = lword->word->bounding_box();
2405  if (rword) info->rword_box = rword->word->bounding_box();
2406 }
TBOX bounding_box() const
Definition: werd.cpp:160
WERD_RES * forward()
Definition: pageres.h:713
Definition: capi.h:76
ROW_RES * row() const
Definition: pageres.h:736
WERD_RES * restart_row()
Definition: pageres.cpp:1636
inT32 size() const
Definition: strngs.h:66
WERD * word
Definition: pageres.h:175
Definition: strngs.h:44
#define NULL
Definition: host.h:144
ParagraphModel tesseract::InternalParagraphModelByOutline ( const GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
int  tolerance,
bool *  consistent 
)

Definition at line 1692 of file paragraphs.cpp.

1694  {
1695  int ltr_line_count = 0;
1696  for (int i = start; i < end; i++) {
1697  ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1698  }
1699  bool ltr = (ltr_line_count >= (end - start) / 2);
1700 
1701  *consistent = true;
1702  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1703  return ParagraphModel();
1704 
1705  // Ensure the caller only passed us a region with a common rmargin and
1706  // lmargin.
1707  int lmargin = (*rows)[start].lmargin_;
1708  int rmargin = (*rows)[start].rmargin_;
1709  int lmin, lmax, rmin, rmax, cmin, cmax;
1710  lmin = lmax = (*rows)[start + 1].lindent_;
1711  rmin = rmax = (*rows)[start + 1].rindent_;
1712  cmin = cmax = 0;
1713  for (int i = start + 1; i < end; i++) {
1714  if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1715  tprintf("Margins don't match! Software error.\n");
1716  *consistent = false;
1717  return ParagraphModel();
1718  }
1719  UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1720  UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1721  UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1722  }
1723  int ldiff = lmax - lmin;
1724  int rdiff = rmax - rmin;
1725  int cdiff = cmax - cmin;
1726  if (rdiff > tolerance && ldiff > tolerance) {
1727  if (cdiff < tolerance * 2) {
1728  if (end - start < 3)
1729  return ParagraphModel();
1730  return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1731  }
1732  *consistent = false;
1733  return ParagraphModel();
1734  }
1735  if (end - start < 3) // Don't return a model for two line paras.
1736  return ParagraphModel();
1737 
1738  // These booleans keep us from saying something is aligned left when the body
1739  // left variance is too large.
1740  bool body_admits_left_alignment = ldiff < tolerance;
1741  bool body_admits_right_alignment = rdiff < tolerance;
1742 
1743  ParagraphModel left_model =
1744  ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1745  (lmin + lmax) / 2, tolerance);
1746  ParagraphModel right_model =
1747  ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1748  (rmin + rmax) / 2, tolerance);
1749 
1750  // These booleans keep us from having an indent on the "wrong side" for the
1751  // first line.
1752  bool text_admits_left_alignment = ltr || left_model.is_flush();
1753  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1754 
1755  // At least one of the edges is less than tolerance in variance.
1756  // If the other is obviously ragged, it can't be the one aligned to.
1757  // [Note the last line is included in this raggedness.]
1758  if (tolerance < rdiff) {
1759  if (body_admits_left_alignment && text_admits_left_alignment)
1760  return left_model;
1761  *consistent = false;
1762  return ParagraphModel();
1763  }
1764  if (tolerance < ldiff) {
1765  if (body_admits_right_alignment && text_admits_right_alignment)
1766  return right_model;
1767  *consistent = false;
1768  return ParagraphModel();
1769  }
1770 
1771  // At this point, we know the body text doesn't vary much on either side.
1772 
1773  // If the first line juts out oddly in one direction or the other,
1774  // that likely indicates the side aligned to.
1775  int first_left = (*rows)[start].lindent_;
1776  int first_right = (*rows)[start].rindent_;
1777 
1778  if (ltr && body_admits_left_alignment &&
1779  (first_left < lmin || first_left > lmax))
1780  return left_model;
1781  if (!ltr && body_admits_right_alignment &&
1782  (first_right < rmin || first_right > rmax))
1783  return right_model;
1784 
1785  *consistent = false;
1786  return ParagraphModel();
1787 }
#define tprintf(...)
Definition: tprintf.h:31
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:125
bool is_flush() const
Definition: ocrpara.h:171
int tesseract::InterwordSpace ( const GenericVector< RowScratchRegisters > &  rows,
int  row_start,
int  row_end 
)

Definition at line 1598 of file paragraphs.cpp.

1599  {
1600  if (row_end < row_start + 1) return 1;
1601  int word_height = (rows[row_start].ri_->lword_box.height() +
1602  rows[row_end - 1].ri_->lword_box.height()) / 2;
1603  int word_width = (rows[row_start].ri_->lword_box.width() +
1604  rows[row_end - 1].ri_->lword_box.width()) / 2;
1605  STATS spacing_widths(0, 5 + word_width);
1606  for (int i = row_start; i < row_end; i++) {
1607  if (rows[i].ri_->num_words > 1) {
1608  spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1609  }
1610  }
1611  int minimum_reasonable_space = word_height / 3;
1612  if (minimum_reasonable_space < 2)
1613  minimum_reasonable_space = 2;
1614  int median = spacing_widths.median();
1615  return (median > minimum_reasonable_space)
1616  ? median : minimum_reasonable_space;
1617 }
Definition: statistc.h:33
bool tesseract::is_double_quote ( const char32  ch)

Definition at line 97 of file normstrngs.cpp.

97  {
98  static const int kNumDoubleQuoteUnicodes = 8;
99  static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
100  '"',
101  0x201C, // left double quotation mark (English, others)
102  0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
103  0x201F, // double high-reversed-9 quotation mark (PropList.txt)
104  0x2033, // double prime
105  0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
106  0x301E, // close double prime (East Asian languages written horizontally)
107  0xFF02, // fullwidth quotation mark
108  };
109  for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
110  if (kDoubleQuoteUnicodes[i] == ch)
111  return true;
112  }
113  return false;
114 }
signed int char32
Definition: normstrngs.h:27
bool tesseract::is_hyphen_punc ( const char32  ch)

Definition at line 58 of file normstrngs.cpp.

58  {
59  static const int kNumHyphenPuncUnicodes = 13;
60  static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
61  '-',
62  0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
63  0x207b, // superscript minus
64  0x208b, // subscript minus
65  0x2212, // minus sign
66  0xfe58, // small em dash
67  0xfe63, // small hyphen-minus
68  0xff0d, // fullwidth hyphen-minus
69  };
70  for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
71  if (kHyphenPuncUnicodes[i] == ch)
72  return true;
73  }
74  return false;
75 }
signed int char32
Definition: normstrngs.h:27
bool tesseract::is_single_quote ( const char32  ch)

Definition at line 77 of file normstrngs.cpp.

77  {
78  static const int kNumSingleQuoteUnicodes = 8;
79  static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
80  '\'',
81  '`',
82  0x2018, // left single quotation mark (English, others)
83  0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
84  // We may have to introduce a comma set with 0x201a
85  0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
86  0x2032, // prime
87  0x300C, // left corner bracket (East Asian languages)
88  0xFF07, // fullwidth apostrophe
89  };
90  for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
91  if (kSingleQuoteUnicodes[i] == ch)
92  return true;
93  }
94  return false;
95 }
signed int char32
Definition: normstrngs.h:27
bool tesseract::IsDigitLike ( int  ch)

Definition at line 197 of file paragraphs.cpp.

197  {
198  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
199 }
bool tesseract::IsInterchangeValid ( const char32  ch)

Definition at line 208 of file normstrngs.cpp.

208  {
209  return IsValidCodepoint(ch) &&
210  !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
211  !(ch >= 0xFFFE && ch <= 0xFFFF) &&
212  !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
213  !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
214  !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
215  !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
216  !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
217  !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
218  !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
219  !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
220  !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
221  !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
222  !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
223  !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
224  !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
225  !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
226  !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
227  !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
228  (!u_isISOControl(static_cast<UChar32>(ch)) ||
229  ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
230 }
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:170
bool tesseract::IsInterchangeValid7BitAscii ( const char32  ch)

Definition at line 232 of file normstrngs.cpp.

232  {
233  return IsValidCodepoint(ch) &&
234  ch <= 128 &&
235  (!u_isISOControl(static_cast<UChar32>(ch)) ||
236  ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
237 }
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:170
bool tesseract::IsLatinLetter ( int  ch)

Definition at line 193 of file paragraphs.cpp.

193  {
194  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
195 }
bool tesseract::IsLeftIndented ( const EquationDetect::IndentType  type)
inline

Definition at line 95 of file equationdetect.cpp.

95  {
96  return type == EquationDetect::LEFT_INDENT ||
97  type == EquationDetect::BOTH_INDENT;
98 }
bool tesseract::IsOCREquivalent ( char32  ch1,
char32  ch2 
)

Definition at line 166 of file normstrngs.cpp.

166  {
167  return OCRNormalize(ch1) == OCRNormalize(ch2);
168 }
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:156
bool tesseract::IsOpeningPunct ( int  ch)

Definition at line 201 of file paragraphs.cpp.

201  {
202  return strchr("'\"({[", ch) != NULL;
203 }
#define NULL
Definition: host.h:144
bool tesseract::IsRightIndented ( const EquationDetect::IndentType  type)
inline

Definition at line 100 of file equationdetect.cpp.

100  {
101  return type == EquationDetect::RIGHT_INDENT ||
102  type == EquationDetect::BOTH_INDENT;
103 }
bool tesseract::IsTerminalPunct ( int  ch)

Definition at line 205 of file paragraphs.cpp.

205  {
206  return strchr(":'\".?!]})", ch) != NULL;
207 }
#define NULL
Definition: host.h:144
bool tesseract::IsTextOrEquationType ( PolyBlockType  type)
inline

Definition at line 91 of file equationdetect.cpp.

91  {
92  return PTIsTextType(type) || type == PT_EQUATION;
93 }
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:70
bool tesseract::IsUTF8Whitespace ( const char *  text)

Definition at line 182 of file normstrngs.cpp.

182  {
183  return SpanUTF8Whitespace(text) == strlen(text);
184 }
int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:186
bool tesseract::IsValidCodepoint ( const char32  ch)

Definition at line 170 of file normstrngs.cpp.

170  {
171  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
172  return (static_cast<uinT32>(ch) < 0xD800)
173  || (ch >= 0xE000 && ch <= 0x10FFFF);
174 }
bool tesseract::IsWhitespace ( const char32  ch)

Definition at line 176 of file normstrngs.cpp.

176  {
178  "Invalid Unicode codepoint: 0x%x\n", ch);
179  return u_isUWhiteSpace(static_cast<UChar32>(ch));
180 }
#define ASSERT_HOST_MSG(x, msg...)
Definition: errcode.h:98
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:170
void tesseract::LeftoverSegments ( const GenericVector< RowScratchRegisters > &  rows,
GenericVector< Interval > *  to_fix,
int  row_start,
int  row_end 
)

Definition at line 2181 of file paragraphs.cpp.

2183  {
2184  to_fix->clear();
2185  for (int i = row_start; i < row_end; i++) {
2186  bool needs_fixing = false;
2187 
2188  SetOfModels models;
2189  SetOfModels models_w_crowns;
2190  rows[i].StrongHypotheses(&models);
2191  rows[i].NonNullHypotheses(&models_w_crowns);
2192  if (models.empty() && models_w_crowns.size() > 0) {
2193  // Crown paragraph. Is it followed by a modeled line?
2194  for (int end = i + 1; end < rows.size(); end++) {
2195  SetOfModels end_models;
2196  SetOfModels strong_end_models;
2197  rows[end].NonNullHypotheses(&end_models);
2198  rows[end].StrongHypotheses(&strong_end_models);
2199  if (end_models.size() == 0) {
2200  needs_fixing = true;
2201  break;
2202  } else if (strong_end_models.size() > 0) {
2203  needs_fixing = false;
2204  break;
2205  }
2206  }
2207  } else if (models.empty() && rows[i].ri_->num_words > 0) {
2208  // No models at all.
2209  needs_fixing = true;
2210  }
2211 
2212  if (!needs_fixing && !models.empty()) {
2213  needs_fixing = RowIsStranded(rows, i);
2214  }
2215 
2216  if (needs_fixing) {
2217  if (!to_fix->empty() && to_fix->back().end == i - 1)
2218  to_fix->back().end = i;
2219  else
2220  to_fix->push_back(Interval(i, i));
2221  }
2222  }
2223  // Convert inclusive intervals to half-open intervals.
2224  for (int i = 0; i < to_fix->size(); i++) {
2225  (*to_fix)[i].end = (*to_fix)[i].end + 1;
2226  }
2227 }
int size() const
Definition: genericvector.h:72
int push_back(T object)
T & back() const
GenericVectorEqEq< const ParagraphModel * > SetOfModels
bool RowIsStranded(const GenericVector< RowScratchRegisters > &rows, int row)
bool empty() const
Definition: genericvector.h:84
void tesseract::LeftWordAttributes ( const UNICHARSET unicharset,
const WERD_CHOICE werd,
const STRING utf8,
bool *  is_list,
bool *  starts_idea,
bool *  ends_idea 
)

Definition at line 394 of file paragraphs.cpp.

396  {
397  *is_list = false;
398  *starts_idea = false;
399  *ends_idea = false;
400  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty
401  *ends_idea = true;
402  return;
403  }
404 
405  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
406  if (UniLikelyListItem(unicharset, werd)) {
407  *is_list = true;
408  *starts_idea = true;
409  *ends_idea = true;
410  }
411  if (unicharset->get_isupper(werd->unichar_id(0))) {
412  *starts_idea = true;
413  }
414  if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
415  *starts_idea = true;
416  *ends_idea = true;
417  }
418  } else { // Assume utf8 is mostly ASCII
419  if (AsciiLikelyListItem(utf8)) {
420  *is_list = true;
421  *starts_idea = true;
422  }
423  int start_letter = utf8[0];
424  if (IsOpeningPunct(start_letter)) {
425  *starts_idea = true;
426  }
427  if (IsTerminalPunct(start_letter)) {
428  *ends_idea = true;
429  }
430  if (start_letter >= 'A' && start_letter <= 'Z') {
431  *starts_idea = true;
432  }
433  }
434 }
bool IsOpeningPunct(int ch)
Definition: paragraphs.cpp:201
int length() const
Definition: ratngs.h:300
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool IsTerminalPunct(int ch)
Definition: paragraphs.cpp:205
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
inT32 size() const
Definition: strngs.h:66
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
#define NULL
Definition: host.h:144
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:267
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
Definition: paragraphs.cpp:357
bool tesseract::LikelyListMark ( const STRING word)

Definition at line 262 of file paragraphs.cpp.

262  {
263  const char *kListMarks = "0Oo*.,+.";
264  return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
265 }
inT32 size() const
Definition: strngs.h:66
#define NULL
Definition: host.h:144
bool tesseract::LikelyListMarkUnicode ( int  ch)

Definition at line 328 of file paragraphs.cpp.

328  {
329  if (ch < 0x80) {
330  STRING single_ch;
331  single_ch += ch;
332  return LikelyListMark(single_ch);
333  }
334  switch (ch) {
335  // TODO(eger) expand this list of unicodes as needed.
336  case 0x00B0: // degree sign
337  case 0x2022: // bullet
338  case 0x25E6: // white bullet
339  case 0x00B7: // middle dot
340  case 0x25A1: // white square
341  case 0x25A0: // black square
342  case 0x25AA: // black small square
343  case 0x2B1D: // black very small square
344  case 0x25BA: // black right-pointing pointer
345  case 0x25CF: // black circle
346  case 0x25CB: // white circle
347  return true;
348  default:
349  break; // fall through
350  }
351  return false;
352 }
bool LikelyListMark(const STRING &word)
Definition: paragraphs.cpp:262
Definition: strngs.h:44
bool tesseract::LikelyListNumeral ( const STRING word)

Definition at line 228 of file paragraphs.cpp.

228  {
229  const char *kRomans = "ivxlmdIVXLMD";
230  const char *kDigits = "012345789";
231  const char *kOpen = "[{(";
232  const char *kSep = ":;-.,";
233  const char *kClose = "]})";
234 
235  int num_segments = 0;
236  const char *pos = word.string();
237  while (*pos != '\0' && num_segments < 3) {
238  // skip up to two open parens.
239  const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
240  const char *numeral_end = SkipChars(numeral_start, kRomans);
241  if (numeral_end != numeral_start) {
242  // Got Roman Numeral. Great.
243  } else {
244  numeral_end = SkipChars(numeral_start, kDigits);
245  if (numeral_end == numeral_start) {
246  // If there's a single latin letter, we can use that.
247  numeral_end = SkipChars(numeral_start, IsLatinLetter);
248  if (numeral_end - numeral_start != 1)
249  break;
250  }
251  }
252  // We got some sort of numeral.
253  num_segments++;
254  // Skip any trailing parens or punctuation.
255  pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
256  if (pos == numeral_end)
257  break;
258  }
259  return *pos == '\0';
260 }
const char * SkipOne(const char *str, const char *toskip)
Definition: paragraphs.cpp:220
const char * string() const
Definition: strngs.cpp:193
const char * SkipChars(const char *str, bool(*skip)(int))
Definition: paragraphs.cpp:215
bool IsLatinLetter(int ch)
Definition: paragraphs.cpp:193
bool tesseract::LikelyParagraphStart ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after 
)

Definition at line 1672 of file paragraphs.cpp.

1673  {
1674  return before.ri_->num_words == 0 ||
1675  (FirstWordWouldHaveFit(before, after) &&
1676  TextSupportsBreak(before, after));
1677 }
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool tesseract::LikelyParagraphStart ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after,
tesseract::ParagraphJustification  j 
)

Definition at line 1679 of file paragraphs.cpp.

1681  {
1682  return before.ri_->num_words == 0 ||
1683  (FirstWordWouldHaveFit(before, after, j) &&
1684  TextSupportsBreak(before, after));
1685 }
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool tesseract::LoadDataFromFile ( const STRING filename,
GenericVector< char > *  data 
)
inline

Definition at line 356 of file genericvector.h.

357  {
358  FILE* fp = fopen(filename.string(), "rb");
359  if (fp == NULL) return false;
360  fseek(fp, 0, SEEK_END);
361  size_t size = ftell(fp);
362  fseek(fp, 0, SEEK_SET);
363  // Pad with a 0, just in case we treat the result as a string.
364  data->init_to_size(size + 1, 0);
365  bool result = fread(&(*data)[0], 1, size, fp) == size;
366  fclose(fp);
367  return result;
368 }
void init_to_size(int size, T t)
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
ShapeTable * tesseract::LoadShapeTable ( const STRING file_prefix)

Definition at line 118 of file commontraining.cpp.

118  {
119  ShapeTable* shape_table = NULL;
120  STRING shape_table_file = file_prefix;
121  shape_table_file += kShapeTableFileSuffix;
122  FILE* shape_fp = fopen(shape_table_file.string(), "rb");
123  if (shape_fp != NULL) {
124  shape_table = new ShapeTable;
125  if (!shape_table->DeSerialize(false, shape_fp)) {
126  delete shape_table;
127  shape_table = NULL;
128  tprintf("Error: Failed to read shape table %s\n",
129  shape_table_file.string());
130  } else {
131  int num_shapes = shape_table->NumShapes();
132  tprintf("Read shape table %s of %d shapes\n",
133  shape_table_file.string(), num_shapes);
134  }
135  fclose(shape_fp);
136  } else {
137  tprintf("Warning: No shape table file present: %s\n",
138  shape_table_file.string());
139  }
140  return shape_table;
141 }
#define tprintf(...)
Definition: tprintf.h:31
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
int NumShapes() const
Definition: shapetable.h:278
MasterTrainer * tesseract::LoadTrainingData ( int  argc,
const char *const *  argv,
bool  replication,
ShapeTable **  shape_table,
STRING file_prefix 
)

Creates a MasterTraininer and loads the training data into it: Initializes feature_defs and IntegerFX. Loads the shape_table if shape_table != NULL. Loads initial unicharset from -U command-line option. If FLAGS_T is set, loads the majority of data from there, else:

  • Loads font info from -F option.
  • Loads xheights from -X option.
  • Loads samples from .tr files in remaining command-line args.
  • Deletes outliers and computes canonical samples.
  • If FLAGS_output_trainer is set, saves the trainer for future use. Computes canonical and cloud features. If shape_table is not NULL, but failed to load, make a fake flat one, as shape clustering was not run.

Definition at line 175 of file commontraining.cpp.

178  {
180  InitIntegerFX();
181  *file_prefix = "";
182  if (!FLAGS_D.empty()) {
183  *file_prefix += FLAGS_D.c_str();
184  *file_prefix += "/";
185  }
186  // If we are shape clustering (NULL shape_table) or we successfully load
187  // a shape_table written by a previous shape clustering, then
188  // shape_analysis will be true, meaning that the MasterTrainer will replace
189  // some members of the unicharset with their fragments.
190  bool shape_analysis = false;
191  if (shape_table != NULL) {
192  *shape_table = LoadShapeTable(*file_prefix);
193  if (*shape_table != NULL)
194  shape_analysis = true;
195  } else {
196  shape_analysis = true;
197  }
198  MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
199  shape_analysis,
200  replication,
201  FLAGS_debug_level);
202  IntFeatureSpace fs;
204  if (FLAGS_T.empty()) {
205  trainer->LoadUnicharset(FLAGS_U.c_str());
206  // Get basic font information from font_properties.
207  if (!FLAGS_F.empty()) {
208  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
209  delete trainer;
210  return NULL;
211  }
212  }
213  if (!FLAGS_X.empty()) {
214  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
215  delete trainer;
216  return NULL;
217  }
218  }
219  trainer->SetFeatureSpace(fs);
220  const char* page_name;
221  // Load training data from .tr files on the command line.
222  while ((page_name = GetNextFilename(argc, argv)) != NULL) {
223  tprintf("Reading %s ...\n", page_name);
224  trainer->ReadTrainingSamples(page_name, feature_defs, false);
225 
226  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
227  // read font spacing information in to fontinfo_table.
228  int pagename_len = strlen(page_name);
229  char *fontinfo_file_name = new char[pagename_len + 7];
230  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
231  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
232  trainer->AddSpacingInfo(fontinfo_file_name);
233  delete[] fontinfo_file_name;
234 
235  // Load the images into memory if required by the classifier.
236  if (FLAGS_load_images) {
237  STRING image_name = page_name;
238  // Chop off the tr and replace with tif. Extension must be tif!
239  image_name.truncate_at(image_name.length() - 2);
240  image_name += "tif";
241  trainer->LoadPageImages(image_name.string());
242  }
243  }
244  trainer->PostLoadCleanup();
245  // Write the master trainer if required.
246  if (!FLAGS_output_trainer.empty()) {
247  FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
248  if (fp == NULL) {
249  tprintf("Can't create saved trainer data!\n");
250  } else {
251  trainer->Serialize(fp);
252  fclose(fp);
253  }
254  }
255  } else {
256  bool success = false;
257  tprintf("Loading master trainer from file:%s\n",
258  FLAGS_T.c_str());
259  FILE* fp = fopen(FLAGS_T.c_str(), "rb");
260  if (fp == NULL) {
261  tprintf("Can't read file %s to initialize master trainer\n",
262  FLAGS_T.c_str());
263  } else {
264  success = trainer->DeSerialize(false, fp);
265  fclose(fp);
266  }
267  if (!success) {
268  tprintf("Deserialize of master trainer failed!\n");
269  delete trainer;
270  return NULL;
271  }
272  trainer->SetFeatureSpace(fs);
273  }
274  trainer->PreTrainingSetup();
275  if (!FLAGS_O.empty() &&
276  !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
277  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
278  delete trainer;
279  return NULL;
280  }
281  if (shape_table != NULL) {
282  // If we previously failed to load a shapetable, then shape clustering
283  // wasn't run so make a flat one now.
284  if (*shape_table == NULL) {
285  *shape_table = new ShapeTable;
286  trainer->SetupFlatShapeTable(*shape_table);
287  tprintf("Flat shape table summary: %s\n",
288  (*shape_table)->SummaryStr().string());
289  }
290  (*shape_table)->set_unicharset(trainer->unicharset());
291  }
292  return trainer;
293 }
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
#define tprintf(...)
Definition: tprintf.h:31
const int kBoostXYBuckets
const int kBoostDirBuckets
inT32 length() const
Definition: strngs.cpp:188
ShapeTable * LoadShapeTable(const STRING &file_prefix)
const char * GetNextFilename(int argc, const char *const *argv)
FEATURE_DEFS_STRUCT feature_defs
void truncate_at(inT32 index)
Definition: strngs.cpp:264
void InitIntegerFX()
Definition: intfx.cpp:55
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
const char * c_str() const
Definition: strngs.cpp:204
TBLOB* tesseract::make_tesseract_blob ( float  baseline,
float  xheight,
float  descender,
float  ascender,
bool  numeric_mode,
Pix *  pix 
)

Return a TBLOB * from the whole pix. To be freed later with delete.

Definition at line 2338 of file baseapi.cpp.

2340  {
2341  TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
2342 
2343  // Normalize TBLOB
2344  ROW *row =
2345  TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
2346  TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
2347  delete row;
2348  return tblob;
2349 }
Definition: blobs.h:261
Definition: ocrrow.h:32
bool tesseract::MakeIndividualGlyphs ( Pix *  pix,
const vector< BoxChar * > &  vbox,
const int  input_tiff_page 
)

Definition at line 309 of file text2image.cpp.

311  {
312  // If checks fail, return false without exiting text2image
313  if (!pix) {
314  tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is NULL\n");
315  return false;
316  } else if (FLAGS_glyph_resized_size <= 0) {
317  tprintf("ERROR: --glyph_resized_size must be positive\n");
318  return false;
319  } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) {
320  tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n");
321  return false;
322  }
323 
324  const int n_boxes = vbox.size();
325  int n_boxes_saved = 0;
326  int current_tiff_page = 0;
327  int y_previous = 0;
328  static int glyph_count = 0;
329  for (int i = 0; i < n_boxes; i++) {
330  // Get one bounding box
331  Box* b = vbox[i]->mutable_box();
332  if (!b) continue;
333  const int x = b->x;
334  const int y = b->y;
335  const int w = b->w;
336  const int h = b->h;
337  // Check present tiff page (for multipage tiff)
338  if (y < y_previous-pixGetHeight(pix)/10) {
339  tprintf("ERROR: Wrap-around encountered, at i=%d\n", i);
340  current_tiff_page++;
341  }
342  if (current_tiff_page < input_tiff_page) continue;
343  else if (current_tiff_page > input_tiff_page) break;
344  // Check box validity
345  if (x < 0 || y < 0 ||
346  (x+w-1) >= pixGetWidth(pix) ||
347  (y+h-1) >= pixGetHeight(pix)) {
348  tprintf("ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d"
349  " (x=%d, y=%d, w=%d, h=%d\n)", i, x, y, w, h);
350  continue;
351  } else if (w < FLAGS_glyph_num_border_pixels_to_pad &&
352  h < FLAGS_glyph_num_border_pixels_to_pad) {
353  tprintf("ERROR: Input image too small to be a character, at i=%d\n", i);
354  continue;
355  }
356  // Crop the boxed character
357  Pix* pix_glyph = pixClipRectangle(pix, b, NULL);
358  if (!pix_glyph) {
359  tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i);
360  continue;
361  }
362  // Resize to square
363  Pix* pix_glyph_sq = pixScaleToSize(pix_glyph,
364  FLAGS_glyph_resized_size,
365  FLAGS_glyph_resized_size);
366  if (!pix_glyph_sq) {
367  tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i);
368  continue;
369  }
370  // Zero-pad
371  Pix* pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq,
372  FLAGS_glyph_num_border_pixels_to_pad,
373  0);
374  if (!pix_glyph_sq_pad) {
375  tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n",
376  i);
377  continue;
378  }
379  // Write out
380  Pix* pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false);
381  char filename[1024];
382  snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(),
383  glyph_count++);
384  if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) {
385  tprintf("ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s,"
386  " at i=%d\n", filename, i);
387  continue;
388  }
389 
390  pixDestroy(&pix_glyph);
391  pixDestroy(&pix_glyph_sq);
392  pixDestroy(&pix_glyph_sq_pad);
393  pixDestroy(&pix_glyph_sq_pad_8);
394  n_boxes_saved++;
395  y_previous = y;
396  }
397  if (n_boxes_saved == 0) {
398  return false;
399  } else {
400  tprintf("Total number of characters saved = %d\n", n_boxes_saved);
401  return true;
402  }
403 }
#define tprintf(...)
Definition: tprintf.h:31
#define NULL
Definition: host.h:144
void tesseract::MarkRowsWithModel ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
const ParagraphModel model,
bool  ltr,
int  eop_threshold 
)

Definition at line 807 of file paragraphs.cpp.

811  {
812  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
813  return;
814  for (int row = row_start; row < row_end; row++) {
815  bool valid_first = ValidFirstLine(rows, row, model);
816  bool valid_body = ValidBodyLine(rows, row, model);
817  if (valid_first && !valid_body) {
818  (*rows)[row].AddStartLine(model);
819  } else if (valid_body && !valid_first) {
820  (*rows)[row].AddBodyLine(model);
821  } else if (valid_body && valid_first) {
822  bool after_eop = (row == row_start);
823  if (row > row_start) {
824  if (eop_threshold > 0) {
825  if (model->justification() == JUSTIFICATION_LEFT) {
826  after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
827  } else {
828  after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
829  }
830  } else {
831  after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
832  model->justification());
833  }
834  }
835  if (after_eop) {
836  (*rows)[row].AddStartLine(model);
837  } else {
838  (*rows)[row].AddBodyLine(model);
839  }
840  } else {
841  // Do nothing. Stray row.
842  }
843  }
844 }
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
void tesseract::MarkStrongEvidence ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end 
)

Definition at line 1830 of file paragraphs.cpp.

1831  {
1832  // Record patently obvious body text.
1833  for (int i = row_start + 1; i < row_end; i++) {
1834  const RowScratchRegisters &prev = (*rows)[i - 1];
1835  RowScratchRegisters &curr = (*rows)[i];
1836  tesseract::ParagraphJustification typical_justification =
1837  prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1838  if (!curr.ri_->rword_likely_starts_idea &&
1839  !curr.ri_->lword_likely_starts_idea &&
1840  !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1841  curr.SetBodyLine();
1842  }
1843  }
1844 
1845  // Record patently obvious start paragraph lines.
1846  //
1847  // It's an extremely good signal of the start of a paragraph that
1848  // the first word would have fit on the end of the previous line.
1849  // However, applying just that signal would have us mark random
1850  // start lines of lineated text (poetry and source code) and some
1851  // centered headings as paragraph start lines. Therefore, we use
1852  // a second qualification for a paragraph start: Not only should
1853  // the first word of this line have fit on the previous line,
1854  // but also, this line should go full to the right of the block,
1855  // disallowing a subsequent word from having fit on this line.
1856 
1857  // First row:
1858  {
1859  RowScratchRegisters &curr = (*rows)[row_start];
1860  RowScratchRegisters &next = (*rows)[row_start + 1];
1862  curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1863  if (curr.GetLineType() == LT_UNKNOWN &&
1864  !FirstWordWouldHaveFit(curr, next, j) &&
1865  (curr.ri_->lword_likely_starts_idea ||
1866  curr.ri_->rword_likely_starts_idea)) {
1867  curr.SetStartLine();
1868  }
1869  }
1870  // Middle rows
1871  for (int i = row_start + 1; i < row_end - 1; i++) {
1872  RowScratchRegisters &prev = (*rows)[i - 1];
1873  RowScratchRegisters &curr = (*rows)[i];
1874  RowScratchRegisters &next = (*rows)[i + 1];
1876  curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1877  if (curr.GetLineType() == LT_UNKNOWN &&
1878  !FirstWordWouldHaveFit(curr, next, j) &&
1879  LikelyParagraphStart(prev, curr, j)) {
1880  curr.SetStartLine();
1881  }
1882  }
1883  // Last row
1884  { // the short circuit at the top means we have at least two lines.
1885  RowScratchRegisters &prev = (*rows)[row_end - 2];
1886  RowScratchRegisters &curr = (*rows)[row_end - 1];
1888  curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1889  if (curr.GetLineType() == LT_UNKNOWN &&
1890  !FirstWordWouldHaveFit(curr, curr, j) &&
1891  LikelyParagraphStart(prev, curr, j)) {
1892  curr.SetStartLine();
1893  }
1894  }
1895 }
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
ParagraphJustification
Definition: publictypes.h:239
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j)
void tesseract::ModelStrongEvidence ( int  debug_level,
GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
bool  allow_flush_models,
ParagraphTheory *  theory 
)

Definition at line 1900 of file paragraphs.cpp.

1904  {
1905  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1906  return;
1907 
1908  int start = row_start;
1909  while (start < row_end) {
1910  while (start < row_end && (*rows)[start].GetLineType() != LT_START)
1911  start++;
1912  if (start >= row_end - 1)
1913  break;
1914 
1915  int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1916  int end = start;
1917  ParagraphModel last_model;
1918  bool next_consistent;
1919  do {
1920  ++end;
1921  // rows[row, end) was consistent.
1922  // If rows[row, end + 1) is not consistent,
1923  // just model rows[row, end)
1924  if (end < row_end - 1) {
1925  RowScratchRegisters &next = (*rows)[end];
1926  LineType lt = next.GetLineType();
1927  next_consistent = lt == LT_BODY ||
1928  (lt == LT_UNKNOWN &&
1929  !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1930  } else {
1931  next_consistent = false;
1932  }
1933  if (next_consistent) {
1935  rows, start, end + 1, tolerance, &next_consistent);
1936  if (((*rows)[start].ri_->ltr &&
1937  last_model.justification() == JUSTIFICATION_LEFT &&
1938  next_model.justification() != JUSTIFICATION_LEFT) ||
1939  (!(*rows)[start].ri_->ltr &&
1940  last_model.justification() == JUSTIFICATION_RIGHT &&
1941  next_model.justification() != JUSTIFICATION_RIGHT)) {
1942  next_consistent = false;
1943  }
1944  last_model = next_model;
1945  } else {
1946  next_consistent = false;
1947  }
1948  } while (next_consistent && end < row_end);
1949  // At this point, rows[start, end) looked like it could have been a
1950  // single paragraph. If we can make a good ParagraphModel for it,
1951  // do so and mark this sequence with that model.
1952  if (end > start + 1) {
1953  // emit a new paragraph if we have more than one line.
1954  const ParagraphModel *model = NULL;
1956  debug_level, rows, start, end,
1957  Epsilon(InterwordSpace(*rows, start, end)));
1958  if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
1959  // couldn't create a good model, oh well.
1960  } else if (new_model.is_flush()) {
1961  if (end == start + 2) {
1962  // It's very likely we just got two paragraph starts in a row.
1963  end = start + 1;
1964  } else if (start == row_start) {
1965  // Mark this as a Crown.
1966  if (new_model.justification() == JUSTIFICATION_LEFT) {
1967  model = kCrownLeft;
1968  } else {
1969  model = kCrownRight;
1970  }
1971  } else if (allow_flush_models) {
1972  model = theory->AddModel(new_model);
1973  }
1974  } else {
1975  model = theory->AddModel(new_model);
1976  }
1977  if (model) {
1978  (*rows)[start].AddStartLine(model);
1979  for (int i = start + 1; i < end; i++) {
1980  (*rows)[i].AddBodyLine(model);
1981  }
1982  }
1983  }
1984  start = end;
1985  }
1986 }
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
ParagraphModel ParagraphModelByOutline(int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45
bool is_flush() const
Definition: ocrpara.h:171
#define NULL
Definition: host.h:144
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
void tesseract::NormalizeChar32 ( char32  ch,
GenericVector< char32 > *  str 
)

Definition at line 131 of file normstrngs.cpp.

131  {
132  IcuErrorCode error_code;
133  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
134  NULL, "nfkc", UNORM2_COMPOSE, error_code);
135  error_code.assertSuccess();
136  error_code.reset();
137 
138  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
139  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
140  error_code.assertSuccess();
141 
142  str->clear();
143  for (int i = 0; i < norm_str.length(); ++i) {
144  // If any spaces were added by NFKC, pretend normalization is a nop.
145  if (norm_str[i] == ' ') {
146  str->clear();
147  str->push_back(ch);
148  break;
149  } else {
150  str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
151  }
152  }
153 }
int push_back(T object)
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:156
#define NULL
Definition: host.h:144
uinT8 tesseract::NormalizeDirection ( uinT8  dir,
const FCOORD unnormed_pos,
const DENORM denorm,
const DENORM root_denorm 
)

Definition at line 171 of file intfx.cpp.

172  {
173  // Convert direction to a vector.
174  FCOORD unnormed_end;
175  unnormed_end.from_direction(dir);
176  unnormed_end += unnormed_pos;
177  FCOORD normed_pos, normed_end;
178  denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);
179  denorm.NormTransform(root_denorm, unnormed_end, &normed_end);
180  normed_end -= normed_pos;
181  return normed_end.to_direction();
182 }
uinT8 to_direction() const
Definition: points.cpp:111
void from_direction(uinT8 direction)
Definition: points.cpp:115
void NormTransform(const DENORM *first_norm, const TPOINT &pt, TPOINT *transformed) const
Definition: normalis.cpp:334
Definition: points.h:189
STRING tesseract::NormalizeUTF8String ( const char *  str8)

Definition at line 116 of file normstrngs.cpp.

116  {
117  GenericVector<char32> str32, out_str32, norm_str;
118  UTF8ToUTF32(str8, &str32);
119  for (int i = 0; i < str32.length(); ++i) {
120  norm_str.clear();
121  NormalizeChar32(str32[i], &norm_str);
122  for (int j = 0; j < norm_str.length(); ++j) {
123  out_str32.push_back(norm_str[j]);
124  }
125  }
126  STRING out_str8;
127  UTF32ToUTF8(out_str32, &out_str8);
128  return out_str8;
129 }
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
Definition: normstrngs.cpp:31
int length() const
Definition: genericvector.h:79
int push_back(T object)
void NormalizeChar32(char32 ch, GenericVector< char32 > *str)
Definition: normstrngs.cpp:131
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
Definition: normstrngs.cpp:45
Definition: strngs.h:44
char32 tesseract::OCRNormalize ( char32  ch)

Definition at line 156 of file normstrngs.cpp.

156  {
157  if (is_hyphen_punc(ch))
158  return '-';
159  else if (is_single_quote(ch))
160  return '\'';
161  else if (is_double_quote(ch))
162  return '"';
163  return ch;
164 }
bool is_double_quote(const char32 ch)
Definition: normstrngs.cpp:97
bool is_hyphen_punc(const char32 ch)
Definition: normstrngs.cpp:58
bool is_single_quote(const char32 ch)
Definition: normstrngs.cpp:77
int tesseract::OtsuStats ( const int *  histogram,
int *  H_out,
int *  omega0_out 
)

Definition at line 182 of file otsuthr.cpp.

182  {
183  int H = 0;
184  double mu_T = 0.0;
185  for (int i = 0; i < kHistogramSize; ++i) {
186  H += histogram[i];
187  mu_T += static_cast<double>(i) * histogram[i];
188  }
189 
190  // Now maximize sig_sq_B over t.
191  // http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
192  int best_t = -1;
193  int omega_0, omega_1;
194  int best_omega_0 = 0;
195  double best_sig_sq_B = 0.0;
196  double mu_0, mu_1, mu_t;
197  omega_0 = 0;
198  mu_t = 0.0;
199  for (int t = 0; t < kHistogramSize - 1; ++t) {
200  omega_0 += histogram[t];
201  mu_t += t * static_cast<double>(histogram[t]);
202  if (omega_0 == 0)
203  continue;
204  omega_1 = H - omega_0;
205  if (omega_1 == 0)
206  break;
207  mu_0 = mu_t / omega_0;
208  mu_1 = (mu_T - mu_t) / omega_1;
209  double sig_sq_B = mu_1 - mu_0;
210  sig_sq_B *= sig_sq_B * omega_0 * omega_1;
211  if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
212  best_sig_sq_B = sig_sq_B;
213  best_t = t;
214  best_omega_0 = omega_0;
215  }
216  }
217  if (H_out != NULL) *H_out = H;
218  if (omega0_out != NULL) *omega0_out = best_omega_0;
219  return best_t;
220 }
const int kHistogramSize
Definition: otsuthr.h:27
#define NULL
Definition: host.h:144
int tesseract::OtsuThreshold ( Pix *  src_pix,
int  left,
int  top,
int  width,
int  height,
int **  thresholds,
int **  hi_values 
)

Definition at line 39 of file otsuthr.cpp.

40  {
41  int num_channels = pixGetDepth(src_pix) / 8;
42  // Of all channels with no good hi_value, keep the best so we can always
43  // produce at least one answer.
44  PERF_COUNT_START("OtsuThreshold")
45  int best_hi_value = 1;
46  int best_hi_index = 0;
47  bool any_good_hivalue = false;
48  double best_hi_dist = 0.0;
49  *thresholds = new int[num_channels];
50  *hi_values = new int[num_channels];
51  // all of channel 0 then all of channel 1...
52  int *histogramAllChannels = new int[kHistogramSize * num_channels];
53 
54  // only use opencl if compiled w/ OpenCL and selected device is opencl
55 #ifdef USE_OPENCL
56  // Calculate Histogram on GPU
57  OpenclDevice od;
58  if (od.selectedDeviceIsOpenCL() &&
59  (num_channels == 1 || num_channels == 4) && top == 0 && left == 0 ) {
60  od.HistogramRectOCL(
61  (const unsigned char*)pixGetData(src_pix),
62  num_channels,
63  pixGetWpl(src_pix) * 4,
64  left,
65  top,
66  width,
67  height,
69  histogramAllChannels);
70 
71  // Calculate Threshold from Histogram on cpu
72  for (int ch = 0; ch < num_channels; ++ch) {
73  (*thresholds)[ch] = -1;
74  (*hi_values)[ch] = -1;
75  int *histogram = &histogramAllChannels[kHistogramSize * ch];
76  int H;
77  int best_omega_0;
78  int best_t = OtsuStats(histogram, &H, &best_omega_0);
79  if (best_omega_0 == 0 || best_omega_0 == H) {
80  // This channel is empty.
81  continue;
82  }
83  // To be a convincing foreground we must have a small fraction of H
84  // or to be a convincing background we must have a large fraction of H.
85  // In between we assume this channel contains no thresholding information.
86  int hi_value = best_omega_0 < H * 0.5;
87  (*thresholds)[ch] = best_t;
88  if (best_omega_0 > H * 0.75) {
89  any_good_hivalue = true;
90  (*hi_values)[ch] = 0;
91  } else if (best_omega_0 < H * 0.25) {
92  any_good_hivalue = true;
93  (*hi_values)[ch] = 1;
94  } else {
95  // In case all channels are like this, keep the best of the bad lot.
96  double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
97  if (hi_dist > best_hi_dist) {
98  best_hi_dist = hi_dist;
99  best_hi_value = hi_value;
100  best_hi_index = ch;
101  }
102  }
103  }
104  } else {
105 #endif
106  for (int ch = 0; ch < num_channels; ++ch) {
107  (*thresholds)[ch] = -1;
108  (*hi_values)[ch] = -1;
109  // Compute the histogram of the image rectangle.
110  int histogram[kHistogramSize];
111  HistogramRect(src_pix, ch, left, top, width, height, histogram);
112  int H;
113  int best_omega_0;
114  int best_t = OtsuStats(histogram, &H, &best_omega_0);
115  if (best_omega_0 == 0 || best_omega_0 == H) {
116  // This channel is empty.
117  continue;
118  }
119  // To be a convincing foreground we must have a small fraction of H
120  // or to be a convincing background we must have a large fraction of H.
121  // In between we assume this channel contains no thresholding information.
122  int hi_value = best_omega_0 < H * 0.5;
123  (*thresholds)[ch] = best_t;
124  if (best_omega_0 > H * 0.75) {
125  any_good_hivalue = true;
126  (*hi_values)[ch] = 0;
127  } else if (best_omega_0 < H * 0.25) {
128  any_good_hivalue = true;
129  (*hi_values)[ch] = 1;
130  } else {
131  // In case all channels are like this, keep the best of the bad lot.
132  double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
133  if (hi_dist > best_hi_dist) {
134  best_hi_dist = hi_dist;
135  best_hi_value = hi_value;
136  best_hi_index = ch;
137  }
138  }
139  }
140 #ifdef USE_OPENCL
141  }
142 #endif // USE_OPENCL
143  delete[] histogramAllChannels;
144 
145  if (!any_good_hivalue) {
146  // Use the best of the ones that were not good enough.
147  (*hi_values)[best_hi_index] = best_hi_value;
148  }
150  return num_channels;
151 }
#define PERF_COUNT_START(FUNCT_NAME)
#define PERF_COUNT_END
int OtsuStats(const int *histogram, int *H_out, int *omega0_out)
Definition: otsuthr.cpp:182
const int kHistogramSize
Definition: otsuthr.h:27
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:157
ParagraphModel tesseract::ParagraphModelByOutline ( int  debug_level,
const GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
int  tolerance 
)

Definition at line 1793 of file paragraphs.cpp.

1796  {
1797  bool unused_consistent;
1799  rows, start, end, tolerance, &unused_consistent);
1800  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1801  tprintf("Could not determine a model for this paragraph:\n");
1802  PrintRowRange(*rows, start, end);
1803  }
1804  return retval;
1805 }
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
#define tprintf(...)
Definition: tprintf.h:31
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
int tesseract::ParamsTrainingFeatureByName ( const char *  name)

Definition at line 26 of file params_training_featdef.cpp.

26  {
27  if (name == NULL)
28  return -1;
29  int array_size = sizeof(kParamsTrainingFeatureTypeName) /
30  sizeof(kParamsTrainingFeatureTypeName[0]);
31  for (int i = 0; i < array_size; i++) {
32  if (kParamsTrainingFeatureTypeName[i] == NULL)
33  continue;
34  if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0)
35  return i;
36  }
37  return -1;
38 }
name_table name
#define NULL
Definition: host.h:144
void tesseract::ParseCommandLineFlags ( const char *  usage,
int *  argc,
char ***  argv,
const bool  remove_flags 
)

Definition at line 312 of file commandlineflags.cpp.

314  {
315  InitGoogle(usage, argc, argv, remove_flags);
316 }
double tesseract::prec ( double  x)

Definition at line 184 of file pdfrenderer.cpp.

184  {
185  double kPrecision = 1000.0;
186  double a = round(x * kPrecision) / kPrecision;
187  if (a == -0)
188  return 0;
189  return a;
190 }
#define round(x)
Definition: mathfix.h:34
bool tesseract::PSM_BLOCK_FIND_ENABLED ( int  pageseg_mode)
inline

Definition at line 191 of file publictypes.h.

191  {
192  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
193 }
bool tesseract::PSM_COL_FIND_ENABLED ( int  pageseg_mode)
inline

Definition at line 185 of file publictypes.h.

185  {
186  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
187 }
Definition: capi.h:73
bool tesseract::PSM_LINE_FIND_ENABLED ( int  pageseg_mode)
inline

Definition at line 194 of file publictypes.h.

194  {
195  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
196 }
bool tesseract::PSM_ORIENTATION_ENABLED ( int  pageseg_mode)
inline

Definition at line 182 of file publictypes.h.

182  {
183  return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
184 }
Definition: capi.h:73
bool tesseract::PSM_OSD_ENABLED ( int  pageseg_mode)
inline

Inline functions that act on a PageSegMode to determine whether components of layout analysis are enabled. Depend critically on the order of elements of PageSegMode. NOTE that arg is an int for compatibility with INT_PARAM.

Definition at line 179 of file publictypes.h.

179  {
180  return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
181 }
bool tesseract::PSM_SPARSE ( int  pageseg_mode)
inline

Definition at line 188 of file publictypes.h.

188  {
189  return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
190 }
bool tesseract::PSM_WORD_FIND_ENABLED ( int  pageseg_mode)
inline

Definition at line 197 of file publictypes.h.

197  {
198  return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
199  pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
200 }
bool tesseract::read_info ( FILE *  f,
FontInfo *  fi,
bool  swap 
)

Definition at line 152 of file fontinfo.cpp.

152  {
153  inT32 size;
154  if (fread(&size, sizeof(size), 1, f) != 1) return false;
155  if (swap)
156  Reverse32(&size);
157  char* font_name = new char[size + 1];
158  fi->name = font_name;
159  if (static_cast<int>(fread(font_name, sizeof(*font_name), size, f)) != size)
160  return false;
161  font_name[size] = '\0';
162  if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
163  if (swap)
164  Reverse32(&fi->properties);
165  return true;
166 }
void Reverse32(void *ptr)
Definition: helpers.h:193
int inT32
Definition: host.h:102
bool tesseract::read_set ( FILE *  f,
FontSet *  fs,
bool  swap 
)

Definition at line 240 of file fontinfo.cpp.

240  {
241  if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
242  if (swap)
243  Reverse32(&fs->size);
244  fs->configs = new int[fs->size];
245  for (int i = 0; i < fs->size; ++i) {
246  if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
247  if (swap)
248  Reverse32(&fs->configs[i]);
249  }
250  return true;
251 }
void Reverse32(void *ptr)
Definition: helpers.h:193
bool tesseract::read_spacing_info ( FILE *  f,
FontInfo *  fi,
bool  swap 
)

Definition at line 177 of file fontinfo.cpp.

177  {
178  inT32 vec_size, kern_size;
179  if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
180  if (swap) Reverse32(&vec_size);
181  ASSERT_HOST(vec_size >= 0);
182  if (vec_size == 0) return true;
183  fi->init_spacing(vec_size);
184  for (int i = 0; i < vec_size; ++i) {
185  FontSpacingInfo *fs = new FontSpacingInfo();
186  if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
187  fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
188  fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
189  delete fs;
190  return false;
191  }
192  if (swap) {
193  ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
194  ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
195  Reverse32(&kern_size);
196  }
197  if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
198  delete fs;
199  continue;
200  }
201  if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
202  !fs->kerned_x_gaps.DeSerialize(swap, f))) {
203  delete fs;
204  return false;
205  }
206  fi->add_spacing(i, fs);
207  }
208  return true;
209 }
void Reverse32(void *ptr)
Definition: helpers.h:193
#define ASSERT_HOST(x)
Definition: errcode.h:84
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
int inT32
Definition: host.h:102
bool tesseract::read_t ( PAGE_RES_IT page_res_it,
TBOX tbox 
)

Definition at line 53 of file recogtraining.cpp.

53  {
54  while (page_res_it->block() != NULL && page_res_it->word() == NULL)
55  page_res_it->forward();
56 
57  if (page_res_it->word() != NULL) {
58  *tbox = page_res_it->word()->word->bounding_box();
59 
60  // If tbox->left() is negative, the training image has vertical text and
61  // all the coordinates of bounding boxes of page_res are rotated by 90
62  // degrees in a counterclockwise direction. We need to rotate the TBOX back
63  // in order to compare with the TBOXes of box files.
64  if (tbox->left() < 0) {
65  tbox->rotate(FCOORD(0.0, -1.0));
66  }
67 
68  return true;
69  } else {
70  return false;
71  }
72 }
TBOX bounding_box() const
Definition: werd.cpp:160
BLOCK_RES * block() const
Definition: pageres.h:739
WERD_RES * forward()
Definition: pageres.h:713
inT16 left() const
Definition: rect.h:68
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
Definition: points.h:189
WERD_RES * word() const
Definition: pageres.h:733
void rotate(const FCOORD &vec)
Definition: rect.h:189
void tesseract::RecomputeMarginsAndClearHypotheses ( GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
int  percentile 
)

Definition at line 1558 of file paragraphs.cpp.

1560  {
1561  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1562  return;
1563 
1564  int lmin, lmax, rmin, rmax;
1565  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1566  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1567  for (int i = start; i < end; i++) {
1568  RowScratchRegisters &sr = (*rows)[i];
1569  sr.SetUnknown();
1570  if (sr.ri_->num_words == 0)
1571  continue;
1572  UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1573  UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1574  }
1575  STATS lefts(lmin, lmax + 1);
1576  STATS rights(rmin, rmax + 1);
1577  for (int i = start; i < end; i++) {
1578  RowScratchRegisters &sr = (*rows)[i];
1579  if (sr.ri_->num_words == 0)
1580  continue;
1581  lefts.add(sr.lmargin_ + sr.lindent_, 1);
1582  rights.add(sr.rmargin_ + sr.rindent_, 1);
1583  }
1584  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1585  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1586  for (int i = start; i < end; i++) {
1587  RowScratchRegisters &sr = (*rows)[i];
1588  int ldelta = ignorable_left - sr.lmargin_;
1589  sr.lmargin_ += ldelta;
1590  sr.lindent_ -= ldelta;
1591  int rdelta = ignorable_right - sr.rmargin_;
1592  sr.rmargin_ += rdelta;
1593  sr.rindent_ -= rdelta;
1594  }
1595 }
Definition: statistc.h:33
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:125
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
void tesseract::RightWordAttributes ( const UNICHARSET unicharset,
const WERD_CHOICE werd,
const STRING utf8,
bool *  is_list,
bool *  starts_idea,
bool *  ends_idea 
)

Definition at line 441 of file paragraphs.cpp.

443  {
444  *is_list = false;
445  *starts_idea = false;
446  *ends_idea = false;
447  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty
448  *ends_idea = true;
449  return;
450  }
451 
452  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
453  if (UniLikelyListItem(unicharset, werd)) {
454  *is_list = true;
455  *starts_idea = true;
456  }
457  UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
458  if (unicharset->get_ispunctuation(last_letter)) {
459  *ends_idea = true;
460  }
461  } else { // Assume utf8 is mostly ASCII
462  if (AsciiLikelyListItem(utf8)) {
463  *is_list = true;
464  *starts_idea = true;
465  }
466  int last_letter = utf8[utf8.size() - 1];
467  if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
468  *ends_idea = true;
469  }
470  }
471 }
bool IsOpeningPunct(int ch)
Definition: paragraphs.cpp:201
int length() const
Definition: ratngs.h:300
bool IsTerminalPunct(int ch)
Definition: paragraphs.cpp:205
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
inT32 size() const
Definition: strngs.h:66
int UNICHAR_ID
Definition: unichar.h:33
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
#define NULL
Definition: host.h:144
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:267
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
Definition: paragraphs.cpp:357
bool tesseract::RowIsStranded ( const GenericVector< RowScratchRegisters > &  rows,
int  row 
)

Definition at line 2139 of file paragraphs.cpp.

2139  {
2140  SetOfModels row_models;
2141  rows[row].StrongHypotheses(&row_models);
2142 
2143  for (int m = 0; m < row_models.size(); m++) {
2144  bool all_starts = rows[row].GetLineType();
2145  int run_length = 1;
2146  bool continues = true;
2147  for (int i = row - 1; i >= 0 && continues; i--) {
2148  SetOfModels models;
2149  rows[i].NonNullHypotheses(&models);
2150  switch (rows[i].GetLineType(row_models[m])) {
2151  case LT_START: run_length++; break;
2152  case LT_MULTIPLE: // explicit fall-through
2153  case LT_BODY: run_length++; all_starts = false; break;
2154  case LT_UNKNOWN: // explicit fall-through
2155  default: continues = false;
2156  }
2157  }
2158  continues = true;
2159  for (int i = row + 1; i < rows.size() && continues; i++) {
2160  SetOfModels models;
2161  rows[i].NonNullHypotheses(&models);
2162  switch (rows[i].GetLineType(row_models[m])) {
2163  case LT_START: run_length++; break;
2164  case LT_MULTIPLE: // explicit fall-through
2165  case LT_BODY: run_length++; all_starts = false; break;
2166  case LT_UNKNOWN: // explicit fall-through
2167  default: continues = false;
2168  }
2169  }
2170  if (run_length > 2 || (!all_starts && run_length > 1)) return false;
2171  }
2172  return true;
2173 }
int size() const
Definition: genericvector.h:72
GenericVectorEqEq< const ParagraphModel * > SetOfModels
bool tesseract::RowsFitModel ( const GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
const ParagraphModel model 
)

Definition at line 1808 of file paragraphs.cpp.

1809  {
1810  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1811  return false;
1812  if (!ValidFirstLine(rows, start, model)) return false;
1813  for (int i = start + 1 ; i < end; i++) {
1814  if (!ValidBodyLine(rows, i, model)) return false;
1815  }
1816  return true;
1817 }
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
STRING tesseract::RtlEmbed ( const STRING word,
bool  rtlify 
)

Definition at line 121 of file paragraphs.cpp.

121  {
122  if (rtlify)
123  return STRING(kRLE) + word + STRING(kPDF);
124  return word;
125 }
const char * kRLE
Definition: unicodes.cpp:29
const char * kPDF
Definition: unicodes.cpp:30
Definition: strngs.h:44
bool tesseract::SaveDataToFile ( const GenericVector< char > &  data,
const STRING filename 
)
inline

Definition at line 371 of file genericvector.h.

372  {
373  FILE* fp = fopen(filename.string(), "wb");
374  if (fp == NULL) return false;
375  bool result =
376  static_cast<int>(fwrite(&data[0], 1, data.size(), fp)) == data.size();
377  fclose(fp);
378  return result;
379 }
int size() const
Definition: genericvector.h:72
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
const char * tesseract::ScriptPosToString ( enum ScriptPos  script_pos)

Definition at line 180 of file ratngs.cpp.

180  {
181  switch (script_pos) {
182  case SP_NORMAL: return "NORM";
183  case SP_SUBSCRIPT: return "SUB";
184  case SP_SUPERSCRIPT: return "SUPER";
185  case SP_DROPCAP: return "DROPC";
186  }
187  return "SP_UNKNOWN";
188 }
void tesseract::SeparateSimpleLeaderLines ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
ParagraphTheory *  theory 
)

Definition at line 2025 of file paragraphs.cpp.

2027  {
2028  for (int i = row_start + 1; i < row_end - 1; i++) {
2029  if ((*rows)[i - 1].ri_->has_leaders &&
2030  (*rows)[i].ri_->has_leaders &&
2031  (*rows)[i + 1].ri_->has_leaders) {
2032  const ParagraphModel *model = theory->AddModel(
2033  ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2034  (*rows)[i].AddStartLine(model);
2035  }
2036  }
2037 }
void tesseract::SetBlobStrokeWidth ( Pix *  pix,
BLOBNBOX blob 
)

Definition at line 58 of file tordmain.cpp.

58  {
59  // Cut the blob rectangle into a Pix.
60  int pix_height = pixGetHeight(pix);
61  const TBOX& box = blob->bounding_box();
62  int width = box.width();
63  int height = box.height();
64  Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
65  width, height);
66  Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
67  boxDestroy(&blob_pix_box);
68  Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
69  pixDestroy(&pix_blob);
70  // Compute the stroke widths.
71  uinT32* data = pixGetData(dist_pix);
72  int wpl = pixGetWpl(dist_pix);
73  // Horizontal width of stroke.
74  STATS h_stats(0, width + 1);
75  for (int y = 0; y < height; ++y) {
76  uinT32* pixels = data + y*wpl;
77  int prev_pixel = 0;
78  int pixel = GET_DATA_BYTE(pixels, 0);
79  for (int x = 1; x < width; ++x) {
80  int next_pixel = GET_DATA_BYTE(pixels, x);
81  // We are looking for a pixel that is equal to its vertical neighbours,
82  // yet greater than its left neighbour.
83  if (prev_pixel < pixel &&
84  (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
85  (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
86  if (pixel > next_pixel) {
87  // Single local max, so an odd width.
88  h_stats.add(pixel * 2 - 1, 1);
89  } else if (pixel == next_pixel && x + 1 < width &&
90  pixel > GET_DATA_BYTE(pixels, x + 1)) {
91  // Double local max, so an even width.
92  h_stats.add(pixel * 2, 1);
93  }
94  }
95  prev_pixel = pixel;
96  pixel = next_pixel;
97  }
98  }
99  // Vertical width of stroke.
100  STATS v_stats(0, height + 1);
101  for (int x = 0; x < width; ++x) {
102  int prev_pixel = 0;
103  int pixel = GET_DATA_BYTE(data, x);
104  for (int y = 1; y < height; ++y) {
105  uinT32* pixels = data + y*wpl;
106  int next_pixel = GET_DATA_BYTE(pixels, x);
107  // We are looking for a pixel that is equal to its horizontal neighbours,
108  // yet greater than its upper neighbour.
109  if (prev_pixel < pixel &&
110  (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
111  (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
112  if (pixel > next_pixel) {
113  // Single local max, so an odd width.
114  v_stats.add(pixel * 2 - 1, 1);
115  } else if (pixel == next_pixel && y + 1 < height &&
116  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
117  // Double local max, so an even width.
118  v_stats.add(pixel * 2, 1);
119  }
120  }
121  prev_pixel = pixel;
122  pixel = next_pixel;
123  }
124  }
125  pixDestroy(&dist_pix);
126  // Store the horizontal and vertical width in the blob, keeping both
127  // widths if there is enough information, otherwse only the one with
128  // the most samples.
129  // If there are insufficent samples, store zero, rather than using
130  // 2*area/perimeter, as the numbers that gives do not match the numbers
131  // from the distance method.
132  if (h_stats.get_total() >= (width + height) / 4) {
133  blob->set_horz_stroke_width(h_stats.ile(0.5f));
134  if (v_stats.get_total() >= (width + height) / 4)
135  blob->set_vert_stroke_width(v_stats.ile(0.5f));
136  else
137  blob->set_vert_stroke_width(0.0f);
138  } else {
139  if (v_stats.get_total() >= (width + height) / 4 ||
140  v_stats.get_total() > h_stats.get_total()) {
141  blob->set_horz_stroke_width(0.0f);
142  blob->set_vert_stroke_width(v_stats.ile(0.5f));
143  } else {
144  blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
145  : 0.0f);
146  blob->set_vert_stroke_width(0.0f);
147  }
148  }
149 }
void set_horz_stroke_width(float width)
Definition: blobbox.h:325
Definition: statistc.h:33
unsigned int uinT32
Definition: host.h:103
inT16 left() const
Definition: rect.h:68
void set_vert_stroke_width(float width)
Definition: blobbox.h:331
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
Definition: rect.h:30
#define NULL
Definition: host.h:144
const TBOX & bounding_box() const
Definition: blobbox.h:215
inT16 top() const
Definition: rect.h:54
void tesseract::SetPropertiesForInputFile ( const string &  script_dir,
const string &  input_unicharset_file,
const string &  output_unicharset_file,
const string &  output_xheights_file 
)

Definition at line 148 of file unicharset_training_utils.cpp.

151  {
152  UNICHARSET unicharset;
153 
154  // Load the input unicharset
155  unicharset.load_from_file(input_unicharset_file.c_str());
156  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
157  input_unicharset_file.c_str());
158 
159  // Set unichar properties
160  tprintf("Setting unichar properties\n");
161  SetupBasicProperties(true, &unicharset);
162  string xheights_str;
163  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
164  // Load the unicharset for the script if available.
165  string filename = script_dir + "/" +
166  unicharset.get_script_from_script_id(s) + ".unicharset";
167  UNICHARSET script_set;
168  if (script_set.load_from_file(filename.c_str())) {
169  unicharset.SetPropertiesFromOther(script_set);
170  }
171  // Load the xheights for the script if available.
172  filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
173  ".xheights";
174  string script_heights;
175  if (File::ReadFileToString(filename, &script_heights))
176  xheights_str += script_heights;
177  }
178  if (!output_xheights_file.empty())
179  File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
180  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
181  if (unicharset.PropertiesIncomplete(c)) {
182  tprintf("Warning: properties incomplete for index %d = %s\n",
183  c, unicharset.id_to_unichar(c));
184  }
185  }
186 
187  // Write the output unicharset
188  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
189  unicharset.save_to_file(output_unicharset_file.c_str());
190 }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
#define tprintf(...)
Definition: tprintf.h:31
void SetupBasicProperties(bool report_errors, UNICHARSET *unicharset)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
int get_script_table_size() const
Definition: unicharset.h:797
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:802
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:604
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:503
int size() const
Definition: unicharset.h:297
void tesseract::SetupBasicProperties ( bool  report_errors,
UNICHARSET unicharset 
)

Definition at line 40 of file unicharset_training_utils.cpp.

40  {
41  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
42  // Convert any custom ligatures.
43  const char* unichar_str = unicharset->id_to_unichar(unichar_id);
44  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
45  if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
46  unichar_str = UNICHARSET::kCustomLigatures[i][0];
47  break;
48  }
49  }
50 
51  // Convert the unichar to UTF32 representation
52  GenericVector<char32> uni_vector;
53  tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
54 
55  // Assume that if the property is true for any character in the string,
56  // then it holds for the whole "character".
57  bool unichar_isalpha = false;
58  bool unichar_islower = false;
59  bool unichar_isupper = false;
60  bool unichar_isdigit = false;
61  bool unichar_ispunct = false;
62 
63  for (int i = 0; i < uni_vector.size(); ++i) {
64  if (u_isalpha(uni_vector[i]))
65  unichar_isalpha = true;
66  if (u_islower(uni_vector[i]))
67  unichar_islower = true;
68  if (u_isupper(uni_vector[i]))
69  unichar_isupper = true;
70  if (u_isdigit(uni_vector[i]))
71  unichar_isdigit = true;
72  if (u_ispunct(uni_vector[i]))
73  unichar_ispunct = true;
74  }
75 
76  unicharset->set_isalpha(unichar_id, unichar_isalpha);
77  unicharset->set_islower(unichar_id, unichar_islower);
78  unicharset->set_isupper(unichar_id, unichar_isupper);
79  unicharset->set_isdigit(unichar_id, unichar_isdigit);
80  unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
81 
83  unicharset->set_script(unichar_id, uscript_getName(
84  uscript_getScript(uni_vector[0], err)));
85 
86  const int num_code_points = uni_vector.size();
87  // Obtain the lower/upper case if needed and record it in the properties.
88  unicharset->set_other_case(unichar_id, unichar_id);
89  if (unichar_islower || unichar_isupper) {
90  GenericVector<char32> other_case(num_code_points, 0);
91  for (int i = 0; i < num_code_points; ++i) {
92  // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
93  // However since they deal with UChars (so need a conversion function
94  // from char32 or UTF8string) and require a meaningful locale string,
95  // for now u_tolower()/u_toupper() are used.
96  other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
97  u_tolower(uni_vector[i]);
98  }
99  STRING other_case_uch;
100  tesseract::UTF32ToUTF8(other_case, &other_case_uch);
101  UNICHAR_ID other_case_id =
102  unicharset->unichar_to_id(other_case_uch.c_str());
103  if (other_case_id != INVALID_UNICHAR_ID) {
104  unicharset->set_other_case(unichar_id, other_case_id);
105  } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
106  tprintf("Other case %s of %s is not in unicharset\n",
107  other_case_uch.c_str(), unichar_str);
108  }
109  }
110 
111  // Set RTL property and obtain mirror unichar ID from ICU.
112  GenericVector<char32> mirrors(num_code_points, 0);
113  for (int i = 0; i < num_code_points; ++i) {
114  mirrors[i] = u_charMirror(uni_vector[i]);
115  if (i == 0) { // set directionality to that of the 1st code point
116  unicharset->set_direction(unichar_id,
117  static_cast<UNICHARSET::Direction>(
118  u_charDirection(uni_vector[i])));
119  }
120  }
121  STRING mirror_uch;
122  tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
123  UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
124  if (mirror_uch_id != INVALID_UNICHAR_ID) {
125  unicharset->set_mirror(unichar_id, mirror_uch_id);
126  } else if (report_errors) {
127  tprintf("Mirror %s of %s is not in unicharset\n",
128  mirror_uch.c_str(), unichar_str);
129  }
130 
131  // Record normalized version of this unichar.
132  STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
133  if (unichar_id != 0 && normed_str.length() > 0) {
134  unicharset->set_normed(unichar_id, normed_str.c_str());
135  } else {
136  unicharset->set_normed(unichar_id, unichar_str);
137  }
138  ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
139  }
140  unicharset->post_load_setup();
141 }
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:399
int size() const
Definition: genericvector.h:72
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
Definition: normstrngs.cpp:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:394
#define tprintf(...)
Definition: tprintf.h:31
inT32 length() const
Definition: strngs.cpp:188
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:440
#define ASSERT_HOST(x)
Definition: errcode.h:84
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:409
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:404
int UNICHAR_ID
Definition: unichar.h:33
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:425
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:389
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:430
STRING NormalizeUTF8String(const char *str8)
Definition: normstrngs.cpp:116
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
Definition: normstrngs.cpp:45
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:420
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:435
Definition: strngs.h:44
#define NULL
Definition: host.h:144
static const char * kCustomLigatures[][2]
Definition: unicharset.h:144
int size() const
Definition: unicharset.h:297
void post_load_setup()
Definition: unicharset.cpp:867
const char * c_str() const
Definition: strngs.cpp:204
const char* tesseract::SkipChars ( const char *  str,
const char *  toskip 
)

Definition at line 210 of file paragraphs.cpp.

210  {
211  while (*str != '\0' && strchr(toskip, *str)) { str++; }
212  return str;
213 }
const char* tesseract::SkipChars ( const char *  str,
bool(*)(int)  skip 
)

Definition at line 215 of file paragraphs.cpp.

215  {
216  while (*str != '\0' && skip(*str)) { str++; }
217  return str;
218 }
const char* tesseract::SkipOne ( const char *  str,
const char *  toskip 
)

Definition at line 220 of file paragraphs.cpp.

220  {
221  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
222  return str;
223 }
template<typename T >
int tesseract::sort_cmp ( const void *  t1,
const void *  t2 
)

Definition at line 391 of file genericvector.h.

391  {
392  const T* a = static_cast<const T *> (t1);
393  const T* b = static_cast<const T *> (t2);
394  if (*a < *b) {
395  return -1;
396  } else if (*b < *a) {
397  return 1;
398  } else {
399  return 0;
400  }
401 }
template<typename T >
int tesseract::sort_ptr_cmp ( const void *  t1,
const void *  t2 
)

Definition at line 408 of file genericvector.h.

408  {
409  const T* a = *reinterpret_cast<T * const *>(t1);
410  const T* b = *reinterpret_cast<T * const *>(t2);
411  if (*a < *b) {
412  return -1;
413  } else if (*b < *a) {
414  return 1;
415  } else {
416  return 0;
417  }
418 }
template<class BBC >
int tesseract::SortByBoxBottom ( const void *  void1,
const void *  void2 
)

Definition at line 408 of file bbgrid.h.

408  {
409  // The void*s are actually doubly indirected, so get rid of one level.
410  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
411  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
412  int result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
413  if (result != 0)
414  return result;
415  result = p1->bounding_box().top() - p2->bounding_box().top();
416  if (result != 0)
417  return result;
418  result = p1->bounding_box().left() - p2->bounding_box().left();
419  if (result != 0)
420  return result;
421  return p1->bounding_box().right() - p2->bounding_box().right();
422 }
template<class BBC >
int tesseract::SortByBoxLeft ( const void *  void1,
const void *  void2 
)

Definition at line 372 of file bbgrid.h.

372  {
373  // The void*s are actually doubly indirected, so get rid of one level.
374  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
375  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
376  int result = p1->bounding_box().left() - p2->bounding_box().left();
377  if (result != 0)
378  return result;
379  result = p1->bounding_box().right() - p2->bounding_box().right();
380  if (result != 0)
381  return result;
382  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
383  if (result != 0)
384  return result;
385  return p1->bounding_box().top() - p2->bounding_box().top();
386 }
template<class BLOB_CHOICE >
int tesseract::SortByRating ( const void *  void1,
const void *  void2 
)

Definition at line 86 of file pieces.cpp.

86  {
87  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
88  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
89 
90  if (p1->rating() < p2->rating())
91  return 1;
92  return -1;
93 }
float rating() const
Definition: ratngs.h:79
template<class BLOB_CHOICE >
int tesseract::SortByUnicharID ( const void *  void1,
const void *  void2 
)

Definition at line 78 of file pieces.cpp.

78  {
79  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
80  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
81 
82  return p1->unichar_id() - p2->unichar_id();
83 }
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
template<class BBC >
int tesseract::SortRightToLeft ( const void *  void1,
const void *  void2 
)

Definition at line 390 of file bbgrid.h.

390  {
391  // The void*s are actually doubly indirected, so get rid of one level.
392  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
393  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
394  int result = p2->bounding_box().right() - p1->bounding_box().right();
395  if (result != 0)
396  return result;
397  result = p2->bounding_box().left() - p1->bounding_box().left();
398  if (result != 0)
399  return result;
400  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
401  if (result != 0)
402  return result;
403  return p1->bounding_box().top() - p2->bounding_box().top();
404 }
int tesseract::SpanUTF8NotWhitespace ( const char *  text)

Definition at line 197 of file normstrngs.cpp.

197  {
198  int n_notwhite = 0;
199  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
200  it != UNICHAR::end(text, strlen(text));
201  ++it) {
202  if (IsWhitespace(*it)) break;
203  n_notwhite += it.utf8_len();
204  }
205  return n_notwhite;
206 }
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:176
int tesseract::SpanUTF8Whitespace ( const char *  text)

Definition at line 186 of file normstrngs.cpp.

186  {
187  int n_white = 0;
188  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
189  it != UNICHAR::end(text, strlen(text));
190  ++it) {
191  if (!IsWhitespace(*it)) break;
192  n_white += it.utf8_len();
193  }
194  return n_white;
195 }
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:176
void tesseract::StrongEvidenceClassify ( int  debug_level,
GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
ParagraphTheory *  theory 
)

Definition at line 1995 of file paragraphs.cpp.

1998  {
1999  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2000  return;
2001 
2002  if (debug_level > 1) {
2003  tprintf("#############################################\n");
2004  tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2005  tprintf("#############################################\n");
2006  }
2007 
2008  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2009  MarkStrongEvidence(rows, row_start, row_end);
2010 
2011  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2012 
2013  // Create paragraph models.
2014  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2015 
2016  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2017 
2018  // At this point, some rows are marked up as paragraphs with model numbers,
2019  // and some rows are marked up as either LT_START or LT_BODY. Now let's
2020  // smear any good paragraph hypotheses forward and backward.
2021  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2022  smearer.Smear();
2023 }
void ModelStrongEvidence(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
#define tprintf(...)
Definition: tprintf.h:31
void MarkStrongEvidence(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool tesseract::StrongModel ( const ParagraphModel model)
inline

Definition at line 75 of file paragraphs_internal.h.

75  {
76  return model != NULL && model != kCrownLeft && model != kCrownRight;
77 }
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45
#define NULL
Definition: host.h:144
bool tesseract::TextSupportsBreak ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after 
)

Definition at line 1661 of file paragraphs.cpp.

1662  {
1663  if (before.ri_->ltr) {
1664  return before.ri_->rword_likely_ends_idea &&
1665  after.ri_->lword_likely_starts_idea;
1666  } else {
1667  return before.ri_->lword_likely_ends_idea &&
1668  after.ri_->rword_likely_starts_idea;
1669  }
1670 }
Pix * tesseract::TraceBlockOnReducedPix ( BLOCK block,
int  gridsize,
ICOORD  bleft,
int *  left,
int *  bottom 
)

Definition at line 258 of file bbgrid.cpp.

259  {
260  TBOX box = block->bounding_box();
261  Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
262  int wpl = pixGetWpl(pix);
263  l_uint32* data = pixGetData(pix);
264  ICOORDELT_IT it(block->poly_block()->points());
265  for (it.mark_cycle_pt(); !it.cycled_list();) {
266  ICOORD pos = *it.data();
267  it.forward();
268  ICOORD next_pos = *it.data();
269  ICOORD line_vector = next_pos - pos;
270  int major, minor;
271  ICOORD major_step, minor_step;
272  line_vector.setup_render(&major_step, &minor_step, &major, &minor);
273  int accumulator = major / 2;
274  while (pos != next_pos) {
275  int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
276  int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
277  SET_DATA_BIT(data + grid_y * wpl, grid_x);
278  pos += major_step;
279  accumulator += minor;
280  if (accumulator >= major) {
281  accumulator -= major;
282  pos += minor_step;
283  }
284  }
285  }
286  return pix;
287 }
inT16 y() const
access_function
Definition: points.h:56
Pix * GridReducedPix(const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom)
Definition: bbgrid.cpp:212
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
ICOORDELT_LIST * points()
Definition: polyblk.h:42
integer coordinate
Definition: points.h:30
inT16 x() const
access function
Definition: points.h:52
Definition: rect.h:30
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void setup_render(ICOORD *major_step, ICOORD *minor_step, int *major, int *minor) const
Definition: points.cpp:86
Pix * tesseract::TraceOutlineOnReducedPix ( C_OUTLINE outline,
int  gridsize,
ICOORD  bleft,
int *  left,
int *  bottom 
)

Definition at line 232 of file bbgrid.cpp.

233  {
234  TBOX box = outline->bounding_box();
235  Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
236  int wpl = pixGetWpl(pix);
237  l_uint32* data = pixGetData(pix);
238  int length = outline->pathlength();
239  ICOORD pos = outline->start_pos();
240  for (int i = 0; i < length; ++i) {
241  int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
242  int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
243  SET_DATA_BIT(data + grid_y * wpl, grid_x);
244  pos += outline->step(i);
245  }
246  return pix;
247 }
const ICOORD & start_pos() const
Definition: coutln.h:146
inT32 pathlength() const
Definition: coutln.h:133
inT16 y() const
access_function
Definition: points.h:56
Pix * GridReducedPix(const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom)
Definition: bbgrid.cpp:212
const TBOX & bounding_box() const
Definition: coutln.h:111
integer coordinate
Definition: points.h:30
inT16 x() const
access function
Definition: points.h:52
Definition: rect.h:30
ICOORD step(int index) const
Definition: coutln.h:142
int tesseract::UnicodeFor ( const UNICHARSET u,
const WERD_CHOICE werd,
int  pos 
)

Definition at line 274 of file paragraphs.cpp.

274  {
275  if (!u || !werd || pos > werd->length())
276  return 0;
277  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
278 }
int length() const
Definition: ratngs.h:300
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
int first_uni() const
Definition: unichar.cpp:97
bool tesseract::UniLikelyListItem ( const UNICHARSET u,
const WERD_CHOICE werd 
)

Definition at line 357 of file paragraphs.cpp.

357  {
358  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
359  return true;
360 
361  UnicodeSpanSkipper m(u, werd);
362  int num_segments = 0;
363  int pos = 0;
364  while (pos < werd->length() && num_segments < 3) {
365  int numeral_start = m.SkipPunc(pos);
366  if (numeral_start > pos + 1) break;
367  int numeral_end = m.SkipRomans(numeral_start);
368  if (numeral_end == numeral_start) {
369  numeral_end = m.SkipDigits(numeral_start);
370  if (numeral_end == numeral_start) {
371  // If there's a single latin letter, we can use that.
372  numeral_end = m.SkipAlpha(numeral_start);
373  if (numeral_end - numeral_start != 1)
374  break;
375  }
376  }
377  // We got some sort of numeral.
378  num_segments++;
379  // Skip any trailing punctuation.
380  pos = m.SkipPunc(numeral_end);
381  if (pos == numeral_end)
382  break;
383  }
384  return pos == werd->length();
385 }
int length() const
Definition: ratngs.h:300
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:274
bool LikelyListMarkUnicode(int ch)
Definition: paragraphs.cpp:328
void tesseract::UTF32ToUTF8 ( const GenericVector< char32 > &  str32,
STRING utf8_str 
)

Definition at line 45 of file normstrngs.cpp.

45  {
46  utf8_str->ensure(str32.length());
47  utf8_str->assign("", 0);
48  for (int i = 0; i < str32.length(); ++i) {
49  UNICHAR uni_ch(str32[i]);
50  char *utf8 = uni_ch.utf8_str();
51  if (utf8 != NULL) {
52  (*utf8_str) += utf8;
53  delete[] utf8;
54  }
55  }
56 }
int length() const
Definition: genericvector.h:79
void assign(const char *cstr, int len)
Definition: strngs.cpp:417
#define NULL
Definition: host.h:144
void ensure(inT32 min_capacity)
Definition: strngs.h:112
void tesseract::UTF8ToUTF32 ( const char *  utf8_str,
GenericVector< char32 > *  str32 
)

Definition at line 31 of file normstrngs.cpp.

31  {
32  str32->clear();
33  str32->reserve(strlen(utf8_str));
34  int len = strlen(utf8_str);
35  int step = 0;
36  for (int ch = 0; ch < len; ch += step) {
37  step = UNICHAR::utf8_step(utf8_str + ch);
38  if (step > 0) {
39  UNICHAR uni_ch(utf8_str + ch, step);
40  (*str32) += uni_ch.first_uni();
41  }
42  }
43 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
void reserve(int size)
bool tesseract::ValidBodyLine ( const GenericVector< RowScratchRegisters > *  rows,
int  row,
const ParagraphModel model 
)

Definition at line 1277 of file paragraphs.cpp.

1278  {
1279  if (!StrongModel(model)) {
1280  tprintf("ValidBodyLine() should only be called with strong models!\n");
1281  }
1282  return StrongModel(model) &&
1283  model->ValidBodyLine(
1284  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1285  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1286 }
#define tprintf(...)
Definition: tprintf.h:31
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:63
bool StrongModel(const ParagraphModel *model)
bool tesseract::ValidFirstLine ( const GenericVector< RowScratchRegisters > *  rows,
int  row,
const ParagraphModel model 
)

Definition at line 1266 of file paragraphs.cpp.

1267  {
1268  if (!StrongModel(model)) {
1269  tprintf("ValidFirstLine() should only be called with strong models!\n");
1270  }
1271  return StrongModel(model) &&
1272  model->ValidFirstLine(
1273  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1274  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1275 }
#define tprintf(...)
Definition: tprintf.h:31
bool StrongModel(const ParagraphModel *model)
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:46
bool tesseract::write_info ( FILE *  f,
const FontInfo &  fi 
)

Definition at line 168 of file fontinfo.cpp.

168  {
169  inT32 size = strlen(fi.name);
170  if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
171  if (static_cast<int>(fwrite(fi.name, sizeof(*fi.name), size, f)) != size)
172  return false;
173  if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
174  return true;
175 }
int inT32
Definition: host.h:102
bool tesseract::write_set ( FILE *  f,
const FontSet &  fs 
)

Definition at line 253 of file fontinfo.cpp.

253  {
254  if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
255  for (int i = 0; i < fs.size; ++i) {
256  if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
257  }
258  return true;
259 }
bool tesseract::write_spacing_info ( FILE *  f,
const FontInfo &  fi 
)

Definition at line 211 of file fontinfo.cpp.

211  {
212  inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
213  if (fwrite(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
214  inT16 x_gap_invalid = -1;
215  for (int i = 0; i < vec_size; ++i) {
216  FontSpacingInfo *fs = fi.spacing_vec->get(i);
217  inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
218  if (fs == NULL) {
219  // Valid to have the identical fwrites. Writing invalid x-gaps.
220  if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
221  fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
222  fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
223  return false;
224  }
225  } else {
226  if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
227  fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
228  fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
229  return false;
230  }
231  }
232  if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
233  !fs->kerned_x_gaps.Serialize(f))) {
234  return false;
235  }
236  }
237  return true;
238 }
#define NULL
Definition: host.h:144
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102
void tesseract::WriteShapeTable ( const STRING file_prefix,
const ShapeTable shape_table 
)

Definition at line 144 of file commontraining.cpp.

144  {
145  STRING shape_table_file = file_prefix;
146  shape_table_file += kShapeTableFileSuffix;
147  FILE* fp = fopen(shape_table_file.string(), "wb");
148  if (fp != NULL) {
149  if (!shape_table.Serialize(fp)) {
150  fprintf(stderr, "Error writing shape table: %s\n",
151  shape_table_file.string());
152  }
153  fclose(fp);
154  } else {
155  fprintf(stderr, "Error creating shape table: %s\n",
156  shape_table_file.string());
157  }
158 }
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::YOutlierPieces ( WERD_RES word,
int  rebuilt_blob_index,
int  super_y_bottom,
int  sub_y_top,
ScriptPos *  leading_pos,
int *  num_leading_outliers,
ScriptPos *  trailing_pos,
int *  num_trailing_outliers 
)

Given a recognized blob, see if a contiguous collection of sub-pieces (chopped blobs) starting at its left might qualify as being a subscript or superscript letter based only on y position. Also do this for the right side.

Definition at line 46 of file superscript.cpp.

49  {
50  ScriptPos sp_unused1, sp_unused2;
51  int unused1, unused2;
52  if (!leading_pos) leading_pos = &sp_unused1;
53  if (!num_leading_outliers) num_leading_outliers = &unused1;
54  if (!trailing_pos) trailing_pos = &sp_unused2;
55  if (!num_trailing_outliers) num_trailing_outliers = &unused2;
56 
57  *num_leading_outliers = *num_trailing_outliers = 0;
58  *leading_pos = *trailing_pos = SP_NORMAL;
59 
60  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
61  int num_chopped_pieces = word->best_state[rebuilt_blob_index];
62  ScriptPos last_pos = SP_NORMAL;
63  int trailing_outliers = 0;
64  for (int i = 0; i < num_chopped_pieces; i++) {
65  TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
66  ScriptPos pos = SP_NORMAL;
67  if (box.bottom() >= super_y_bottom) {
68  pos = SP_SUPERSCRIPT;
69  } else if (box.top() <= sub_y_top) {
70  pos = SP_SUBSCRIPT;
71  }
72  if (pos == SP_NORMAL) {
73  if (trailing_outliers == i) {
74  *num_leading_outliers = trailing_outliers;
75  *leading_pos = last_pos;
76  }
77  trailing_outliers = 0;
78  } else {
79  if (pos == last_pos) {
80  trailing_outliers++;
81  } else {
82  trailing_outliers = 1;
83  }
84  }
85  last_pos = pos;
86  }
87  *num_trailing_outliers = trailing_outliers;
88  *trailing_pos = last_pos;
89 }
TWERD * chopped_word
Definition: pageres.h:201
inT16 bottom() const
Definition: rect.h:61
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: rect.h:30
GenericVector< int > best_state
Definition: pageres.h:255
inT16 top() const
Definition: rect.h:54

Variable Documentation

const int tesseract::case_state_table[6][4]
Initial value:
= { {
0, 1, 5, 4
},
{
0, 3, 2, 4
},
{
0, -1, 2, -1
},
{
0, 3, -1, 4
},
{
0, -1, -1, 4
},
{
5, -1, 2, -1
},
}

Definition at line 35 of file context.cpp.

const int tesseract::kAdjacentLeaderSearchPadding = 2

Definition at line 125 of file tablefind.cpp.

const double tesseract::kAlignedFraction = 0.03125

Definition at line 39 of file alignedblob.cpp.

const double tesseract::kAlignedGapFraction = 0.75

Definition at line 43 of file alignedblob.cpp.

const char* tesseract::kAlignmentNames[]
Initial value:
= {
"Left Aligned",
"Left Ragged",
"Center",
"Right Aligned",
"Right Ragged",
"Separator"
}

Definition at line 515 of file tabvector.cpp.

const double tesseract::kAllowBlobArea = 0.05

Definition at line 61 of file tablefind.cpp.

const double tesseract::kAllowBlobHeight = 0.3

Definition at line 59 of file tablefind.cpp.

const double tesseract::kAllowBlobWidth = 0.4

Definition at line 60 of file tablefind.cpp.

const double tesseract::kAllowTextArea = 0.8

Definition at line 54 of file tablefind.cpp.

const double tesseract::kAllowTextHeight = 0.5

Definition at line 52 of file tablefind.cpp.

const double tesseract::kAllowTextWidth = 0.6

Definition at line 53 of file tablefind.cpp.

const char * tesseract::kApostropheLikeUTF8
Initial value:
= {
"'",
"`",
"\u2018",
"\u2019",
"\u2032",
NULL,
}
#define NULL
Definition: host.h:144

Definition at line 48 of file unicodes.cpp.

const int tesseract::kBasicBufSize = 2048

Definition at line 155 of file pdfrenderer.cpp.

const double tesseract::kBigPartSizeRatio = 1.75

Definition at line 51 of file colpartitiongrid.cpp.

const int tesseract::kBoxClipTolerance = 2

Definition at line 31 of file boxword.cpp.

const double tesseract::kBrokenCJKIterationFraction = 0.125

Definition at line 71 of file strokewidth.cpp.

const int tesseract::kBytesPer64BitNumber = 20

Max bytes in the decimal representation of inT64.

Definition at line 1566 of file baseapi.cpp.

const int tesseract::kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1

Multiplier for max expected textlength assumes (kBytesPerNumber + space)

  • kNumbersPerBlob plus the newline. Add to this the original UTF8 characters, and one kMaxBytesPerLine for safety.

Definition at line 1563 of file baseapi.cpp.

const int tesseract::kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1

Definition at line 1564 of file baseapi.cpp.

const int tesseract::kBytesPerNumber = 5

The number of bytes taken by each number. Since we use inT16 for ICOORD, assume only 5 digits max.

Definition at line 1557 of file baseapi.cpp.

const int tesseract::kCellSplitColumnThreshold = 0

Definition at line 40 of file tablerecog.cpp.

const int tesseract::kCellSplitRowThreshold = 0

Definition at line 39 of file tablerecog.cpp.

const double tesseract::kCharVerticalOverlapFraction = 0.375

Definition at line 62 of file tabfind.cpp.

const int tesseract::kCharWidth = 2

Definition at line 158 of file pdfrenderer.cpp.

const double tesseract::kCJKAspectRatio = 1.25

Definition at line 65 of file strokewidth.cpp.

const double tesseract::kCJKAspectRatioIncrease = 1.0625

Definition at line 67 of file strokewidth.cpp.

const double tesseract::kCJKBrokenDistanceFraction = 0.25

Definition at line 61 of file strokewidth.cpp.

const int tesseract::kCJKMaxComponents = 8

Definition at line 63 of file strokewidth.cpp.

const int tesseract::kCJKRadius = 2

Definition at line 59 of file strokewidth.cpp.

const int tesseract::kColumnWidthFactor = 20

Pixel resolution of column width estimates.

Definition at line 42 of file tabfind.h.

const double tesseract::kCosMaxSkewAngle = 0.866025

Definition at line 81 of file tabfind.cpp.

const int tesseract::kCrackSpacing = 100

Spacing of cracks across the page to break up tall vertical lines.

Definition at line 45 of file linefind.cpp.

const ParagraphModel * tesseract::kCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F)

Definition at line 45 of file paragraphs.cpp.

const ParagraphModel * tesseract::kCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F)

Definition at line 47 of file paragraphs.cpp.

const int tesseract::kDefaultResolution = 300

Default resolution used if input in not believable.

Definition at line 60 of file pagesegmain.cpp.

const double tesseract::kDiacriticXPadRatio = 7.0

Definition at line 74 of file strokewidth.cpp.

const double tesseract::kDiacriticYPadRatio = 1.75

Definition at line 77 of file strokewidth.cpp.

const char tesseract::kDoNotReverse[] = "RRP_DO_NO_REVERSE"

Definition at line 44 of file trie.cpp.

const int tesseract::kExposureFactor = 16

Definition at line 32 of file degradeimage.cpp.

const int tesseract::kFeaturePadding = 2

Definition at line 34 of file imagedata.h.

const float tesseract::kFontMergeDistance = 0.025

Definition at line 52 of file mastertrainer.cpp.

const char tesseract::kForceReverse[] = "RRP_FORCE_REVERSE"

Definition at line 46 of file trie.cpp.

const double tesseract::kGoodRowNumberOfColumnsLarge = 0.7

Definition at line 58 of file tablerecog.cpp.

const double tesseract::kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 }

Definition at line 54 of file tablerecog.cpp.

const int tesseract::kGoodRowNumberOfColumnsSmallSize
Initial value:
=
sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1
const double kGoodRowNumberOfColumnsSmall[]
Definition: tablerecog.cpp:54

Definition at line 55 of file tablerecog.cpp.

const int tesseract::kGutterMultiple = 4

Definition at line 38 of file tabvector.cpp.

const int tesseract::kGutterToNeighbourRatio = 3

Definition at line 40 of file tabvector.cpp.

const int tesseract::kHistogramSize = 256

Definition at line 27 of file otsuthr.h.

const double tesseract::kHorizontalGapMergeFraction = 0.5

Definition at line 57 of file colfind.cpp.

const double tesseract::kHorizontalSpacing = 0.30

Definition at line 33 of file tablerecog.cpp.

const int tesseract::kHorzStrongTextlineAspect = 5

Definition at line 74 of file colpartition.cpp.

const int tesseract::kHorzStrongTextlineCount = 8

Definition at line 70 of file colpartition.cpp.

const int tesseract::kHorzStrongTextlineHeight = 10

Definition at line 72 of file colpartition.cpp.

const char * tesseract::kHyphenLikeUTF8
Initial value:
= {
"-",
"\u05BE",
"\u2010",
"\u2011",
"\u2012",
"\u2013",
"\u2014",
"\u2015",
"\u2212",
"\uFE58",
"\uFE63",
"\uFF0D",
NULL,
}
#define NULL
Definition: host.h:144

The following are confusable internal word punctuation symbols which we normalize to the first variant when matching in dawgs.

Definition at line 32 of file unicodes.cpp.

const int tesseract::kImagePadding = 4

Definition at line 36 of file imagedata.h.

const float tesseract::kInfiniteDist = 999.0f

Definition at line 911 of file mastertrainer.cpp.

const char* tesseract::kInputFile = "noname.tif"

Filename used for input image file, from which to derive a name to search for a possible UNLV zone file, if none is specified by SetInputName.

Definition at line 97 of file baseapi.cpp.

const double tesseract::kLargeTableProjectionThreshold = 0.45

Definition at line 110 of file tablefind.cpp.

const int tesseract::kLargeTableRowCount = 6

Definition at line 112 of file tablefind.cpp.

const int tesseract::kLatinChs[]
Initial value:
= {
0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
}

Latin chars corresponding to the unicode chars above.

Definition at line 1627 of file baseapi.cpp.

const int tesseract::kLeaderCutCost = 8

Definition at line 64 of file colpartition.cpp.

const int tesseract::kLeftIndentAlignmentCountTh = 1

Definition at line 88 of file equationdetect.cpp.

const double tesseract::kLineCountReciprocal = 4.0

Definition at line 51 of file tabvector.cpp.

const int tesseract::kLinedTableMinHorizontalLines = 3

Definition at line 43 of file tablerecog.cpp.

const int tesseract::kLinedTableMinVerticalLines = 3

Definition at line 42 of file tablerecog.cpp.

const int tesseract::kLineFindGridSize = 50

Grid size used by line finder. Not very critical.

Definition at line 47 of file linefind.cpp.

const double tesseract::kLineFragmentAspectRatio = 10.0

Definition at line 56 of file tabfind.cpp.

const double tesseract::kLineResidueAspectRatio = 8.0

Definition at line 100 of file strokewidth.cpp.

const int tesseract::kLineResiduePadRatio = 3

Definition at line 102 of file strokewidth.cpp.

const double tesseract::kLineResidueSizeRatio = 1.75

Definition at line 104 of file strokewidth.cpp.

const int tesseract::kLineTrapLongest = 4

Definition at line 93 of file strokewidth.cpp.

const int tesseract::kLineTrapShortest = 2

Definition at line 95 of file strokewidth.cpp.

const char * tesseract::kLRM = "\u200E"

Definition at line 27 of file unicodes.cpp.

const double tesseract::kMarginFactor = 1.1

Definition at line 48 of file tablerecog.cpp.

const double tesseract::kMarginOverlapFraction = 0.25

Definition at line 54 of file colfind.cpp.

const float tesseract::kMathDigitDensityTh1 = 0.25

Definition at line 83 of file equationdetect.cpp.

const float tesseract::kMathDigitDensityTh2 = 0.1

Definition at line 84 of file equationdetect.cpp.

const float tesseract::kMathItalicDensityTh = 0.5

Definition at line 85 of file equationdetect.cpp.

const int tesseract::kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1)

Definition at line 40 of file ambigs.cpp.

const double tesseract::kMaxBaselineError = 0.4375

Definition at line 77 of file colpartition.cpp.

const double tesseract::kMaxBlobOverlapFactor = 4.0

Definition at line 80 of file tablefind.cpp.

const int tesseract::kMaxBlobWidth = 500

Definition at line 43 of file tablefind.cpp.

const inT16 tesseract::kMaxBoxEdgeDiff = 2

Definition at line 32 of file recogtraining.cpp.

const int tesseract::kMaxBoxesInDataPartition = 20

Definition at line 69 of file tablefind.cpp.

const int tesseract::kMaxBytesPerLine
Initial value:
const int kBytesPer64BitNumber
Definition: baseapi.cpp:1566
const int kNumbersPerBlob
Definition: baseapi.cpp:1552
#define UNICHAR_LEN
Definition: unichar.h:30

A maximal single box could occupy kNumbersPerBlob numbers at kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a space plus the newline and the maximum length of a UNICHAR. Test against this on each iteration for safety.

Definition at line 1573 of file baseapi.cpp.

const int tesseract::kMaxCaptionLines = 7

Definition at line 43 of file colpartitiongrid.cpp.

const int tesseract::kMaxCharTopRange = 48

Definition at line 66 of file fixxht.cpp.

const int tesseract::kMaxCircleErosions = 8

Definition at line 62 of file pagesegmain.cpp.

const int tesseract::kMaxCJKSizeRatio = 5

Definition at line 69 of file strokewidth.cpp.

const int tesseract::kMaxColorDistance = 900

Definition at line 84 of file colpartition.cpp.

const int tesseract::kMaxColumnHeaderDistance = 4

Definition at line 88 of file tablefind.cpp.

const int tesseract::kMaxCredibleResolution = 2400

Maximum believable resolution.

Definition at line 110 of file baseapi.cpp.

const double tesseract::kMaxDiacriticDistanceRatio = 1.25

Definition at line 83 of file strokewidth.cpp.

const double tesseract::kMaxDiacriticGapToBaseCharHeight = 1.0

Definition at line 86 of file strokewidth.cpp.

const double tesseract::kMaxDistToPartSizeRatio = 1.5

Definition at line 64 of file colfind.cpp.

const int tesseract::kMaxFillinMultiple = 11

Definition at line 47 of file tabvector.cpp.

const double tesseract::kMaxGapInTextPartition = 4.0

Definition at line 72 of file tablefind.cpp.

const double tesseract::kMaxGutterWidthAbsolute = 2.00

Definition at line 51 of file tabfind.cpp.

const double tesseract::kMaxHorizontalGap = 3.0

Definition at line 64 of file tabfind.cpp.

const int tesseract::kMaxIncompatibleColumnCount = 2

Definition at line 52 of file colfind.cpp.

const int tesseract::kMaxIntSize = 22

Max string length of an int.

Definition at line 103 of file baseapi.cpp.

const int tesseract::kMaxLargeOverlaps = 3

Definition at line 109 of file strokewidth.cpp.

const int tesseract::kMaxLargeOverlapsWithMedium = 12

Definition at line 44 of file ccnontextdetect.cpp.

const int tesseract::kMaxLargeOverlapsWithSmall = 3

Definition at line 35 of file ccnontextdetect.cpp.

const double tesseract::kMaxLeaderGapFractionOfMax = 0.25

Definition at line 58 of file colpartition.cpp.

const double tesseract::kMaxLeaderGapFractionOfMin = 0.5

Definition at line 60 of file colpartition.cpp.

const int tesseract::kMaxLigature = 0xfb17

Definition at line 46 of file ligature_table.cpp.

const int tesseract::kMaxLineLength = 1024

Definition at line 290 of file boxchar.cpp.

const int tesseract::kMaxLineResidue = 6

Definition at line 53 of file linefind.cpp.

const int tesseract::kMaxMediumOverlapsWithSmall = 12

Definition at line 40 of file ccnontextdetect.cpp.

const int tesseract::kMaxNeighbourDistFactor = 4

Definition at line 37 of file colpartitiongrid.cpp.

const double tesseract::kMaxNonLineDensity = 0.25

Definition at line 58 of file linefind.cpp.

const int tesseract::kMaxOffsetDist = 32

Definition at line 32 of file intfeaturemap.cpp.

const int tesseract::kMaxPadFactor = 6

Definition at line 34 of file colpartitiongrid.cpp.

const double tesseract::kMaxParagraphEndingLeftSpaceMultiple = 3.0

Definition at line 134 of file tablefind.cpp.

const double tesseract::kMaxPartitionSpacing = 1.75

Definition at line 70 of file colpartitiongrid.cpp.

const int tesseract::kMaxPartnerDepth = 4

Definition at line 46 of file colpartition.cpp.

const int tesseract::kMaxRaggedSearch = 25

Definition at line 39 of file tabfind.cpp.

const int tesseract::kMaxRealDistance = 2.0

Definition at line 37 of file detlinefit.cpp.

const double tesseract::kMaxRectangularFraction = 0.75

Definition at line 46 of file imagefind.cpp.

const double tesseract::kMaxRectangularGradient = 0.1

Definition at line 49 of file imagefind.cpp.

const int tesseract::kMaxRMSColorNoise = 128

Definition at line 81 of file colpartition.cpp.

const double tesseract::kMaxRowSize = 2.5

Definition at line 51 of file tablerecog.cpp.

const double tesseract::kMaxSameBlockLineSpacing = 3

Definition at line 54 of file colpartition.cpp.

const double tesseract::kMaxSizeRatio = 1.5

Definition at line 56 of file colpartition.cpp.

const int tesseract::kMaxSkewFactor = 15

Definition at line 65 of file alignedblob.cpp.

const double tesseract::kMaxSmallNeighboursPerPix = 1.0 / 32

Definition at line 32 of file ccnontextdetect.cpp.

const double tesseract::kMaxSpacingDrift = 1.0 / 72

Definition at line 48 of file colpartition.cpp.

const double tesseract::kMaxStaveHeight = 1.0

Definition at line 60 of file linefind.cpp.

const double tesseract::kMaxTableCellXheight = 2.0

Definition at line 84 of file tablefind.cpp.

const int tesseract::kMaxTextLineBlobRatio = 5

Definition at line 72 of file tabfind.cpp.

const double tesseract::kMaxTopSpacingFraction = 0.25

Definition at line 51 of file colpartition.cpp.

const int tesseract::kMaxUnicharsPerCluster = 2000

Definition at line 50 of file mastertrainer.cpp.

const int tesseract::kMaxVerticalSearch = 12

Definition at line 38 of file tabfind.cpp.

const int tesseract::kMaxVerticalSpacing = 500

Definition at line 41 of file tablefind.cpp.

const double tesseract::kMaxXProjectionGapFactor = 2.0

Definition at line 144 of file tablefind.cpp.

const double tesseract::kMinAlignedGutter = 0.25

Definition at line 53 of file tabvector.cpp.

const int tesseract::kMinAlignedTabs = 4

Definition at line 55 of file alignedblob.cpp.

const double tesseract::kMinBaselineCoverage = 0.5

Definition at line 79 of file colpartition.cpp.

const int tesseract::kMinBoxesInTextPartition = 10

Definition at line 66 of file tablefind.cpp.

const double tesseract::kMinCaptionGapHeightRatio = 0.5

Definition at line 47 of file colpartitiongrid.cpp.

const double tesseract::kMinCaptionGapRatio = 2.0

Definition at line 45 of file colpartitiongrid.cpp.

const int tesseract::kMinChainTextValue = 3

Definition at line 68 of file colpartition.cpp.

const int tesseract::kMinClusteredShapes = 1

Definition at line 48 of file mastertrainer.cpp.

const int tesseract::kMinColorDifference = 16

Definition at line 55 of file imagefind.cpp.

const int tesseract::kMinColumnWidth = 100

Definition at line 49 of file colfind.cpp.

const int tesseract::kMinCredibleResolution = 70

Minimum believable resolution.

Minimum believable resolution. Used as a default if there is no other information, as it is safer to under-estimate than over-estimate.

Definition at line 108 of file baseapi.cpp.

const double tesseract::kMinDiacriticSizeRatio = 1.0625

Definition at line 80 of file strokewidth.cpp.

const int tesseract::kMinEvaluatedTabs = 3

Definition at line 69 of file tabfind.cpp.

const double tesseract::kMinFilledArea = 0.35

Definition at line 61 of file tablerecog.cpp.

const double tesseract::kMinFractionalLinesInColumn = 0.125

Definition at line 45 of file tabfind.cpp.

const double tesseract::kMinGoodTextPARatio = 1.5

Definition at line 60 of file ccnontextdetect.cpp.

const double tesseract::kMinGutterFraction = 0.5

Definition at line 49 of file tabvector.cpp.

const double tesseract::kMinGutterWidthAbsolute = 0.02

Definition at line 49 of file tabfind.cpp.

const double tesseract::kMinGutterWidthGrid = 0.5

Definition at line 61 of file colfind.cpp.

const double tesseract::kMinImageArea = 0.5

Definition at line 77 of file tabfind.cpp.

const int tesseract::kMinImageFindSize = 100

Definition at line 51 of file imagefind.cpp.

const int tesseract::kMinLeaderCount = 5

Definition at line 62 of file colpartition.cpp.

const int tesseract::kMinLigature = 0xfb00

Definition at line 45 of file ligature_table.cpp.

const int tesseract::kMinLineLengthFraction = 4

Denominator of resolution makes min pixels to demand line lengths to be.

Definition at line 43 of file linefind.cpp.

const int tesseract::kMinLinesInColumn = 10

Definition at line 41 of file tabfind.cpp.

const double tesseract::kMinMaxGapInTextPartition = 0.5

Definition at line 76 of file tablefind.cpp.

const double tesseract::kMinMusicPixelFraction = 0.75

Definition at line 62 of file linefind.cpp.

const double tesseract::kMinNonNoiseFraction = 0.5

Definition at line 59 of file colfind.cpp.

const int tesseract::kMinOutlierSamples = 5

Definition at line 37 of file trainingsampleset.cpp.

const double tesseract::kMinOverlapWithTable = 0.6

Definition at line 100 of file tablefind.cpp.

const double tesseract::kMinParagraphEndingTextToWhitespaceRatio = 3.0

Definition at line 140 of file tablefind.cpp.

const double tesseract::kMinPCLengthIncrease = 1.0 / 1024

Definition at line 33 of file intfeaturemap.cpp.

const int tesseract::kMinPointsForErrorCount = 16

Definition at line 34 of file detlinefit.cpp.

const double tesseract::kMinRaggedGutter = 1.5

Definition at line 55 of file tabvector.cpp.

const int tesseract::kMinRaggedTabs = 5

Definition at line 53 of file alignedblob.cpp.

const int tesseract::kMinRampSize = 1000

Definition at line 36 of file degradeimage.cpp.

const double tesseract::kMinRectangularFraction = 0.125

Definition at line 44 of file imagefind.cpp.

const int tesseract::kMinRectSize = 10

Minimum sensible image size to be worth running tesseract.

Definition at line 86 of file baseapi.cpp.

const int tesseract::kMinRowsInTable = 3

Definition at line 115 of file tablefind.cpp.

const int tesseract::kMinStrongTextValue = 6

Definition at line 66 of file colpartition.cpp.

const double tesseract::kMinTabGradient = 4.0

Definition at line 61 of file alignedblob.cpp.

const int tesseract::kMinTextLineBlobRatio = 3

Definition at line 75 of file tabfind.cpp.

const int tesseract::kMinThickLineWidth = 12

Definition at line 49 of file linefind.cpp.

const int tesseract::kMinVerticalSearch = 3

Definition at line 37 of file tabfind.cpp.

const int tesseract::kMostlyOneDirRatio = 3

Definition at line 98 of file strokewidth.cpp.

const double tesseract::kNeighbourSearchFactor = 2.5

Definition at line 111 of file strokewidth.cpp.

const double tesseract::kNoiseOverlapAreaFactor = 1.0 / 512

Definition at line 116 of file strokewidth.cpp.

const double tesseract::kNoiseOverlapGrowthFactor = 4.0

Definition at line 113 of file strokewidth.cpp.

const int tesseract::kNoisePadding = 4

Definition at line 51 of file ccnontextdetect.cpp.

const int tesseract::kNumbersPerBlob = 5

The 5 numbers output for each box (the usual 4 and a page number.)

Definition at line 1552 of file baseapi.cpp.

const int tesseract::kNumEndPoints = 3

Definition at line 28 of file detlinefit.cpp.

const int tesseract::kNumLiteralCnt = 5

Definition at line 36 of file tess_lang_model.h.

const int tesseract::kNumPagesPerMiniBatch = 100

Definition at line 38 of file imagedata.h.

const char* tesseract::kOldVarsFile = "failed_vars.txt"

Temp file used for storing current parameters before applying retry values.

Definition at line 101 of file baseapi.cpp.

const int tesseract::kOriginalNoiseMultiple = 8

Definition at line 47 of file ccnontextdetect.cpp.

const double tesseract::kParagraphEndingPreviousLineRatio = 1.3

Definition at line 130 of file tablefind.cpp.

const char * tesseract::kPDF = "\u202C"

Definition at line 30 of file unicodes.cpp.

const double tesseract::kPhotoOffsetFraction = 0.375

Definition at line 54 of file ccnontextdetect.cpp.

const int tesseract::kPrime1 = 17

Definition at line 34 of file trainingsampleset.cpp.

const int tesseract::kPrime2 = 13

Definition at line 35 of file trainingsampleset.cpp.

const double tesseract::kRaggedFraction = 2.5

Definition at line 41 of file alignedblob.cpp.

const double tesseract::kRaggedGapFraction = 1.0

Definition at line 45 of file alignedblob.cpp.

const int tesseract::kRaggedGutterMultiple = 5

Definition at line 53 of file tabfind.cpp.

const int tesseract::kRandomizingCenter = 128

Definition at line 35 of file trainingsample.cpp.

const double tesseract::kRatingEpsilon = 1.0 / 32

Definition at line 31 of file errorcounter.cpp.

const double tesseract::kRequiredColumns = 0.7

Definition at line 46 of file tablerecog.cpp.

const double tesseract::kRequiredFullJustifiedSpacing = 4.0

Definition at line 120 of file tablefind.cpp.

const char tesseract::kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"

Definition at line 45 of file trie.cpp.

const int tesseract::kRGBRMSColors = 4

Definition at line 36 of file colpartition.h.

const char * tesseract::kRLE = "\u202A"

Definition at line 29 of file unicodes.cpp.

const char * tesseract::kRLM = "\u200F"

Definition at line 28 of file unicodes.cpp.

const double tesseract::kRMSFitScaling = 8.0

Definition at line 53 of file imagefind.cpp.

const float tesseract::kRotationRange = 0.02f

Definition at line 30 of file degradeimage.cpp.

const int tesseract::kRulingVerticalMargin = 3

Definition at line 96 of file tablefind.cpp.

const int tesseract::kSaltnPepper = 5

Definition at line 34 of file degradeimage.cpp.

const int tesseract::kSearchRadius = 2

Definition at line 88 of file strokewidth.cpp.

const int tesseract::kSeedBlobsCountTh = 10

Definition at line 87 of file equationdetect.cpp.

const double tesseract::kShapePerimeterRatio = 3.0

Definition at line 118 of file strokewidth.cpp.

const int tesseract::kSideSpaceMargin = 10

Definition at line 105 of file tablefind.cpp.

const int tesseract::kSimilarRaggedDist = 50

Definition at line 45 of file tabvector.cpp.

const int tesseract::kSimilarVectorDist = 10

Definition at line 42 of file tabvector.cpp.

const int tesseract::ksizeofUniversalAmbigsFile = sizeof(kUniversalAmbigsFile)

Definition at line 24 of file universalambigs.h.

const float tesseract::kSizeRatioToReject = 2.0

Definition at line 106 of file strokewidth.cpp.

const double tesseract::kSmallTableProjectionThreshold = 0.35

Definition at line 109 of file tablefind.cpp.

const int tesseract::kSmoothDecisionMargin = 4

Definition at line 73 of file colpartitiongrid.cpp.

const double tesseract::kSmoothFactor = 0.25

Definition at line 58 of file tabfind.cpp.

const double tesseract::kSplitPartitionSize = 2.0

Definition at line 47 of file tablefind.cpp.

const int tesseract::kSquareLimit = 25

Definition at line 32 of file trainingsampleset.cpp.

const int tesseract::kStateCnt = 4

Definition at line 35 of file tess_lang_model.h.

const double tesseract::kStrokeWidthCJK = 2.0

Definition at line 56 of file strokewidth.cpp.

const double tesseract::kStrokeWidthConstantTolerance = 2.0

Definition at line 55 of file colpartitiongrid.cpp.

const double tesseract::kStrokeWidthFractionalTolerance = 0.25

Definition at line 148 of file tablefind.cpp.

const double tesseract::kStrokeWidthFractionCJK = 0.25

Definition at line 55 of file strokewidth.cpp.

const double tesseract::kStrokeWidthFractionTolerance = 0.25

Allowed proportional change in stroke width to be the same font.

Definition at line 53 of file colpartitiongrid.cpp.

const double tesseract::kStrokeWidthTolerance = 1.5

Allowed constant change in stroke width to be the same font. Really 1.5 pixels.

Definition at line 53 of file strokewidth.cpp.

const double tesseract::kTableColumnThreshold = 3.0

Definition at line 92 of file tablefind.cpp.

const int tesseract::kTabRadiusFactor = 5

Definition at line 35 of file tabfind.cpp.

const char tesseract::kTesseractReject = '~'

Character returned when Tesseract couldn't recognize as anything.

Definition at line 88 of file baseapi.cpp.

const int tesseract::kTestChar = -1

Definition at line 30 of file trainingsampleset.cpp.

const char* tesseract::kTextordDebugPix = "psdebug_pix"

Definition at line 68 of file alignedblob.cpp.

const double tesseract::kThickLengthMultiple = 0.75

Definition at line 56 of file linefind.cpp.

const int tesseract::kThinLineFraction = 20

Denominator of resolution makes max pixel width to allow thin lines.

Definition at line 41 of file linefind.cpp.

const double tesseract::kTinyEnoughTextlineOverlapFraction = 0.25

Definition at line 57 of file colpartitiongrid.cpp.

const float tesseract::kUnclearDensityTh = 0.25

Definition at line 86 of file equationdetect.cpp.

const int tesseract::kUniChs[]
Initial value:
= {
0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
}

Conversion table for non-latin characters. Maps characters out of the latin set into the latin set. TODO(rays) incorporate this translation into unicharset.

Definition at line 1623 of file baseapi.cpp.

const char tesseract::kUniversalAmbigsFile

Definition at line 23 of file universalambigs.h.

const char tesseract::kUNLVReject = '~'

Character used by UNLV error counter as a reject.

Definition at line 90 of file baseapi.cpp.

const char tesseract::kUNLVSuspect = '^'

Character used by UNLV as a suspect marker.

Definition at line 92 of file baseapi.cpp.

const char * tesseract::kUTF8LineSeparator = "\u2028"

Definition at line 25 of file unicodes.cpp.

const char * tesseract::kUTF8ParagraphSeparator = "\u2029"

Definition at line 26 of file unicodes.cpp.

const double tesseract::kVerticalSpacing = -0.2

Definition at line 36 of file tablerecog.cpp.

const int tesseract::kVLineAlignment = 3

Definition at line 47 of file alignedblob.cpp.

const int tesseract::kVLineGutter = 1

Definition at line 49 of file alignedblob.cpp.

const int tesseract::kVLineMinLength = 500

Definition at line 57 of file alignedblob.cpp.

const int tesseract::kVLineSearchSize = 150

Definition at line 51 of file alignedblob.cpp.

const char* const tesseract::RTLReversePolicyNames[]
Initial value:
= {
}
const char kDoNotReverse[]
Definition: trie.cpp:44
const char kReverseIfHasRTL[]
Definition: trie.cpp:45
const char kForceReverse[]
Definition: trie.cpp:46

Definition at line 48 of file trie.cpp.

bool tesseract::textord_dump_table_images = false

"Paint table detection output"

Definition at line 151 of file tablefind.cpp.

bool tesseract::textord_show_tables = false

"Show table regions"

Definition at line 152 of file tablefind.cpp.

bool tesseract::textord_tabfind_find_tables = true

"run table detection"

Definition at line 74 of file colfind.cpp.

bool tesseract::textord_tabfind_only_strokewidths = false

"Only run stroke widths"

Definition at line 45 of file strokewidth.cpp.

bool tesseract::textord_tabfind_show_blocks = false

"Show final block bounds"

Definition at line 73 of file colfind.cpp.

bool tesseract::textord_tabfind_show_color_fit = false

"Show stroke widths"

Definition at line 30 of file colpartitiongrid.cpp.

bool tesseract::textord_tabfind_show_columns = false

"Show column bounds"

Definition at line 72 of file colfind.cpp.

bool tesseract::textord_tabfind_show_finaltabs = false

"Show tab vectors"

Definition at line 84 of file tabfind.cpp.

bool tesseract::textord_tabfind_show_initial_partitions = false

"Show partition bounds"

Definition at line 67 of file colfind.cpp.

bool tesseract::textord_tabfind_show_initialtabs = false

"Show tab candidates"

Definition at line 83 of file tabfind.cpp.

int tesseract::textord_tabfind_show_partitions = 0

"Show partition bounds, waiting if >1"

Definition at line 71 of file colfind.cpp.

bool tesseract::textord_tabfind_show_reject_blobs = false

"Show blobs rejected as noise"

Definition at line 69 of file colfind.cpp.

int tesseract::textord_tabfind_show_strokewidths = 0

"Show stroke widths"

Definition at line 44 of file strokewidth.cpp.

bool tesseract::textord_tablefind_recognize_tables = false

"Enables the table recognizer for table layout and filtering."

Definition at line 158 of file tablefind.cpp.

bool tesseract::textord_tablefind_show_mark = false

"Debug table marking steps in detail"

Definition at line 154 of file tablefind.cpp.

bool tesseract::textord_tablefind_show_stats = false

"Show page stats used in table finding"

Definition at line 156 of file tablefind.cpp.

double tesseract::textord_tabvector_vertical_box_ratio = 0.5

"Fraction of box matches required to declare a line vertical"

Definition at line 61 of file tabvector.cpp.

double tesseract::textord_tabvector_vertical_gap_fraction = 0.5

"max fraction of mean blob width allowed for vertical gaps in vertical text"

"Max fraction of mean blob width allowed for vertical gaps in vertical text"

Definition at line 58 of file tabvector.cpp.

CCUtilMutex tesseract::tprintfMutex

Definition at line 51 of file ccutil.cpp.