tesseract v5.3.3.20231005
lm_consistency.h
Go to the documentation of this file.
1
2// File: lm_consistency.h
3// Description: Struct for recording consistency of the paths representing
4// OCR hypotheses.
5// Author: Rika Antonova
6//
7// (C) Copyright 2012, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_
21#define TESSERACT_WORDREC_LM_CONSISTENCY_H_
22
23#include <cstdint> // for INT16_MAX
24#include "dawg.h" // for EDGE_REF, NO_EDGE
25#include "dict.h" // for XH_GOOD, XH_INCONSISTENT, XHeightConsi...
26
27class BLOB_CHOICE;
28
29namespace tesseract {
30
31static const char *const XHeightConsistencyEnumName[] = {
32 "XH_GOOD",
33 "XH_SUBNORMAL",
34 "XH_INCONSISTENT",
35};
36
37// Struct for keeping track of the consistency of the path.
40
41 // How much do characters have to be shifted away from normal parameters
42 // before we say they're not normal?
43 static const int kShiftThresh = 1;
44
45 // How much shifting from subscript to superscript and back
46 // before we declare shenanigans?
47 static const int kMaxEntropy = 1;
48
49 // Script positions - order important for entropy calculation.
50 static const int kSUB = 0, kNORM = 1, kSUP = 2;
51 static const int kNumPos = 3;
52
53 explicit LMConsistencyInfo(const LMConsistencyInfo *parent_info) {
54 if (parent_info == nullptr) {
55 // Initialize from scratch.
56 num_alphas = 0;
57 num_digits = 0;
58 num_punc = 0;
59 num_other = 0;
61 punc_ref = NO_EDGE;
62 invalid_punc = false;
64 num_lower = 0;
65 script_id = 0;
66 inconsistent_script = false;
68 inconsistent_font = false;
69 // Initialize XHeight stats.
70 for (int i = 0; i < kNumPos; i++) {
71 xht_count[i] = 0;
72 xht_count_punc[i] = 0;
73 xht_lo[i] = 0;
74 xht_hi[i] = 256; // kBlnCellHeight
75 }
76 xht_sp = -1; // This invalid value indicates that there was no parent.
77 xpos_entropy = 0;
79 } else {
80 // Copy parent info
81 *this = *parent_info;
82 }
83 }
84 inline int NumInconsistentPunc() const {
85 return invalid_punc ? num_punc : 0;
86 }
87 inline int NumInconsistentCase() const {
89 }
90 inline int NumInconsistentChartype() const {
91 return (NumInconsistentPunc() + num_other +
93 }
94 inline bool Consistent() const {
95 return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
98 }
99 inline int NumInconsistentSpaces() const {
101 }
102 inline int InconsistentXHeight() const {
104 }
105 void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc);
106 float BodyMinXHeight() const {
107 if (InconsistentXHeight()) {
108 return 0.0f;
109 }
110 return xht_lo[kNORM];
111 }
112 float BodyMaxXHeight() const {
113 if (InconsistentXHeight()) {
114 return static_cast<float>(INT16_MAX);
115 }
116 return xht_hi[kNORM];
117 }
118
130 // Metrics clumped by position.
135 int16_t xht_sp;
140};
141
142} // namespace tesseract
143
144#endif // TESSERACT_WORDREC_LM_CONSISTENCY_H_
int64_t EDGE_REF
Definition: dawg.h:49
XHeightConsistencyEnum
Definition: dict.h:81
@ XH_GOOD
Definition: dict.h:81
@ XH_INCONSISTENT
Definition: dict.h:81
LMConsistencyInfo(const LMConsistencyInfo *parent_info)
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
static const int kShiftThresh
int16_t xht_count_punc[kNumPos]
XHeightConsistencyEnum xht_decision
static const int kMaxEntropy