30static BOOL_VAR(textord_space_size_is_variable,
false,
31 "If true, word delimiter spaces are assumed to have "
32 "variable width, even though characters have fixed pitch.");
35static const float kFPTolerance = 0.1f;
39static const float kFixedPitchThreshold = 0.35f;
53 values_.push_back(
value);
58 std::sort(values_.begin(), values_.end());
62 float ile(
double frac) {
66 if (values_.empty()) {
70 return values_.back();
72 if (frac <= 0.0 || values_.size() == 1) {
75 int index =
static_cast<int>((values_.size() - 1) * frac);
76 float reminder = (values_.size() - 1) * frac - index;
78 return values_[index] * (1.0f - reminder) + values_[index + 1] * reminder;
89 if (values_.empty()) {
96 return values_.empty();
100 return values_.size();
104 bool finalized_ =
false;
105 std::vector<float> values_;
122 std::sort(values_.begin(), values_.end(), float_pair_compare);
135 values_.push_back(
value);
141 unsigned start = 0, end = values_.size();
144 while (start < values_.size() && values_[start].x <
x * (1 - r)) {
147 while (end > 0 && values_[end - 1].
x >
x * (1 + r)) {
155 end = values_.size();
161 for (
auto i = start;
i < end;
i++) {
162 rc += values_[
i].vote *
x * values_[
i].y / values_[
i].x;
163 vote += values_[
i].vote;
166 return vote == 0 ? 0.0f : rc / vote;
170 static bool float_pair_compare(
const float_pair f_a,
const float_pair f_b) {
171 return f_a.x < f_b.x;
175 std::vector<struct float_pair> values_;
193 , merge_to_prev_(false)
194 , delete_flag_(false) {}
207 int gap = real_body_.
x_gap(
next.real_body_);
208 if (gap > max_gap_) {
213 real_body_ +=
next.real_body_;
215 num_blobs_ +=
next.num_blobs_;
244 return merge_to_prev_;
247 merge_to_prev_ = flag;
290 FPRow() : all_pitches_(), all_gaps_(), good_pitches_(), good_gaps_(), heights_(), characters_() {}
329 return good_pitches_.
size();
337 return estimated_pitch_;
341 estimated_pitch_ = v;
349 if (good_pitches_.
size() < 2) {
352 return height_ / good_pitches_.
median();
360 return characters_.size();
363 return &characters_[
i];
367 return characters_[
i].box();
371 return characters_[
i].real_body();
375 return !(characters_[
i].box() == characters_[
i].real_body());
379 return (characters_[
i].
box().left() + characters_[
i].
box().right()) / 2.0;
383 return characters_[
i].is_final();
387 characters_[
i].set_final(
true);
407 static float x_overlap_fraction(
const TBOX &box1,
const TBOX &box2) {
408 if (std::min(box1.
width(), box2.
width()) == 0) {
411 return -box1.
x_gap(box2) /
static_cast<float>(std::min(box1.
width(), box2.
width()));
414 static bool mostly_overlap(
const TBOX &box1,
const TBOX &box2) {
415 return x_overlap_fraction(box1, box2) > 0.9;
418 static bool significant_overlap(
const TBOX &box1,
const TBOX &box2) {
419 if (std::min(box1.width(), box2.width()) == 0) {
422 int overlap = -box1.x_gap(box2);
423 return overlap > 1 || x_overlap_fraction(box1, box2) > 0.1;
426 static float box_pitch(
const TBOX &ref,
const TBOX &
box) {
431 static bool is_good_pitch(
float pitch,
const TBOX &box1,
const TBOX &box2) {
433 if (box1.width() >=
pitch * (1.0 + kFPTolerance) ||
434 box2.width() >=
pitch * (1.0 + kFPTolerance) ||
435 box1.height() >=
pitch * (1.0 + kFPTolerance) ||
436 box2.height() >=
pitch * (1.0 + kFPTolerance)) {
440 const float real_pitch = box_pitch(box1, box2);
441 if (std::fabs(real_pitch -
pitch) <
pitch * kFPTolerance) {
445 if (textord_space_size_is_variable) {
448 if (real_pitch >
pitch && real_pitch <
pitch * 2.0 && real_pitch - box1.x_gap(box2) <
pitch) {
455 static bool is_interesting_blob(
const BLOBNBOX *blob) {
456 return !blob->joined_to_prev() && blob->flow() !=
BTFT_LEADER;
462 for (
unsigned i = 0;
i < characters_.size(); ++
i) {
463 if (!characters_[
i].delete_flag()) {
465 characters_[index] = characters_[
i];
470 characters_.resize(index);
474 float estimated_pitch_ = 0.0f;
476 float height_ = 0.0f;
480 SimpleStats all_pitches_;
482 SimpleStats all_gaps_;
485 SimpleStats good_pitches_;
488 SimpleStats good_gaps_;
490 SimpleStats heights_;
492 std::vector<FPChar> characters_;
493 TO_ROW *real_row_ =
nullptr;
505 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
506 if (is_interesting_blob(blob_it.data())) {
508 fp_char.
Init(blob_it.data());
510 if (!characters_.empty() && significant_overlap(fp_char.
box(), characters_.back().box())) {
511 characters_.back().Merge(fp_char);
513 characters_.push_back(fp_char);
515 TBOX bound = blob_it.data()->bounding_box();
522 height_ = heights_.
ile(0.875);
526 if (good_pitches_.
empty()) {
532 pitch_ = good_pitches_.
median();
538 std::min(good_gaps_.
ile(0.125), std::max(pitch_ - height_, 0.0f));
541 if (good_pitches_.
size() < all_pitches_.
size() * kFixedPitchThreshold) {
550 }
else if (good_pitches_.
size() > all_pitches_.
size() * 0.75) {
564 std::max(pitch_ * 0.25 + good_gaps_.
minimum(),
static_cast<double>(good_gaps_.
ile(0.875)));
567 static_cast<int>(real_row_->
xheight));
577 static_cast<int>(real_row_->
xheight));
581 ICOORDELT_IT cell_it = &real_row_->
char_cells;
583 cell_it.add_after_then_move(cell);
594 cell_it.add_after_then_move(cell);
595 while (right + pitch_ <
box(
i).left()) {
598 cell_it.add_after_then_move(cell);
603 cell_it.add_after_then_move(cell);
608 cell_it.add_after_then_move(cell);
616 good_pitches_.
Clear();
617 all_pitches_.
Clear();
626 bool prev_was_good =
is_good(0);
632 int32_t
pitch = cx1 - cx0;
638 if (
pitch > height_ * 0.5) {
649 (prev_was_good && std::fabs(estimated_pitch_ -
pitch) < kFPTolerance * estimated_pitch_)) {
655 prev_was_good =
true;
657 prev_was_good =
false;
669 height_ = heights_.
ile(0.875);
670 if (all_pitches_.
empty()) {
673 }
else if (good_pitches_.
size() < 2) {
676 pitch_ = all_pitches_.
median();
678 gap_ = all_gaps_.
ile(0.125);
680 pitch_ = good_pitches_.
median();
682 gap_ = good_gaps_.
ile(0.125);
689 "Row %d: pitch_decision=%d, fixed_pitch=%f, max_nonspace=%d, "
690 "space_size=%f, space_threshold=%d, xheight=%f\n",
708 if (estimated_pitch_ > 0.0f) {
710 if (is_good_pitch(estimated_pitch_,
box(
i - 2),
box(
i - 1)) &&
711 is_good_pitch(estimated_pitch_,
box(
i - 1),
box(
i))) {
727 bool changed =
false;
728 if (
num_chars() <= 1 || estimated_pitch_ == 0.0f) {
737 bool intersecting =
false;
738 bool not_intersecting =
false;
744 bool skipped_whitespaces =
false;
745 float c1 =
center_x(
i + 1) - 1.5 * estimated_pitch_;
746 while (c1 >
box(
i).right()) {
747 skipped_whitespaces =
true;
748 c1 -= estimated_pitch_;
750 TBOX ibody(c1,
box(
i).bottom(), c1 + estimated_pitch_,
box(
i).top());
756 while (j >= 0 && !
is_final(j) && mostly_overlap(ibody,
box(j)) &&
762 if (j >= 0 && significant_overlap(ibody,
box(j))) {
769 not_intersecting =
true;
775 if (!skipped_whitespaces) {
781 if (
box(
i).width() <= estimated_pitch_ * 0.5) {
788 for (
int k =
i; k > j + 1; k--) {
799 bool skipped_whitespaces =
false;
800 float c1 =
center_x(
i - 1) + 1.5 * estimated_pitch_;
801 while (c1 <
box(
i).left()) {
802 skipped_whitespaces =
true;
803 c1 += estimated_pitch_;
805 TBOX ibody(c1 - estimated_pitch_,
box(
i).bottom(), c1,
box(
i).top());
815 if (j <
num_chars() && significant_overlap(ibody,
box(j))) {
820 not_intersecting =
true;
823 if (!skipped_whitespaces) {
826 if (
box(
i).width() <= estimated_pitch_ * 0.5) {
833 for (
size_t k =
i + 1; k < j; k++) {
843 if (intersecting && !not_intersecting) {
857 for (
size_t j = 0; j <
num_chars(); ++j) {
884 TBOX ibody(cx - 0.5 * row_pitch, 0, cx + 0.5 * row_pitch, 1);
889 if (x_overlap_fraction(ibody,
box(
i - 1)) > 0.1) {
894 merged +=
box(
i - 1);
895 if (merged.
width() < row_pitch) {
904 if (x_overlap_fraction(ibody,
box(
i + 1)) > 0.1) {
909 merged +=
box(
i + 1);
910 if (merged.
width() < row_pitch) {
927 bool good_pitch =
false;
928 bool bad_pitch =
false;
930 if (is_good_pitch(row_pitch,
box(
i - 1),
box(
i))) {
937 if (is_good_pitch(row_pitch,
box(
i),
box(
i + 1))) {
943 if (good_pitch && !bad_pitch) {
945 }
else if (!good_pitch && bad_pitch) {
957 for (
auto &row : rows_) {
968 if (rows_.empty() || rows_.size() <= num_bad_rows_ + num_tall_rows_ + 1) {
975 for (
auto &row : rows_) {
976 row.MergeFragments();
981 for (
auto &row : rows_) {
982 row.FinalizeLargeChars();
987 bool changed =
false;
988 for (
auto &row : rows_) {
989 if (row.Pass2Analyze()) {
997 for (
auto &row : rows_) {
998 row.OutputEstimations();
1004 tprintf(
"FPAnalyzer: final result\n");
1005 for (
size_t i = 0;
i < rows_.size();
i++) {
1006 rows_[
i].DebugOutputResult(
i);
1011 return rows_.size();
1018 return max_chars_per_row_ + 100;
1023 std::vector<FPRow> rows_;
1024 unsigned num_tall_rows_;
1025 unsigned num_bad_rows_;
1027 unsigned num_empty_rows_;
1028 unsigned max_chars_per_row_;
1035 , num_empty_rows_(0)
1036 , max_chars_per_row_(0) {
1037 TO_BLOCK_IT block_it(port_blocks);
1039 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
1047 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
1048 TO_ROW_IT row_it = block_it.data()->get_rows();
1049 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1051 row.
Init(row_it.data());
1052 rows_.push_back(row);
1053 size_t num_chars = rows_.back().num_chars();
1054 if (num_chars <= 1) {
1057 if (num_chars > max_chars_per_row_) {
1058 max_chars_per_row_ = num_chars;
1069 pitch_height_stats.
Clear();
1070 for (
auto &row : rows_) {
1071 row.EstimatePitch(pass1);
1072 if (row.good_pitches()) {
1073 pitch_height_stats.
Add(row.height() + row.gap(), row.pitch(), row.good_pitches());
1074 if (row.height_pitch_ratio() > 1.1) {
1082 pitch_height_stats.
Finish();
1083 for (
auto &row : rows_) {
1084 if (row.good_pitches() >= 5) {
1087 row.set_estimated_pitch(row.pitch());
1088 }
else if (row.num_chars() > 1) {
1089 float estimated_pitch = pitch_height_stats.
EstimateYFor(row.height() + row.gap(), 0.1f);
1094 if (estimated_pitch > row.pitch() || row.pitch() > row.height() * 2.0) {
1095 row.set_estimated_pitch(estimated_pitch);
1097 row.set_estimated_pitch(row.pitch());
1120 tprintf(
"Page doesn't seem to contain fixed pitch rows\n");
1125 unsigned iteration = 0;
1134 tprintf(
"compute_fixed_pitch_cjk finished after %u iteration (limit=%u)\n", iteration,
#define BOOL_VAR(name, val, comment)
void tprintf(const char *format,...)
void find_repeated_chars(TO_BLOCK *block, bool testing_on)
bool textord_debug_pitch_test
void compute_fixed_pitch_cjk(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
const TBOX & bounding_box() const
ICOORDELT_LIST char_cells
BLOBNBOX_LIST * blob_list()
PITCH_TYPE pitch_decision
TDimension height() const
int x_gap(const TBOX &box) const
TBOX bounding_union(const TBOX &box) const
void Add(float x, float y, int v)
~LocalCorrelation()=default
float EstimateYFor(float x, float r)
bool merge_to_prev() const
void Merge(const FPChar &next)
void set_delete_flag(bool flag)
void set_merge_to_prev(bool flag)
const Alignment & alignment() const
void set_box(const TBOX &box)
void set_alignment(Alignment alignment)
const TBOX & real_body() const
void Init(BLOBNBOX *blob)
void set_final(bool flag)
void FinalizeLargeChars()
bool is_box_modified(int i)
void DebugOutputResult(int row_index)
FPChar * character(int i)
void clear_alignment(int i)
void EstimatePitch(bool pass1)
const TBOX & real_body(int i)
float height_pitch_ratio()
void set_estimated_pitch(float v)
void EstimatePitch(bool pass1)
FPAnalyzer(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
void FinalizeLargeChars()