tesseract v5.3.3.20231005
tesseract::CHAR_FRAGMENT Class Reference

#include <unicharset.h>

Public Member Functions

void set_all (const char *unichar, int pos, int total, bool natural)
 
void set_unichar (const char *uch)
 
void set_pos (int p)
 
void set_total (int t)
 
const char * get_unichar () const
 
int get_pos () const
 
int get_total () const
 
std::string to_string () const
 
bool equals (const char *other_unichar, int other_pos, int other_total) const
 
bool equals (const CHAR_FRAGMENT *other) const
 
bool is_continuation_of (const CHAR_FRAGMENT *fragment) const
 
bool is_beginning () const
 
bool is_ending () const
 
bool is_natural () const
 
void set_natural (bool value)
 

Static Public Member Functions

static std::string to_string (const char *unichar, int pos, int total, bool natural)
 
static CHAR_FRAGMENTparse_from_string (const char *str)
 

Static Public Attributes

static const int kMinLen = 6
 
static const int kMaxLen = 3 + UNICHAR_LEN + 2
 
static const int kMaxChunks = 5
 

Detailed Description

Definition at line 50 of file unicharset.h.

Member Function Documentation

◆ equals() [1/2]

bool tesseract::CHAR_FRAGMENT::equals ( const char *  other_unichar,
int  other_pos,
int  other_total 
) const
inline

Definition at line 97 of file unicharset.h.

98 {
99 return (strcmp(this->unichar, other_unichar) == 0 &&
100 this->pos == other_pos && this->total == other_total);
101 }

◆ equals() [2/2]

bool tesseract::CHAR_FRAGMENT::equals ( const CHAR_FRAGMENT other) const
inline

Definition at line 102 of file unicharset.h.

102 {
103 return this->equals(other->get_unichar(), other->get_pos(),
104 other->get_total());
105 }
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:97

◆ get_pos()

int tesseract::CHAR_FRAGMENT::get_pos ( ) const
inline

Definition at line 79 of file unicharset.h.

79 {
80 return this->pos;
81 }

◆ get_total()

int tesseract::CHAR_FRAGMENT::get_total ( ) const
inline

Definition at line 82 of file unicharset.h.

82 {
83 return this->total;
84 }

◆ get_unichar()

const char * tesseract::CHAR_FRAGMENT::get_unichar ( ) const
inline

Definition at line 76 of file unicharset.h.

76 {
77 return this->unichar;
78 }

◆ is_beginning()

bool tesseract::CHAR_FRAGMENT::is_beginning ( ) const
inline

Definition at line 116 of file unicharset.h.

116 {
117 return this->pos == 0;
118 }

◆ is_continuation_of()

bool tesseract::CHAR_FRAGMENT::is_continuation_of ( const CHAR_FRAGMENT fragment) const
inline

Definition at line 109 of file unicharset.h.

109 {
110 return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
111 this->total == fragment->get_total() &&
112 this->pos == fragment->get_pos() + 1);
113 }

◆ is_ending()

bool tesseract::CHAR_FRAGMENT::is_ending ( ) const
inline

Definition at line 121 of file unicharset.h.

121 {
122 return this->pos == this->total - 1;
123 }

◆ is_natural()

bool tesseract::CHAR_FRAGMENT::is_natural ( ) const
inline

Definition at line 128 of file unicharset.h.

128 {
129 return natural;
130 }

◆ parse_from_string()

CHAR_FRAGMENT * tesseract::CHAR_FRAGMENT::parse_from_string ( const char *  str)
static

Definition at line 1103 of file unicharset.cpp.

1103 {
1104 const char *ptr = string;
1105 int len = strlen(string);
1106 if (len < kMinLen || *ptr != kSeparator) {
1107 return nullptr; // this string cannot represent a fragment
1108 }
1109 ptr++; // move to the next character
1110 int step = 0;
1111 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1112 step += UNICHAR::utf8_step(ptr + step);
1113 }
1114 if (step == 0 || step > UNICHAR_LEN) {
1115 return nullptr; // no character for unichar or the character is too long
1116 }
1117 char unichar[UNICHAR_LEN + 1];
1118 strncpy(unichar, ptr, step);
1119 unichar[step] = '\0'; // null terminate unichar
1120 ptr += step; // move to the next fragment separator
1121 int pos = 0;
1122 int total = 0;
1123 bool natural = false;
1124 char *end_ptr = nullptr;
1125 for (int i = 0; i < 2; i++) {
1126 if (ptr > string + len || *ptr != kSeparator) {
1127 if (i == 1 && *ptr == kNaturalFlag) {
1128 natural = true;
1129 } else {
1130 return nullptr; // Failed to parse fragment representation.
1131 }
1132 }
1133 ptr++; // move to the next character
1134 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1135 : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1136 ptr = end_ptr;
1137 }
1138 if (ptr != string + len) {
1139 return nullptr; // malformed fragment representation
1140 }
1141 auto *fragment = new CHAR_FRAGMENT();
1142 fragment->set_all(unichar, pos, total, natural);
1143 return fragment;
1144}
#define UNICHAR_LEN
Definition: unichar.h:31
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
static const int kMinLen
Definition: unicharset.h:53

◆ set_all()

void tesseract::CHAR_FRAGMENT::set_all ( const char *  unichar,
int  pos,
int  total,
bool  natural 
)
inline

Definition at line 60 of file unicharset.h.

60 {
61 set_unichar(unichar);
62 set_pos(pos);
63 set_total(total);
64 set_natural(natural);
65 }
void set_unichar(const char *uch)
Definition: unicharset.h:66
void set_natural(bool value)
Definition: unicharset.h:131

◆ set_natural()

void tesseract::CHAR_FRAGMENT::set_natural ( bool  value)
inline

Definition at line 131 of file unicharset.h.

131 {
132 natural = value;
133 }
int value

◆ set_pos()

void tesseract::CHAR_FRAGMENT::set_pos ( int  p)
inline

Definition at line 70 of file unicharset.h.

70 {
71 this->pos = p;
72 }
const char * p

◆ set_total()

void tesseract::CHAR_FRAGMENT::set_total ( int  t)
inline

Definition at line 73 of file unicharset.h.

73 {
74 this->total = t;
75 }

◆ set_unichar()

void tesseract::CHAR_FRAGMENT::set_unichar ( const char *  uch)
inline

Definition at line 66 of file unicharset.h.

66 {
67 strncpy(this->unichar, uch, sizeof(this->unichar));
68 this->unichar[UNICHAR_LEN] = '\0';
69 }

◆ to_string() [1/2]

std::string tesseract::CHAR_FRAGMENT::to_string ( ) const
inline

Definition at line 91 of file unicharset.h.

91 {
92 return to_string(unichar, pos, total, natural);
93 }
std::string to_string() const
Definition: unicharset.h:91

◆ to_string() [2/2]

std::string tesseract::CHAR_FRAGMENT::to_string ( const char *  unichar,
int  pos,
int  total,
bool  natural 
)
static

Definition at line 1088 of file unicharset.cpp.

1089 {
1090 if (total == 1) {
1091 return std::string(unichar);
1092 }
1093 std::string result;
1094 result += kSeparator;
1095 result += unichar;
1096 char buffer[kMaxLen];
1097 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1098 natural ? kNaturalFlag : kSeparator, total);
1099 result += buffer;
1100 return result;
1101}
static const int kMaxLen
Definition: unicharset.h:55

Member Data Documentation

◆ kMaxChunks

const int tesseract::CHAR_FRAGMENT::kMaxChunks = 5
static

Definition at line 57 of file unicharset.h.

◆ kMaxLen

const int tesseract::CHAR_FRAGMENT::kMaxLen = 3 + UNICHAR_LEN + 2
static

Definition at line 55 of file unicharset.h.

◆ kMinLen

const int tesseract::CHAR_FRAGMENT::kMinLen = 6
static

Definition at line 53 of file unicharset.h.


The documentation for this class was generated from the following files: