tesseract v5.3.3.20231005
UnicodeText Class Reference

#include <unicodetext.h>

Classes

class  const_iterator
 
class  const_reverse_iterator
 

Public Types

typedef char32 value_type
 

Public Member Functions

 UnicodeText ()
 
 UnicodeText (const UnicodeText &src)
 
 UnicodeText (const const_iterator &first, const const_iterator &last)
 
UnicodeTextoperator= (const UnicodeText &src)
 
UnicodeTextCopy (const UnicodeText &src)
 
UnicodeTextassign (const UnicodeText &src)
 
UnicodeTextPointTo (const UnicodeText &src)
 
UnicodeTextPointTo (const const_iterator &first, const const_iterator &last)
 
 ~UnicodeText ()
 
void clear ()
 
bool empty () const
 
void push_back (char32 codepoint)
 
template<typename ForwardIterator >
UnicodeTextappend (ForwardIterator first, const ForwardIterator last)
 
UnicodeTextappend (const const_iterator &first, const const_iterator &last)
 
UnicodeTextappend (const UnicodeText &source)
 
int size () const
 
const_iterator begin () const
 
const_iterator end () const
 
const_reverse_iterator rbegin () const
 
const_reverse_iterator rend () const
 
const_iterator find (const UnicodeText &look, const_iterator start_pos) const
 
const_iterator find (const UnicodeText &look) const
 
bool HasReplacementChar () const
 
const char * utf8_data () const
 
int utf8_length () const
 
int utf8_capacity () const
 
UnicodeTextCopyUTF8 (const char *utf8_buffer, int byte_length)
 
UnicodeTextTakeOwnershipOfUTF8 (char *utf8_buffer, int byte_length, int byte_capacity)
 
UnicodeTextPointToUTF8 (const char *utf8_buffer, int byte_length)
 
const_iterator MakeIterator (const char *p) const
 
string DebugString () const
 

Static Public Member Functions

static string UTF8Substring (const const_iterator &first, const const_iterator &last)
 

Friends

class const_iterator
 
class UnicodeTextUtils
 
bool operator== (const UnicodeText &lhs, const UnicodeText &rhs)
 
bool operator!= (const UnicodeText &lhs, const UnicodeText &rhs)
 

Detailed Description

Copyright 2010 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Definition at line 116 of file unicodetext.h.

Member Typedef Documentation

◆ value_type

Definition at line 120 of file unicodetext.h.

Constructor & Destructor Documentation

◆ UnicodeText() [1/3]

UnicodeText::UnicodeText ( )

Definition at line 188 of file unicodetext.cc.

188{}

◆ UnicodeText() [2/3]

UnicodeText::UnicodeText ( const UnicodeText src)

Definition at line 191 of file unicodetext.cc.

191 {
192 Copy(src);
193}
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216

◆ UnicodeText() [3/3]

UnicodeText::UnicodeText ( const const_iterator first,
const const_iterator last 
)

Definition at line 196 of file unicodetext.cc.

197 {
198 CHECK(first <= last) << " Incompatible iterators";
199 repr_.append(first.it_, last.it_ - first.it_);
200}
#define CHECK(condition)
Definition: include_gunit.h:76
LIST last(LIST var_list)
Definition: oldlist.cpp:153

◆ ~UnicodeText()

UnicodeText::~UnicodeText ( )

Definition at line 355 of file unicodetext.cc.

355{}

Member Function Documentation

◆ append() [1/3]

UnicodeText & UnicodeText::append ( const const_iterator first,
const const_iterator last 
)

Definition at line 288 of file unicodetext.cc.

288 {
289 CHECK(first <= last) << " Incompatible iterators";
290 repr_.append(first.it_, last.it_ - first.it_);
291 return *this;
292}

◆ append() [2/3]

UnicodeText & UnicodeText::append ( const UnicodeText source)

Definition at line 283 of file unicodetext.cc.

283 {
284 repr_.append(u.repr_.data_, u.repr_.size_);
285 return *this;
286}

◆ append() [3/3]

template<typename ForwardIterator >
UnicodeText & UnicodeText::append ( ForwardIterator  first,
const ForwardIterator  last 
)
inline

Definition at line 163 of file unicodetext.h.

163 {
164 while (first != last) {
165 push_back(*first++);
166 }
167 return *this;
168 }
void push_back(char32 codepoint)
Definition: unicodetext.cc:357

◆ assign()

UnicodeText & UnicodeText::assign ( const UnicodeText src)
inline

Definition at line 134 of file unicodetext.h.

134 {
135 return Copy(src);
136 }

◆ begin()

UnicodeText::const_iterator UnicodeText::begin ( ) const

Definition at line 408 of file unicodetext.cc.

408 {
409 return const_iterator(repr_.data_);
410}
friend class const_iterator
Definition: unicodetext.h:348

◆ clear()

void UnicodeText::clear ( )

Definition at line 350 of file unicodetext.cc.

350 {
351 repr_.clear();
352}

◆ Copy()

UnicodeText & UnicodeText::Copy ( const UnicodeText src)

Definition at line 216 of file unicodetext.cc.

216 {
217 repr_.Copy(src.repr_.data_, src.repr_.size_);
218 return *this;
219}

◆ CopyUTF8()

UnicodeText & UnicodeText::CopyUTF8 ( const char *  utf8_buffer,
int  byte_length 
)

Definition at line 221 of file unicodetext.cc.

221 {
222 repr_.Copy(buffer, byte_length);
223 if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
224 LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
225 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
226 }
227 return *this;
228}
@ LOG
@ WARNING
Definition: log.h:28
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33

◆ DebugString()

string UnicodeText::DebugString ( ) const

◆ empty()

bool UnicodeText::empty ( ) const
inline

Definition at line 146 of file unicodetext.h.

146 {
147 return repr_.size_ == 0;
148 } // Test if text is empty.

◆ end()

UnicodeText::const_iterator UnicodeText::end ( ) const

Definition at line 412 of file unicodetext.cc.

412 {
413 return const_iterator(repr_.data_ + repr_.size_);
414}

◆ find() [1/2]

UnicodeText::const_iterator UnicodeText::find ( const UnicodeText look) const

Definition at line 308 of file unicodetext.cc.

308 {
309 return UnsafeFind(look, begin());
310}
const_iterator begin() const
Definition: unicodetext.cc:408

◆ find() [2/2]

UnicodeText::const_iterator UnicodeText::find ( const UnicodeText look,
const_iterator  start_pos 
) const

Definition at line 301 of file unicodetext.cc.

302 {
303 CHECK_GE(start_pos.utf8_data(), utf8_data());
304 CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
305 return UnsafeFind(look, start_pos);
306}
#define CHECK_GE(test, value)
Definition: include_gunit.h:80
#define CHECK_LE(test, value)
Definition: include_gunit.h:83
int utf8_length() const
Definition: unicodetext.h:308
const char * utf8_data() const
Definition: unicodetext.h:305

◆ HasReplacementChar()

bool UnicodeText::HasReplacementChar ( ) const

◆ MakeIterator()

UnicodeText::const_iterator UnicodeText::MakeIterator ( const char *  p) const

Definition at line 484 of file unicodetext.cc.

484 {
485 CHECK(p != nullptr);
486 const char *start = utf8_data();
487 int len = utf8_length();
488 const char *end = start + len;
489 CHECK(p >= start);
490 CHECK(p <= end);
491 CHECK(p == end || !UniLib::IsTrailByte(*p));
492 return const_iterator(p);
493}
const char * p
bool IsTrailByte(char x)
const_iterator end() const
Definition: unicodetext.cc:412

◆ operator=()

UnicodeText & UnicodeText::operator= ( const UnicodeText src)

Definition at line 209 of file unicodetext.cc.

209 {
210 if (this != &src) {
211 Copy(src);
212 }
213 return *this;
214}

◆ PointTo() [1/2]

UnicodeText & UnicodeText::PointTo ( const const_iterator first,
const const_iterator last 
)

Definition at line 275 of file unicodetext.cc.

275 {
276 CHECK(first <= last) << " Incompatible iterators";
277 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
278 return *this;
279}

◆ PointTo() [2/2]

UnicodeText & UnicodeText::PointTo ( const UnicodeText src)

Definition at line 270 of file unicodetext.cc.

270 {
271 repr_.PointTo(src.repr_.data_, src.repr_.size_);
272 return *this;
273}

◆ PointToUTF8()

UnicodeText & UnicodeText::PointToUTF8 ( const char *  utf8_buffer,
int  byte_length 
)

Definition at line 254 of file unicodetext.cc.

254 {
255 if (UniLib::IsInterchangeValid(buffer, byte_length)) {
256 repr_.PointTo(buffer, byte_length);
257 } else {
258 LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
259 repr_.Copy(buffer, byte_length);
260 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
261 }
262 return *this;
263}

◆ push_back()

void UnicodeText::push_back ( char32  codepoint)

Definition at line 357 of file unicodetext.cc.

357 {
359 char buf[UTFmax];
360 int len = runetochar(buf, &c);
361 if (UniLib::IsInterchangeValid(buf, len)) {
362 repr_.append(buf, len);
363 } else {
364 LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
365 repr_.append(" ", 1);
366 }
367 } else {
368 LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
369 repr_.append(" ", 1);
370 }
371}
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244
@ UTFmax
Definition: utf.h:22
bool IsValidCodepoint(char32 c)

◆ rbegin()

const_reverse_iterator UnicodeText::rbegin ( ) const
inline

Definition at line 283 of file unicodetext.h.

283 {
284 return const_reverse_iterator(end());
285 }

◆ rend()

const_reverse_iterator UnicodeText::rend ( ) const
inline

Definition at line 286 of file unicodetext.h.

286 {
287 return const_reverse_iterator(begin());
288 }

◆ size()

int UnicodeText::size ( ) const

Definition at line 373 of file unicodetext.cc.

373 {
374 return CodepointCount(repr_.data_, repr_.size_);
375}

◆ TakeOwnershipOfUTF8()

UnicodeText & UnicodeText::TakeOwnershipOfUTF8 ( char *  utf8_buffer,
int  byte_length,
int  byte_capacity 
)

Definition at line 237 of file unicodetext.cc.

237 {
238 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
239 if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
240 LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
241 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
242 }
243 return *this;
244}

◆ utf8_capacity()

int UnicodeText::utf8_capacity ( ) const
inline

Definition at line 311 of file unicodetext.h.

311 {
312 return repr_.capacity_;
313 }

◆ utf8_data()

const char * UnicodeText::utf8_data ( ) const
inline

Definition at line 305 of file unicodetext.h.

305 {
306 return repr_.data_;
307 }

◆ utf8_length()

int UnicodeText::utf8_length ( ) const
inline

Definition at line 308 of file unicodetext.h.

308 {
309 return repr_.size_;
310 }

◆ UTF8Substring()

string UnicodeText::UTF8Substring ( const const_iterator first,
const const_iterator last 
)
static

Definition at line 202 of file unicodetext.cc.

202 {
203 CHECK(first <= last) << " Incompatible iterators";
204 return string(first.it_, last.it_ - first.it_);
205}

Friends And Related Function Documentation

◆ const_iterator

friend class const_iterator
friend

Definition at line 348 of file unicodetext.h.

◆ operator!=

bool operator!= ( const UnicodeText lhs,
const UnicodeText rhs 
)
friend

Definition at line 397 of file unicodetext.h.

397 {
398 return !(lhs == rhs);
399}

◆ operator==

bool operator== ( const UnicodeText lhs,
const UnicodeText rhs 
)
friend

Definition at line 377 of file unicodetext.cc.

377 {
378 if (&lhs == &rhs)
379 return true;
380 if (lhs.repr_.size_ != rhs.repr_.size_)
381 return false;
382 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
383}

◆ UnicodeTextUtils

friend class UnicodeTextUtils
friend

Definition at line 349 of file unicodetext.h.


The documentation for this class was generated from the following files: