tesseract-ocr.github.io/5.3.3/a01604_source.html

#include "include_gunit.h"

#include "util/utf8/unicodetext.h"


#include <string.h>  // for memcpy, NULL, memcmp, etc

#include <algorithm> // for max


//#include "base/logging.h"               // for operator<<, CHECK, etc

//#include "base/stringprintf.h"          // for StringPrintf, StringAppendF

//#include "strings/stringpiece.h"        // for StringPiece, etc


#include "third_party/utf/utf.h"         // for isvalidcharntorune, etc

#include "util/utf8/unilib.h"            // for IsInterchangeValid, etc

#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen


static int CodepointDistance(const char *start, const char *end) {

  int n = 0;

  // Increment n on every non-trail-byte.

  for (const char *p = start; p < end; ++p) {

    n += (*reinterpret_cast<const signed char *>(p) >= -0x40);

  }

  return n;

}


static int CodepointCount(const char *utf8, int len) {

  return CodepointDistance(utf8, utf8 + len);

}


UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first,

                                                      const UnicodeText::const_iterator &last) {

  return CodepointDistance(first.it_, last.it_);

}


// ---------- Utility ----------


static int ConvertToInterchangeValid(char *start, int len) {

  // This routine is called only when we've discovered that a UTF-8 buffer

  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8

  // was not interchange valid. This indicates a bug in the caller, and

  // a LOG(WARNING) is done in that case.

  // This is similar to CoerceToInterchangeValid, but it replaces each

  // structurally valid byte with a space, and each non-interchange

  // character with a space, even when that character requires more

  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is

  // structurally valid UTF8, but U+FDD0 is not an interchange-valid

  // code point. The result should contain one space, not three.

  //

  // Since the conversion never needs to write more data than it

  // reads, it is safe to change the buffer in place. It returns the

  // number of bytes written.

  char *const in = start;

  char *out = start;

  char *const end = start + len;

  while (start < end) {

    int good = UniLib::SpanInterchangeValid(start, end - start);

    if (good > 0) {

      if (out != start) {

        memmove(out, start, good);

      }

      out += good;

      start += good;

      if (start == end) {

        break;

      }

    }

    // Is the current string invalid UTF8 or just non-interchange UTF8?

    char32 rune;

    int n;

    if (isvalidcharntorune(start, end - start, &rune, &n)) {

      // structurally valid UTF8, but not interchange valid

      start += n; // Skip over the whole character.

    } else {      // bad UTF8

      start += 1; // Skip over just one byte

    }

    *out++ = ' ';

  }

  return out - in;

}


// *************** Data representation **********


// Note: the copy constructor is undefined.


// After reserve(), resize(), or clear(), we're an owner, not an alias.


void UnicodeText::Repr::reserve(int new_capacity) {

  // If there's already enough capacity, and we're an owner, do nothing.

  if (capacity_ >= new_capacity && ours_)

    return;


  // Otherwise, allocate a new buffer.

  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);

  char *new_data = new char[capacity_];


  // If there is an old buffer, copy it into the new buffer.

  if (data_) {

    memcpy(new_data, data_, size_);

    if (ours_)

      delete[] data_; // If we owned the old buffer, free it.

  }

  data_ = new_data;

  ours_ = true; // We own the new buffer.

  // size_ is unchanged.

}


void UnicodeText::Repr::resize(int new_size) {

  if (new_size == 0) {

    clear();

  } else {

    if (!ours_ || new_size > capacity_)

      reserve(new_size);

    // Clear the memory in the expanded part.

    if (size_ < new_size)

      memset(data_ + size_, 0, new_size - size_);

    size_ = new_size;

    ours_ = true;

  }

}


// This implementation of clear() deallocates the buffer if we're an owner.

// That's not strictly necessary; we could just set size_ to 0.

void UnicodeText::Repr::clear() {

  if (ours_)

    delete[] data_;

  data_ = nullptr;

  size_ = capacity_ = 0;

  ours_ = true;

}


void UnicodeText::Repr::Copy(const char *data, int size) {

  resize(size);

  memcpy(data_, data, size);

}


void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {

  if (data == data_)

    return; // We already own this memory. (Weird case.)

  if (ours_ && data_)

    delete[] data_; // If we owned the old buffer, free it.

  data_ = data;

  size_ = size;

  capacity_ = capacity;

  ours_ = true;

}


void UnicodeText::Repr::PointTo(const char *data, int size) {

  if (ours_ && data_)

    delete[] data_; // If we owned the old buffer, free it.

  data_ = const_cast<char *>(data);

  size_ = size;

  capacity_ = size;

  ours_ = false;

}


void UnicodeText::Repr::append(const char *bytes, int byte_length) {

  reserve(size_ + byte_length);

  memcpy(data_ + size_, bytes, byte_length);

  size_ += byte_length;

}


#ifdef INCLUDE_TENSORFLOW

string UnicodeText::Repr::DebugString() const {

  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,

                                     capacity_, ours_ ? "Owned" : "Alias");

}

#endif


// *************** UnicodeText ******************


// ----- Constructors -----


// Default constructor

UnicodeText::UnicodeText() {}


// Copy constructor

UnicodeText::UnicodeText(const UnicodeText &src) {

  Copy(src);

}


// Substring constructor

UnicodeText::UnicodeText(const UnicodeText::const_iterator &first,

                         const UnicodeText::const_iterator &last) {

  CHECK(first <= last) << " Incompatible iterators";

  repr_.append(first.it_, last.it_ - first.it_);

}


string UnicodeText::UTF8Substring(const const_iterator &first, const const_iterator &last) {

  CHECK(first <= last) << " Incompatible iterators";

  return string(first.it_, last.it_ - first.it_);

}


// ----- Copy -----


UnicodeText &UnicodeText::operator=(const UnicodeText &src) {

  if (this != &src) {

    Copy(src);

  }

  return *this;

}


UnicodeText &UnicodeText::Copy(const UnicodeText &src) {

  repr_.Copy(src.repr_.data_, src.repr_.size_);

  return *this;

}


UnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) {

  repr_.Copy(buffer, byte_length);

  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {

    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";

    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

  }

  return *this;

}


UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {

  repr_.Copy(buffer, byte_length);

  return *this;

}


// ----- TakeOwnershipOf  -----


UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {

  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {

    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";

    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

  }

  return *this;

}


UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,

                                                    int byte_capacity) {

  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

  return *this;

}


// ----- PointTo -----


UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {

  if (UniLib::IsInterchangeValid(buffer, byte_length)) {

    repr_.PointTo(buffer, byte_length);

  } else {

    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";

    repr_.Copy(buffer, byte_length);

    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

  }

  return *this;

}


UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {

  repr_.PointTo(buffer, byte_length);

  return *this;

}


UnicodeText &UnicodeText::PointTo(const UnicodeText &src) {

  repr_.PointTo(src.repr_.data_, src.repr_.size_);

  return *this;

}


UnicodeText &UnicodeText::PointTo(const const_iterator &first, const const_iterator &last) {

  CHECK(first <= last) << " Incompatible iterators";

  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());

  return *this;

}


// ----- Append -----


UnicodeText &UnicodeText::append(const UnicodeText &u) {

  repr_.append(u.repr_.data_, u.repr_.size_);

  return *this;

}


UnicodeText &UnicodeText::append(const const_iterator &first, const const_iterator &last) {

  CHECK(first <= last) << " Incompatible iterators";

  repr_.append(first.it_, last.it_ - first.it_);

  return *this;

}


UnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) {

  repr_.append(utf8, len);

  return *this;

}


// ----- substring searching -----


UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look,

                                              const_iterator start_pos) const {

  CHECK_GE(start_pos.utf8_data(), utf8_data());

  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());

  return UnsafeFind(look, start_pos);

}


UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look) const {

  return UnsafeFind(look, begin());

}


UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,

                                                    const_iterator start_pos) const {

  // Due to the magic of the UTF8 encoding, searching for a sequence of

  // letters is equivalent to substring search.

#ifdef INCLUDE_TENSORFLOW

  StringPiece searching(utf8_data(), utf8_length());

  StringPiece look_piece(look.utf8_data(), look.utf8_length());

#endif

  LOG(FATAL) << "Not implemented";

#ifdef INCLUDE_TENSORFLOW

  // StringPiece::size_type found =

  //    searching.find(look_piece, start_pos.utf8_data() - utf8_data());

  StringPiece::size_type found = StringPiece::npos;

  if (found == StringPiece::npos)

    return end();

  return const_iterator(utf8_data() + found);

#else

  return end();

#endif

}


#ifdef INCLUDE_TENSORFLOW

bool UnicodeText::HasReplacementChar() const {

  // Equivalent to:

  //   UnicodeText replacement_char;

  //   replacement_char.push_back(0xFFFD);

  //   return find(replacement_char) != end();

  StringPiece searching(utf8_data(), utf8_length());

  StringPiece looking_for("\xEF\xBF\xBD", 3);

  LOG(FATAL) << "Not implemented";

  // return searching.find(looking_for) != StringPiece::npos;

  return false;

}

#endif


// ----- other methods -----


// Clear operator

void UnicodeText::clear() {

  repr_.clear();

}


// Destructor

UnicodeText::~UnicodeText() {}


void UnicodeText::push_back(char32 c) {

  if (UniLib::IsValidCodepoint(c)) {

    char buf[UTFmax];

    int len = runetochar(buf, &c);

    if (UniLib::IsInterchangeValid(buf, len)) {

      repr_.append(buf, len);

    } else {

      LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";

      repr_.append(" ", 1);

    }

  } else {

    LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;

    repr_.append(" ", 1);

  }

}


int UnicodeText::size() const {

  return CodepointCount(repr_.data_, repr_.size_);

}


bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {

  if (&lhs == &rhs)

    return true;

  if (lhs.repr_.size_ != rhs.repr_.size_)

    return false;

  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;

}


#ifdef INCLUDE_TENSORFLOW

string UnicodeText::DebugString() const {

  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),

                                     repr_.DebugString().c_str());

}

#endif


// ******************* UnicodeText::const_iterator *********************


// The implementation of const_iterator would be nicer if it

// inherited from boost::iterator_facade

// (http://boost.org/libs/iterator/doc/iterator_facade.html).


UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}


UnicodeText::const_iterator::const_iterator(const const_iterator &other) : it_(other.it_) {}


UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(const const_iterator &other) {

  if (&other != this)

    it_ = other.it_;

  return *this;

}


UnicodeText::const_iterator UnicodeText::begin() const {

  return const_iterator(repr_.data_);

}


UnicodeText::const_iterator UnicodeText::end() const {

  return const_iterator(repr_.data_ + repr_.size_);

}


bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs) {

  return lhs.it_ < rhs.it_;

}


char32 UnicodeText::const_iterator::operator*() const {

  // (We could call chartorune here, but that does some

  // error-checking, and we're guaranteed that our data is valid

  // UTF-8. Also, we expect this routine to be called very often. So

  // for speed, we do the calculation ourselves.)


  // Convert from UTF-8

  unsigned char byte1 = it_[0];

  if (byte1 < 0x80)

    return byte1;


  unsigned char byte2 = it_[1];

  if (byte1 < 0xE0)

    return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);


  unsigned char byte3 = it_[2];

  if (byte1 < 0xF0)

    return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);


  unsigned char byte4 = it_[3];

  return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);

}


UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {

  it_ += UniLib::OneCharLen(it_);

  return *this;

}


UnicodeText::const_iterator &UnicodeText::const_iterator::operator--() {

  while (UniLib::IsTrailByte(*--it_))

    ;

  return *this;

}


int UnicodeText::const_iterator::get_utf8(char *utf8_output) const {

  utf8_output[0] = it_[0];

  if ((it_[0] & 0xff) < 0x80)

    return 1;

  utf8_output[1] = it_[1];

  if ((it_[0] & 0xff) < 0xE0)

    return 2;

  utf8_output[2] = it_[2];

  if ((it_[0] & 0xff) < 0xF0)

    return 3;

  utf8_output[3] = it_[3];

  return 4;

}


string UnicodeText::const_iterator::get_utf8_string() const {

  return string(utf8_data(), utf8_length());

}


int UnicodeText::const_iterator::utf8_length() const {

  if ((it_[0] & 0xff) < 0x80) {

    return 1;

  } else if ((it_[0] & 0xff) < 0xE0) {

    return 2;

  } else if ((it_[0] & 0xff) < 0xF0) {

    return 3;

  } else {

    return 4;

  }

}


UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {

  CHECK(p != nullptr);

  const char *start = utf8_data();

  int len = utf8_length();

  const char *end = start + len;

  CHECK(p >= start);

  CHECK(p <= end);

  CHECK(p == end || !UniLib::IsTrailByte(*p));

  return const_iterator(p);

}


#ifdef INCLUDE_TENSORFLOW

string UnicodeText::const_iterator::DebugString() const {

  return tensorflow::strings::Printf("{iter %p}", it_);

}


// *************************** Utilities *************************


string CodepointString(const UnicodeText &t) {

  string s;

  UnicodeText::const_iterator it = t.begin(), end = t.end();

  while (it != end)

    tensorflow::strings::Appendf(&s, "%X ", *it++);

  return s;

}

#endif

char32
signed int char32
Definition: pango_font_info.h:36

LOG
@ LOG
Definition: cleanapi_test.cc:19

FATAL
@ FATAL
Definition: log.h:28

WARNING
@ WARNING
Definition: log.h:28

operator<
bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs)
Definition: unicodetext.cc:416

operator==
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377

distance
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44

unicodetext.h

CodepointString
string CodepointString(const UnicodeText &t)

unilib_utf8_utils.h

unilib.h

runetochar
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244

isvalidcharntorune
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:239

utf.h

UTFmax
@ UTFmax
Definition: utf.h:22

p
const char * p
Definition: gmock-matchers_test.cc:4030

include_gunit.h

CHECK_GE
#define CHECK_GE(test, value)
Definition: include_gunit.h:80

CHECK
#define CHECK(condition)
Definition: include_gunit.h:76

CHECK_LE
#define CHECK_LE(test, value)
Definition: include_gunit.h:83

tesseract::last
LIST last(LIST var_list)
Definition: oldlist.cpp:153

UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33

UniLib::IsValidCodepoint
bool IsValidCodepoint(char32 c)
Definition: unilib_utf8_utils.h:31

UniLib::SpanInterchangeValid
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:39

UniLib::IsTrailByte
bool IsTrailByte(char x)
Definition: unilib_utf8_utils.h:58

UniLib::OneCharLen
int OneCharLen(const char *src)
Definition: unilib_utf8_utils.h:53

UnicodeText
Definition: unicodetext.h:116

UnicodeText::UTF8Substring
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
Definition: unicodetext.cc:202

UnicodeText::~UnicodeText
~UnicodeText()
Definition: unicodetext.cc:355

UnicodeText::push_back
void push_back(char32 codepoint)
Definition: unicodetext.cc:357

UnicodeText::MakeIterator
const_iterator MakeIterator(const char *p) const
Definition: unicodetext.cc:484

UnicodeText::CopyUTF8
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:221

UnicodeText::find
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
Definition: unicodetext.cc:301

UnicodeText::append
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
Definition: unicodetext.h:163

UnicodeText::Copy
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216

UnicodeText::PointTo
UnicodeText & PointTo(const UnicodeText &src)
Definition: unicodetext.cc:270

UnicodeText::DebugString
string DebugString() const

UnicodeText::end
const_iterator end() const
Definition: unicodetext.cc:412

UnicodeText::const_iterator
friend class const_iterator
Definition: unicodetext.h:348

UnicodeText::operator=
UnicodeText & operator=(const UnicodeText &src)
Definition: unicodetext.cc:209

UnicodeText::PointToUTF8
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254

UnicodeText::TakeOwnershipOfUTF8
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.cc:237

UnicodeText::utf8_length
int utf8_length() const
Definition: unicodetext.h:308

UnicodeText::size
int size() const
Definition: unicodetext.cc:373

UnicodeText::HasReplacementChar
bool HasReplacementChar() const

UnicodeText::begin
const_iterator begin() const
Definition: unicodetext.cc:408

UnicodeText::utf8_data
const char * utf8_data() const
Definition: unicodetext.h:305

UnicodeText::UnicodeText
UnicodeText()
Definition: unicodetext.cc:188

UnicodeText::clear
void clear()
Definition: unicodetext.cc:350

UnicodeText::const_iterator
Definition: unicodetext.h:181

UnicodeText::const_iterator::DebugString
string DebugString() const

UnicodeText::const_iterator::const_iterator
const_iterator()
Definition: unicodetext.cc:398

UnicodeText::const_iterator::operator++
const_iterator & operator++()
Definition: unicodetext.cc:443

UnicodeText::const_iterator::operator--
const_iterator & operator--()
Definition: unicodetext.cc:448

UnicodeText::const_iterator::difference_type
ptrdiff_t difference_type
Definition: unicodetext.h:187

UnicodeText::const_iterator::get_utf8
int get_utf8(char *buf) const
Definition: unicodetext.cc:454

UnicodeText::const_iterator::operator=
const_iterator & operator=(const const_iterator &other)
Definition: unicodetext.cc:402

UnicodeText::const_iterator::operator*
char32 operator*() const
Definition: unicodetext.cc:420

UnicodeText::const_iterator::utf8_length
int utf8_length() const
Definition: unicodetext.cc:472

UnicodeText::const_iterator::get_utf8_string
string get_utf8_string() const
Definition: unicodetext.cc:468

UnicodeText::const_iterator::utf8_data
const char * utf8_data() const
Definition: unicodetext.h:244