tesseract-ocr.github.io/5.3.3/a00260_source.html

// File:        unichar.cpp

// Description: Unicode character/ligature class.

// Author:      Ray Smith

//

// (C) Copyright 2006, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//


#include <tesseract/unichar.h>

#include "errcode.h"

#include "tprintf.h"


#define UNI_MAX_LEGAL_UTF32 0x0010FFFF


namespace tesseract {


// Construct from a utf8 string. If len<0 then the string is null terminated.

// If the string is too long to fit in the UNICHAR then it takes only what

// will fit. Checks for illegal input and stops at an illegal sequence.

// The resulting UNICHAR may be empty.

UNICHAR::UNICHAR(const char *utf8_str, int len) {

  int total_len = 0;

  int step = 0;

  if (len < 0) {

    for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {

      ;

    }

  }

  for (total_len = 0; total_len < len; total_len += step) {

    step = utf8_step(utf8_str + total_len);

    if (total_len + step > UNICHAR_LEN) {

      break; // Too long.

    }

    if (step == 0) {

      break; // Illegal first byte.

    }

    int i;

    for (i = 1; i < step; ++i) {

      if ((utf8_str[total_len + i] & 0xc0) != 0x80) {

        break;

      }

    }

    if (i < step) {

      break; // Illegal surrogate

    }

  }

  memcpy(chars, utf8_str, total_len);

  if (total_len < UNICHAR_LEN) {

    chars[UNICHAR_LEN - 1] = total_len;

    while (total_len < UNICHAR_LEN - 1) {

      chars[total_len++] = 0;

    }

  }

}


// Construct from a single UCS4 character. Illegal values are ignored,

// resulting in an empty UNICHAR.

UNICHAR::UNICHAR(int unicode) {

  const int bytemask = 0xBF;

  const int bytemark = 0x80;


  if (unicode < 0x80) {

    chars[UNICHAR_LEN - 1] = 1;

    chars[2] = 0;

    chars[1] = 0;

    chars[0] = static_cast<char>(unicode);

  } else if (unicode < 0x800) {

    chars[UNICHAR_LEN - 1] = 2;

    chars[2] = 0;

    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);

    unicode >>= 6;

    chars[0] = static_cast<char>(unicode | 0xc0);

  } else if (unicode < 0x10000) {

    chars[UNICHAR_LEN - 1] = 3;

    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);

    unicode >>= 6;

    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);

    unicode >>= 6;

    chars[0] = static_cast<char>(unicode | 0xe0);

  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {

    chars[UNICHAR_LEN - 1] = 4;

    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);

    unicode >>= 6;

    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);

    unicode >>= 6;

    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);

    unicode >>= 6;

    chars[0] = static_cast<char>(unicode | 0xf0);

  } else {

    memset(chars, 0, UNICHAR_LEN);

  }

}


// Get the first character as UCS-4.

int UNICHAR::first_uni() const {

  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};

  int uni = 0;

  int len = utf8_step(chars);

  const char *src = chars;


  switch (len) {

    default:

      break;

    case 4:

      uni += static_cast<unsigned char>(*src++);

      uni <<= 6;

      // Fall through.

    case 3:

      uni += static_cast<unsigned char>(*src++);

      uni <<= 6;

      // Fall through.

    case 2:

      uni += static_cast<unsigned char>(*src++);

      uni <<= 6;

      // Fall through.

    case 1:

      uni += static_cast<unsigned char>(*src++);

  }

  uni -= utf8_offsets[len];

  return uni;

}


// Get a terminated UTF8 string: Must delete[] it after use.

char *UNICHAR::utf8_str() const {

  int len = utf8_len();

  char *str = new char[len + 1];

  memcpy(str, chars, len);

  str[len] = 0;

  return str;

}


// Get the number of bytes in the first character of the given utf8 string.

int UNICHAR::utf8_step(const char *utf8_str) {

  static const char utf8_bytes[256] = {

      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,

      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};


  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];

}


UNICHAR::const_iterator &UNICHAR::const_iterator::operator++() {

  ASSERT_HOST(it_ != nullptr);

  int step = utf8_step(it_);

  if (step == 0) {

    tprintf("ERROR: Illegal UTF8 encountered.\n");

    for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {

      tprintf("Index %d char = 0x%x\n", i, it_[i]);

    }

    step = 1;

  }

  it_ += step;

  return *this;

}


int UNICHAR::const_iterator::operator*() const {

  ASSERT_HOST(it_ != nullptr);

  const int len = utf8_step(it_);

  if (len == 0) {

    tprintf("WARNING: Illegal UTF8 encountered\n");

    return ' ';

  }

  UNICHAR uch(it_, len);

  return uch.first_uni();

}


int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {

  ASSERT_HOST(it_ != nullptr);

  const int len = utf8_step(it_);

  if (len == 0) {

    tprintf("WARNING: Illegal UTF8 encountered\n");

    utf8_output[0] = ' ';

    return 1;

  }

  strncpy(utf8_output, it_, len);

  return len;

}


int UNICHAR::const_iterator::utf8_len() const {

  ASSERT_HOST(it_ != nullptr);

  const int len = utf8_step(it_);

  if (len == 0) {

    tprintf("WARNING: Illegal UTF8 encountered\n");

    return 1;

  }

  return len;

}


bool UNICHAR::const_iterator::is_legal() const {

  return utf8_step(it_) > 0;

}


UNICHAR::const_iterator UNICHAR::begin(const char *utf8_str, int len) {

  return UNICHAR::const_iterator(utf8_str);

}


UNICHAR::const_iterator UNICHAR::end(const char *utf8_str, int len) {

  return UNICHAR::const_iterator(utf8_str + len);

}


// Converts a utf-8 string to a vector of unicodes.

// Returns an empty vector if the input contains invalid UTF-8.

/* static */

std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {

  const int utf8_length = strlen(utf8_str);

  std::vector<char32> unicodes;

  unicodes.reserve(utf8_length);

  const_iterator end_it(end(utf8_str, utf8_length));

  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {

    if (it.is_legal()) {

      unicodes.push_back(*it);

    } else {

      unicodes.clear();

      return unicodes;

    }

  }

  return unicodes;

}


// Returns an empty string if the input contains an invalid unicode.

std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {

  std::string utf8_str;

  for (char32 ch : str32) {

    UNICHAR uni_ch(ch);

    int step;

    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {

      utf8_str.append(uni_ch.utf8(), step);

    } else {

      return "";

    }

  }

  return utf8_str;

}


} // namespace tesseract

unichar.h

UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:31

errcode.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

tprintf.h

UNI_MAX_LEGAL_UTF32
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:23

i
int i
Definition: gmock-matchers_test.cc:718

ch
char ch
Definition: gmock-matchers_test.cc:4035

tesseract
Definition: baseapi.h:39

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::char32
signed int char32
Definition: unichar.h:49

tesseract::UNICHAR
Definition: unichar.h:55

tesseract::UNICHAR::utf8_str
char * utf8_str() const
Definition: unichar.cpp:134

tesseract::UNICHAR::UNICHAR
UNICHAR()
Definition: unichar.h:57

tesseract::UNICHAR::first_uni
int first_uni() const
Definition: unichar.cpp:105

tesseract::UNICHAR::begin
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209

tesseract::UNICHAR::UTF8ToUTF32
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220

tesseract::UNICHAR::utf8_len
int utf8_len() const
Definition: unichar.h:75

tesseract::UNICHAR::UTF32ToUTF8
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:237

tesseract::UNICHAR::utf8
const char * utf8() const
Definition: unichar.h:81

tesseract::UNICHAR::end
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213

tesseract::UNICHAR::utf8_step
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143

tesseract::UNICHAR::const_iterator
Definition: unichar.h:105

tesseract::UNICHAR::const_iterator::operator*
int operator*() const
Definition: unichar.cpp:172

tesseract::UNICHAR::const_iterator::operator++
const_iterator & operator++()
Definition: unichar.cpp:158

tesseract::UNICHAR::const_iterator::get_utf8
int get_utf8(char *buf) const
Definition: unichar.cpp:183

tesseract::UNICHAR::const_iterator::is_legal
bool is_legal() const
Definition: unichar.cpp:205

tesseract::UNICHAR::const_iterator::utf8_len
int utf8_len() const
Definition: unichar.cpp:195