34const float CTC::kMinProb_ = 1e-12;
36const double CTC::kMaxExpArg_ = 80.0;
38const double CTC::kMinTotalTimeProb_ = 1e-8;
40const double CTC::kMinTotalFinalProb_ = 1e-6;
55 std::unique_ptr<CTC> ctc(
new CTC(labels, null_char, outputs));
56 if (!ctc->ComputeLabelLimits()) {
62 ctc->ComputeSimpleTargets(&simple_targets);
64 float bias_fraction = ctc->CalculateBiasFraction();
65 simple_targets *= bias_fraction;
66 ctc->outputs_ += simple_targets;
71 ctc->Forward(&log_alphas);
72 ctc->Backward(&log_betas);
74 log_alphas += log_betas;
75 ctc->NormalizeSequence(&log_alphas);
76 ctc->LabelsToClasses(log_alphas, targets);
82 : labels_(labels), outputs_(outputs), null_char_(null_char) {
83 num_timesteps_ = outputs.
dim1();
84 num_classes_ = outputs.
dim2();
85 num_labels_ = labels_.size();
90bool CTC::ComputeLabelLimits() {
92 min_labels_.resize(num_timesteps_, 0);
94 max_labels_.resize(num_timesteps_, 0);
95 int min_u = num_labels_ - 1;
96 if (labels_[min_u] == null_char_) {
99 for (
int t = num_timesteps_ - 1; t >= 0; --t) {
100 min_labels_[t] = min_u;
103 if (labels_[min_u] == null_char_ && min_u > 0 && labels_[min_u + 1] != labels_[min_u - 1]) {
108 int max_u = labels_[0] == null_char_;
109 for (
int t = 0; t < num_timesteps_; ++t) {
110 max_labels_[t] = max_u;
111 if (max_labels_[t] < min_labels_[t]) {
114 if (max_u + 1 < num_labels_) {
116 if (labels_[max_u] == null_char_ && max_u + 1 < num_labels_ &&
117 labels_[max_u + 1] != labels_[max_u - 1]) {
127void CTC::ComputeSimpleTargets(GENERIC_2D_ARRAY<float> *targets)
const {
129 targets->Resize(num_timesteps_, num_classes_, 0.0f);
130 std::vector<float> half_widths;
131 std::vector<int> means;
132 ComputeWidthsAndMeans(&half_widths, &means);
133 for (
int l = 0; l < num_labels_; ++l) {
134 int label = labels_[l];
135 float left_half_width = half_widths[l];
136 float right_half_width = left_half_width;
138 if (label == null_char_) {
139 if (!NeededNull(l)) {
140 if ((l > 0 && mean == means[l - 1]) || (l + 1 < num_labels_ && mean == means[l + 1])) {
147 left_half_width = mean - means[l - 1];
149 if (l + 1 < num_labels_) {
150 right_half_width = means[l + 1] - mean;
153 if (mean >= 0 && mean < num_timesteps_) {
154 targets->put(mean, label, 1.0f);
156 for (
int offset = 1; offset < left_half_width && mean >= offset; ++offset) {
157 float prob = 1.0f - offset / left_half_width;
158 if (mean - offset < num_timesteps_ && prob > targets->get(mean - offset, label)) {
159 targets->put(mean - offset, label, prob);
162 for (
int offset = 1; offset < right_half_width && mean + offset < num_timesteps_; ++offset) {
163 float prob = 1.0f - offset / right_half_width;
164 if (mean + offset >= 0 && prob > targets->get(mean + offset, label)) {
165 targets->put(mean + offset, label, prob);
173void CTC::ComputeWidthsAndMeans(std::vector<float> *half_widths, std::vector<int> *means)
const {
177 int num_plus = 0, num_star = 0;
178 for (
int i = 0;
i < num_labels_; ++
i) {
179 if (labels_[
i] != null_char_ || NeededNull(
i)) {
188 float plus_size = 1.0f, star_size = 0.0f;
189 float total_floating = num_plus + num_star;
190 if (total_floating <= num_timesteps_) {
191 plus_size = star_size = num_timesteps_ / total_floating;
192 }
else if (num_star > 0) {
193 star_size =
static_cast<float>(num_timesteps_ - num_plus) / num_star;
196 float mean_pos = 0.0f;
197 for (
int i = 0;
i < num_labels_; ++
i) {
199 if (labels_[
i] != null_char_ || NeededNull(
i)) {
200 half_width = plus_size / 2.0f;
202 half_width = star_size / 2.0f;
204 mean_pos += half_width;
205 means->push_back(
static_cast<int>(mean_pos));
206 mean_pos += half_width;
207 half_widths->push_back(half_width);
212static int BestLabel(
const GENERIC_2D_ARRAY<float> &outputs,
int t) {
214 int num_classes = outputs.dim2();
215 const float *outputs_t = outputs[t];
216 for (
int c = 1; c < num_classes; ++c) {
217 if (outputs_t[c] > outputs_t[result]) {
226float CTC::CalculateBiasFraction() {
228 std::vector<int> output_labels;
229 for (
int t = 0; t < num_timesteps_; ++t) {
230 int label = BestLabel(outputs_, t);
231 while (t + 1 < num_timesteps_ && BestLabel(outputs_, t + 1) == label) {
234 if (label != null_char_) {
235 output_labels.push_back(label);
239 std::vector<int> truth_counts(num_classes_);
240 std::vector<int> output_counts(num_classes_);
241 for (
int l = 0; l < num_labels_; ++l) {
242 ++truth_counts[labels_[l]];
244 for (
auto l : output_labels) {
248 int true_pos = 0, false_pos = 0, total_labels = 0;
249 for (
int c = 0; c < num_classes_; ++c) {
250 if (c == null_char_) {
253 int truth_count = truth_counts[c];
254 int ocr_count = output_counts[c];
255 if (truth_count > 0) {
256 total_labels += truth_count;
257 if (ocr_count > truth_count) {
258 true_pos += truth_count;
259 false_pos += ocr_count - truth_count;
261 true_pos += ocr_count;
267 if (total_labels == 0) {
270 return exp(std::max(true_pos - false_pos, 1) * std::log(kMinProb_) / total_labels);
276static double LogSumExp(
double ln_x,
double ln_y) {
278 return ln_x + log1p(exp(ln_y - ln_x));
280 return ln_y + log1p(exp(ln_x - ln_y));
285void CTC::Forward(GENERIC_2D_ARRAY<double> *log_probs)
const {
286 log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX);
287 log_probs->put(0, 0, log(outputs_(0, labels_[0])));
288 if (labels_[0] == null_char_) {
289 log_probs->put(0, 1, log(outputs_(0, labels_[1])));
291 for (
int t = 1; t < num_timesteps_; ++t) {
292 const float *outputs_t = outputs_[t];
293 for (
int u = min_labels_[t]; u <= max_labels_[t]; ++u) {
295 double log_sum = log_probs->
get(t - 1, u);
298 log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 1));
301 if (u >= 2 && labels_[u - 1] == null_char_ && labels_[u] != labels_[u - 2]) {
302 log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 2));
305 double label_prob = outputs_t[labels_[u]];
306 log_sum += log(label_prob);
307 log_probs->put(t, u, log_sum);
313void CTC::Backward(GENERIC_2D_ARRAY<double> *log_probs)
const {
314 log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX);
315 log_probs->put(num_timesteps_ - 1, num_labels_ - 1, 0.0);
316 if (labels_[num_labels_ - 1] == null_char_) {
317 log_probs->put(num_timesteps_ - 1, num_labels_ - 2, 0.0);
319 for (
int t = num_timesteps_ - 2; t >= 0; --t) {
320 const float *outputs_tp1 = outputs_[t + 1];
321 for (
int u = min_labels_[t]; u <= max_labels_[t]; ++u) {
323 double log_sum = log_probs->
get(t + 1, u) + std::log(outputs_tp1[labels_[u]]);
325 if (u + 1 < num_labels_) {
326 double prev_prob = outputs_tp1[labels_[u + 1]];
327 log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 1) + log(prev_prob));
330 if (u + 2 < num_labels_ && labels_[u + 1] == null_char_ && labels_[u] != labels_[u + 2]) {
331 double skip_prob = outputs_tp1[labels_[u + 2]];
332 log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 2) + log(skip_prob));
334 log_probs->put(t, u, log_sum);
340void CTC::NormalizeSequence(GENERIC_2D_ARRAY<double> *probs)
const {
341 double max_logprob = probs->Max();
342 for (
int u = 0; u < num_labels_; ++u) {
344 for (
int t = 0; t < num_timesteps_; ++t) {
346 double prob = probs->get(t, u);
347 if (prob > -FLT_MAX) {
348 prob = ClippedExp(prob - max_logprob);
353 probs->put(t, u, prob);
358 if (total < kMinTotalTimeProb_) {
359 total = kMinTotalTimeProb_;
361 for (
int t = 0; t < num_timesteps_; ++t) {
362 probs->put(t, u, probs->get(t, u) / total);
370void CTC::LabelsToClasses(
const GENERIC_2D_ARRAY<double> &probs, NetworkIO *targets)
const {
373 for (
int t = 0; t < num_timesteps_; ++t) {
374 float *targets_t = targets->f(t);
375 std::vector<double> class_probs(num_classes_);
376 for (
int u = 0; u < num_labels_; ++u) {
377 double prob = probs(t, u);
381 if (prob > class_probs[labels_[u]]) {
382 class_probs[labels_[u]] = prob;
387 for (
int c = 0; c < num_classes_; ++c) {
388 targets_t[c] = class_probs[c];
389 if (class_probs[c] > class_probs[best_class]) {
402 int num_timesteps = probs->dim1();
403 int num_classes = probs->dim2();
404 for (
int t = 0; t < num_timesteps; ++t) {
405 float *probs_t = (*probs)[t];
408 for (
int c = 0; c < num_classes; ++c) {
411 if (total < kMinTotalFinalProb_) {
412 total = kMinTotalFinalProb_;
415 double increment = 0.0;
416 for (
int c = 0; c < num_classes; ++c) {
417 double prob = probs_t[c] / total;
418 if (prob < kMinProb_) {
419 increment += kMinProb_ - prob;
424 for (
int c = 0; c < num_classes; ++c) {
425 float prob = probs_t[c] / total;
426 probs_t[c] = std::max(prob, kMinProb_);
432bool CTC::NeededNull(
int index)
const {
433 return labels_[index] == null_char_ && index > 0 && index + 1 < num_labels_ &&
434 labels_[index + 1] == labels_[index - 1];
static bool ComputeCTCTargets(const std::vector< int > &truth_labels, int null_char, const GENERIC_2D_ARRAY< float > &outputs, NetworkIO *targets)
static void NormalizeProbs(NetworkIO *probs)