All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #include <ctype.h>
23 #include <math.h>
24 #include <stdarg.h>
25 #include <stddef.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <stdio.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <fcntl.h>
32 
33 #include "scanutils.h"
34 #include "tprintf.h"
35 
36 // workaround for "'off_t' was not declared in this scope" with -std=c++11
37 #if !defined(off_t) && !defined(__APPLE__) && !defined(__CYGWIN__)
38 typedef long off_t;
39 #endif // off_t
40 
41 enum Flags {
42  FL_SPLAT = 0x01, // Drop the value, do not assign
43  FL_INV = 0x02, // Character-set with inverse
44  FL_WIDTH = 0x04, // Field width specified
45  FL_MINUS = 0x08, // Negative number
46 };
47 
48 enum Ranks {
49  RANK_CHAR = -2,
50  RANK_SHORT = -1,
51  RANK_INT = 0,
52  RANK_LONG = 1,
54  RANK_PTR = INT_MAX // Special value used for pointers
55 };
56 
57 const enum Ranks kMinRank = RANK_CHAR;
59 
61 const enum Ranks kSizeTRank = RANK_LONG;
63 
64 enum Bail {
65  BAIL_NONE = 0, // No error condition
66  BAIL_EOF, // Hit EOF
67  BAIL_ERR // Conversion mismatch
68 };
69 
70 // Helper functions ------------------------------------------------------------
71 inline size_t LongBit() {
72  return CHAR_BIT * sizeof(long);
73 }
74 
75 static inline int
76 SkipSpace(FILE *s) {
77  int p;
78  while (isspace(p = fgetc(s)));
79  ungetc(p, s); // Make sure next char is available for reading
80  return p;
81 }
82 
83 static inline void
84 SetBit(unsigned long *bitmap, unsigned int bit) {
85  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
86 }
87 
88 static inline int
89 TestBit(unsigned long *bitmap, unsigned int bit) {
90  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
91 }
92 
93 static inline int DigitValue(int ch, int base) {
94  if (ch >= '0' && ch <= '9') {
95  if (base >= 10 || ch <= '7')
96  return ch-'0';
97  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
98  return ch-'A'+10;
99  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
100  return ch-'a'+10;
101  }
102  return -1;
103 }
104 
105 // IO (re-)implementations -----------------------------------------------------
106 uintmax_t streamtoumax(FILE* s, int base) {
107  int minus = 0;
108  uintmax_t v = 0;
109  int d, c = 0;
110 
111  for (c = fgetc(s);
112  isspace(static_cast<unsigned char>(c)) && (c != EOF);
113  c = fgetc(s)) {}
114 
115  // Single optional + or -
116  if (c == '-' || c == '+') {
117  minus = (c == '-');
118  c = fgetc(s);
119  }
120 
121  // Assign correct base
122  if (base == 0) {
123  if (c == '0') {
124  c = fgetc(s);
125  if (c == 'x' || c == 'X') {
126  base = 16;
127  c = fgetc(s);
128  } else {
129  base = 8;
130  }
131  }
132  } else if (base == 16) {
133  if (c == '0') {
134  c = fgetc(s);
135  if (c == 'x' || c == 'X') c = fgetc(s);
136  }
137  }
138 
139  // Actual number parsing
140  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
141  v = v*base + d;
142 
143  ungetc(c, s);
144  return minus ? -v : v;
145 }
146 
147 double streamtofloat(FILE* s) {
148  int minus = 0;
149  int v = 0;
150  int d, c = 0;
151  int k = 1;
152  int w = 0;
153 
154  for (c = fgetc(s);
155  isspace(static_cast<unsigned char>(c)) && (c != EOF);
156  c = fgetc(s));
157 
158  // Single optional + or -
159  if (c == '-' || c == '+') {
160  minus = (c == '-');
161  c = fgetc(s);
162  }
163 
164  // Actual number parsing
165  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
166  v = v*10 + d;
167  if (c == '.') {
168  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
169  w = w*10 + d;
170  k *= 10;
171  }
172  }
173  double f = static_cast<double>(v)
174  + static_cast<double>(w) / static_cast<double>(k);
175  if (c == 'e' || c == 'E') {
176  c = fgetc(s);
177  int expsign = 1;
178  if (c == '-' || c == '+') {
179  expsign = (c == '-') ? -1 : 1;
180  c = fgetc(s);
181  }
182  int exponent = 0;
183  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
184  exponent = exponent * 10 + d;
185  }
186  exponent *= expsign;
187  f *= pow(10.0, static_cast<double>(exponent));
188  }
189  ungetc(c, s);
190 
191  return minus ? -f : f;
192 }
193 
194 double strtofloat(const char* s) {
195  int minus = 0;
196  int v = 0;
197  int d;
198  int k = 1;
199  int w = 0;
200 
201  while(*s && isspace(static_cast<unsigned char>(*s))) s++;
202 
203  // Single optional + or -
204  if (*s == '-' || *s == '+') {
205  minus = (*s == '-');
206  s++;
207  }
208 
209  // Actual number parsing
210  for (; *s && (d = DigitValue(*s, 10)) >= 0; s++)
211  v = v*10 + d;
212  if (*s == '.') {
213  for (++s; *s && (d = DigitValue(*s, 10)) >= 0; s++) {
214  w = w*10 + d;
215  k *= 10;
216  }
217  }
218  if (*s == 'e' || *s == 'E')
219  tprintf("WARNING: Scientific Notation not supported!");
220 
221  double f = static_cast<double>(v)
222  + static_cast<double>(w) / static_cast<double>(k);
223 
224  return minus ? -f : f;
225 }
226 
227 static int tvfscanf(FILE* stream, const char *format, va_list ap);
228 
229 int tfscanf(FILE* stream, const char *format, ...) {
230  va_list ap;
231  int rv;
232 
233  va_start(ap, format);
234  rv = tvfscanf(stream, format, ap);
235  va_end(ap);
236 
237  return rv;
238 }
239 
240 #ifdef EMBEDDED
241 
242 int fscanf(FILE* stream, const char *format, ...) {
243  va_list ap;
244  int rv;
245 
246  va_start(ap, format);
247  rv = tvfscanf(stream, format, ap);
248  va_end(ap);
249 
250  return rv;
251 }
252 
253 int vfscanf(FILE* stream, const char *format, ...) {
254  va_list ap;
255  int rv;
256 
257  va_start(ap, format);
258  rv = tvfscanf(stream, format, ap);
259  va_end(ap);
260 
261  return rv;
262 }
263 #endif
264 
265 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
266  const char *p = format;
267  char ch;
268  int q = 0;
269  uintmax_t val = 0;
270  int rank = RANK_INT; // Default rank
271  unsigned int width = UINT_MAX;
272  int base;
273  int flags = 0;
274  enum {
275  ST_NORMAL, // Ground state
276  ST_FLAGS, // Special flags
277  ST_WIDTH, // Field width
278  ST_MODIFIERS, // Length or conversion modifiers
279  ST_MATCH_INIT, // Initial state of %[ sequence
280  ST_MATCH, // Main state of %[ sequence
281  ST_MATCH_RANGE, // After - in a %[ sequence
282  } state = ST_NORMAL;
283  char *sarg = NULL; // %s %c or %[ string argument
284  enum Bail bail = BAIL_NONE;
285  int sign;
286  int converted = 0; // Successful conversions
287  unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
288  (CHAR_BIT * sizeof(long))];
289  int matchinv = 0; // Is match map inverted?
290  unsigned char range_start = 0;
291  off_t start_off = ftell(stream);
292 
293  // Skip leading spaces
294  SkipSpace(stream);
295 
296  while ((ch = *p++) && !bail) {
297  switch (state) {
298  case ST_NORMAL:
299  if (ch == '%') {
300  state = ST_FLAGS;
301  flags = 0; rank = RANK_INT; width = UINT_MAX;
302  } else if (isspace(static_cast<unsigned char>(ch))) {
303  SkipSpace(stream);
304  } else {
305  if (fgetc(stream) != ch)
306  bail = BAIL_ERR; // Match failure
307  }
308  break;
309 
310  case ST_FLAGS:
311  if (ch == '*') {
312  flags |= FL_SPLAT;
313  } else if ('0' <= ch && ch <= '9') {
314  width = (ch-'0');
315  state = ST_WIDTH;
316  flags |= FL_WIDTH;
317  } else {
318  state = ST_MODIFIERS;
319  p--; // Process this character again
320  }
321  break;
322 
323  case ST_WIDTH:
324  if (ch >= '0' && ch <= '9') {
325  width = width*10+(ch-'0');
326  } else {
327  state = ST_MODIFIERS;
328  p--; // Process this character again
329  }
330  break;
331 
332  case ST_MODIFIERS:
333  switch (ch) {
334  // Length modifiers - nonterminal sequences
335  case 'h':
336  rank--; // Shorter rank
337  break;
338  case 'l':
339  rank++; // Longer rank
340  break;
341  case 'j':
342  rank = kIntMaxRank;
343  break;
344  case 'z':
345  rank = kSizeTRank;
346  break;
347  case 't':
348  rank = kPtrDiffRank;
349  break;
350  case 'L':
351  case 'q':
352  rank = RANK_LONGLONG; // long double/long long
353  break;
354 
355  default:
356  // Output modifiers - terminal sequences
357  state = ST_NORMAL; // Next state will be normal
358  if (rank < kMinRank) // Canonicalize rank
359  rank = kMinRank;
360  else if (rank > kMaxRank)
361  rank = kMaxRank;
362 
363  switch (ch) {
364  case 'P': // Upper case pointer
365  case 'p': // Pointer
366  rank = RANK_PTR;
367  base = 0; sign = 0;
368  goto scan_int;
369 
370  case 'i': // Base-independent integer
371  base = 0; sign = 1;
372  goto scan_int;
373 
374  case 'd': // Decimal integer
375  base = 10; sign = 1;
376  goto scan_int;
377 
378  case 'o': // Octal integer
379  base = 8; sign = 0;
380  goto scan_int;
381 
382  case 'u': // Unsigned decimal integer
383  base = 10; sign = 0;
384  goto scan_int;
385 
386  case 'x': // Hexadecimal integer
387  case 'X':
388  base = 16; sign = 0;
389  goto scan_int;
390 
391  case 'n': // Number of characters consumed
392  val = ftell(stream) - start_off;
393  goto set_integer;
394 
395  scan_int:
396  q = SkipSpace(stream);
397  if ( q <= 0 ) {
398  bail = BAIL_EOF;
399  break;
400  }
401  val = streamtoumax(stream, base);
402  // fall through
403 
404  set_integer:
405  if (!(flags & FL_SPLAT)) {
406  converted++;
407  switch(rank) {
408  case RANK_CHAR:
409  *va_arg(ap, unsigned char *)
410  = static_cast<unsigned char>(val);
411  break;
412  case RANK_SHORT:
413  *va_arg(ap, unsigned short *)
414  = static_cast<unsigned short>(val);
415  break;
416  case RANK_INT:
417  *va_arg(ap, unsigned int *)
418  = static_cast<unsigned int>(val);
419  break;
420  case RANK_LONG:
421  *va_arg(ap, unsigned long *)
422  = static_cast<unsigned long>(val);
423  break;
424  case RANK_LONGLONG:
425  *va_arg(ap, unsigned long long *)
426  = static_cast<unsigned long long>(val);
427  break;
428  case RANK_PTR:
429  *va_arg(ap, void **)
430  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
431  break;
432  }
433  }
434  break;
435 
436  case 'f': // Preliminary float value parsing
437  case 'g':
438  case 'G':
439  case 'e':
440  case 'E':
441  q = SkipSpace(stream);
442  if (q <= 0) {
443  bail = BAIL_EOF;
444  break;
445  }
446 
447  {
448  double fval = streamtofloat(stream);
449  if (!(flags & FL_SPLAT)) {
450  if (rank == RANK_INT)
451  *va_arg(ap, float *) = static_cast<float>(fval);
452  else if (rank == RANK_LONG)
453  *va_arg(ap, double *) = static_cast<double>(fval);
454  converted++;
455  }
456  }
457  break;
458 
459  case 'c': // Character
460  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
461  sarg = va_arg(ap, char *);
462  while (width--) {
463  if ((q = fgetc(stream)) <= 0) {
464  bail = BAIL_EOF;
465  break;
466  }
467  if (!(flags & FL_SPLAT)) {
468  *sarg++ = q;
469  converted++;
470  }
471  }
472  break;
473 
474  case 's': // String
475  {
476  char *sp;
477  sp = sarg = va_arg(ap, char *);
478  while (width--) {
479  q = fgetc(stream);
480  if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
481  ungetc(q, stream);
482  break;
483  }
484  if (!(flags & FL_SPLAT)) *sp = q;
485  sp++;
486  }
487  if (sarg == sp) {
488  bail = BAIL_EOF;
489  } else if (!(flags & FL_SPLAT)) {
490  *sp = '\0'; // Terminate output
491  converted++;
492  } else {
493  }
494  }
495  break;
496 
497  case '[': // Character range
498  sarg = va_arg(ap, char *);
499  state = ST_MATCH_INIT;
500  matchinv = 0;
501  memset(matchmap, 0, sizeof matchmap);
502  break;
503 
504  case '%': // %% sequence
505  if (fgetc(stream) != '%' )
506  bail = BAIL_ERR;
507  break;
508 
509  default: // Anything else
510  bail = BAIL_ERR; // Unknown sequence
511  break;
512  }
513  }
514  break;
515 
516  case ST_MATCH_INIT: // Initial state for %[ match
517  if (ch == '^' && !(flags & FL_INV)) {
518  matchinv = 1;
519  } else {
520  SetBit(matchmap, static_cast<unsigned char>(ch));
521  state = ST_MATCH;
522  }
523  break;
524 
525  case ST_MATCH: // Main state for %[ match
526  if (ch == ']') {
527  goto match_run;
528  } else if (ch == '-') {
529  range_start = static_cast<unsigned char>(ch);
530  state = ST_MATCH_RANGE;
531  } else {
532  SetBit(matchmap, static_cast<unsigned char>(ch));
533  }
534  break;
535 
536  case ST_MATCH_RANGE: // %[ match after -
537  if (ch == ']') {
538  SetBit(matchmap, static_cast<unsigned char>('-'));
539  goto match_run;
540  } else {
541  int i;
542  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
543  SetBit(matchmap, i);
544  state = ST_MATCH;
545  }
546  break;
547 
548  match_run: // Match expression finished
549  char* oarg = sarg;
550  while (width) {
551  q = fgetc(stream);
552  unsigned char qc = static_cast<unsigned char>(q);
553  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
554  ungetc(q, stream);
555  break;
556  }
557  if (!(flags & FL_SPLAT)) *sarg = q;
558  sarg++;
559  }
560  if (oarg == sarg) {
561  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
562  } else if (!(flags & FL_SPLAT)) {
563  *sarg = '\0';
564  converted++;
565  }
566  break;
567  }
568  }
569 
570  if (bail == BAIL_EOF && !converted)
571  converted = -1; // Return EOF (-1)
572 
573  return converted;
574 }
575 
576 #ifdef EMBEDDED
577 int creat(const char *pathname, mode_t mode) {
578  return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
579 }
580 
581 #endif // EMBEDDED
Flags
Definition: scanutils.cpp:41
#define tprintf(...)
Definition: tprintf.h:31
Ranks
Definition: scanutils.cpp:48
size_t LongBit()
Definition: scanutils.cpp:71
enum Ranks kIntMaxRank
Definition: scanutils.cpp:60
double strtofloat(const char *s)
Definition: scanutils.cpp:194
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:229
CMD_EVENTS mode
Definition: pgedit.cpp:116
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:62
enum Ranks kMinRank
Definition: scanutils.cpp:57
double streamtofloat(FILE *s)
Definition: scanutils.cpp:147
enum Ranks kMaxRank
Definition: scanutils.cpp:58
enum Ranks kSizeTRank
Definition: scanutils.cpp:61
Bail
Definition: scanutils.cpp:64
#define NULL
Definition: host.h:144
uintmax_t streamtoumax(FILE *s, int base)
Definition: scanutils.cpp:106
long off_t
Definition: scanutils.cpp:38