tesseract v5.3.3.20231005
cpp.tokenize Namespace Reference

Classes

class  Token
 

Functions

def GetTokens (source)
 
def main (argv)
 

Variables

 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
 
 HEX_DIGITS = set('0123456789abcdefABCDEF')
 
 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
 
string UNKNOWN = 'UNKNOWN'
 
string SYNTAX = 'SYNTAX'
 
string CONSTANT = 'CONSTANT'
 
string NAME = 'NAME'
 
string PREPROCESSOR = 'PREPROCESSOR'
 
 WHENCE_STREAM
 
 WHENCE_QUEUE
 

Function Documentation

◆ GetTokens()

def cpp.tokenize.GetTokens (   source)
Returns a sequence of Tokens.

Args:
  source: string of C++ source code.

Yields:
  Token that represents the next token in the source.

Definition at line 116 of file tokenize.py.

116def GetTokens(source):
117 """Returns a sequence of Tokens.
118
119 Args:
120 source: string of C++ source code.
121
122 Yields:
123 Token that represents the next token in the source.
124 """
125 # Cache various valid character sets for speed.
126 valid_identifier_chars = VALID_IDENTIFIER_CHARS
127 hex_digits = HEX_DIGITS
128 int_or_float_digits = INT_OR_FLOAT_DIGITS
129 int_or_float_digits2 = int_or_float_digits | set('.')
130
131 # Only ignore errors while in a #if 0 block.
132 ignore_errors = False
133 count_ifs = 0
134
135 i = 0
136 end = len(source)
137 while i < end:
138 # Skip whitespace.
139 while i < end and source[i].isspace():
140 i += 1
141 if i >= end:
142 return
143
144 token_type = UNKNOWN
145 start = i
146 c = source[i]
147 if c.isalpha() or c == '_': # Find a string token.
148 token_type = NAME
149 while source[i] in valid_identifier_chars:
150 i += 1
151 # String and character constants can look like a name if
152 # they are something like L"".
153 if (source[i] == "'" and (i - start) == 1 and
154 source[start:i] in 'uUL'):
155 # u, U, and L are valid C++0x character preffixes.
156 token_type = CONSTANT
157 i = _GetChar(source, start, i)
158 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
159 token_type = CONSTANT
160 i = _GetString(source, start, i)
161 elif c == '/' and source[i+1] == '/': # Find // comments.
162 i = source.find('\n', i)
163 if i == -1: # Handle EOF.
164 i = end
165 continue
166 elif c == '/' and source[i+1] == '*': # Find /* comments. */
167 i = source.find('*/', i) + 2
168 continue
169 elif c in ':+-<>&|*=': # : or :: (plus other chars).
170 token_type = SYNTAX
171 i += 1
172 new_ch = source[i]
173 if new_ch == c and c != '>': # Treat ">>" as two tokens.
174 i += 1
175 elif c == '-' and new_ch == '>':
176 i += 1
177 elif new_ch == '=':
178 i += 1
179 elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
180 token_type = SYNTAX
181 i += 1
182 if c == '.' and source[i].isdigit():
183 token_type = CONSTANT
184 i += 1
185 while source[i] in int_or_float_digits:
186 i += 1
187 # Handle float suffixes.
188 for suffix in ('l', 'f'):
189 if suffix == source[i:i+1].lower():
190 i += 1
191 break
192 elif c.isdigit(): # Find integer.
193 token_type = CONSTANT
194 if c == '0' and source[i+1] in 'xX':
195 # Handle hex digits.
196 i += 2
197 while source[i] in hex_digits:
198 i += 1
199 else:
200 while source[i] in int_or_float_digits2:
201 i += 1
202 # Handle integer (and float) suffixes.
203 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
204 size = len(suffix)
205 if suffix == source[i:i+size].lower():
206 i += size
207 break
208 elif c == '"': # Find string.
209 token_type = CONSTANT
210 i = _GetString(source, start, i)
211 elif c == "'": # Find char.
212 token_type = CONSTANT
213 i = _GetChar(source, start, i)
214 elif c == '#': # Find pre-processor command.
215 token_type = PREPROCESSOR
216 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
217 if got_if:
218 count_ifs += 1
219 elif source[i:i+6] == '#endif':
220 count_ifs -= 1
221 if count_ifs == 0:
222 ignore_errors = False
223
224 # TODO(nnorwitz): handle preprocessor statements (\ continuations).
225 while 1:
226 i1 = source.find('\n', i)
227 i2 = source.find('//', i)
228 i3 = source.find('/*', i)
229 i4 = source.find('"', i)
230 # NOTE(nnorwitz): doesn't handle comments in #define macros.
231 # Get the first important symbol (newline, comment, EOF/end).
232 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
233
234 # Handle #include "dir//foo.h" properly.
235 if source[i] == '"':
236 i = source.find('"', i+1) + 1
237 assert i > 0
238 continue
239 # Keep going if end of the line and the line ends with \.
240 if not (i == i1 and source[i-1] == '\\'):
241 if got_if:
242 condition = source[start+4:i].lstrip()
243 if (condition.startswith('0') or
244 condition.startswith('(0)')):
245 ignore_errors = True
246 break
247 i += 1
248 elif c == '\\': # Handle \ in code.
249 # This is different from the pre-processor \ handling.
250 i += 1
251 continue
252 elif ignore_errors:
253 # The tokenizer seems to be in pretty good shape. This
254 # raise is conditionally disabled so that bogus code
255 # in an #if 0 block can be handled. Since we will ignore
256 # it anyways, this is probably fine. So disable the
257 # exception and return the bogus char.
258 i += 1
259 else:
260 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
261 ('?', i, c, source[i-10:i+10]))
262 raise RuntimeError('unexpected token')
263
264 if i <= 0:
265 print('Invalid index, exiting now.')
266 return
267 yield Token(token_type, source[start:i], start, i)
268
269
def GetTokens(source)
Definition: tokenize.py:116

◆ main()

def cpp.tokenize.main (   argv)
Driver mostly for testing purposes.

Definition at line 271 of file tokenize.py.

271 def main(argv):
272 """Driver mostly for testing purposes."""
273 for filename in argv[1:]:
274 source = utils.ReadFile(filename)
275 if source is None:
276 continue
277
278 for token in GetTokens(source):
279 print('%-12s: %s' % (token.token_type, token.name))
280 # print('\r%6.2f%%' % (100.0 * index / token.end),)
281 sys.stdout.write('\n')
282
283
def main(argv)
Definition: tokenize.py:271

Variable Documentation

◆ CONSTANT

string cpp.tokenize.CONSTANT = 'CONSTANT'

Definition at line 52 of file tokenize.py.

◆ HEX_DIGITS

cpp.tokenize.HEX_DIGITS = set('0123456789abcdefABCDEF')

Definition at line 41 of file tokenize.py.

◆ INT_OR_FLOAT_DIGITS

cpp.tokenize.INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

Definition at line 42 of file tokenize.py.

◆ NAME

string cpp.tokenize.NAME = 'NAME'

Definition at line 53 of file tokenize.py.

◆ PREPROCESSOR

string cpp.tokenize.PREPROCESSOR = 'PREPROCESSOR'

Definition at line 54 of file tokenize.py.

◆ SYNTAX

string cpp.tokenize.SYNTAX = 'SYNTAX'

Definition at line 51 of file tokenize.py.

◆ UNKNOWN

string cpp.tokenize.UNKNOWN = 'UNKNOWN'

Definition at line 50 of file tokenize.py.

◆ VALID_IDENTIFIER_CHARS

cpp.tokenize.VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

Definition at line 40 of file tokenize.py.

◆ WHENCE_QUEUE

cpp.tokenize.WHENCE_QUEUE

Definition at line 58 of file tokenize.py.

◆ WHENCE_STREAM

cpp.tokenize.WHENCE_STREAM

Definition at line 58 of file tokenize.py.