18"""Tokenize C++ source code."""
25 import __builtin__
as builtins
33if not hasattr(builtins,
'set'):
35 from sets
import Set
as set
39_letters =
'abcdefghijklmnopqrstuvwxyz'
40VALID_IDENTIFIER_CHARS =
set(_letters + _letters.upper() +
'_0123456789$')
41HEX_DIGITS =
set(
'0123456789abcdefABCDEF')
42INT_OR_FLOAT_DIGITS =
set(
'01234567890eE-+')
46_STR_PREFIXES =
set((
'R',
'u8',
'u8R',
'u',
'uR',
'U',
'UR',
'L',
'LR'))
54PREPROCESSOR =
'PREPROCESSOR'
58WHENCE_STREAM, WHENCE_QUEUE = range(2)
62 """Data container to represent a C++ token.
64 Tokens can be identifiers, syntax char(s), constants, or
65 pre-processor directives.
67 start contains the index of the first char of the token
in the source
68 end contains the index of the last char of the token
in the source
71 def __init__(self, token_type, name, start, end):
80 return 'Token(%r)' % self.
name
81 return 'Token(%r, %s, %s)' % (self.
name, self.
start, self.
end)
86def _GetString(source, start, i):
87 i = source.find(
'"', i+1)
88 while source[i-1] ==
'\\':
92 while source[j] ==
'\\':
96 if (backslash_count % 2) == 0:
98 i = source.find(
'"', i+1)
102def _GetChar(source, start, i):
104 i = source.find(
"'", i+1)
105 while source[i-1] ==
'\\':
107 if (i - 2) > start
and source[i-2] ==
'\\':
109 i = source.find(
"'", i+1)
117 """Returns a sequence of Tokens.
120 source: string of C++ source code.
123 Token that represents the next token in the source.
126 valid_identifier_chars = VALID_IDENTIFIER_CHARS
127 hex_digits = HEX_DIGITS
128 int_or_float_digits = INT_OR_FLOAT_DIGITS
129 int_or_float_digits2 = int_or_float_digits |
set(
'.')
132 ignore_errors =
False
139 while i < end
and source[i].isspace():
147 if c.isalpha()
or c ==
'_':
149 while source[i]
in valid_identifier_chars:
153 if (source[i] ==
"'" and (i - start) == 1
and
154 source[start:i]
in 'uUL'):
156 token_type = CONSTANT
157 i = _GetChar(source, start, i)
158 elif source[i] ==
"'" and source[start:i]
in _STR_PREFIXES:
159 token_type = CONSTANT
160 i = _GetString(source, start, i)
161 elif c ==
'/' and source[i+1] ==
'/':
162 i = source.find(
'\n', i)
166 elif c ==
'/' and source[i+1] ==
'*':
167 i = source.find(
'*/', i) + 2
169 elif c
in ':+-<>&|*=':
173 if new_ch == c
and c !=
'>':
175 elif c ==
'-' and new_ch ==
'>':
179 elif c
in '()[]{}~!?^%;/.,':
182 if c ==
'.' and source[i].isdigit():
183 token_type = CONSTANT
185 while source[i]
in int_or_float_digits:
188 for suffix
in (
'l',
'f'):
189 if suffix == source[i:i+1].lower():
193 token_type = CONSTANT
194 if c ==
'0' and source[i+1]
in 'xX':
197 while source[i]
in hex_digits:
200 while source[i]
in int_or_float_digits2:
203 for suffix
in (
'ull',
'll',
'ul',
'l',
'f',
'u'):
205 if suffix == source[i:i+size].lower():
209 token_type = CONSTANT
210 i = _GetString(source, start, i)
212 token_type = CONSTANT
213 i = _GetChar(source, start, i)
215 token_type = PREPROCESSOR
216 got_if = source[i:i+3] ==
'#if' and source[i+3:i+4].isspace()
219 elif source[i:i+6] ==
'#endif':
222 ignore_errors =
False
226 i1 = source.find(
'\n', i)
227 i2 = source.find(
'//', i)
228 i3 = source.find(
'/*', i)
229 i4 = source.find(
'"', i)
232 i = min([x
for x
in (i1, i2, i3, i4, end)
if x != -1])
236 i = source.find(
'"', i+1) + 1
240 if not (i == i1
and source[i-1] ==
'\\'):
242 condition = source[start+4:i].lstrip()
243 if (condition.startswith(
'0')
or
244 condition.startswith(
'(0)')):
260 sys.stderr.write(
'Got invalid token in %s @ %d token:%s: %r\n' %
261 (
'?', i, c, source[i-10:i+10]))
262 raise RuntimeError(
'unexpected token')
265 print(
'Invalid index, exiting now.')
267 yield Token(token_type, source[start:i], start, i)
270if __name__ ==
'__main__':
272 """Driver mostly for testing purposes."""
273 for filename
in argv[1:]:
274 source = utils.ReadFile(filename)
279 print(
'%-12s: %s' % (token.token_type, token.name))
281 sys.stdout.write(
'\n')
def __init__(self, token_type, name, start, end)