Classes
class	Token

Functions
def	GetTokens (source)

def	main (argv)

Variables
	VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

	HEX_DIGITS = set('0123456789abcdefABCDEF')

	INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

string	UNKNOWN = 'UNKNOWN'

string	SYNTAX = 'SYNTAX'

string	CONSTANT = 'CONSTANT'

string	NAME = 'NAME'

string	PREPROCESSOR = 'PREPROCESSOR'

	WHENCE_STREAM

	WHENCE_QUEUE

Function Documentation

◆ GetTokens()

def cpp.tokenize.GetTokens ( source )

Returns a sequence of Tokens.

Args:
  source: string of C++ source code.

Yields:
  Token that represents the next token in the source.

Definition at line 116 of file tokenize.py.

def GetTokens(source):
    """Returns a sequence of Tokens.
 
    Args:
      source: string of C++ source code.
 
    Yields:
      Token that represents the next token in the source.
    """
    # Cache various valid character sets for speed.
    valid_identifier_chars = VALID_IDENTIFIER_CHARS
    hex_digits = HEX_DIGITS
    int_or_float_digits = INT_OR_FLOAT_DIGITS
    int_or_float_digits2 = int_or_float_digits | set('.')
 
    # Only ignore errors while in a #if 0 block.
    ignore_errors = False
    count_ifs = 0
 
    i = 0
    end = len(source)
    while i < end:
        # Skip whitespace.
        while i < end and source[i].isspace():
            i += 1
        if i >= end:
            return
 
        token_type = UNKNOWN
        start = i
        c = source[i]
        if c.isalpha() or c == '_':              # Find a string token.
            token_type = NAME
            while source[i] in valid_identifier_chars:
                i += 1
            # String and character constants can look like a name if
            # they are something like L"".
            if (source[i] == "'" and (i - start) == 1 and
                source[start:i] in 'uUL'):
                # u, U, and L are valid C++0x character preffixes.
                token_type = CONSTANT
                i = _GetChar(source, start, i)
            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
                token_type = CONSTANT
                i = _GetString(source, start, i)
        elif c == '/' and source[i+1] == '/':    # Find // comments.
            i = source.find('\n', i)
            if i == -1:  # Handle EOF.
                i = end
            continue
        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
            i = source.find('*/', i) + 2
            continue
        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
            token_type = SYNTAX
            i += 1
            new_ch = source[i]
            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
                i += 1
            elif c == '-' and new_ch == '>':
                i += 1
            elif new_ch == '=':
                i += 1
        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
            token_type = SYNTAX
            i += 1
            if c == '.' and source[i].isdigit():
                token_type = CONSTANT
                i += 1
                while source[i] in int_or_float_digits:
                    i += 1
                # Handle float suffixes.
                for suffix in ('l', 'f'):
                    if suffix == source[i:i+1].lower():
                        i += 1
                        break
        elif c.isdigit():                        # Find integer.
            token_type = CONSTANT
            if c == '0' and source[i+1] in 'xX':
                # Handle hex digits.
                i += 2
                while source[i] in hex_digits:
                    i += 1
            else:
                while source[i] in int_or_float_digits2:
                    i += 1
            # Handle integer (and float) suffixes.
            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
                size = len(suffix)
                if suffix == source[i:i+size].lower():
                    i += size
                    break
        elif c == '"':                           # Find string.
            token_type = CONSTANT
            i = _GetString(source, start, i)
        elif c == "'":                           # Find char.
            token_type = CONSTANT
            i = _GetChar(source, start, i)
        elif c == '#':                           # Find pre-processor command.
            token_type = PREPROCESSOR
            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
            if got_if:
                count_ifs += 1
            elif source[i:i+6] == '#endif':
                count_ifs -= 1
                if count_ifs == 0:
                    ignore_errors = False
 
            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
            while 1:
                i1 = source.find('\n', i)
                i2 = source.find('//', i)
                i3 = source.find('/*', i)
                i4 = source.find('"', i)
                # NOTE(nnorwitz): doesn't handle comments in #define macros.
                # Get the first important symbol (newline, comment, EOF/end).
                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
 
                # Handle #include "dir//foo.h" properly.
                if source[i] == '"':
                    i = source.find('"', i+1) + 1
                    assert i > 0
                    continue
                # Keep going if end of the line and the line ends with \.
                if not (i == i1 and source[i-1] == '\\'):
                    if got_if:
                        condition = source[start+4:i].lstrip()
                        if (condition.startswith('0') or
                            condition.startswith('(0)')):
                            ignore_errors = True
                    break
                i += 1
        elif c == '\\':                          # Handle \ in code.
            # This is different from the pre-processor \ handling.
            i += 1
            continue
        elif ignore_errors:
            # The tokenizer seems to be in pretty good shape.  This
            # raise is conditionally disabled so that bogus code
            # in an #if 0 block can be handled.  Since we will ignore
            # it anyways, this is probably fine.  So disable the
            # exception and  return the bogus char.
            i += 1
        else:
            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
                             ('?', i, c, source[i-10:i+10]))
            raise RuntimeError('unexpected token')
 
        if i <= 0:
            print('Invalid index, exiting now.')
            return
        yield Token(token_type, source[start:i], start, i)
 
 

◆ main()

def cpp.tokenize.main ( argv )

Driver mostly for testing purposes.

Definition at line 271 of file tokenize.py.

    def main(argv):
        """Driver mostly for testing purposes."""
        for filename in argv[1:]:
            source = utils.ReadFile(filename)
            if source is None:
                continue
 
            for token in GetTokens(source):
                print('%-12s: %s' % (token.token_type, token.name))
                # print('\r%6.2f%%' % (100.0 * index / token.end),)
            sys.stdout.write('\n')
 
 

Variable Documentation

◆ CONSTANT

string cpp.tokenize.CONSTANT = 'CONSTANT'

Definition at line 52 of file tokenize.py.

◆ HEX_DIGITS

cpp.tokenize.HEX_DIGITS = set('0123456789abcdefABCDEF')

Definition at line 41 of file tokenize.py.

◆ INT_OR_FLOAT_DIGITS

cpp.tokenize.INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

Definition at line 42 of file tokenize.py.

◆ NAME

string cpp.tokenize.NAME = 'NAME'

Definition at line 53 of file tokenize.py.

◆ PREPROCESSOR

string cpp.tokenize.PREPROCESSOR = 'PREPROCESSOR'

Definition at line 54 of file tokenize.py.

◆ SYNTAX

string cpp.tokenize.SYNTAX = 'SYNTAX'

Definition at line 51 of file tokenize.py.

◆ UNKNOWN

string cpp.tokenize.UNKNOWN = 'UNKNOWN'

Definition at line 50 of file tokenize.py.

◆ VALID_IDENTIFIER_CHARS

cpp.tokenize.VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

Definition at line 40 of file tokenize.py.

◆ WHENCE_QUEUE

cpp.tokenize.WHENCE_QUEUE

Definition at line 58 of file tokenize.py.

◆ WHENCE_STREAM

cpp.tokenize.WHENCE_STREAM