3 # Copyright 2007 Neal Norwitz
4 # Portions Copyright 2007 Google Inc.
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
10 # http://www.apache.org/licenses/LICENSE-2.0
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
18 """Tokenize C++ source code."""
25 import __builtin__ as builtins
33 if not hasattr(builtins, 'set'):
34 # Nominal support for Python 2.3.
35 from sets import Set as set
38 # Add $ as a valid identifier char since so much code uses it.
39 _letters = 'abcdefghijklmnopqrstuvwxyz'
40 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
41 HEX_DIGITS = set('0123456789abcdefABCDEF')
42 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
45 # C++0x string preffixes.
46 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
54 PREPROCESSOR = 'PREPROCESSOR'
56 # Where the token originated from. This can be used for backtracking.
57 # It is always set to WHENCE_STREAM in this code.
58 WHENCE_STREAM, WHENCE_QUEUE = range(2)
62 """Data container to represent a C++ token.
64 Tokens can be identifiers, syntax char(s), constants, or
65 pre-processor directives.
67 start contains the index of the first char of the token in the source
68 end contains the index of the last char of the token in the source
71 def __init__(self, token_type, name, start, end):
72 self.token_type = token_type
76 self.whence = WHENCE_STREAM
80 return 'Token(%r)' % self.name
81 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
86 def _GetString(source, start, i):
87 i = source.find('"', i+1)
88 while source[i-1] == '\\':
89 # Count the trailing backslashes.
92 while source[j] == '\\':
95 # When trailing backslashes are even, they escape each other.
96 if (backslash_count % 2) == 0:
98 i = source.find('"', i+1)
102 def _GetChar(source, start, i):
103 # NOTE(nnorwitz): may not be quite correct, should be good enough.
104 i = source.find("'", i+1)
105 while source[i-1] == '\\':
106 # Need to special case '\\'.
107 if (i - 2) > start and source[i-2] == '\\':
109 i = source.find("'", i+1)
110 # Try to handle unterminated single quotes (in a #if 0 block).
116 def GetTokens(source):
117 """Returns a sequence of Tokens.
120 source: string of C++ source code.
123 Token that represents the next token in the source.
125 # Cache various valid character sets for speed.
126 valid_identifier_chars = VALID_IDENTIFIER_CHARS
127 hex_digits = HEX_DIGITS
128 int_or_float_digits = INT_OR_FLOAT_DIGITS
129 int_or_float_digits2 = int_or_float_digits | set('.')
131 # Only ignore errors while in a #if 0 block.
132 ignore_errors = False
139 while i < end and source[i].isspace():
147 if c.isalpha() or c == '_': # Find a string token.
149 while source[i] in valid_identifier_chars:
151 # String and character constants can look like a name if
152 # they are something like L"".
153 if (source[i] == "'" and (i - start) == 1 and
154 source[start:i] in 'uUL'):
155 # u, U, and L are valid C++0x character preffixes.
156 token_type = CONSTANT
157 i = _GetChar(source, start, i)
158 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
159 token_type = CONSTANT
160 i = _GetString(source, start, i)
161 elif c == '/' and source[i+1] == '/': # Find // comments.
162 i = source.find('\n', i)
163 if i == -1: # Handle EOF.
166 elif c == '/' and source[i+1] == '*': # Find /* comments. */
167 i = source.find('*/', i) + 2
169 elif c in ':+-<>&|*=': # : or :: (plus other chars).
173 if new_ch == c and c != '>': # Treat ">>" as two tokens.
175 elif c == '-' and new_ch == '>':
179 elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
182 if c == '.' and source[i].isdigit():
183 token_type = CONSTANT
185 while source[i] in int_or_float_digits:
187 # Handle float suffixes.
188 for suffix in ('l', 'f'):
189 if suffix == source[i:i+1].lower():
192 elif c.isdigit(): # Find integer.
193 token_type = CONSTANT
194 if c == '0' and source[i+1] in 'xX':
197 while source[i] in hex_digits:
200 while source[i] in int_or_float_digits2:
202 # Handle integer (and float) suffixes.
203 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
205 if suffix == source[i:i+size].lower():
208 elif c == '"': # Find string.
209 token_type = CONSTANT
210 i = _GetString(source, start, i)
211 elif c == "'": # Find char.
212 token_type = CONSTANT
213 i = _GetChar(source, start, i)
214 elif c == '#': # Find pre-processor command.
215 token_type = PREPROCESSOR
216 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
219 elif source[i:i+6] == '#endif':
222 ignore_errors = False
224 # TODO(nnorwitz): handle preprocessor statements (\ continuations).
226 i1 = source.find('\n', i)
227 i2 = source.find('//', i)
228 i3 = source.find('/*', i)
229 i4 = source.find('"', i)
230 # NOTE(nnorwitz): doesn't handle comments in #define macros.
231 # Get the first important symbol (newline, comment, EOF/end).
232 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
234 # Handle #include "dir//foo.h" properly.
236 i = source.find('"', i+1) + 1
239 # Keep going if end of the line and the line ends with \.
240 if not (i == i1 and source[i-1] == '\\'):
242 condition = source[start+4:i].lstrip()
243 if (condition.startswith('0') or
244 condition.startswith('(0)')):
248 elif c == '\\': # Handle \ in code.
249 # This is different from the pre-processor \ handling.
253 # The tokenizer seems to be in pretty good shape. This
254 # raise is conditionally disabled so that bogus code
255 # in an #if 0 block can be handled. Since we will ignore
256 # it anyways, this is probably fine. So disable the
257 # exception and return the bogus char.
260 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
261 ('?', i, c, source[i-10:i+10]))
262 raise RuntimeError('unexpected token')
265 print('Invalid index, exiting now.')
267 yield Token(token_type, source[start:i], start, i)
270 if __name__ == '__main__':
272 """Driver mostly for testing purposes."""
273 for filename in argv[1:]:
274 source = utils.ReadFile(filename)
278 for token in GetTokens(source):
279 print('%-12s: %s' % (token.token_type, token.name))
280 # print('\r%6.2f%%' % (100.0 * index / token.end),)
281 sys.stdout.write('\n')