github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/third_party/pythonparser/lexer.py (about) 1 """ 2 The :mod:`lexer` module concerns itself with tokenizing Python source. 3 """ 4 5 from __future__ import absolute_import, division, print_function, unicode_literals 6 from . import source, diagnostic 7 import re 8 import unicodedata 9 import sys 10 11 if sys.version_info[0] == 3: 12 unichr = chr 13 byte = lambda x: bytes([x]) 14 else: 15 byte = chr 16 17 class Token: 18 """ 19 The :class:`Token` encapsulates a single lexer token and its location 20 in the source code. 21 22 :ivar loc: (:class:`pythonparser.source.Range`) token location 23 :ivar kind: (string) token kind 24 :ivar value: token value; None or a kind-specific class 25 """ 26 def __init__(self, loc, kind, value=None): 27 self.loc, self.kind, self.value = loc, kind, value 28 29 def __repr__(self): 30 return "Token(%s, \"%s\", %s)" % (repr(self.loc), self.kind, repr(self.value)) 31 32 class Lexer: 33 """ 34 The :class:`Lexer` class extracts tokens and comments from 35 a :class:`pythonparser.source.Buffer`. 36 37 :class:`Lexer` is an iterable. 38 39 :ivar version: (tuple of (*major*, *minor*)) 40 the version of Python, determining the grammar used 41 :ivar source_buffer: (:class:`pythonparser.source.Buffer`) 42 the source buffer 43 :ivar diagnostic_engine: (:class:`pythonparser.diagnostic.Engine`) 44 the diagnostic engine 45 :ivar offset: (integer) character offset into ``source_buffer`` 46 indicating where the next token will be recognized 47 :ivar interactive: (boolean) whether a completely empty line 48 should generate a NEWLINE token, for use in REPLs 49 """ 50 51 _reserved_2_6 = frozenset([ 52 "!=", "%", "%=", "&", "&=", "(", ")", "*", "**", "**=", "*=", "+", "+=", 53 ",", "-", "-=", ".", "/", "//", "//=", "/=", ":", ";", "<", "<<", "<<=", 54 "<=", "<>", "=", "==", ">", ">=", ">>", ">>=", "@", "[", "]", "^", "^=", "`", 55 "and", "as", "assert", "break", "class", "continue", "def", "del", "elif", 56 "else", "except", "exec", "finally", "for", "from", "global", "if", "import", 57 "in", "is", "lambda", "not", "or", "pass", "print", "raise", "return", "try", 58 "while", "with", "yield", "{", "|", "|=", "}", "~" 59 ]) 60 61 _reserved_3_0 = _reserved_2_6 \ 62 - set(["<>", "`", "exec", "print"]) \ 63 | set(["->", "...", "False", "None", "nonlocal", "True"]) 64 65 _reserved_3_1 = _reserved_3_0 \ 66 | set(["<>"]) 67 68 _reserved_3_5 = _reserved_3_1 \ 69 | set(["@", "@="]) 70 71 _reserved = { 72 (2, 6): _reserved_2_6, 73 (2, 7): _reserved_2_6, 74 (3, 0): _reserved_3_0, 75 (3, 1): _reserved_3_1, 76 (3, 2): _reserved_3_1, 77 (3, 3): _reserved_3_1, 78 (3, 4): _reserved_3_1, 79 (3, 5): _reserved_3_5, 80 } 81 """ 82 A map from a tuple (*major*, *minor*) corresponding to Python version to 83 :class:`frozenset`\s of keywords. 84 """ 85 86 _string_prefixes_3_1 = frozenset(["", "r", "b", "br"]) 87 _string_prefixes_3_3 = frozenset(["", "r", "u", "b", "br", "rb"]) 88 89 # holy mother of god why 90 _string_prefixes = { 91 (2, 6): frozenset(["", "r", "u", "ur"]), 92 (2, 7): frozenset(["", "r", "u", "ur", "b", "br"]), 93 (3, 0): frozenset(["", "r", "b"]), 94 (3, 1): _string_prefixes_3_1, 95 (3, 2): _string_prefixes_3_1, 96 (3, 3): _string_prefixes_3_3, 97 (3, 4): _string_prefixes_3_3, 98 (3, 5): _string_prefixes_3_3, 99 } 100 """ 101 A map from a tuple (*major*, *minor*) corresponding to Python version to 102 :class:`frozenset`\s of string prefixes. 103 """ 104 105 def __init__(self, source_buffer, version, diagnostic_engine, interactive=False): 106 self.source_buffer = source_buffer 107 self.version = version 108 self.diagnostic_engine = diagnostic_engine 109 self.interactive = interactive 110 self.print_function = False 111 self.unicode_literals = self.version >= (3, 0) 112 113 self.offset = 0 114 self.new_line = True 115 self.indent = [(0, source.Range(source_buffer, 0, 0), "")] 116 self.comments = [] 117 self.queue = [] 118 self.parentheses = [] 119 self.curly_braces = [] 120 self.square_braces = [] 121 122 try: 123 reserved = self._reserved[version] 124 except KeyError: 125 raise NotImplementedError("pythonparser.lexer.Lexer cannot lex Python %s" % str(version)) 126 127 # Sort for the regexp to obey longest-match rule. 128 re_reserved = sorted(reserved, reverse=True, key=len) 129 re_keywords = "|".join([kw for kw in re_reserved if kw.isalnum()]) 130 re_operators = "|".join([re.escape(op) for op in re_reserved if not op.isalnum()]) 131 132 # Python 3.0 uses ID_Start, >3.0 uses XID_Start 133 if self.version == (3, 0): 134 id_xid = "" 135 else: 136 id_xid = "X" 137 138 # To speed things up on CPython, we use the re module to generate a DFA 139 # from our token set and execute it in C. Every result yielded by 140 # iterating this regular expression has exactly one non-empty group 141 # that would correspond to a e.g. lex scanner branch. 142 # The only thing left to Python code is then to select one from this 143 # small set of groups, which is much faster than dissecting the strings. 144 # 145 # A lexer has to obey longest-match rule, but a regular expression does not. 146 # Therefore, the cases in it are carefully sorted so that the longest 147 # ones come up first. The exception is the identifier case, which would 148 # otherwise grab all keywords; it is made to work by making it impossible 149 # for the keyword case to match a word prefix, and ordering it before 150 # the identifier case. 151 self._lex_token_re = re.compile(r""" 152 [ \t\f]* # initial whitespace 153 ( # 1 154 (\\)? # ?2 line continuation 155 ([\n]|[\r][\n]|[\r]) # 3 newline 156 | (\#.*) # 4 comment 157 | ( # 5 floating point or complex literal 158 (?: [0-9]* \. [0-9]+ 159 | [0-9]+ \.? 160 ) [eE] [+-]? [0-9]+ 161 | [0-9]* \. [0-9]+ 162 | [0-9]+ \. 163 ) ([jJ])? # ?6 complex suffix 164 | ([0-9]+) [jJ] # 7 complex literal 165 | (?: # integer literal 166 ( [1-9] [0-9]* ) # 8 dec 167 | 0[oO] ( [0-7]+ ) # 9 oct 168 | 0[xX] ( [0-9A-Fa-f]+ ) # 10 hex 169 | 0[bB] ( [01]+ ) # 11 bin 170 | ( [0-9] [0-9]* ) # 12 bare oct 171 ) 172 ([Ll])? # 13 long option 173 | ([BbUu]?[Rr]?) # ?14 string literal options 174 (?: # string literal start 175 # 15, 16, 17 long string 176 (""\"|''') ((?: \\?[\n] | \\. | . )*?) (\15) 177 # 18, 19, 20 short string 178 | (" |' ) ((?: \\ [\n] | \\. | . )*?) (\18) 179 # 21 unterminated 180 | (""\"|'''|"|') 181 ) 182 | ((?:{keywords})\b|{operators}) # 22 keywords and operators 183 | ([A-Za-z_][A-Za-z0-9_]*\b) # 23 identifier 184 | (\p{{{id_xid}ID_Start}}\p{{{id_xid}ID_Continue}}*) # 24 Unicode identifier 185 | ($) # 25 end-of-file 186 ) 187 """.format(keywords=re_keywords, operators=re_operators, 188 id_xid=id_xid), re.VERBOSE|re.UNICODE) 189 190 # These are identical for all lexer instances. 191 _lex_escape_pattern = r""" 192 \\(?: 193 ([\n\\'"abfnrtv]) # 1 single-char 194 | ([0-7]{1,3}) # 2 oct 195 | x([0-9A-Fa-f]{2}) # 3 hex 196 ) 197 """ 198 _lex_escape_re = re.compile(_lex_escape_pattern.encode(), re.VERBOSE) 199 200 _lex_escape_unicode_re = re.compile(_lex_escape_pattern + r""" 201 | \\(?: 202 u([0-9A-Fa-f]{4}) # 4 unicode-16 203 | U([0-9A-Fa-f]{8}) # 5 unicode-32 204 | N\{(.+?)\} # 6 unicode-name 205 ) 206 """, re.VERBOSE) 207 208 def next(self, eof_token=False): 209 """ 210 Returns token at ``offset`` as a :class:`Token` and advances ``offset`` 211 to point past the end of the token, where the token has: 212 213 - *range* which is a :class:`pythonparser.source.Range` that includes 214 the token but not surrounding whitespace, 215 - *kind* which is a string containing one of Python keywords or operators, 216 ``newline``, ``float``, ``int``, ``complex``, ``strbegin``, 217 ``strdata``, ``strend``, ``ident``, ``indent``, ``dedent`` or ``eof`` 218 (if ``eof_token`` is True). 219 - *value* which is the flags as lowercase string if *kind* is ``strbegin``, 220 the string contents if *kind* is ``strdata``, 221 the numeric value if *kind* is ``float``, ``int`` or ``complex``, 222 the identifier if *kind* is ``ident`` and ``None`` in any other case. 223 224 :param eof_token: if true, will return a token with kind ``eof`` 225 when the input is exhausted; if false, will raise ``StopIteration``. 226 """ 227 if len(self.queue) == 0: 228 self._refill(eof_token) 229 230 return self.queue.pop(0) 231 232 def peek(self, eof_token=False): 233 """Same as :meth:`next`, except the token is not dequeued.""" 234 if len(self.queue) == 0: 235 self._refill(eof_token) 236 237 return self.queue[-1] 238 239 # We need separate next and _refill because lexing can sometimes 240 # generate several tokens, e.g. INDENT 241 def _refill(self, eof_token): 242 if self.offset == len(self.source_buffer.source): 243 range = source.Range(self.source_buffer, self.offset, self.offset) 244 245 if not self.new_line: 246 self.new_line = True 247 self.queue.append(Token(range, "newline")) 248 return 249 250 for i in self.indent[1:]: 251 self.indent.pop(-1) 252 self.queue.append(Token(range, "dedent")) 253 254 if eof_token: 255 self.queue.append(Token(range, "eof")) 256 elif len(self.queue) == 0: 257 raise StopIteration 258 259 return 260 261 match = self._lex_token_re.match(self.source_buffer.source, self.offset) 262 if match is None: 263 diag = diagnostic.Diagnostic( 264 "fatal", "unexpected {character}", 265 {"character": repr(self.source_buffer.source[self.offset]).lstrip("u")}, 266 source.Range(self.source_buffer, self.offset, self.offset + 1)) 267 self.diagnostic_engine.process(diag) 268 269 # Should we emit indent/dedent? 270 if self.new_line and \ 271 match.group(3) is None and \ 272 match.group(4) is None: # not a blank line 273 whitespace = match.string[match.start(0):match.start(1)] 274 level = len(whitespace.expandtabs()) 275 range = source.Range(self.source_buffer, match.start(1), match.start(1)) 276 if level > self.indent[-1][0]: 277 self.indent.append((level, range, whitespace)) 278 self.queue.append(Token(range, "indent")) 279 elif level < self.indent[-1][0]: 280 exact = False 281 while level <= self.indent[-1][0]: 282 if level == self.indent[-1][0] or self.indent[-1][0] == 0: 283 exact = True 284 break 285 self.indent.pop(-1) 286 self.queue.append(Token(range, "dedent")) 287 if not exact: 288 note = diagnostic.Diagnostic( 289 "note", "expected to match level here", {}, 290 self.indent[-1][1]) 291 error = diagnostic.Diagnostic( 292 "fatal", "inconsistent indentation", {}, 293 range, notes=[note]) 294 self.diagnostic_engine.process(error) 295 elif whitespace != self.indent[-1][2] and self.version >= (3, 0): 296 error = diagnostic.Diagnostic( 297 "error", "inconsistent use of tabs and spaces in indentation", {}, 298 range) 299 self.diagnostic_engine.process(error) 300 301 # Prepare for next token. 302 self.offset = match.end(0) 303 304 tok_range = source.Range(self.source_buffer, *match.span(1)) 305 if match.group(3) is not None: # newline 306 if len(self.parentheses) + len(self.square_braces) + len(self.curly_braces) > 0: 307 # 2.1.6 Implicit line joining 308 return self._refill(eof_token) 309 if match.group(2) is not None: 310 # 2.1.5. Explicit line joining 311 return self._refill(eof_token) 312 if self.new_line and not \ 313 (self.interactive and match.group(0) == match.group(3)): # REPL terminator 314 # 2.1.7. Blank lines 315 return self._refill(eof_token) 316 317 self.new_line = True 318 self.queue.append(Token(tok_range, "newline")) 319 return 320 321 if match.group(4) is not None: # comment 322 self.comments.append(source.Comment(tok_range, match.group(4))) 323 return self._refill(eof_token) 324 325 # Lexing non-whitespace now. 326 self.new_line = False 327 328 if sys.version_info > (3,) or not match.group(13): 329 int_type = int 330 else: 331 int_type = long 332 333 if match.group(5) is not None: # floating point or complex literal 334 if match.group(6) is None: 335 self.queue.append(Token(tok_range, "float", float(match.group(5)))) 336 else: 337 self.queue.append(Token(tok_range, "complex", float(match.group(5)) * 1j)) 338 339 elif match.group(7) is not None: # complex literal 340 self.queue.append(Token(tok_range, "complex", int(match.group(7)) * 1j)) 341 342 elif match.group(8) is not None: # integer literal, dec 343 literal = match.group(8) 344 self._check_long_literal(tok_range, match.group(1)) 345 self.queue.append(Token(tok_range, "int", int_type(literal))) 346 347 elif match.group(9) is not None: # integer literal, oct 348 literal = match.group(9) 349 self._check_long_literal(tok_range, match.group(1)) 350 self.queue.append(Token(tok_range, "int", int_type(literal, 8))) 351 352 elif match.group(10) is not None: # integer literal, hex 353 literal = match.group(10) 354 self._check_long_literal(tok_range, match.group(1)) 355 self.queue.append(Token(tok_range, "int", int_type(literal, 16))) 356 357 elif match.group(11) is not None: # integer literal, bin 358 literal = match.group(11) 359 self._check_long_literal(tok_range, match.group(1)) 360 self.queue.append(Token(tok_range, "int", int_type(literal, 2))) 361 362 elif match.group(12) is not None: # integer literal, bare oct 363 literal = match.group(12) 364 if len(literal) > 1 and self.version >= (3, 0): 365 error = diagnostic.Diagnostic( 366 "error", "in Python 3, decimal literals must not start with a zero", {}, 367 source.Range(self.source_buffer, tok_range.begin_pos, tok_range.begin_pos + 1)) 368 self.diagnostic_engine.process(error) 369 self.queue.append(Token(tok_range, "int", int(literal, 8))) 370 371 elif match.group(15) is not None: # long string literal 372 self._string_literal( 373 options=match.group(14), begin_span=(match.start(14), match.end(15)), 374 data=match.group(16), data_span=match.span(16), 375 end_span=match.span(17)) 376 377 elif match.group(18) is not None: # short string literal 378 self._string_literal( 379 options=match.group(14), begin_span=(match.start(14), match.end(18)), 380 data=match.group(19), data_span=match.span(19), 381 end_span=match.span(20)) 382 383 elif match.group(21) is not None: # unterminated string 384 error = diagnostic.Diagnostic( 385 "fatal", "unterminated string", {}, 386 tok_range) 387 self.diagnostic_engine.process(error) 388 389 elif match.group(22) is not None: # keywords and operators 390 kwop = match.group(22) 391 self._match_pair_delim(tok_range, kwop) 392 if kwop == "print" and self.print_function: 393 self.queue.append(Token(tok_range, "ident", "print")) 394 else: 395 self.queue.append(Token(tok_range, kwop)) 396 397 elif match.group(23) is not None: # identifier 398 self.queue.append(Token(tok_range, "ident", match.group(23))) 399 400 elif match.group(24) is not None: # Unicode identifier 401 if self.version < (3, 0): 402 error = diagnostic.Diagnostic( 403 "error", "in Python 2, Unicode identifiers are not allowed", {}, 404 tok_range) 405 self.diagnostic_engine.process(error) 406 self.queue.append(Token(tok_range, "ident", match.group(24))) 407 408 elif match.group(25) is not None: # end-of-file 409 # Reuse the EOF logic 410 return self._refill(eof_token) 411 412 else: 413 assert False 414 415 def _string_literal(self, options, begin_span, data, data_span, end_span): 416 options = options.lower() 417 begin_range = source.Range(self.source_buffer, *begin_span) 418 data_range = source.Range(self.source_buffer, *data_span) 419 420 if options not in self._string_prefixes[self.version]: 421 error = diagnostic.Diagnostic( 422 "error", "string prefix '{prefix}' is not available in Python {major}.{minor}", 423 {"prefix": options, "major": self.version[0], "minor": self.version[1]}, 424 begin_range) 425 self.diagnostic_engine.process(error) 426 427 self.queue.append(Token(begin_range, "strbegin", options)) 428 self.queue.append(Token(data_range, 429 "strdata", self._replace_escape(data_range, options, data))) 430 self.queue.append(Token(source.Range(self.source_buffer, *end_span), 431 "strend")) 432 433 def _replace_escape(self, range, mode, value): 434 is_raw = ("r" in mode) 435 is_unicode = "u" in mode or ("b" not in mode and self.unicode_literals) 436 437 if not is_unicode: 438 value = value.encode(self.source_buffer.encoding) 439 if is_raw: 440 return value 441 return self._replace_escape_bytes(value) 442 443 if is_raw: 444 return value 445 446 return self._replace_escape_unicode(range, value) 447 448 def _replace_escape_unicode(self, range, value): 449 chunks = [] 450 offset = 0 451 while offset < len(value): 452 match = self._lex_escape_unicode_re.search(value, offset) 453 if match is None: 454 # Append the remaining of the string 455 chunks.append(value[offset:]) 456 break 457 458 # Append the part of string before match 459 chunks.append(value[offset:match.start()]) 460 offset = match.end() 461 462 # Process the escape 463 if match.group(1) is not None: # single-char 464 chr = match.group(1) 465 if chr == "\n": 466 pass 467 elif chr == "\\" or chr == "'" or chr == "\"": 468 chunks.append(chr) 469 elif chr == "a": 470 chunks.append("\a") 471 elif chr == "b": 472 chunks.append("\b") 473 elif chr == "f": 474 chunks.append("\f") 475 elif chr == "n": 476 chunks.append("\n") 477 elif chr == "r": 478 chunks.append("\r") 479 elif chr == "t": 480 chunks.append("\t") 481 elif chr == "v": 482 chunks.append("\v") 483 elif match.group(2) is not None: # oct 484 chunks.append(unichr(int(match.group(2), 8))) 485 elif match.group(3) is not None: # hex 486 chunks.append(unichr(int(match.group(3), 16))) 487 elif match.group(4) is not None: # unicode-16 488 chunks.append(unichr(int(match.group(4), 16))) 489 elif match.group(5) is not None: # unicode-32 490 try: 491 chunks.append(unichr(int(match.group(5), 16))) 492 except ValueError: 493 error = diagnostic.Diagnostic( 494 "error", "unicode character out of range", {}, 495 source.Range(self.source_buffer, 496 range.begin_pos + match.start(0), 497 range.begin_pos + match.end(0))) 498 self.diagnostic_engine.process(error) 499 elif match.group(6) is not None: # unicode-name 500 try: 501 chunks.append(unicodedata.lookup(match.group(6))) 502 except KeyError: 503 error = diagnostic.Diagnostic( 504 "error", "unknown unicode character name", {}, 505 source.Range(self.source_buffer, 506 range.begin_pos + match.start(0), 507 range.begin_pos + match.end(0))) 508 self.diagnostic_engine.process(error) 509 510 return "".join(chunks) 511 512 def _replace_escape_bytes(self, value): 513 chunks = [] 514 offset = 0 515 while offset < len(value): 516 match = self._lex_escape_re.search(value, offset) 517 if match is None: 518 # Append the remaining of the string 519 chunks.append(value[offset:]) 520 break 521 522 # Append the part of string before match 523 chunks.append(value[offset:match.start()]) 524 offset = match.end() 525 526 # Process the escape 527 if match.group(1) is not None: # single-char 528 chr = match.group(1) 529 if chr == b"\n": 530 pass 531 elif chr == b"\\" or chr == b"'" or chr == b"\"": 532 chunks.append(chr) 533 elif chr == b"a": 534 chunks.append(b"\a") 535 elif chr == b"b": 536 chunks.append(b"\b") 537 elif chr == b"f": 538 chunks.append(b"\f") 539 elif chr == b"n": 540 chunks.append(b"\n") 541 elif chr == b"r": 542 chunks.append(b"\r") 543 elif chr == b"t": 544 chunks.append(b"\t") 545 elif chr == b"v": 546 chunks.append(b"\v") 547 elif match.group(2) is not None: # oct 548 chunks.append(byte(int(match.group(2), 8))) 549 elif match.group(3) is not None: # hex 550 chunks.append(byte(int(match.group(3), 16))) 551 552 return b"".join(chunks) 553 554 def _check_long_literal(self, range, literal): 555 if literal[-1] in "lL" and self.version >= (3, 0): 556 error = diagnostic.Diagnostic( 557 "error", "in Python 3, long integer literals were removed", {}, 558 source.Range(self.source_buffer, range.end_pos - 1, range.end_pos)) 559 self.diagnostic_engine.process(error) 560 561 def _match_pair_delim(self, range, kwop): 562 if kwop == "(": 563 self.parentheses.append(range) 564 elif kwop == "[": 565 self.square_braces.append(range) 566 elif kwop == "{": 567 self.curly_braces.append(range) 568 elif kwop == ")": 569 self._check_innermost_pair_delim(range, "(") 570 self.parentheses.pop() 571 elif kwop == "]": 572 self._check_innermost_pair_delim(range, "[") 573 self.square_braces.pop() 574 elif kwop == "}": 575 self._check_innermost_pair_delim(range, "{") 576 self.curly_braces.pop() 577 578 def _check_innermost_pair_delim(self, range, expected): 579 ranges = [] 580 if len(self.parentheses) > 0: 581 ranges.append(("(", self.parentheses[-1])) 582 if len(self.square_braces) > 0: 583 ranges.append(("[", self.square_braces[-1])) 584 if len(self.curly_braces) > 0: 585 ranges.append(("{", self.curly_braces[-1])) 586 587 ranges.sort(key=lambda k: k[1].begin_pos) 588 if any(ranges): 589 compl_kind, compl_range = ranges[-1] 590 if compl_kind != expected: 591 note = diagnostic.Diagnostic( 592 "note", "'{delimiter}' opened here", 593 {"delimiter": compl_kind}, 594 compl_range) 595 error = diagnostic.Diagnostic( 596 "fatal", "mismatched '{delimiter}'", 597 {"delimiter": range.source()}, 598 range, notes=[note]) 599 self.diagnostic_engine.process(error) 600 else: 601 error = diagnostic.Diagnostic( 602 "fatal", "mismatched '{delimiter}'", 603 {"delimiter": range.source()}, 604 range) 605 self.diagnostic_engine.process(error) 606 607 def __iter__(self): 608 return self 609 610 def __next__(self): 611 return self.next()