github.com/k14s/starlark-go@v0.0.0-20200720175618-3a5c849cc368/syntax/scan.go (about) 1 // Copyright 2017 The Bazel Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // A lexical scanner for Starlark. 8 9 import ( 10 "fmt" 11 "io" 12 "io/ioutil" 13 "log" 14 "math/big" 15 "os" 16 "strconv" 17 "strings" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 // A Token represents a Starlark lexical token. 23 type Token int8 24 25 const ( 26 ILLEGAL Token = iota 27 EOF 28 29 NEWLINE 30 INDENT 31 OUTDENT 32 33 // Tokens with values 34 IDENT // x 35 INT // 123 36 FLOAT // 1.23e45 37 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" 38 39 // Punctuation 40 PLUS // + 41 MINUS // - 42 STAR // * 43 SLASH // / 44 SLASHSLASH // // 45 PERCENT // % 46 AMP // & 47 PIPE // | 48 CIRCUMFLEX // ^ 49 LTLT // << 50 GTGT // >> 51 TILDE // ~ 52 DOT // . 53 COMMA // , 54 EQ // = 55 SEMI // ; 56 COLON // : 57 LPAREN // ( 58 RPAREN // ) 59 LBRACK // [ 60 RBRACK // ] 61 LBRACE // { 62 RBRACE // } 63 LT // < 64 GT // > 65 GE // >= 66 LE // <= 67 EQL // == 68 NEQ // != 69 PLUS_EQ // += (keep order consistent with PLUS..GTGT) 70 MINUS_EQ // -= 71 STAR_EQ // *= 72 SLASH_EQ // /= 73 SLASHSLASH_EQ // //= 74 PERCENT_EQ // %= 75 AMP_EQ // &= 76 PIPE_EQ // |= 77 CIRCUMFLEX_EQ // ^= 78 LTLT_EQ // <<= 79 GTGT_EQ // >>= 80 STARSTAR // ** 81 82 // Keywords 83 AND 84 BREAK 85 CONTINUE 86 DEF 87 ELIF 88 ELSE 89 FOR 90 IF 91 IN 92 LAMBDA 93 LOAD 94 NOT 95 NOT_IN // synthesized by parser from NOT IN 96 OR 97 PASS 98 RETURN 99 WHILE 100 101 maxToken 102 ) 103 104 func (tok Token) String() string { return tokenNames[tok] } 105 106 // GoString is like String but quotes punctuation tokens. 107 // Use Sprintf("%#v", tok) when constructing error messages. 108 func (tok Token) GoString() string { 109 if tok >= PLUS && tok <= STARSTAR { 110 return "'" + tokenNames[tok] + "'" 111 } 112 return tokenNames[tok] 113 } 114 115 var tokenNames = [...]string{ 116 ILLEGAL: "illegal token", 117 EOF: "end of file", 118 NEWLINE: "newline", 119 INDENT: "indent", 120 OUTDENT: "outdent", 121 IDENT: "identifier", 122 INT: "int literal", 123 FLOAT: "float literal", 124 STRING: "string literal", 125 PLUS: "+", 126 MINUS: "-", 127 STAR: "*", 128 SLASH: "/", 129 SLASHSLASH: "//", 130 PERCENT: "%", 131 AMP: "&", 132 PIPE: "|", 133 CIRCUMFLEX: "^", 134 LTLT: "<<", 135 GTGT: ">>", 136 TILDE: "~", 137 DOT: ".", 138 COMMA: ",", 139 EQ: "=", 140 SEMI: ";", 141 COLON: ":", 142 LPAREN: "(", 143 RPAREN: ")", 144 LBRACK: "[", 145 RBRACK: "]", 146 LBRACE: "{", 147 RBRACE: "}", 148 LT: "<", 149 GT: ">", 150 GE: ">=", 151 LE: "<=", 152 EQL: "==", 153 NEQ: "!=", 154 PLUS_EQ: "+=", 155 MINUS_EQ: "-=", 156 STAR_EQ: "*=", 157 SLASH_EQ: "/=", 158 SLASHSLASH_EQ: "//=", 159 PERCENT_EQ: "%=", 160 AMP_EQ: "&=", 161 PIPE_EQ: "|=", 162 CIRCUMFLEX_EQ: "^=", 163 LTLT_EQ: "<<=", 164 GTGT_EQ: ">>=", 165 STARSTAR: "**", 166 AND: "and", 167 BREAK: "break", 168 CONTINUE: "continue", 169 DEF: "def", 170 ELIF: "elif", 171 ELSE: "else", 172 FOR: "for", 173 IF: "if", 174 IN: "in", 175 LAMBDA: "lambda", 176 LOAD: "load", 177 NOT: "not", 178 NOT_IN: "not in", 179 OR: "or", 180 PASS: "pass", 181 RETURN: "return", 182 WHILE: "while", 183 } 184 185 // A Position describes the location of a rune of input. 186 type Position struct { 187 file *string // filename (indirect for compactness) 188 Line int32 // 1-based line number; 0 if line unknown 189 Col int32 // 1-based column (rune) number; 0 if column unknown 190 } 191 192 // IsValid reports whether the position is valid. 193 func (p Position) IsValid() bool { return p.file != nil } 194 195 // Filename returns the name of the file containing this position. 196 func (p Position) Filename() string { 197 if p.file != nil { 198 return *p.file 199 } 200 return "<invalid>" 201 } 202 203 // MakePosition returns position with the specified components. 204 func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } 205 206 // add returns the position at the end of s, assuming it starts at p. 207 func (p Position) add(s string) Position { 208 if n := strings.Count(s, "\n"); n > 0 { 209 p.Line += int32(n) 210 s = s[strings.LastIndex(s, "\n")+1:] 211 p.Col = 1 212 } 213 p.Col += int32(utf8.RuneCountInString(s)) 214 return p 215 } 216 217 func (p Position) String() string { 218 file := p.Filename() 219 if p.Line > 0 { 220 if p.Col > 0 { 221 return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) 222 } 223 return fmt.Sprintf("%s:%d", file, p.Line) 224 } 225 return file 226 } 227 228 func (p Position) isBefore(q Position) bool { 229 if p.Line != q.Line { 230 return p.Line < q.Line 231 } 232 return p.Col < q.Col 233 } 234 235 // An scanner represents a single input file being parsed. 236 type scanner struct { 237 rest []byte // rest of input (in REPL, a line of input) 238 token []byte // token being scanned 239 pos Position // current input position 240 depth int // nesting of [ ] { } ( ) 241 indentstk []int // stack of indentation levels 242 dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return 243 lineStart bool // after NEWLINE; convert spaces to indentation tokens 244 keepComments bool // accumulate comments in slice 245 lineComments []Comment // list of full line comments (if keepComments) 246 suffixComments []Comment // list of suffix comments (if keepComments) 247 248 readline func() ([]byte, error) // read next line of input (REPL only) 249 } 250 251 func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { 252 sc := &scanner{ 253 pos: Position{file: &filename, Line: 1, Col: 1}, 254 indentstk: make([]int, 1, 10), // []int{0} + spare capacity 255 lineStart: true, 256 keepComments: keepComments, 257 } 258 sc.readline, _ = src.(func() ([]byte, error)) // REPL only 259 if sc.readline == nil { 260 data, err := readSource(filename, src) 261 if err != nil { 262 return nil, err 263 } 264 sc.rest = data 265 } 266 return sc, nil 267 } 268 269 func readSource(filename string, src interface{}) ([]byte, error) { 270 switch src := src.(type) { 271 case string: 272 return []byte(src), nil 273 case []byte: 274 return src, nil 275 case io.Reader: 276 data, err := ioutil.ReadAll(src) 277 if err != nil { 278 err = &os.PathError{Op: "read", Path: filename, Err: err} 279 } 280 return data, nil 281 case nil: 282 return ioutil.ReadFile(filename) 283 default: 284 return nil, fmt.Errorf("invalid source: %T", src) 285 } 286 } 287 288 // An Error describes the nature and position of a scanner or parser error. 289 type Error struct { 290 Pos Position 291 Msg string 292 } 293 294 func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } 295 296 // errorf is called to report an error. 297 // errorf does not return: it panics. 298 func (sc *scanner) error(pos Position, s string) { 299 panic(Error{pos, s}) 300 } 301 302 func (sc *scanner) errorf(pos Position, format string, args ...interface{}) { 303 sc.error(pos, fmt.Sprintf(format, args...)) 304 } 305 306 func (sc *scanner) recover(err *error) { 307 // The scanner and parser panic both for routine errors like 308 // syntax errors and for programmer bugs like array index 309 // errors. Turn both into error returns. Catching bug panics 310 // is especially important when processing many files. 311 switch e := recover().(type) { 312 case nil: 313 // no panic 314 case Error: 315 *err = e 316 default: 317 *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} 318 if debug { 319 log.Fatal(*err) 320 } 321 } 322 } 323 324 // eof reports whether the input has reached end of file. 325 func (sc *scanner) eof() bool { 326 return len(sc.rest) == 0 && !sc.readLine() 327 } 328 329 // readLine attempts to read another line of input. 330 // Precondition: len(sc.rest)==0. 331 func (sc *scanner) readLine() bool { 332 if sc.readline != nil { 333 var err error 334 sc.rest, err = sc.readline() 335 if err != nil { 336 sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt 337 } 338 return len(sc.rest) > 0 339 } 340 return false 341 } 342 343 // peekRune returns the next rune in the input without consuming it. 344 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 345 func (sc *scanner) peekRune() rune { 346 // TODO(adonovan): opt: measure and perhaps inline eof. 347 if sc.eof() { 348 return 0 349 } 350 351 // fast path: ASCII 352 if b := sc.rest[0]; b < utf8.RuneSelf { 353 if b == '\r' { 354 return '\n' 355 } 356 return rune(b) 357 } 358 359 r, _ := utf8.DecodeRune(sc.rest) 360 return r 361 } 362 363 // readRune consumes and returns the next rune in the input. 364 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 365 func (sc *scanner) readRune() rune { 366 // eof() has been inlined here, both to avoid a call 367 // and to establish len(rest)>0 to avoid a bounds check. 368 if len(sc.rest) == 0 { 369 if !sc.readLine() { 370 sc.error(sc.pos, "internal scanner error: readRune at EOF") 371 } 372 // Redundant, but eliminates the bounds-check below. 373 if len(sc.rest) == 0 { 374 return 0 375 } 376 } 377 378 // fast path: ASCII 379 if b := sc.rest[0]; b < utf8.RuneSelf { 380 r := rune(b) 381 sc.rest = sc.rest[1:] 382 if r == '\r' { 383 if len(sc.rest) > 0 && sc.rest[0] == '\n' { 384 sc.rest = sc.rest[1:] 385 } 386 r = '\n' 387 } 388 if r == '\n' { 389 sc.pos.Line++ 390 sc.pos.Col = 1 391 } else { 392 sc.pos.Col++ 393 } 394 return r 395 } 396 397 r, size := utf8.DecodeRune(sc.rest) 398 sc.rest = sc.rest[size:] 399 sc.pos.Col++ 400 return r 401 } 402 403 // tokenValue records the position and value associated with each token. 404 type tokenValue struct { 405 raw string // raw text of token 406 int int64 // decoded int 407 bigInt *big.Int // decoded integers > int64 408 float float64 // decoded float 409 string string // decoded string 410 pos Position // start position of token 411 } 412 413 // startToken marks the beginning of the next input token. 414 // It must be followed by a call to endToken once the token has 415 // been consumed using readRune. 416 func (sc *scanner) startToken(val *tokenValue) { 417 sc.token = sc.rest 418 val.raw = "" 419 val.pos = sc.pos 420 } 421 422 // endToken marks the end of an input token. 423 // It records the actual token string in val.raw if the caller 424 // has not done that already. 425 func (sc *scanner) endToken(val *tokenValue) { 426 if val.raw == "" { 427 val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) 428 } 429 } 430 431 // nextToken is called by the parser to obtain the next input token. 432 // It returns the token value and sets val to the data associated with 433 // the token. 434 // 435 // For all our input tokens, the associated data is val.pos (the 436 // position where the token begins), val.raw (the input string 437 // corresponding to the token). For string and int tokens, the string 438 // and int fields additionally contain the token's interpreted value. 439 func (sc *scanner) nextToken(val *tokenValue) Token { 440 441 // The following distribution of tokens guides case ordering: 442 // 443 // COMMA 27 % 444 // STRING 23 % 445 // IDENT 15 % 446 // EQL 11 % 447 // LBRACK 5.5 % 448 // RBRACK 5.5 % 449 // NEWLINE 3 % 450 // LPAREN 2.9 % 451 // RPAREN 2.9 % 452 // INT 2 % 453 // others < 1 % 454 // 455 // Although NEWLINE tokens are infrequent, and lineStart is 456 // usually (~97%) false on entry, skipped newlines account for 457 // about 50% of all iterations of the 'start' loop. 458 459 start: 460 var c rune 461 462 // Deal with leading spaces and indentation. 463 blank := false 464 savedLineStart := sc.lineStart 465 if sc.lineStart { 466 sc.lineStart = false 467 col := 0 468 for { 469 c = sc.peekRune() 470 if c == ' ' { 471 col++ 472 sc.readRune() 473 } else if c == '\t' { 474 const tab = 8 475 col += int(tab - (sc.pos.Col-1)%tab) 476 sc.readRune() 477 } else { 478 break 479 } 480 } 481 482 // The third clause matches EOF. 483 if c == '#' || c == '\n' || c == 0 { 484 blank = true 485 } 486 487 // Compute indentation level for non-blank lines not 488 // inside an expression. This is not the common case. 489 if !blank && sc.depth == 0 { 490 cur := sc.indentstk[len(sc.indentstk)-1] 491 if col > cur { 492 // indent 493 sc.dents++ 494 sc.indentstk = append(sc.indentstk, col) 495 } else if col < cur { 496 // outdent(s) 497 for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { 498 sc.dents-- 499 sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop 500 } 501 if col != sc.indentstk[len(sc.indentstk)-1] { 502 sc.error(sc.pos, "unindent does not match any outer indentation level") 503 } 504 } 505 } 506 } 507 508 // Return saved indentation tokens. 509 if sc.dents != 0 { 510 sc.startToken(val) 511 sc.endToken(val) 512 if sc.dents < 0 { 513 sc.dents++ 514 return OUTDENT 515 } else { 516 sc.dents-- 517 return INDENT 518 } 519 } 520 521 // start of line proper 522 c = sc.peekRune() 523 524 // Skip spaces. 525 for c == ' ' || c == '\t' { 526 sc.readRune() 527 c = sc.peekRune() 528 } 529 530 // comment 531 if c == '#' { 532 if sc.keepComments { 533 sc.startToken(val) 534 } 535 // Consume up to newline (included). 536 for c != 0 && c != '\n' { 537 sc.readRune() 538 c = sc.peekRune() 539 } 540 if sc.keepComments { 541 sc.endToken(val) 542 if blank { 543 sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) 544 } else { 545 sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) 546 } 547 } 548 } 549 550 // newline 551 if c == '\n' { 552 sc.lineStart = true 553 554 // Ignore newlines within expressions (common case). 555 if sc.depth > 0 { 556 sc.readRune() 557 goto start 558 } 559 560 // Ignore blank lines, except in the REPL, 561 // where they emit OUTDENTs and NEWLINE. 562 if blank { 563 if sc.readline == nil { 564 sc.readRune() 565 goto start 566 } else if len(sc.indentstk) > 1 { 567 sc.dents = 1 - len(sc.indentstk) 568 sc.indentstk = sc.indentstk[:1] 569 goto start 570 } 571 } 572 573 // At top-level (not in an expression). 574 sc.startToken(val) 575 sc.readRune() 576 val.raw = "\n" 577 return NEWLINE 578 } 579 580 // end of file 581 if c == 0 { 582 // Emit OUTDENTs for unfinished indentation, 583 // preceded by a NEWLINE if we haven't just emitted one. 584 if len(sc.indentstk) > 1 { 585 if savedLineStart { 586 sc.dents = 1 - len(sc.indentstk) 587 sc.indentstk = sc.indentstk[:1] 588 goto start 589 } else { 590 sc.lineStart = true 591 sc.startToken(val) 592 val.raw = "\n" 593 return NEWLINE 594 } 595 } 596 597 sc.startToken(val) 598 sc.endToken(val) 599 return EOF 600 } 601 602 // line continuation 603 if c == '\\' { 604 sc.readRune() 605 if sc.peekRune() != '\n' { 606 sc.errorf(sc.pos, "stray backslash in program") 607 } 608 sc.readRune() 609 goto start 610 } 611 612 // start of the next token 613 sc.startToken(val) 614 615 // comma (common case) 616 if c == ',' { 617 sc.readRune() 618 sc.endToken(val) 619 return COMMA 620 } 621 622 // string literal 623 if c == '"' || c == '\'' { 624 return sc.scanString(val, c) 625 } 626 627 // identifier or keyword 628 if isIdentStart(c) { 629 // raw string literal 630 if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { 631 sc.readRune() 632 c = sc.peekRune() 633 return sc.scanString(val, c) 634 } 635 636 for isIdent(c) { 637 sc.readRune() 638 c = sc.peekRune() 639 } 640 sc.endToken(val) 641 if k, ok := keywordToken[val.raw]; ok { 642 return k 643 } 644 645 return IDENT 646 } 647 648 // brackets 649 switch c { 650 case '[', '(', '{': 651 sc.depth++ 652 sc.readRune() 653 sc.endToken(val) 654 switch c { 655 case '[': 656 return LBRACK 657 case '(': 658 return LPAREN 659 case '{': 660 return LBRACE 661 } 662 panic("unreachable") 663 664 case ']', ')', '}': 665 if sc.depth == 0 { 666 sc.errorf(sc.pos, "unexpected %q", c) 667 } else { 668 sc.depth-- 669 } 670 sc.readRune() 671 sc.endToken(val) 672 switch c { 673 case ']': 674 return RBRACK 675 case ')': 676 return RPAREN 677 case '}': 678 return RBRACE 679 } 680 panic("unreachable") 681 } 682 683 // int or float literal, or period 684 if isdigit(c) || c == '.' { 685 return sc.scanNumber(val, c) 686 } 687 688 // other punctuation 689 defer sc.endToken(val) 690 switch c { 691 case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' 692 start := sc.pos 693 sc.readRune() 694 if sc.peekRune() == '=' { 695 sc.readRune() 696 switch c { 697 case '<': 698 return LE 699 case '>': 700 return GE 701 case '=': 702 return EQL 703 case '!': 704 return NEQ 705 case '+': 706 return PLUS_EQ 707 case '-': 708 return MINUS_EQ 709 case '/': 710 return SLASH_EQ 711 case '%': 712 return PERCENT_EQ 713 case '&': 714 return AMP_EQ 715 case '|': 716 return PIPE_EQ 717 case '^': 718 return CIRCUMFLEX_EQ 719 } 720 } 721 switch c { 722 case '=': 723 return EQ 724 case '<': 725 if sc.peekRune() == '<' { 726 sc.readRune() 727 if sc.peekRune() == '=' { 728 sc.readRune() 729 return LTLT_EQ 730 } else { 731 return LTLT 732 } 733 } 734 return LT 735 case '>': 736 if sc.peekRune() == '>' { 737 sc.readRune() 738 if sc.peekRune() == '=' { 739 sc.readRune() 740 return GTGT_EQ 741 } else { 742 return GTGT 743 } 744 } 745 return GT 746 case '!': 747 sc.error(start, "unexpected input character '!'") 748 case '+': 749 return PLUS 750 case '-': 751 return MINUS 752 case '/': 753 if sc.peekRune() == '/' { 754 sc.readRune() 755 if sc.peekRune() == '=' { 756 sc.readRune() 757 return SLASHSLASH_EQ 758 } else { 759 return SLASHSLASH 760 } 761 } 762 return SLASH 763 case '%': 764 return PERCENT 765 case '&': 766 return AMP 767 case '|': 768 return PIPE 769 case '^': 770 return CIRCUMFLEX 771 } 772 panic("unreachable") 773 774 case ':', ';', '~': // single-char tokens (except comma) 775 sc.readRune() 776 switch c { 777 case ':': 778 return COLON 779 case ';': 780 return SEMI 781 case '~': 782 return TILDE 783 } 784 panic("unreachable") 785 786 case '*': // possibly followed by '*' or '=' 787 sc.readRune() 788 switch sc.peekRune() { 789 case '*': 790 sc.readRune() 791 return STARSTAR 792 case '=': 793 sc.readRune() 794 return STAR_EQ 795 } 796 return STAR 797 } 798 799 sc.errorf(sc.pos, "unexpected input character %#q", c) 800 panic("unreachable") 801 } 802 803 func (sc *scanner) scanString(val *tokenValue, quote rune) Token { 804 start := sc.pos 805 triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) 806 sc.readRune() 807 if !triple { 808 // Precondition: startToken was already called. 809 for { 810 if sc.eof() { 811 sc.error(val.pos, "unexpected EOF in string") 812 } 813 c := sc.readRune() 814 if c == quote { 815 break 816 } 817 if c == '\n' { 818 sc.error(val.pos, "unexpected newline in string") 819 } 820 if c == '\\' { 821 if sc.eof() { 822 sc.error(val.pos, "unexpected EOF in string") 823 } 824 sc.readRune() 825 } 826 } 827 sc.endToken(val) 828 } else { 829 // triple-quoted string literal 830 sc.readRune() 831 sc.readRune() 832 833 // A triple-quoted string literal may span multiple 834 // gulps of REPL input; it is the only such token. 835 // Thus we must avoid {start,end}Token. 836 raw := new(strings.Builder) 837 838 // Copy the prefix, e.g. r''' or """ (see startToken). 839 raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) 840 841 quoteCount := 0 842 for { 843 if sc.eof() { 844 sc.error(val.pos, "unexpected EOF in string") 845 } 846 c := sc.readRune() 847 raw.WriteRune(c) 848 if c == quote { 849 quoteCount++ 850 if quoteCount == 3 { 851 break 852 } 853 } else { 854 quoteCount = 0 855 } 856 if c == '\\' { 857 if sc.eof() { 858 sc.error(val.pos, "unexpected EOF in string") 859 } 860 c = sc.readRune() 861 raw.WriteRune(c) 862 } 863 } 864 val.raw = raw.String() 865 } 866 867 s, _, err := unquote(val.raw) 868 if err != nil { 869 sc.error(start, err.Error()) 870 } 871 val.string = s 872 return STRING 873 } 874 875 func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { 876 // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements 877 // 878 // Python features not supported: 879 // - integer literals of >64 bits of precision 880 // - 123L or 123l long suffix 881 // - traditional octal: 0755 882 // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals 883 884 start := sc.pos 885 fraction, exponent := false, false 886 887 if c == '.' { 888 // dot or start of fraction 889 sc.readRune() 890 c = sc.peekRune() 891 if !isdigit(c) { 892 sc.endToken(val) 893 return DOT 894 } 895 fraction = true 896 } else if c == '0' { 897 // hex, octal, binary or float 898 sc.readRune() 899 c = sc.peekRune() 900 901 if c == '.' { 902 fraction = true 903 } else if c == 'x' || c == 'X' { 904 // hex 905 sc.readRune() 906 c = sc.peekRune() 907 if !isxdigit(c) { 908 sc.error(start, "invalid hex literal") 909 } 910 for isxdigit(c) { 911 sc.readRune() 912 c = sc.peekRune() 913 } 914 } else if c == 'o' || c == 'O' { 915 // octal 916 sc.readRune() 917 c = sc.peekRune() 918 if !isodigit(c) { 919 sc.error(sc.pos, "invalid octal literal") 920 } 921 for isodigit(c) { 922 sc.readRune() 923 c = sc.peekRune() 924 } 925 } else if c == 'b' || c == 'B' { 926 // binary 927 sc.readRune() 928 c = sc.peekRune() 929 if !isbdigit(c) { 930 sc.error(sc.pos, "invalid binary literal") 931 } 932 for isbdigit(c) { 933 sc.readRune() 934 c = sc.peekRune() 935 } 936 } else { 937 // float (or obsolete octal "0755") 938 allzeros, octal := true, true 939 for isdigit(c) { 940 if c != '0' { 941 allzeros = false 942 } 943 if c > '7' { 944 octal = false 945 } 946 sc.readRune() 947 c = sc.peekRune() 948 } 949 if c == '.' { 950 fraction = true 951 } else if c == 'e' || c == 'E' { 952 exponent = true 953 } else if octal && !allzeros { 954 sc.endToken(val) 955 sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) 956 } 957 } 958 } else { 959 // decimal 960 for isdigit(c) { 961 sc.readRune() 962 c = sc.peekRune() 963 } 964 965 if c == '.' { 966 fraction = true 967 } else if c == 'e' || c == 'E' { 968 exponent = true 969 } 970 } 971 972 if fraction { 973 sc.readRune() // consume '.' 974 c = sc.peekRune() 975 for isdigit(c) { 976 sc.readRune() 977 c = sc.peekRune() 978 } 979 980 if c == 'e' || c == 'E' { 981 exponent = true 982 } 983 } 984 985 if exponent { 986 sc.readRune() // consume [eE] 987 c = sc.peekRune() 988 if c == '+' || c == '-' { 989 sc.readRune() 990 c = sc.peekRune() 991 if !isdigit(c) { 992 sc.error(sc.pos, "invalid float literal") 993 } 994 } 995 for isdigit(c) { 996 sc.readRune() 997 c = sc.peekRune() 998 } 999 } 1000 1001 sc.endToken(val) 1002 if fraction || exponent { 1003 var err error 1004 val.float, err = strconv.ParseFloat(val.raw, 64) 1005 if err != nil { 1006 sc.error(sc.pos, "invalid float literal") 1007 } 1008 return FLOAT 1009 } else { 1010 var err error 1011 s := val.raw 1012 val.bigInt = nil 1013 if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { 1014 val.int, err = strconv.ParseInt(s[2:], 8, 64) 1015 } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { 1016 val.int, err = strconv.ParseInt(s[2:], 2, 64) 1017 } else { 1018 val.int, err = strconv.ParseInt(s, 0, 64) 1019 if err != nil { 1020 num := new(big.Int) 1021 var ok bool = true 1022 val.bigInt, ok = num.SetString(s, 0) 1023 if ok { 1024 err = nil 1025 } 1026 } 1027 } 1028 if err != nil { 1029 sc.error(start, "invalid int literal") 1030 } 1031 return INT 1032 } 1033 } 1034 1035 // isIdent reports whether c is an identifier rune. 1036 func isIdent(c rune) bool { 1037 return isdigit(c) || isIdentStart(c) 1038 } 1039 1040 func isIdentStart(c rune) bool { 1041 return 'a' <= c && c <= 'z' || 1042 'A' <= c && c <= 'Z' || 1043 c == '_' || 1044 unicode.IsLetter(c) 1045 } 1046 1047 func isdigit(c rune) bool { return '0' <= c && c <= '9' } 1048 func isodigit(c rune) bool { return '0' <= c && c <= '7' } 1049 func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } 1050 func isbdigit(c rune) bool { return '0' == c || c == '1' } 1051 1052 // keywordToken records the special tokens for 1053 // strings that should not be treated as ordinary identifiers. 1054 var keywordToken = map[string]Token{ 1055 "and": AND, 1056 "break": BREAK, 1057 "continue": CONTINUE, 1058 "def": DEF, 1059 "elif": ELIF, 1060 "else": ELSE, 1061 "for": FOR, 1062 "if": IF, 1063 "in": IN, 1064 "lambda": LAMBDA, 1065 "load": LOAD, 1066 "not": NOT, 1067 "or": OR, 1068 "pass": PASS, 1069 "return": RETURN, 1070 "while": WHILE, 1071 1072 // reserved words: 1073 "as": ILLEGAL, 1074 // "assert": ILLEGAL, // heavily used by our tests 1075 "class": ILLEGAL, 1076 "del": ILLEGAL, 1077 "except": ILLEGAL, 1078 "finally": ILLEGAL, 1079 "from": ILLEGAL, 1080 "global": ILLEGAL, 1081 "import": ILLEGAL, 1082 "is": ILLEGAL, 1083 "nonlocal": ILLEGAL, 1084 "raise": ILLEGAL, 1085 "try": ILLEGAL, 1086 "with": ILLEGAL, 1087 "yield": ILLEGAL, 1088 }