go.starlark.net@v0.0.0-20231101134539-556fd59b42f6/syntax/scan.go (about) 1 // Copyright 2017 The Bazel Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // A lexical scanner for Starlark. 8 9 import ( 10 "fmt" 11 "io" 12 "log" 13 "math/big" 14 "os" 15 "strconv" 16 "strings" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // A Token represents a Starlark lexical token. 22 type Token int8 23 24 const ( 25 ILLEGAL Token = iota 26 EOF 27 28 NEWLINE 29 INDENT 30 OUTDENT 31 32 // Tokens with values 33 IDENT // x 34 INT // 123 35 FLOAT // 1.23e45 36 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" 37 BYTES // b"foo", etc 38 39 // Punctuation 40 PLUS // + 41 MINUS // - 42 STAR // * 43 SLASH // / 44 SLASHSLASH // // 45 PERCENT // % 46 AMP // & 47 PIPE // | 48 CIRCUMFLEX // ^ 49 LTLT // << 50 GTGT // >> 51 TILDE // ~ 52 DOT // . 53 COMMA // , 54 EQ // = 55 SEMI // ; 56 COLON // : 57 LPAREN // ( 58 RPAREN // ) 59 LBRACK // [ 60 RBRACK // ] 61 LBRACE // { 62 RBRACE // } 63 LT // < 64 GT // > 65 GE // >= 66 LE // <= 67 EQL // == 68 NEQ // != 69 PLUS_EQ // += (keep order consistent with PLUS..GTGT) 70 MINUS_EQ // -= 71 STAR_EQ // *= 72 SLASH_EQ // /= 73 SLASHSLASH_EQ // //= 74 PERCENT_EQ // %= 75 AMP_EQ // &= 76 PIPE_EQ // |= 77 CIRCUMFLEX_EQ // ^= 78 LTLT_EQ // <<= 79 GTGT_EQ // >>= 80 STARSTAR // ** 81 82 // Keywords 83 AND 84 BREAK 85 CONTINUE 86 DEF 87 ELIF 88 ELSE 89 FOR 90 IF 91 IN 92 LAMBDA 93 LOAD 94 NOT 95 NOT_IN // synthesized by parser from NOT IN 96 OR 97 PASS 98 RETURN 99 WHILE 100 101 maxToken 102 ) 103 104 func (tok Token) String() string { return tokenNames[tok] } 105 106 // GoString is like String but quotes punctuation tokens. 107 // Use Sprintf("%#v", tok) when constructing error messages. 108 func (tok Token) GoString() string { 109 if tok >= PLUS && tok <= STARSTAR { 110 return "'" + tokenNames[tok] + "'" 111 } 112 return tokenNames[tok] 113 } 114 115 var tokenNames = [...]string{ 116 ILLEGAL: "illegal token", 117 EOF: "end of file", 118 NEWLINE: "newline", 119 INDENT: "indent", 120 OUTDENT: "outdent", 121 IDENT: "identifier", 122 INT: "int literal", 123 FLOAT: "float literal", 124 STRING: "string literal", 125 PLUS: "+", 126 MINUS: "-", 127 STAR: "*", 128 SLASH: "/", 129 SLASHSLASH: "//", 130 PERCENT: "%", 131 AMP: "&", 132 PIPE: "|", 133 CIRCUMFLEX: "^", 134 LTLT: "<<", 135 GTGT: ">>", 136 TILDE: "~", 137 DOT: ".", 138 COMMA: ",", 139 EQ: "=", 140 SEMI: ";", 141 COLON: ":", 142 LPAREN: "(", 143 RPAREN: ")", 144 LBRACK: "[", 145 RBRACK: "]", 146 LBRACE: "{", 147 RBRACE: "}", 148 LT: "<", 149 GT: ">", 150 GE: ">=", 151 LE: "<=", 152 EQL: "==", 153 NEQ: "!=", 154 PLUS_EQ: "+=", 155 MINUS_EQ: "-=", 156 STAR_EQ: "*=", 157 SLASH_EQ: "/=", 158 SLASHSLASH_EQ: "//=", 159 PERCENT_EQ: "%=", 160 AMP_EQ: "&=", 161 PIPE_EQ: "|=", 162 CIRCUMFLEX_EQ: "^=", 163 LTLT_EQ: "<<=", 164 GTGT_EQ: ">>=", 165 STARSTAR: "**", 166 AND: "and", 167 BREAK: "break", 168 CONTINUE: "continue", 169 DEF: "def", 170 ELIF: "elif", 171 ELSE: "else", 172 FOR: "for", 173 IF: "if", 174 IN: "in", 175 LAMBDA: "lambda", 176 LOAD: "load", 177 NOT: "not", 178 NOT_IN: "not in", 179 OR: "or", 180 PASS: "pass", 181 RETURN: "return", 182 WHILE: "while", 183 } 184 185 // A FilePortion describes the content of a portion of a file. 186 // Callers may provide a FilePortion for the src argument of Parse 187 // when the desired initial line and column numbers are not (1, 1), 188 // such as when an expression is parsed from within larger file. 189 type FilePortion struct { 190 Content []byte 191 FirstLine, FirstCol int32 192 } 193 194 // A Position describes the location of a rune of input. 195 type Position struct { 196 file *string // filename (indirect for compactness) 197 Line int32 // 1-based line number; 0 if line unknown 198 Col int32 // 1-based column (rune) number; 0 if column unknown 199 } 200 201 // IsValid reports whether the position is valid. 202 func (p Position) IsValid() bool { return p.file != nil } 203 204 // Filename returns the name of the file containing this position. 205 func (p Position) Filename() string { 206 if p.file != nil { 207 return *p.file 208 } 209 return "<invalid>" 210 } 211 212 // MakePosition returns position with the specified components. 213 func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } 214 215 // add returns the position at the end of s, assuming it starts at p. 216 func (p Position) add(s string) Position { 217 if n := strings.Count(s, "\n"); n > 0 { 218 p.Line += int32(n) 219 s = s[strings.LastIndex(s, "\n")+1:] 220 p.Col = 1 221 } 222 p.Col += int32(utf8.RuneCountInString(s)) 223 return p 224 } 225 226 func (p Position) String() string { 227 file := p.Filename() 228 if p.Line > 0 { 229 if p.Col > 0 { 230 return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) 231 } 232 return fmt.Sprintf("%s:%d", file, p.Line) 233 } 234 return file 235 } 236 237 func (p Position) isBefore(q Position) bool { 238 if p.Line != q.Line { 239 return p.Line < q.Line 240 } 241 return p.Col < q.Col 242 } 243 244 // An scanner represents a single input file being parsed. 245 type scanner struct { 246 rest []byte // rest of input (in REPL, a line of input) 247 token []byte // token being scanned 248 pos Position // current input position 249 depth int // nesting of [ ] { } ( ) 250 indentstk []int // stack of indentation levels 251 dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return 252 lineStart bool // after NEWLINE; convert spaces to indentation tokens 253 keepComments bool // accumulate comments in slice 254 lineComments []Comment // list of full line comments (if keepComments) 255 suffixComments []Comment // list of suffix comments (if keepComments) 256 257 readline func() ([]byte, error) // read next line of input (REPL only) 258 } 259 260 func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { 261 var firstLine, firstCol int32 = 1, 1 262 if portion, ok := src.(FilePortion); ok { 263 firstLine, firstCol = portion.FirstLine, portion.FirstCol 264 } 265 sc := &scanner{ 266 pos: MakePosition(&filename, firstLine, firstCol), 267 indentstk: make([]int, 1, 10), // []int{0} + spare capacity 268 lineStart: true, 269 keepComments: keepComments, 270 } 271 sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only 272 if sc.readline == nil { 273 data, err := readSource(filename, src) 274 if err != nil { 275 return nil, err 276 } 277 sc.rest = data 278 } 279 return sc, nil 280 } 281 282 func readSource(filename string, src interface{}) ([]byte, error) { 283 switch src := src.(type) { 284 case string: 285 return []byte(src), nil 286 case []byte: 287 return src, nil 288 case io.Reader: 289 data, err := io.ReadAll(src) 290 if err != nil { 291 err = &os.PathError{Op: "read", Path: filename, Err: err} 292 return nil, err 293 } 294 return data, nil 295 case FilePortion: 296 return src.Content, nil 297 case nil: 298 return os.ReadFile(filename) 299 default: 300 return nil, fmt.Errorf("invalid source: %T", src) 301 } 302 } 303 304 // An Error describes the nature and position of a scanner or parser error. 305 type Error struct { 306 Pos Position 307 Msg string 308 } 309 310 func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } 311 312 // errorf is called to report an error. 313 // errorf does not return: it panics. 314 func (sc *scanner) error(pos Position, s string) { 315 panic(Error{pos, s}) 316 } 317 318 func (sc *scanner) errorf(pos Position, format string, args ...interface{}) { 319 sc.error(pos, fmt.Sprintf(format, args...)) 320 } 321 322 func (sc *scanner) recover(err *error) { 323 // The scanner and parser panic both for routine errors like 324 // syntax errors and for programmer bugs like array index 325 // errors. Turn both into error returns. Catching bug panics 326 // is especially important when processing many files. 327 switch e := recover().(type) { 328 case nil: 329 // no panic 330 case Error: 331 *err = e 332 default: 333 *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} 334 if debug { 335 log.Fatal(*err) 336 } 337 } 338 } 339 340 // eof reports whether the input has reached end of file. 341 func (sc *scanner) eof() bool { 342 return len(sc.rest) == 0 && !sc.readLine() 343 } 344 345 // readLine attempts to read another line of input. 346 // Precondition: len(sc.rest)==0. 347 func (sc *scanner) readLine() bool { 348 if sc.readline != nil { 349 var err error 350 sc.rest, err = sc.readline() 351 if err != nil { 352 sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt 353 } 354 return len(sc.rest) > 0 355 } 356 return false 357 } 358 359 // peekRune returns the next rune in the input without consuming it. 360 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 361 func (sc *scanner) peekRune() rune { 362 // TODO(adonovan): opt: measure and perhaps inline eof. 363 if sc.eof() { 364 return 0 365 } 366 367 // fast path: ASCII 368 if b := sc.rest[0]; b < utf8.RuneSelf { 369 if b == '\r' { 370 return '\n' 371 } 372 return rune(b) 373 } 374 375 r, _ := utf8.DecodeRune(sc.rest) 376 return r 377 } 378 379 // readRune consumes and returns the next rune in the input. 380 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 381 func (sc *scanner) readRune() rune { 382 // eof() has been inlined here, both to avoid a call 383 // and to establish len(rest)>0 to avoid a bounds check. 384 if len(sc.rest) == 0 { 385 if !sc.readLine() { 386 sc.error(sc.pos, "internal scanner error: readRune at EOF") 387 } 388 // Redundant, but eliminates the bounds-check below. 389 if len(sc.rest) == 0 { 390 return 0 391 } 392 } 393 394 // fast path: ASCII 395 if b := sc.rest[0]; b < utf8.RuneSelf { 396 r := rune(b) 397 sc.rest = sc.rest[1:] 398 if r == '\r' { 399 if len(sc.rest) > 0 && sc.rest[0] == '\n' { 400 sc.rest = sc.rest[1:] 401 } 402 r = '\n' 403 } 404 if r == '\n' { 405 sc.pos.Line++ 406 sc.pos.Col = 1 407 } else { 408 sc.pos.Col++ 409 } 410 return r 411 } 412 413 r, size := utf8.DecodeRune(sc.rest) 414 sc.rest = sc.rest[size:] 415 sc.pos.Col++ 416 return r 417 } 418 419 // tokenValue records the position and value associated with each token. 420 type tokenValue struct { 421 raw string // raw text of token 422 int int64 // decoded int 423 bigInt *big.Int // decoded integers > int64 424 float float64 // decoded float 425 string string // decoded string or bytes 426 pos Position // start position of token 427 } 428 429 // startToken marks the beginning of the next input token. 430 // It must be followed by a call to endToken once the token has 431 // been consumed using readRune. 432 func (sc *scanner) startToken(val *tokenValue) { 433 sc.token = sc.rest 434 val.raw = "" 435 val.pos = sc.pos 436 } 437 438 // endToken marks the end of an input token. 439 // It records the actual token string in val.raw if the caller 440 // has not done that already. 441 func (sc *scanner) endToken(val *tokenValue) { 442 if val.raw == "" { 443 val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) 444 } 445 } 446 447 // nextToken is called by the parser to obtain the next input token. 448 // It returns the token value and sets val to the data associated with 449 // the token. 450 // 451 // For all our input tokens, the associated data is val.pos (the 452 // position where the token begins), val.raw (the input string 453 // corresponding to the token). For string and int tokens, the string 454 // and int fields additionally contain the token's interpreted value. 455 func (sc *scanner) nextToken(val *tokenValue) Token { 456 457 // The following distribution of tokens guides case ordering: 458 // 459 // COMMA 27 % 460 // STRING 23 % 461 // IDENT 15 % 462 // EQL 11 % 463 // LBRACK 5.5 % 464 // RBRACK 5.5 % 465 // NEWLINE 3 % 466 // LPAREN 2.9 % 467 // RPAREN 2.9 % 468 // INT 2 % 469 // others < 1 % 470 // 471 // Although NEWLINE tokens are infrequent, and lineStart is 472 // usually (~97%) false on entry, skipped newlines account for 473 // about 50% of all iterations of the 'start' loop. 474 475 start: 476 var c rune 477 478 // Deal with leading spaces and indentation. 479 blank := false 480 savedLineStart := sc.lineStart 481 if sc.lineStart { 482 sc.lineStart = false 483 col := 0 484 for { 485 c = sc.peekRune() 486 if c == ' ' { 487 col++ 488 sc.readRune() 489 } else if c == '\t' { 490 const tab = 8 491 col += int(tab - (sc.pos.Col-1)%tab) 492 sc.readRune() 493 } else { 494 break 495 } 496 } 497 498 // The third clause matches EOF. 499 if c == '#' || c == '\n' || c == 0 { 500 blank = true 501 } 502 503 // Compute indentation level for non-blank lines not 504 // inside an expression. This is not the common case. 505 if !blank && sc.depth == 0 { 506 cur := sc.indentstk[len(sc.indentstk)-1] 507 if col > cur { 508 // indent 509 sc.dents++ 510 sc.indentstk = append(sc.indentstk, col) 511 } else if col < cur { 512 // outdent(s) 513 for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { 514 sc.dents-- 515 sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop 516 } 517 if col != sc.indentstk[len(sc.indentstk)-1] { 518 sc.error(sc.pos, "unindent does not match any outer indentation level") 519 } 520 } 521 } 522 } 523 524 // Return saved indentation tokens. 525 if sc.dents != 0 { 526 sc.startToken(val) 527 sc.endToken(val) 528 if sc.dents < 0 { 529 sc.dents++ 530 return OUTDENT 531 } else { 532 sc.dents-- 533 return INDENT 534 } 535 } 536 537 // start of line proper 538 c = sc.peekRune() 539 540 // Skip spaces. 541 for c == ' ' || c == '\t' { 542 sc.readRune() 543 c = sc.peekRune() 544 } 545 546 // comment 547 if c == '#' { 548 if sc.keepComments { 549 sc.startToken(val) 550 } 551 // Consume up to newline (included). 552 for c != 0 && c != '\n' { 553 sc.readRune() 554 c = sc.peekRune() 555 } 556 if sc.keepComments { 557 sc.endToken(val) 558 if blank { 559 sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) 560 } else { 561 sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) 562 } 563 } 564 } 565 566 // newline 567 if c == '\n' { 568 sc.lineStart = true 569 570 // Ignore newlines within expressions (common case). 571 if sc.depth > 0 { 572 sc.readRune() 573 goto start 574 } 575 576 // Ignore blank lines, except in the REPL, 577 // where they emit OUTDENTs and NEWLINE. 578 if blank { 579 if sc.readline == nil { 580 sc.readRune() 581 goto start 582 } else if len(sc.indentstk) > 1 { 583 sc.dents = 1 - len(sc.indentstk) 584 sc.indentstk = sc.indentstk[:1] 585 goto start 586 } 587 } 588 589 // At top-level (not in an expression). 590 sc.startToken(val) 591 sc.readRune() 592 val.raw = "\n" 593 return NEWLINE 594 } 595 596 // end of file 597 if c == 0 { 598 // Emit OUTDENTs for unfinished indentation, 599 // preceded by a NEWLINE if we haven't just emitted one. 600 if len(sc.indentstk) > 1 { 601 if savedLineStart { 602 sc.dents = 1 - len(sc.indentstk) 603 sc.indentstk = sc.indentstk[:1] 604 goto start 605 } else { 606 sc.lineStart = true 607 sc.startToken(val) 608 val.raw = "\n" 609 return NEWLINE 610 } 611 } 612 613 sc.startToken(val) 614 sc.endToken(val) 615 return EOF 616 } 617 618 // line continuation 619 if c == '\\' { 620 sc.readRune() 621 if sc.peekRune() != '\n' { 622 sc.errorf(sc.pos, "stray backslash in program") 623 } 624 sc.readRune() 625 goto start 626 } 627 628 // start of the next token 629 sc.startToken(val) 630 631 // comma (common case) 632 if c == ',' { 633 sc.readRune() 634 sc.endToken(val) 635 return COMMA 636 } 637 638 // string literal 639 if c == '"' || c == '\'' { 640 return sc.scanString(val, c) 641 } 642 643 // identifier or keyword 644 if isIdentStart(c) { 645 if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { 646 // r"..." 647 // b"..." 648 sc.readRune() 649 c = sc.peekRune() 650 return sc.scanString(val, c) 651 } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { 652 // rb"..." 653 sc.readRune() 654 sc.readRune() 655 c = sc.peekRune() 656 return sc.scanString(val, c) 657 } 658 659 for isIdent(c) { 660 sc.readRune() 661 c = sc.peekRune() 662 } 663 sc.endToken(val) 664 if k, ok := keywordToken[val.raw]; ok { 665 return k 666 } 667 668 return IDENT 669 } 670 671 // brackets 672 switch c { 673 case '[', '(', '{': 674 sc.depth++ 675 sc.readRune() 676 sc.endToken(val) 677 switch c { 678 case '[': 679 return LBRACK 680 case '(': 681 return LPAREN 682 case '{': 683 return LBRACE 684 } 685 panic("unreachable") 686 687 case ']', ')', '}': 688 if sc.depth == 0 { 689 sc.errorf(sc.pos, "unexpected %q", c) 690 } else { 691 sc.depth-- 692 } 693 sc.readRune() 694 sc.endToken(val) 695 switch c { 696 case ']': 697 return RBRACK 698 case ')': 699 return RPAREN 700 case '}': 701 return RBRACE 702 } 703 panic("unreachable") 704 } 705 706 // int or float literal, or period 707 if isdigit(c) || c == '.' { 708 return sc.scanNumber(val, c) 709 } 710 711 // other punctuation 712 defer sc.endToken(val) 713 switch c { 714 case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' 715 start := sc.pos 716 sc.readRune() 717 if sc.peekRune() == '=' { 718 sc.readRune() 719 switch c { 720 case '<': 721 return LE 722 case '>': 723 return GE 724 case '=': 725 return EQL 726 case '!': 727 return NEQ 728 case '+': 729 return PLUS_EQ 730 case '-': 731 return MINUS_EQ 732 case '/': 733 return SLASH_EQ 734 case '%': 735 return PERCENT_EQ 736 case '&': 737 return AMP_EQ 738 case '|': 739 return PIPE_EQ 740 case '^': 741 return CIRCUMFLEX_EQ 742 } 743 } 744 switch c { 745 case '=': 746 return EQ 747 case '<': 748 if sc.peekRune() == '<' { 749 sc.readRune() 750 if sc.peekRune() == '=' { 751 sc.readRune() 752 return LTLT_EQ 753 } else { 754 return LTLT 755 } 756 } 757 return LT 758 case '>': 759 if sc.peekRune() == '>' { 760 sc.readRune() 761 if sc.peekRune() == '=' { 762 sc.readRune() 763 return GTGT_EQ 764 } else { 765 return GTGT 766 } 767 } 768 return GT 769 case '!': 770 sc.error(start, "unexpected input character '!'") 771 case '+': 772 return PLUS 773 case '-': 774 return MINUS 775 case '/': 776 if sc.peekRune() == '/' { 777 sc.readRune() 778 if sc.peekRune() == '=' { 779 sc.readRune() 780 return SLASHSLASH_EQ 781 } else { 782 return SLASHSLASH 783 } 784 } 785 return SLASH 786 case '%': 787 return PERCENT 788 case '&': 789 return AMP 790 case '|': 791 return PIPE 792 case '^': 793 return CIRCUMFLEX 794 } 795 panic("unreachable") 796 797 case ':', ';', '~': // single-char tokens (except comma) 798 sc.readRune() 799 switch c { 800 case ':': 801 return COLON 802 case ';': 803 return SEMI 804 case '~': 805 return TILDE 806 } 807 panic("unreachable") 808 809 case '*': // possibly followed by '*' or '=' 810 sc.readRune() 811 switch sc.peekRune() { 812 case '*': 813 sc.readRune() 814 return STARSTAR 815 case '=': 816 sc.readRune() 817 return STAR_EQ 818 } 819 return STAR 820 } 821 822 sc.errorf(sc.pos, "unexpected input character %#q", c) 823 panic("unreachable") 824 } 825 826 func (sc *scanner) scanString(val *tokenValue, quote rune) Token { 827 start := sc.pos 828 triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) 829 sc.readRune() 830 831 // String literals may contain escaped or unescaped newlines, 832 // causing them to span multiple lines (gulps) of REPL input; 833 // they are the only such token. Thus we cannot call endToken, 834 // as it assumes sc.rest is unchanged since startToken. 835 // Instead, buffer the token here. 836 // TODO(adonovan): opt: buffer only if we encounter a newline. 837 raw := new(strings.Builder) 838 839 // Copy the prefix, e.g. r' or " (see startToken). 840 raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) 841 842 if !triple { 843 // single-quoted string literal 844 for { 845 if sc.eof() { 846 sc.error(val.pos, "unexpected EOF in string") 847 } 848 c := sc.readRune() 849 raw.WriteRune(c) 850 if c == quote { 851 break 852 } 853 if c == '\n' { 854 sc.error(val.pos, "unexpected newline in string") 855 } 856 if c == '\\' { 857 if sc.eof() { 858 sc.error(val.pos, "unexpected EOF in string") 859 } 860 c = sc.readRune() 861 raw.WriteRune(c) 862 } 863 } 864 } else { 865 // triple-quoted string literal 866 sc.readRune() 867 raw.WriteRune(quote) 868 sc.readRune() 869 raw.WriteRune(quote) 870 871 quoteCount := 0 872 for { 873 if sc.eof() { 874 sc.error(val.pos, "unexpected EOF in string") 875 } 876 c := sc.readRune() 877 raw.WriteRune(c) 878 if c == quote { 879 quoteCount++ 880 if quoteCount == 3 { 881 break 882 } 883 } else { 884 quoteCount = 0 885 } 886 if c == '\\' { 887 if sc.eof() { 888 sc.error(val.pos, "unexpected EOF in string") 889 } 890 c = sc.readRune() 891 raw.WriteRune(c) 892 } 893 } 894 } 895 val.raw = raw.String() 896 897 s, _, isByte, err := unquote(val.raw) 898 if err != nil { 899 sc.error(start, err.Error()) 900 } 901 val.string = s 902 if isByte { 903 return BYTES 904 } else { 905 return STRING 906 } 907 } 908 909 func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { 910 // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements 911 // 912 // Python features not supported: 913 // - integer literals of >64 bits of precision 914 // - 123L or 123l long suffix 915 // - traditional octal: 0755 916 // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals 917 918 start := sc.pos 919 fraction, exponent := false, false 920 921 if c == '.' { 922 // dot or start of fraction 923 sc.readRune() 924 c = sc.peekRune() 925 if !isdigit(c) { 926 sc.endToken(val) 927 return DOT 928 } 929 fraction = true 930 } else if c == '0' { 931 // hex, octal, binary or float 932 sc.readRune() 933 c = sc.peekRune() 934 935 if c == '.' { 936 fraction = true 937 } else if c == 'x' || c == 'X' { 938 // hex 939 sc.readRune() 940 c = sc.peekRune() 941 if !isxdigit(c) { 942 sc.error(start, "invalid hex literal") 943 } 944 for isxdigit(c) { 945 sc.readRune() 946 c = sc.peekRune() 947 } 948 } else if c == 'o' || c == 'O' { 949 // octal 950 sc.readRune() 951 c = sc.peekRune() 952 if !isodigit(c) { 953 sc.error(sc.pos, "invalid octal literal") 954 } 955 for isodigit(c) { 956 sc.readRune() 957 c = sc.peekRune() 958 } 959 } else if c == 'b' || c == 'B' { 960 // binary 961 sc.readRune() 962 c = sc.peekRune() 963 if !isbdigit(c) { 964 sc.error(sc.pos, "invalid binary literal") 965 } 966 for isbdigit(c) { 967 sc.readRune() 968 c = sc.peekRune() 969 } 970 } else { 971 // float (or obsolete octal "0755") 972 allzeros, octal := true, true 973 for isdigit(c) { 974 if c != '0' { 975 allzeros = false 976 } 977 if c > '7' { 978 octal = false 979 } 980 sc.readRune() 981 c = sc.peekRune() 982 } 983 if c == '.' { 984 fraction = true 985 } else if c == 'e' || c == 'E' { 986 exponent = true 987 } else if octal && !allzeros { 988 sc.endToken(val) 989 sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) 990 } 991 } 992 } else { 993 // decimal 994 for isdigit(c) { 995 sc.readRune() 996 c = sc.peekRune() 997 } 998 999 if c == '.' { 1000 fraction = true 1001 } else if c == 'e' || c == 'E' { 1002 exponent = true 1003 } 1004 } 1005 1006 if fraction { 1007 sc.readRune() // consume '.' 1008 c = sc.peekRune() 1009 for isdigit(c) { 1010 sc.readRune() 1011 c = sc.peekRune() 1012 } 1013 1014 if c == 'e' || c == 'E' { 1015 exponent = true 1016 } 1017 } 1018 1019 if exponent { 1020 sc.readRune() // consume [eE] 1021 c = sc.peekRune() 1022 if c == '+' || c == '-' { 1023 sc.readRune() 1024 c = sc.peekRune() 1025 if !isdigit(c) { 1026 sc.error(sc.pos, "invalid float literal") 1027 } 1028 } 1029 for isdigit(c) { 1030 sc.readRune() 1031 c = sc.peekRune() 1032 } 1033 } 1034 1035 sc.endToken(val) 1036 if fraction || exponent { 1037 var err error 1038 val.float, err = strconv.ParseFloat(val.raw, 64) 1039 if err != nil { 1040 sc.error(sc.pos, "invalid float literal") 1041 } 1042 return FLOAT 1043 } else { 1044 var err error 1045 s := val.raw 1046 val.bigInt = nil 1047 if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { 1048 val.int, err = strconv.ParseInt(s[2:], 8, 64) 1049 } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { 1050 val.int, err = strconv.ParseInt(s[2:], 2, 64) 1051 } else { 1052 val.int, err = strconv.ParseInt(s, 0, 64) 1053 if err != nil { 1054 num := new(big.Int) 1055 var ok bool 1056 val.bigInt, ok = num.SetString(s, 0) 1057 if ok { 1058 err = nil 1059 } 1060 } 1061 } 1062 if err != nil { 1063 sc.error(start, "invalid int literal") 1064 } 1065 return INT 1066 } 1067 } 1068 1069 // isIdent reports whether c is an identifier rune. 1070 func isIdent(c rune) bool { 1071 return isdigit(c) || isIdentStart(c) 1072 } 1073 1074 func isIdentStart(c rune) bool { 1075 return 'a' <= c && c <= 'z' || 1076 'A' <= c && c <= 'Z' || 1077 c == '_' || 1078 unicode.IsLetter(c) 1079 } 1080 1081 func isdigit(c rune) bool { return '0' <= c && c <= '9' } 1082 func isodigit(c rune) bool { return '0' <= c && c <= '7' } 1083 func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } 1084 func isbdigit(c rune) bool { return '0' == c || c == '1' } 1085 1086 // keywordToken records the special tokens for 1087 // strings that should not be treated as ordinary identifiers. 1088 var keywordToken = map[string]Token{ 1089 "and": AND, 1090 "break": BREAK, 1091 "continue": CONTINUE, 1092 "def": DEF, 1093 "elif": ELIF, 1094 "else": ELSE, 1095 "for": FOR, 1096 "if": IF, 1097 "in": IN, 1098 "lambda": LAMBDA, 1099 "load": LOAD, 1100 "not": NOT, 1101 "or": OR, 1102 "pass": PASS, 1103 "return": RETURN, 1104 "while": WHILE, 1105 1106 // reserved words: 1107 "as": ILLEGAL, 1108 // "assert": ILLEGAL, // heavily used by our tests 1109 "async": ILLEGAL, 1110 "await": ILLEGAL, 1111 "class": ILLEGAL, 1112 "del": ILLEGAL, 1113 "except": ILLEGAL, 1114 "finally": ILLEGAL, 1115 "from": ILLEGAL, 1116 "global": ILLEGAL, 1117 "import": ILLEGAL, 1118 "is": ILLEGAL, 1119 "nonlocal": ILLEGAL, 1120 "raise": ILLEGAL, 1121 "try": ILLEGAL, 1122 "with": ILLEGAL, 1123 "yield": ILLEGAL, 1124 }