github.com/lab47/exprcore@v0.0.0-20210525052339-fb7d6bd9331e/syntax/scan.go (about) 1 // Copyright 2017 The Bazel Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // A lexical scanner for exprcore. 8 9 import ( 10 "fmt" 11 "io" 12 "io/ioutil" 13 "log" 14 "math/big" 15 "os" 16 "strconv" 17 "strings" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 // A Token represents a exprcore lexical token. 23 type Token int8 24 25 const ( 26 ILLEGAL Token = iota 27 EOF 28 29 NEWLINE 30 INDENT 31 OUTDENT 32 33 // Tokens with values 34 IDENT // x 35 INT // 123 36 FLOAT // 1.23e45 37 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" 38 39 SHELL // $ foo bar 40 DSHELL_START // the start of a shell expression with expandation 41 DSHELL_PART // an expandation within a shell expression 42 DSHELL_END // the end of a DSHELL 43 44 // Punctuation 45 PLUS // + 46 MINUS // - 47 STAR // * 48 SLASH // / 49 SLASHSLASH // // 50 PERCENT // % 51 AMP // & 52 PIPE // | 53 CIRCUMFLEX // ^ 54 LTLT // << 55 GTGT // >> 56 TILDE // ~ 57 DOT // . 58 COMMA // , 59 EQ // = 60 SEMI // ; 61 COLON // : 62 LPAREN // ( 63 RPAREN // ) 64 LBRACK // [ 65 RBRACK // ] 66 LBRACE // { 67 RBRACE // } 68 LT // < 69 GT // > 70 GE // >= 71 LE // <= 72 EQL // == 73 NEQ // != 74 PLUS_EQ // += (keep order consistent with PLUS..GTGT) 75 MINUS_EQ // -= 76 STAR_EQ // *= 77 SLASH_EQ // /= 78 SLASHSLASH_EQ // //= 79 PERCENT_EQ // %= 80 AMP_EQ // &= 81 PIPE_EQ // |= 82 CIRCUMFLEX_EQ // ^= 83 LTLT_EQ // <<= 84 GTGT_EQ // >>= 85 ARROW // => 86 AT // @ 87 PERCENT_BRACE // %{ 88 STARSTAR // ** 89 90 // Keywords 91 AND 92 BREAK 93 CONTINUE 94 DEF 95 ELIF 96 ELSE 97 FOR 98 IF 99 IN 100 LAMBDA 101 LOAD 102 IMPORT 103 AS 104 USING 105 NOT 106 NOT_IN // synthesized by parser from NOT IN 107 OR 108 PASS 109 RETURN 110 WHILE 111 112 maxToken 113 ) 114 115 func (tok Token) String() string { return tokenNames[tok] } 116 117 // GoString is like String but quotes punctuation tokens. 118 // Use Sprintf("%#v", tok) when constructing error messages. 119 func (tok Token) GoString() string { 120 if tok >= PLUS && tok <= STARSTAR { 121 return "'" + tokenNames[tok] + "'" 122 } 123 return tokenNames[tok] 124 } 125 126 var tokenNames = [...]string{ 127 ILLEGAL: "illegal token", 128 EOF: "end of file", 129 NEWLINE: "newline", 130 INDENT: "indent", 131 OUTDENT: "outdent", 132 IDENT: "identifier", 133 INT: "int literal", 134 FLOAT: "float literal", 135 STRING: "string literal", 136 SHELL: "a shell expression", 137 DSHELL_START: "the start of a dynamic shell expression", 138 DSHELL_PART: "part of a dynamic shell expression", 139 DSHELL_END: "the end of a dynamic shell expression", 140 PLUS: "+", 141 MINUS: "-", 142 STAR: "*", 143 SLASH: "/", 144 SLASHSLASH: "//", 145 PERCENT: "%", 146 PERCENT_BRACE: "%{", 147 AMP: "&", 148 PIPE: "|", 149 CIRCUMFLEX: "^", 150 LTLT: "<<", 151 GTGT: ">>", 152 TILDE: "~", 153 DOT: ".", 154 COMMA: ",", 155 EQ: "=", 156 SEMI: ";", 157 COLON: ":", 158 LPAREN: "(", 159 RPAREN: ")", 160 LBRACK: "[", 161 RBRACK: "]", 162 LBRACE: "{", 163 RBRACE: "}", 164 LT: "<", 165 GT: ">", 166 GE: ">=", 167 LE: "<=", 168 EQL: "==", 169 NEQ: "!=", 170 PLUS_EQ: "+=", 171 MINUS_EQ: "-=", 172 STAR_EQ: "*=", 173 SLASH_EQ: "/=", 174 SLASHSLASH_EQ: "//=", 175 PERCENT_EQ: "%=", 176 AMP_EQ: "&=", 177 PIPE_EQ: "|=", 178 CIRCUMFLEX_EQ: "^=", 179 LTLT_EQ: "<<=", 180 GTGT_EQ: ">>=", 181 STARSTAR: "**", 182 ARROW: "=>", 183 AT: "@", 184 AND: "and", 185 BREAK: "break", 186 CONTINUE: "continue", 187 DEF: "def", 188 ELIF: "elif", 189 ELSE: "else", 190 FOR: "for", 191 IF: "if", 192 IN: "in", 193 LAMBDA: "lambda", 194 LOAD: "load", 195 IMPORT: "import", 196 AS: "as", 197 USING: "using", 198 NOT: "not", 199 NOT_IN: "not in", 200 OR: "or", 201 PASS: "pass", 202 RETURN: "return", 203 WHILE: "while", 204 } 205 206 // A Position describes the location of a rune of input. 207 type Position struct { 208 file *string // filename (indirect for compactness) 209 Line int32 // 1-based line number; 0 if line unknown 210 Col int32 // 1-based column (rune) number; 0 if column unknown 211 } 212 213 // IsValid reports whether the position is valid. 214 func (p Position) IsValid() bool { return p.file != nil } 215 216 // Filename returns the name of the file containing this position. 217 func (p Position) Filename() string { 218 if p.file != nil { 219 return *p.file 220 } 221 return "<invalid>" 222 } 223 224 // MakePosition returns position with the specified components. 225 func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } 226 227 // add returns the position at the end of s, assuming it starts at p. 228 func (p Position) add(s string) Position { 229 if n := strings.Count(s, "\n"); n > 0 { 230 p.Line += int32(n) 231 s = s[strings.LastIndex(s, "\n")+1:] 232 p.Col = 1 233 } 234 p.Col += int32(utf8.RuneCountInString(s)) 235 return p 236 } 237 238 func (p Position) String() string { 239 file := p.Filename() 240 if p.Line > 0 { 241 if p.Col > 0 { 242 return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) 243 } 244 return fmt.Sprintf("%s:%d", file, p.Line) 245 } 246 return file 247 } 248 249 func (p Position) isBefore(q Position) bool { 250 if p.Line != q.Line { 251 return p.Line < q.Line 252 } 253 return p.Col < q.Col 254 } 255 256 // An scanner represents a single input file being parsed. 257 type scanner struct { 258 rest []byte // rest of input (in REPL, a line of input) 259 token []byte // token being scanned 260 pos Position // current input position 261 depth int // nesting of [ ] { } ( ) 262 indentstk []int // stack of indentation levels 263 dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return 264 lineStart bool // after NEWLINE; convert spaces to indentation tokens 265 keepComments bool // accumulate comments in slice 266 lineComments []Comment // list of full line comments (if keepComments) 267 suffixComments []Comment // list of suffix comments (if keepComments) 268 269 insertSemi bool // insert a semicolon before next newline 270 271 interpDepth int // how far inside ${'s we are 272 interpExprDepth int // how far inside `'s we are 273 274 readline func() ([]byte, error) // read next line of input (REPL only) 275 } 276 277 func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { 278 sc := &scanner{ 279 pos: Position{file: &filename, Line: 1, Col: 1}, 280 indentstk: make([]int, 1, 10), // []int{0} + spare capacity 281 lineStart: true, 282 keepComments: keepComments, 283 } 284 sc.readline, _ = src.(func() ([]byte, error)) // REPL only 285 if sc.readline == nil { 286 data, err := readSource(filename, src) 287 if err != nil { 288 return nil, err 289 } 290 sc.rest = data 291 } 292 return sc, nil 293 } 294 295 func readSource(filename string, src interface{}) ([]byte, error) { 296 switch src := src.(type) { 297 case string: 298 return []byte(src), nil 299 case []byte: 300 return src, nil 301 case io.Reader: 302 data, err := ioutil.ReadAll(src) 303 if err != nil { 304 err = &os.PathError{Op: "read", Path: filename, Err: err} 305 return nil, err 306 } 307 return data, nil 308 case nil: 309 return ioutil.ReadFile(filename) 310 default: 311 return nil, fmt.Errorf("invalid source: %T", src) 312 } 313 } 314 315 // An Error describes the nature and position of a scanner or parser error. 316 type Error struct { 317 Pos Position 318 Msg string 319 } 320 321 func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } 322 323 // errorf is called to report an error. 324 // errorf does not return: it panics. 325 func (sc *scanner) error(pos Position, s string) { 326 panic(Error{pos, s}) 327 } 328 329 func (sc *scanner) errorf(pos Position, format string, args ...interface{}) { 330 sc.error(pos, fmt.Sprintf(format, args...)) 331 } 332 333 func (sc *scanner) recover(err *error) { 334 // The scanner and parser panic both for routine errors like 335 // syntax errors and for programmer bugs like array index 336 // errors. Turn both into error returns. Catching bug panics 337 // is especially important when processing many files. 338 switch e := recover().(type) { 339 case nil: 340 // no panic 341 case Error: 342 *err = e 343 default: 344 *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} 345 if debug { 346 log.Fatal(*err) 347 } 348 } 349 } 350 351 // eof reports whether the input has reached end of file. 352 func (sc *scanner) eof() bool { 353 return len(sc.rest) == 0 && !sc.readLine() 354 } 355 356 // readLine attempts to read another line of input. 357 // Precondition: len(sc.rest)==0. 358 func (sc *scanner) readLine() bool { 359 if sc.readline != nil { 360 var err error 361 sc.rest, err = sc.readline() 362 if err != nil { 363 sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt 364 } 365 return len(sc.rest) > 0 366 } 367 return false 368 } 369 370 // peekRune returns the next rune in the input without consuming it. 371 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 372 func (sc *scanner) peekRune() rune { 373 // TODO(adonovan): opt: measure and perhaps inline eof. 374 if sc.eof() { 375 return 0 376 } 377 378 // fast path: ASCII 379 if b := sc.rest[0]; b < utf8.RuneSelf { 380 if b == '\r' { 381 return '\n' 382 } 383 return rune(b) 384 } 385 386 r, _ := utf8.DecodeRune(sc.rest) 387 return r 388 } 389 390 // readRune consumes and returns the next rune in the input. 391 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 392 func (sc *scanner) readRune() rune { 393 // eof() has been inlined here, both to avoid a call 394 // and to establish len(rest)>0 to avoid a bounds check. 395 if len(sc.rest) == 0 { 396 if !sc.readLine() { 397 sc.error(sc.pos, "internal scanner error: readRune at EOF") 398 } 399 // Redundant, but eliminates the bounds-check below. 400 if len(sc.rest) == 0 { 401 return 0 402 } 403 } 404 405 // fast path: ASCII 406 if b := sc.rest[0]; b < utf8.RuneSelf { 407 r := rune(b) 408 sc.rest = sc.rest[1:] 409 if r == '\r' { 410 if len(sc.rest) > 0 && sc.rest[0] == '\n' { 411 sc.rest = sc.rest[1:] 412 } 413 r = '\n' 414 } 415 if r == '\n' { 416 sc.pos.Line++ 417 sc.pos.Col = 1 418 } else { 419 sc.pos.Col++ 420 } 421 return r 422 } 423 424 r, size := utf8.DecodeRune(sc.rest) 425 sc.rest = sc.rest[size:] 426 sc.pos.Col++ 427 return r 428 } 429 430 // tokenValue records the position and value associated with each token. 431 type tokenValue struct { 432 raw string // raw text of token 433 int int64 // decoded int 434 bigInt *big.Int // decoded integers > int64 435 float float64 // decoded float 436 string string // decoded string 437 pos Position // start position of token 438 } 439 440 // startToken marks the beginning of the next input token. 441 // It must be followed by a call to endToken once the token has 442 // been consumed using readRune. 443 func (sc *scanner) startToken(val *tokenValue) { 444 sc.token = sc.rest 445 val.raw = "" 446 val.pos = sc.pos 447 } 448 449 // endToken marks the end of an input token. 450 // It records the actual token string in val.raw if the caller 451 // has not done that already. 452 func (sc *scanner) endToken(val *tokenValue) { 453 if val.raw == "" { 454 val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) 455 } 456 } 457 458 // nextToken is called by the parser to obtain the next input token. 459 // It returns the token value and sets val to the data associated with 460 // the token. 461 // 462 // For all our input tokens, the associated data is val.pos (the 463 // position where the token begins), val.raw (the input string 464 // corresponding to the token). For string and int tokens, the string 465 // and int fields additionally contain the token's interpreted value. 466 func (sc *scanner) nextToken(val *tokenValue) Token { 467 468 // The following distribution of tokens guides case ordering: 469 // 470 // COMMA 27 % 471 // STRING 23 % 472 // IDENT 15 % 473 // EQL 11 % 474 // LBRACK 5.5 % 475 // RBRACK 5.5 % 476 // NEWLINE 3 % 477 // LPAREN 2.9 % 478 // RPAREN 2.9 % 479 // INT 2 % 480 // others < 1 % 481 // 482 // Although NEWLINE tokens are infrequent, and lineStart is 483 // usually (~97%) false on entry, skipped newlines account for 484 // about 50% of all iterations of the 'start' loop. 485 486 insertSemi := false 487 488 // Replace the value with the updated on on every time through 489 defer func() { 490 sc.insertSemi = insertSemi 491 }() 492 493 start: 494 var c rune 495 496 // Deal with leading spaces and indentation. 497 blank := false 498 /* 499 savedLineStart := sc.lineStart 500 if sc.lineStart { 501 sc.lineStart = false 502 col := 0 503 for { 504 c = sc.peekRune() 505 if c == ' ' { 506 col++ 507 sc.readRune() 508 } else if c == '\t' { 509 const tab = 8 510 col += int(tab - (sc.pos.Col-1)%tab) 511 sc.readRune() 512 } else { 513 break 514 } 515 } 516 517 // The third clause matches EOF. 518 if c == '#' || c == '\n' || c == 0 { 519 blank = true 520 } 521 522 // Compute indentation level for non-blank lines not 523 // inside an expression. This is not the common case. 524 if false { // !blank { // && sc.depth == 0 { 525 cur := sc.indentstk[len(sc.indentstk)-1] 526 if col > cur { 527 // indent 528 sc.dents++ 529 sc.indentstk = append(sc.indentstk, col) 530 } else if col < cur { 531 // outdent(s) 532 for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { 533 sc.dents-- 534 sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop 535 } 536 if col != sc.indentstk[len(sc.indentstk)-1] { 537 sc.error(sc.pos, "unindent does not match any outer indentation level") 538 } 539 } 540 } 541 */ 542 543 // Return saved indentation tokens. 544 /* 545 if sc.dents != 0 { 546 sc.startToken(val) 547 sc.endToken(val) 548 if sc.dents < 0 { 549 sc.dents++ 550 return OUTDENT 551 } else { 552 sc.dents-- 553 return INDENT 554 } 555 } 556 */ 557 558 // start of line proper 559 c = sc.peekRune() 560 561 // Skip spaces. 562 for c == ' ' || c == '\t' || (c == '\n' && !sc.insertSemi) || c == '\r' { 563 sc.readRune() 564 c = sc.peekRune() 565 } 566 567 // comment 568 if c == '#' { 569 if sc.keepComments { 570 sc.startToken(val) 571 } 572 // Consume up to newline (included). 573 for c != 0 && c != '\n' { 574 sc.readRune() 575 c = sc.peekRune() 576 } 577 578 if sc.keepComments { 579 sc.endToken(val) 580 if blank { 581 sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) 582 } else { 583 sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) 584 } 585 } 586 587 goto start 588 } 589 590 // newline 591 if c == '\n' { 592 // Only seen if insertSemi was true because otherwise the loop above will eat newlines 593 sc.lineStart = true 594 595 // Ignore newlines within expressions (common case). 596 /* 597 if sc.depth > 0 { 598 sc.readRune() 599 goto start 600 } 601 */ 602 603 // Ignore blank lines, except in the REPL, 604 // where they emit OUTDENTs and NEWLINE. 605 /* 606 if blank { 607 if sc.readline == nil { 608 sc.readRune() 609 goto start 610 } else if len(sc.indentstk) > 1 { 611 sc.dents = 1 - len(sc.indentstk) 612 sc.indentstk = sc.indentstk[:1] 613 goto start 614 } 615 } 616 */ 617 618 // At top-level (not in an expression). 619 sc.startToken(val) 620 sc.readRune() 621 val.raw = "\n" 622 return SEMI 623 } 624 625 // end of file 626 if c == 0 { 627 // Emit OUTDENTs for unfinished indentation, 628 // preceded by a NEWLINE if we haven't just emitted one. 629 /* 630 if len(sc.indentstk) > 1 { 631 if savedLineStart { 632 sc.dents = 1 - len(sc.indentstk) 633 sc.indentstk = sc.indentstk[:1] 634 goto start 635 } else { 636 sc.lineStart = true 637 sc.startToken(val) 638 val.raw = "\n" 639 return NEWLINE 640 } 641 } 642 */ 643 644 if sc.insertSemi { 645 sc.startToken(val) 646 sc.endToken(val) 647 return SEMI 648 } 649 650 sc.startToken(val) 651 sc.endToken(val) 652 return EOF 653 } 654 655 // line continuation 656 if c == '\\' { 657 sc.readRune() 658 if sc.peekRune() != '\n' { 659 sc.errorf(sc.pos, "stray backslash in program") 660 } 661 sc.readRune() 662 goto start 663 } 664 665 // start of the next token 666 sc.startToken(val) 667 668 // comma (common case) 669 if c == ',' { 670 sc.readRune() 671 sc.endToken(val) 672 return COMMA 673 } 674 675 // string literal 676 if c == '"' || c == '\'' { 677 insertSemi = true 678 return sc.scanString(val, c) 679 } 680 681 if c == '`' { 682 tok := sc.scanShellExpr(val) 683 if tok == SHELL { 684 insertSemi = true 685 } 686 687 return tok 688 } 689 690 if c == '$' { 691 tok := sc.scanShell(val) 692 if tok == SHELL { 693 insertSemi = true 694 } 695 696 return tok 697 } 698 699 // identifier or keyword 700 if isIdentStart(c) { 701 // raw string literal 702 if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { 703 insertSemi = true 704 sc.readRune() 705 c = sc.peekRune() 706 return sc.scanString(val, c) 707 } 708 709 for isIdent(c) { 710 sc.readRune() 711 c = sc.peekRune() 712 } 713 sc.endToken(val) 714 if k, ok := keywordToken[val.raw]; ok { 715 switch k { 716 case BREAK, CONTINUE, PASS, RETURN: 717 insertSemi = true 718 } 719 720 return k 721 } 722 723 insertSemi = true 724 return IDENT 725 } 726 727 // brackets 728 switch c { 729 case '[', '(', '{': 730 sc.depth++ 731 sc.readRune() 732 sc.endToken(val) 733 switch c { 734 case '[': 735 return LBRACK 736 case '(': 737 return LPAREN 738 case '{': 739 return LBRACE 740 } 741 panic("unreachable") 742 743 case ']', ')', '}': 744 if c == '}' { 745 if sc.interpDepth > 0 { 746 tok := sc.scanMoreShell(val) 747 if tok == DSHELL_END { 748 insertSemi = true 749 } 750 751 return tok 752 } 753 754 if sc.interpExprDepth > 0 { 755 tok := sc.scanMoreShellExpr(val) 756 if tok == DSHELL_END { 757 insertSemi = true 758 } 759 760 return tok 761 } 762 } 763 764 if sc.depth == 0 { 765 sc.errorf(sc.pos, "unexpected %q", c) 766 } else { 767 sc.depth-- 768 } 769 sc.readRune() 770 sc.endToken(val) 771 772 insertSemi = true 773 switch c { 774 case ']': 775 return RBRACK 776 case ')': 777 return RPAREN 778 case '}': 779 return RBRACE 780 } 781 panic("unreachable") 782 } 783 784 // int or float literal, or period 785 if isdigit(c) || c == '.' { 786 insertSemi = true 787 return sc.scanNumber(val, c) 788 } 789 790 // other punctuation 791 defer sc.endToken(val) 792 switch c { 793 case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' 794 start := sc.pos 795 sc.readRune() 796 if sc.peekRune() == '=' { 797 sc.readRune() 798 switch c { 799 case '<': 800 return LE 801 case '>': 802 return GE 803 case '=': 804 return EQL 805 case '!': 806 return NEQ 807 case '+': 808 return PLUS_EQ 809 case '-': 810 return MINUS_EQ 811 case '/': 812 return SLASH_EQ 813 case '%': 814 return PERCENT_EQ 815 case '&': 816 return AMP_EQ 817 case '|': 818 return PIPE_EQ 819 case '^': 820 return CIRCUMFLEX_EQ 821 } 822 } 823 switch c { 824 case '=': 825 if sc.peekRune() == '>' { 826 sc.readRune() 827 return ARROW 828 } 829 830 return EQ 831 case '<': 832 if sc.peekRune() == '<' { 833 sc.readRune() 834 if sc.peekRune() == '=' { 835 sc.readRune() 836 return LTLT_EQ 837 } else { 838 return LTLT 839 } 840 } 841 return LT 842 case '>': 843 if sc.peekRune() == '>' { 844 sc.readRune() 845 if sc.peekRune() == '=' { 846 sc.readRune() 847 return GTGT_EQ 848 } else { 849 return GTGT 850 } 851 } 852 return GT 853 case '!': 854 sc.error(start, "unexpected input character '!'") 855 case '+': 856 return PLUS 857 case '-': 858 return MINUS 859 case '/': 860 if sc.peekRune() == '/' { 861 sc.readRune() 862 if sc.peekRune() == '=' { 863 sc.readRune() 864 return SLASHSLASH_EQ 865 } else { 866 return SLASHSLASH 867 } 868 } 869 return SLASH 870 case '%': 871 if sc.peekRune() == '{' { 872 sc.readRune() 873 sc.depth++ 874 return PERCENT_BRACE 875 } 876 877 return PERCENT 878 case '&': 879 return AMP 880 case '|': 881 return PIPE 882 case '^': 883 return CIRCUMFLEX 884 } 885 panic("unreachable") 886 887 case ':', ';', '~', '@': // single-char tokens (except comma) 888 sc.readRune() 889 switch c { 890 case ':': 891 return COLON 892 case ';': 893 return SEMI 894 case '~': 895 return TILDE 896 case '@': 897 return AT 898 } 899 panic("unreachable") 900 901 case '*': // possibly followed by '*' or '=' 902 sc.readRune() 903 switch sc.peekRune() { 904 case '*': 905 sc.readRune() 906 return STARSTAR 907 case '=': 908 sc.readRune() 909 return STAR_EQ 910 } 911 return STAR 912 } 913 914 sc.errorf(sc.pos, "unexpected input character %#q", c) 915 panic("unreachable") 916 } 917 918 func (sc *scanner) scanShellExpr(val *tokenValue) Token { 919 sc.readRune() 920 921 var ( 922 raw strings.Builder 923 hasExpand bool 924 ) 925 926 for sc.peekRune() == ' ' { 927 sc.readRune() 928 } 929 930 for { 931 if sc.eof() { 932 break 933 } 934 935 c := sc.readRune() 936 if c == '`' { 937 break 938 } 939 940 if c == '$' { 941 nc := sc.peekRune() 942 if nc == '{' { 943 sc.readRune() 944 sc.interpExprDepth++ 945 hasExpand = true 946 break 947 } 948 } else if c == '\\' { 949 if sc.eof() { 950 sc.error(val.pos, "unexpected EOF in string") 951 } 952 c = sc.readRune() 953 } 954 955 raw.WriteRune(c) 956 } 957 958 val.string = raw.String() 959 960 if hasExpand { 961 return DSHELL_START 962 } else { 963 return SHELL 964 } 965 } 966 967 func (sc *scanner) scanShell(val *tokenValue) Token { 968 sc.readRune() 969 970 var ( 971 raw strings.Builder 972 hasExpand bool 973 ) 974 975 for sc.peekRune() == ' ' { 976 sc.readRune() 977 } 978 979 for { 980 if sc.eof() { 981 break 982 } 983 984 c := sc.peekRune() 985 if c == '\n' { 986 break 987 } 988 989 sc.readRune() 990 if c == '$' { 991 nc := sc.peekRune() 992 if nc == '{' { 993 sc.readRune() 994 sc.interpDepth++ 995 hasExpand = true 996 break 997 } 998 } else if c == '\\' { 999 if sc.eof() { 1000 sc.error(val.pos, "unexpected EOF in string") 1001 } 1002 c = sc.readRune() 1003 } 1004 1005 raw.WriteRune(c) 1006 } 1007 1008 val.string = raw.String() 1009 1010 if hasExpand { 1011 return DSHELL_START 1012 } else { 1013 return SHELL 1014 } 1015 } 1016 1017 func (sc *scanner) scanMoreShell(val *tokenValue) Token { 1018 sc.interpDepth-- 1019 1020 sc.readRune() 1021 1022 var ( 1023 raw strings.Builder 1024 hasExpand bool 1025 ) 1026 1027 for { 1028 if sc.eof() { 1029 break 1030 } 1031 1032 c := sc.peekRune() 1033 if c == '\n' { 1034 break 1035 } 1036 1037 sc.readRune() 1038 1039 if c == '$' { 1040 nc := sc.peekRune() 1041 if nc == '{' { 1042 sc.readRune() 1043 sc.interpDepth++ 1044 hasExpand = true 1045 break 1046 } 1047 } else if c == '\\' { 1048 if sc.eof() { 1049 sc.error(val.pos, "unexpected EOF in string") 1050 } 1051 c = sc.readRune() 1052 } 1053 1054 raw.WriteRune(c) 1055 } 1056 1057 val.string = raw.String() 1058 1059 if !hasExpand { 1060 return DSHELL_END 1061 } else { 1062 return DSHELL_PART 1063 } 1064 } 1065 1066 func (sc *scanner) scanMoreShellExpr(val *tokenValue) Token { 1067 sc.interpDepth-- 1068 1069 sc.readRune() 1070 1071 var ( 1072 raw strings.Builder 1073 hasExpand bool 1074 ) 1075 1076 for { 1077 if sc.eof() { 1078 sc.error(val.pos, "unexpected EOF in string") 1079 } 1080 1081 c := sc.readRune() 1082 if c == '`' { 1083 break 1084 } 1085 1086 if c == '$' { 1087 nc := sc.peekRune() 1088 if nc == '{' { 1089 sc.readRune() 1090 sc.interpExprDepth++ 1091 hasExpand = true 1092 break 1093 } 1094 } else if c == '\\' { 1095 if sc.eof() { 1096 sc.error(val.pos, "unexpected EOF in string") 1097 } 1098 c = sc.readRune() 1099 } 1100 1101 raw.WriteRune(c) 1102 } 1103 1104 val.string = raw.String() 1105 1106 if !hasExpand { 1107 return DSHELL_END 1108 } else { 1109 return DSHELL_PART 1110 } 1111 } 1112 1113 func (sc *scanner) scanString(val *tokenValue, quote rune) Token { 1114 start := sc.pos 1115 triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) 1116 sc.readRune() 1117 1118 // String literals may contain escaped or unescaped newlines, 1119 // causing them to span multiple lines (gulps) of REPL input; 1120 // they are the only such token. Thus we cannot call endToken, 1121 // as it assumes sc.rest is unchanged since startToken. 1122 // Instead, buffer the token here. 1123 // TODO(adonovan): opt: buffer only if we encounter a newline. 1124 raw := new(strings.Builder) 1125 1126 // Copy the prefix, e.g. r' or " (see startToken). 1127 raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) 1128 1129 if !triple { 1130 // single-quoted string literal 1131 for { 1132 if sc.eof() { 1133 sc.error(val.pos, "unexpected EOF in string") 1134 } 1135 c := sc.readRune() 1136 raw.WriteRune(c) 1137 if c == quote { 1138 break 1139 } 1140 if c == '\n' { 1141 sc.error(val.pos, "unexpected newline in string") 1142 } 1143 if c == '\\' { 1144 if sc.eof() { 1145 sc.error(val.pos, "unexpected EOF in string") 1146 } 1147 c = sc.readRune() 1148 raw.WriteRune(c) 1149 } 1150 } 1151 } else { 1152 // triple-quoted string literal 1153 sc.readRune() 1154 raw.WriteRune(quote) 1155 sc.readRune() 1156 raw.WriteRune(quote) 1157 1158 quoteCount := 0 1159 for { 1160 if sc.eof() { 1161 sc.error(val.pos, "unexpected EOF in string") 1162 } 1163 c := sc.readRune() 1164 raw.WriteRune(c) 1165 if c == quote { 1166 quoteCount++ 1167 if quoteCount == 3 { 1168 break 1169 } 1170 } else { 1171 quoteCount = 0 1172 } 1173 if c == '\\' { 1174 if sc.eof() { 1175 sc.error(val.pos, "unexpected EOF in string") 1176 } 1177 c = sc.readRune() 1178 raw.WriteRune(c) 1179 } 1180 } 1181 } 1182 val.raw = raw.String() 1183 1184 s, _, err := unquote(val.raw) 1185 if err != nil { 1186 sc.error(start, err.Error()) 1187 } 1188 val.string = s 1189 return STRING 1190 } 1191 1192 func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { 1193 // https://github.com/google/exprcore-go/blob/master/doc/spec.md#lexical-elements 1194 // 1195 // Python features not supported: 1196 // - integer literals of >64 bits of precision 1197 // - 123L or 123l long suffix 1198 // - traditional octal: 0755 1199 // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals 1200 1201 start := sc.pos 1202 fraction, exponent := false, false 1203 1204 if c == '.' { 1205 // dot or start of fraction 1206 sc.readRune() 1207 c = sc.peekRune() 1208 if !isdigit(c) { 1209 sc.endToken(val) 1210 return DOT 1211 } 1212 fraction = true 1213 } else if c == '0' { 1214 // hex, octal, binary or float 1215 sc.readRune() 1216 c = sc.peekRune() 1217 1218 if c == '.' { 1219 fraction = true 1220 } else if c == 'x' || c == 'X' { 1221 // hex 1222 sc.readRune() 1223 c = sc.peekRune() 1224 if !isxdigit(c) { 1225 sc.error(start, "invalid hex literal") 1226 } 1227 for isxdigit(c) { 1228 sc.readRune() 1229 c = sc.peekRune() 1230 } 1231 } else if c == 'o' || c == 'O' { 1232 // octal 1233 sc.readRune() 1234 c = sc.peekRune() 1235 if !isodigit(c) { 1236 sc.error(sc.pos, "invalid octal literal") 1237 } 1238 for isodigit(c) { 1239 sc.readRune() 1240 c = sc.peekRune() 1241 } 1242 } else if c == 'b' || c == 'B' { 1243 // binary 1244 sc.readRune() 1245 c = sc.peekRune() 1246 if !isbdigit(c) { 1247 sc.error(sc.pos, "invalid binary literal") 1248 } 1249 for isbdigit(c) { 1250 sc.readRune() 1251 c = sc.peekRune() 1252 } 1253 } else { 1254 // float (or obsolete octal "0755") 1255 allzeros, octal := true, true 1256 for isdigit(c) { 1257 if c != '0' { 1258 allzeros = false 1259 } 1260 if c > '7' { 1261 octal = false 1262 } 1263 sc.readRune() 1264 c = sc.peekRune() 1265 } 1266 if c == '.' { 1267 fraction = true 1268 } else if c == 'e' || c == 'E' { 1269 exponent = true 1270 } else if octal && !allzeros { 1271 sc.endToken(val) 1272 sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) 1273 } 1274 } 1275 } else { 1276 // decimal 1277 for isdigit(c) { 1278 sc.readRune() 1279 c = sc.peekRune() 1280 } 1281 1282 if c == '.' { 1283 fraction = true 1284 } else if c == 'e' || c == 'E' { 1285 exponent = true 1286 } 1287 } 1288 1289 if fraction { 1290 sc.readRune() // consume '.' 1291 c = sc.peekRune() 1292 for isdigit(c) { 1293 sc.readRune() 1294 c = sc.peekRune() 1295 } 1296 1297 if c == 'e' || c == 'E' { 1298 exponent = true 1299 } 1300 } 1301 1302 if exponent { 1303 sc.readRune() // consume [eE] 1304 c = sc.peekRune() 1305 if c == '+' || c == '-' { 1306 sc.readRune() 1307 c = sc.peekRune() 1308 if !isdigit(c) { 1309 sc.error(sc.pos, "invalid float literal") 1310 } 1311 } 1312 for isdigit(c) { 1313 sc.readRune() 1314 c = sc.peekRune() 1315 } 1316 } 1317 1318 sc.endToken(val) 1319 if fraction || exponent { 1320 var err error 1321 val.float, err = strconv.ParseFloat(val.raw, 64) 1322 if err != nil { 1323 sc.error(sc.pos, "invalid float literal") 1324 } 1325 return FLOAT 1326 } else { 1327 var err error 1328 s := val.raw 1329 val.bigInt = nil 1330 if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { 1331 val.int, err = strconv.ParseInt(s[2:], 8, 64) 1332 } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { 1333 val.int, err = strconv.ParseInt(s[2:], 2, 64) 1334 } else { 1335 val.int, err = strconv.ParseInt(s, 0, 64) 1336 if err != nil { 1337 num := new(big.Int) 1338 var ok bool 1339 val.bigInt, ok = num.SetString(s, 0) 1340 if ok { 1341 err = nil 1342 } 1343 } 1344 } 1345 if err != nil { 1346 sc.error(start, "invalid int literal") 1347 } 1348 return INT 1349 } 1350 } 1351 1352 // isIdent reports whether c is an identifier rune. 1353 func isIdent(c rune) bool { 1354 return isdigit(c) || isIdentStart(c) 1355 } 1356 1357 func isIdentStart(c rune) bool { 1358 return 'a' <= c && c <= 'z' || 1359 'A' <= c && c <= 'Z' || 1360 c == '_' || 1361 unicode.IsLetter(c) 1362 } 1363 1364 func isdigit(c rune) bool { return '0' <= c && c <= '9' } 1365 func isodigit(c rune) bool { return '0' <= c && c <= '7' } 1366 func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } 1367 func isbdigit(c rune) bool { return '0' == c || c == '1' } 1368 1369 // keywordToken records the special tokens for 1370 // strings that should not be treated as ordinary identifiers. 1371 var keywordToken = map[string]Token{ 1372 "and": AND, 1373 "break": BREAK, 1374 "continue": CONTINUE, 1375 "def": DEF, 1376 "elif": ELIF, 1377 "else": ELSE, 1378 "for": FOR, 1379 "if": IF, 1380 "in": IN, 1381 "lambda": LAMBDA, 1382 "load": LOAD, 1383 "import": IMPORT, 1384 "as": AS, 1385 "using": USING, 1386 "not": NOT, 1387 "or": OR, 1388 "pass": PASS, 1389 "return": RETURN, 1390 "while": WHILE, 1391 1392 // reserved words: 1393 // "assert": ILLEGAL, // heavily used by our tests 1394 "class": ILLEGAL, 1395 "del": ILLEGAL, 1396 "except": ILLEGAL, 1397 "finally": ILLEGAL, 1398 "from": ILLEGAL, 1399 "global": ILLEGAL, 1400 "is": ILLEGAL, 1401 "nonlocal": ILLEGAL, 1402 "raise": ILLEGAL, 1403 "try": ILLEGAL, 1404 "with": ILLEGAL, 1405 "yield": ILLEGAL, 1406 }