github.com/bir3/gocompiler@v0.9.2202/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, tokens.go, and token_string.go are self-contained 10 // (`go tool compile scanner.go source.go tokens.go token_string.go` compiles) 11 // and thus could be made into their own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 // The mode flags below control which comments are reported 23 // by calling the error handler. If no flag is set, comments 24 // are ignored. 25 const ( 26 comments uint = 1 << iota // call handler for all comments 27 directives // call handler for directives only 28 ) 29 30 type scanner struct { 31 source 32 mode uint 33 nlsemi bool // if set '\n' and EOF translate to ';' 34 35 // current token, valid after calling next() 36 line, col uint 37 blank bool // line is blank up to col 38 tok token 39 lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF"); may be malformed if bad is true 40 bad bool // valid if tok is _Literal, true if a syntax error occurred, lit may be malformed 41 kind LitKind // valid if tok is _Literal 42 op Operator // valid if tok is _Operator, _Star, _AssignOp, or _IncOp 43 prec int // valid if tok is _Operator, _Star, _AssignOp, or _IncOp 44 } 45 46 func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) { 47 s.source.init(src, errh) 48 s.mode = mode 49 s.nlsemi = false 50 } 51 52 // errorf reports an error at the most recently read character position. 53 func (s *scanner) errorf(format string, args ...interface{}) { 54 s.error(fmt.Sprintf(format, args...)) 55 } 56 57 // errorAtf reports an error at a byte column offset relative to the current token start. 58 func (s *scanner) errorAtf(offset int, format string, args ...interface{}) { 59 s.errh(s.line, s.col+uint(offset), fmt.Sprintf(format, args...)) 60 } 61 62 // setLit sets the scanner state for a recognized _Literal token. 63 func (s *scanner) setLit(kind LitKind, ok bool) { 64 s.nlsemi = true 65 s.tok = _Literal 66 s.lit = string(s.segment()) 67 s.bad = !ok 68 s.kind = kind 69 } 70 71 // next advances the scanner by reading the next token. 72 // 73 // If a read, source encoding, or lexical error occurs, next calls 74 // the installed error handler with the respective error position 75 // and message. The error message is guaranteed to be non-empty and 76 // never starts with a '/'. The error handler must exist. 77 // 78 // If the scanner mode includes the comments flag and a comment 79 // (including comments containing directives) is encountered, the 80 // error handler is also called with each comment position and text 81 // (including opening /* or // and closing */, but without a newline 82 // at the end of line comments). Comment text always starts with a / 83 // which can be used to distinguish these handler calls from errors. 84 // 85 // If the scanner mode includes the directives (but not the comments) 86 // flag, only comments containing a //line, /*line, or //go: directive 87 // are reported, in the same way as regular comments. 88 func (s *scanner) next() { 89 nlsemi := s.nlsemi 90 s.nlsemi = false 91 92 redo: 93 // skip white space 94 s.stop() 95 startLine, startCol := s.pos() 96 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' { 97 s.nextch() 98 } 99 100 // token start 101 s.line, s.col = s.pos() 102 s.blank = s.line > startLine || startCol == colbase 103 s.start() 104 if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) { 105 s.nextch() 106 s.ident() 107 return 108 } 109 110 switch s.ch { 111 case -1: 112 if nlsemi { 113 s.lit = "EOF" 114 s.tok = _Semi 115 break 116 } 117 s.tok = _EOF 118 119 case '\n': 120 s.nextch() 121 s.lit = "newline" 122 s.tok = _Semi 123 124 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 125 s.number(false) 126 127 case '"': 128 s.stdString() 129 130 case '`': 131 s.rawString() 132 133 case '\'': 134 s.rune() 135 136 case '(': 137 s.nextch() 138 s.tok = _Lparen 139 140 case '[': 141 s.nextch() 142 s.tok = _Lbrack 143 144 case '{': 145 s.nextch() 146 s.tok = _Lbrace 147 148 case ',': 149 s.nextch() 150 s.tok = _Comma 151 152 case ';': 153 s.nextch() 154 s.lit = "semicolon" 155 s.tok = _Semi 156 157 case ')': 158 s.nextch() 159 s.nlsemi = true 160 s.tok = _Rparen 161 162 case ']': 163 s.nextch() 164 s.nlsemi = true 165 s.tok = _Rbrack 166 167 case '}': 168 s.nextch() 169 s.nlsemi = true 170 s.tok = _Rbrace 171 172 case ':': 173 s.nextch() 174 if s.ch == '=' { 175 s.nextch() 176 s.tok = _Define 177 break 178 } 179 s.tok = _Colon 180 181 case '.': 182 s.nextch() 183 if isDecimal(s.ch) { 184 s.number(true) 185 break 186 } 187 if s.ch == '.' { 188 s.nextch() 189 if s.ch == '.' { 190 s.nextch() 191 s.tok = _DotDotDot 192 break 193 } 194 s.rewind() // now s.ch holds 1st '.' 195 s.nextch() // consume 1st '.' again 196 } 197 s.tok = _Dot 198 199 case '+': 200 s.nextch() 201 s.op, s.prec = Add, precAdd 202 if s.ch != '+' { 203 goto assignop 204 } 205 s.nextch() 206 s.nlsemi = true 207 s.tok = _IncOp 208 209 case '-': 210 s.nextch() 211 s.op, s.prec = Sub, precAdd 212 if s.ch != '-' { 213 goto assignop 214 } 215 s.nextch() 216 s.nlsemi = true 217 s.tok = _IncOp 218 219 case '*': 220 s.nextch() 221 s.op, s.prec = Mul, precMul 222 // don't goto assignop - want _Star token 223 if s.ch == '=' { 224 s.nextch() 225 s.tok = _AssignOp 226 break 227 } 228 s.tok = _Star 229 230 case '/': 231 s.nextch() 232 if s.ch == '/' { 233 s.nextch() 234 s.lineComment() 235 goto redo 236 } 237 if s.ch == '*' { 238 s.nextch() 239 s.fullComment() 240 if line, _ := s.pos(); line > s.line && nlsemi { 241 // A multi-line comment acts like a newline; 242 // it translates to a ';' if nlsemi is set. 243 s.lit = "newline" 244 s.tok = _Semi 245 break 246 } 247 goto redo 248 } 249 s.op, s.prec = Div, precMul 250 goto assignop 251 252 case '%': 253 s.nextch() 254 s.op, s.prec = Rem, precMul 255 goto assignop 256 257 case '&': 258 s.nextch() 259 if s.ch == '&' { 260 s.nextch() 261 s.op, s.prec = AndAnd, precAndAnd 262 s.tok = _Operator 263 break 264 } 265 s.op, s.prec = And, precMul 266 if s.ch == '^' { 267 s.nextch() 268 s.op = AndNot 269 } 270 goto assignop 271 272 case '|': 273 s.nextch() 274 if s.ch == '|' { 275 s.nextch() 276 s.op, s.prec = OrOr, precOrOr 277 s.tok = _Operator 278 break 279 } 280 s.op, s.prec = Or, precAdd 281 goto assignop 282 283 case '^': 284 s.nextch() 285 s.op, s.prec = Xor, precAdd 286 goto assignop 287 288 case '<': 289 s.nextch() 290 if s.ch == '=' { 291 s.nextch() 292 s.op, s.prec = Leq, precCmp 293 s.tok = _Operator 294 break 295 } 296 if s.ch == '<' { 297 s.nextch() 298 s.op, s.prec = Shl, precMul 299 goto assignop 300 } 301 if s.ch == '-' { 302 s.nextch() 303 s.tok = _Arrow 304 break 305 } 306 s.op, s.prec = Lss, precCmp 307 s.tok = _Operator 308 309 case '>': 310 s.nextch() 311 if s.ch == '=' { 312 s.nextch() 313 s.op, s.prec = Geq, precCmp 314 s.tok = _Operator 315 break 316 } 317 if s.ch == '>' { 318 s.nextch() 319 s.op, s.prec = Shr, precMul 320 goto assignop 321 } 322 s.op, s.prec = Gtr, precCmp 323 s.tok = _Operator 324 325 case '=': 326 s.nextch() 327 if s.ch == '=' { 328 s.nextch() 329 s.op, s.prec = Eql, precCmp 330 s.tok = _Operator 331 break 332 } 333 s.tok = _Assign 334 335 case '!': 336 s.nextch() 337 if s.ch == '=' { 338 s.nextch() 339 s.op, s.prec = Neq, precCmp 340 s.tok = _Operator 341 break 342 } 343 s.op, s.prec = Not, 0 344 s.tok = _Operator 345 346 case '~': 347 s.nextch() 348 s.op, s.prec = Tilde, 0 349 s.tok = _Operator 350 351 default: 352 s.errorf("invalid character %#U", s.ch) 353 s.nextch() 354 goto redo 355 } 356 357 return 358 359 assignop: 360 if s.ch == '=' { 361 s.nextch() 362 s.tok = _AssignOp 363 return 364 } 365 s.tok = _Operator 366 } 367 368 func (s *scanner) ident() { 369 // accelerate common case (7bit ASCII) 370 for isLetter(s.ch) || isDecimal(s.ch) { 371 s.nextch() 372 } 373 374 // general case 375 if s.ch >= utf8.RuneSelf { 376 for s.atIdentChar(false) { 377 s.nextch() 378 } 379 } 380 381 // possibly a keyword 382 lit := s.segment() 383 if len(lit) >= 2 { 384 if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) { 385 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 386 s.tok = tok 387 return 388 } 389 } 390 391 s.nlsemi = true 392 s.lit = string(lit) 393 s.tok = _Name 394 } 395 396 // tokStrFast is a faster version of token.String, which assumes that tok 397 // is one of the valid tokens - and can thus skip bounds checks. 398 func tokStrFast(tok token) string { 399 return _token_name[_token_index[tok-1]:_token_index[tok]] 400 } 401 402 func (s *scanner) atIdentChar(first bool) bool { 403 switch { 404 case unicode.IsLetter(s.ch) || s.ch == '_': 405 // ok 406 case unicode.IsDigit(s.ch): 407 if first { 408 s.errorf("identifier cannot begin with digit %#U", s.ch) 409 } 410 case s.ch >= utf8.RuneSelf: 411 s.errorf("invalid character %#U in identifier", s.ch) 412 default: 413 return false 414 } 415 return true 416 } 417 418 // hash is a perfect hash function for keywords. 419 // It assumes that s has at least length 2. 420 func hash(s []byte) uint { 421 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 422 } 423 424 var keywordMap [1 << 6]token // size must be power of two 425 426 func init() { 427 // populate keywordMap 428 for tok := _Break; tok <= _Var; tok++ { 429 h := hash([]byte(tok.String())) 430 if keywordMap[h] != 0 { 431 panic("imperfect hash") 432 } 433 keywordMap[h] = tok 434 } 435 } 436 437 func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter 438 func isLetter(ch rune) bool { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' } 439 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 440 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } 441 442 // digits accepts the sequence { digit | '_' }. 443 // If base <= 10, digits accepts any decimal digit but records 444 // the index (relative to the literal start) of a digit >= base 445 // in *invalid, if *invalid < 0. 446 // digits returns a bitset describing whether the sequence contained 447 // digits (bit 0 is set), or separators '_' (bit 1 is set). 448 func (s *scanner) digits(base int, invalid *int) (digsep int) { 449 if base <= 10 { 450 max := rune('0' + base) 451 for isDecimal(s.ch) || s.ch == '_' { 452 ds := 1 453 if s.ch == '_' { 454 ds = 2 455 } else if s.ch >= max && *invalid < 0 { 456 _, col := s.pos() 457 *invalid = int(col - s.col) // record invalid rune index 458 } 459 digsep |= ds 460 s.nextch() 461 } 462 } else { 463 for isHex(s.ch) || s.ch == '_' { 464 ds := 1 465 if s.ch == '_' { 466 ds = 2 467 } 468 digsep |= ds 469 s.nextch() 470 } 471 } 472 return 473 } 474 475 func (s *scanner) number(seenPoint bool) { 476 ok := true 477 kind := IntLit 478 base := 10 // number base 479 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' 480 digsep := 0 // bit 0: digit present, bit 1: '_' present 481 invalid := -1 // index of invalid digit in literal, or < 0 482 483 // integer part 484 if !seenPoint { 485 if s.ch == '0' { 486 s.nextch() 487 switch lower(s.ch) { 488 case 'x': 489 s.nextch() 490 base, prefix = 16, 'x' 491 case 'o': 492 s.nextch() 493 base, prefix = 8, 'o' 494 case 'b': 495 s.nextch() 496 base, prefix = 2, 'b' 497 default: 498 base, prefix = 8, '0' 499 digsep = 1 // leading 0 500 } 501 } 502 digsep |= s.digits(base, &invalid) 503 if s.ch == '.' { 504 if prefix == 'o' || prefix == 'b' { 505 s.errorf("invalid radix point in %s literal", baseName(base)) 506 ok = false 507 } 508 s.nextch() 509 seenPoint = true 510 } 511 } 512 513 // fractional part 514 if seenPoint { 515 kind = FloatLit 516 digsep |= s.digits(base, &invalid) 517 } 518 519 if digsep&1 == 0 && ok { 520 s.errorf("%s literal has no digits", baseName(base)) 521 ok = false 522 } 523 524 // exponent 525 if e := lower(s.ch); e == 'e' || e == 'p' { 526 if ok { 527 switch { 528 case e == 'e' && prefix != 0 && prefix != '0': 529 s.errorf("%q exponent requires decimal mantissa", s.ch) 530 ok = false 531 case e == 'p' && prefix != 'x': 532 s.errorf("%q exponent requires hexadecimal mantissa", s.ch) 533 ok = false 534 } 535 } 536 s.nextch() 537 kind = FloatLit 538 if s.ch == '+' || s.ch == '-' { 539 s.nextch() 540 } 541 digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit 542 if digsep&1 == 0 && ok { 543 s.errorf("exponent has no digits") 544 ok = false 545 } 546 } else if prefix == 'x' && kind == FloatLit && ok { 547 s.errorf("hexadecimal mantissa requires a 'p' exponent") 548 ok = false 549 } 550 551 // suffix 'i' 552 if s.ch == 'i' { 553 kind = ImagLit 554 s.nextch() 555 } 556 557 s.setLit(kind, ok) // do this now so we can use s.lit below 558 559 if kind == IntLit && invalid >= 0 && ok { 560 s.errorAtf(invalid, "invalid digit %q in %s literal", s.lit[invalid], baseName(base)) 561 ok = false 562 } 563 564 if digsep&2 != 0 && ok { 565 if i := invalidSep(s.lit); i >= 0 { 566 s.errorAtf(i, "'_' must separate successive digits") 567 ok = false 568 } 569 } 570 571 s.bad = !ok // correct s.bad 572 } 573 574 func baseName(base int) string { 575 switch base { 576 case 2: 577 return "binary" 578 case 8: 579 return "octal" 580 case 10: 581 return "decimal" 582 case 16: 583 return "hexadecimal" 584 } 585 panic("invalid base") 586 } 587 588 // invalidSep returns the index of the first invalid separator in x, or -1. 589 func invalidSep(x string) int { 590 x1 := ' ' // prefix char, we only care if it's 'x' 591 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) 592 i := 0 593 594 // a prefix counts as a digit 595 if len(x) >= 2 && x[0] == '0' { 596 x1 = lower(rune(x[1])) 597 if x1 == 'x' || x1 == 'o' || x1 == 'b' { 598 d = '0' 599 i = 2 600 } 601 } 602 603 // mantissa and exponent 604 for ; i < len(x); i++ { 605 p := d // previous digit 606 d = rune(x[i]) 607 switch { 608 case d == '_': 609 if p != '0' { 610 return i 611 } 612 case isDecimal(d) || x1 == 'x' && isHex(d): 613 d = '0' 614 default: 615 if p == '_' { 616 return i - 1 617 } 618 d = '.' 619 } 620 } 621 if d == '_' { 622 return len(x) - 1 623 } 624 625 return -1 626 } 627 628 func (s *scanner) rune() { 629 ok := true 630 s.nextch() 631 632 n := 0 633 for ; ; n++ { 634 if s.ch == '\'' { 635 if ok { 636 if n == 0 { 637 s.errorf("empty rune literal or unescaped '") 638 ok = false 639 } else if n != 1 { 640 s.errorAtf(0, "more than one character in rune literal") 641 ok = false 642 } 643 } 644 s.nextch() 645 break 646 } 647 if s.ch == '\\' { 648 s.nextch() 649 if !s.escape('\'') { 650 ok = false 651 } 652 continue 653 } 654 if s.ch == '\n' { 655 if ok { 656 s.errorf("newline in rune literal") 657 ok = false 658 } 659 break 660 } 661 if s.ch < 0 { 662 if ok { 663 s.errorAtf(0, "rune literal not terminated") 664 ok = false 665 } 666 break 667 } 668 s.nextch() 669 } 670 671 s.setLit(RuneLit, ok) 672 } 673 674 func (s *scanner) stdString() { 675 ok := true 676 s.nextch() 677 678 for { 679 if s.ch == '"' { 680 s.nextch() 681 break 682 } 683 if s.ch == '\\' { 684 s.nextch() 685 if !s.escape('"') { 686 ok = false 687 } 688 continue 689 } 690 if s.ch == '\n' { 691 s.errorf("newline in string") 692 ok = false 693 break 694 } 695 if s.ch < 0 { 696 s.errorAtf(0, "string not terminated") 697 ok = false 698 break 699 } 700 s.nextch() 701 } 702 703 s.setLit(StringLit, ok) 704 } 705 706 func (s *scanner) rawString() { 707 ok := true 708 s.nextch() 709 710 for { 711 if s.ch == '`' { 712 s.nextch() 713 break 714 } 715 if s.ch < 0 { 716 s.errorAtf(0, "string not terminated") 717 ok = false 718 break 719 } 720 s.nextch() 721 } 722 // We leave CRs in the string since they are part of the 723 // literal (even though they are not part of the literal 724 // value). 725 726 s.setLit(StringLit, ok) 727 } 728 729 func (s *scanner) comment(text string) { 730 s.errorAtf(0, "%s", text) 731 } 732 733 func (s *scanner) skipLine() { 734 // don't consume '\n' - needed for nlsemi logic 735 for s.ch >= 0 && s.ch != '\n' { 736 s.nextch() 737 } 738 } 739 740 func (s *scanner) lineComment() { 741 // opening has already been consumed 742 743 if s.mode&comments != 0 { 744 s.skipLine() 745 s.comment(string(s.segment())) 746 return 747 } 748 749 // are we saving directives? or is this definitely not a directive? 750 if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') { 751 s.stop() 752 s.skipLine() 753 return 754 } 755 756 // recognize go: or line directives 757 prefix := "go:" 758 if s.ch == 'l' { 759 prefix = "line " 760 } 761 for _, m := range prefix { 762 if s.ch != m { 763 s.stop() 764 s.skipLine() 765 return 766 } 767 s.nextch() 768 } 769 770 // directive text 771 s.skipLine() 772 s.comment(string(s.segment())) 773 } 774 775 func (s *scanner) skipComment() bool { 776 for s.ch >= 0 { 777 for s.ch == '*' { 778 s.nextch() 779 if s.ch == '/' { 780 s.nextch() 781 return true 782 } 783 } 784 s.nextch() 785 } 786 s.errorAtf(0, "comment not terminated") 787 return false 788 } 789 790 func (s *scanner) fullComment() { 791 /* opening has already been consumed */ 792 793 if s.mode&comments != 0 { 794 if s.skipComment() { 795 s.comment(string(s.segment())) 796 } 797 return 798 } 799 800 if s.mode&directives == 0 || s.ch != 'l' { 801 s.stop() 802 s.skipComment() 803 return 804 } 805 806 // recognize line directive 807 const prefix = "line " 808 for _, m := range prefix { 809 if s.ch != m { 810 s.stop() 811 s.skipComment() 812 return 813 } 814 s.nextch() 815 } 816 817 // directive text 818 if s.skipComment() { 819 s.comment(string(s.segment())) 820 } 821 } 822 823 func (s *scanner) escape(quote rune) bool { 824 var n int 825 var base, max uint32 826 827 switch s.ch { 828 case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\': 829 s.nextch() 830 return true 831 case '0', '1', '2', '3', '4', '5', '6', '7': 832 n, base, max = 3, 8, 255 833 case 'x': 834 s.nextch() 835 n, base, max = 2, 16, 255 836 case 'u': 837 s.nextch() 838 n, base, max = 4, 16, unicode.MaxRune 839 case 'U': 840 s.nextch() 841 n, base, max = 8, 16, unicode.MaxRune 842 default: 843 if s.ch < 0 { 844 return true // complain in caller about EOF 845 } 846 s.errorf("unknown escape") 847 return false 848 } 849 850 var x uint32 851 for i := n; i > 0; i-- { 852 if s.ch < 0 { 853 return true // complain in caller about EOF 854 } 855 d := base 856 if isDecimal(s.ch) { 857 d = uint32(s.ch) - '0' 858 } else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' { 859 d = uint32(lower(s.ch)) - 'a' + 10 860 } 861 if d >= base { 862 s.errorf("invalid character %q in %s escape", s.ch, baseName(int(base))) 863 return false 864 } 865 // d < base 866 x = x*base + d 867 s.nextch() 868 } 869 870 if x > max && base == 8 { 871 s.errorf("octal escape value %d > 255", x) 872 return false 873 } 874 875 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 876 s.errorf("escape is invalid Unicode code point %#U", x) 877 return false 878 } 879 880 return true 881 }