github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, and tokens.go are self-contained 10 // (go tool compile scanner.go source.go tokens.go compiles) 11 // and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 // The mode flags below control which comments are reported 23 // by calling the error handler. If no flag is set, comments 24 // are ignored. 25 const ( 26 comments uint = 1 << iota // call handler for all comments 27 directives // call handler for directives only 28 ) 29 30 type scanner struct { 31 source 32 mode uint 33 nlsemi bool // if set '\n' and EOF translate to ';' 34 35 // current token, valid after calling next() 36 line, col uint 37 tok token 38 lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF") 39 kind LitKind // valid if tok is _Literal 40 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 41 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 42 } 43 44 func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) { 45 s.source.init(src, errh) 46 s.mode = mode 47 s.nlsemi = false 48 } 49 50 // next advances the scanner by reading the next token. 51 // 52 // If a read, source encoding, or lexical error occurs, next calls 53 // the installed error handler with the respective error position 54 // and message. The error message is guaranteed to be non-empty and 55 // never starts with a '/'. The error handler must exist. 56 // 57 // If the scanner mode includes the comments flag and a comment 58 // (including comments containing directives) is encountered, the 59 // error handler is also called with each comment position and text 60 // (including opening /* or // and closing */, but without a newline 61 // at the end of line comments). Comment text always starts with a / 62 // which can be used to distinguish these handler calls from errors. 63 // 64 // If the scanner mode includes the directives (but not the comments) 65 // flag, only comments containing a //line, /*line, or //go: directive 66 // are reported, in the same way as regular comments. Directives in 67 // //-style comments are only recognized if they are at the beginning 68 // of a line. 69 // 70 func (s *scanner) next() { 71 nlsemi := s.nlsemi 72 s.nlsemi = false 73 74 redo: 75 // skip white space 76 c := s.getr() 77 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 78 c = s.getr() 79 } 80 81 // token start 82 s.line, s.col = s.source.line0, s.source.col0 83 84 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { 85 s.ident() 86 return 87 } 88 89 switch c { 90 case -1: 91 if nlsemi { 92 s.lit = "EOF" 93 s.tok = _Semi 94 break 95 } 96 s.tok = _EOF 97 98 case '\n': 99 s.lit = "newline" 100 s.tok = _Semi 101 102 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 103 s.number(c) 104 105 case '"': 106 s.stdString() 107 108 case '`': 109 s.rawString() 110 111 case '\'': 112 s.rune() 113 114 case '(': 115 s.tok = _Lparen 116 117 case '[': 118 s.tok = _Lbrack 119 120 case '{': 121 s.tok = _Lbrace 122 123 case ',': 124 s.tok = _Comma 125 126 case ';': 127 s.lit = "semicolon" 128 s.tok = _Semi 129 130 case ')': 131 s.nlsemi = true 132 s.tok = _Rparen 133 134 case ']': 135 s.nlsemi = true 136 s.tok = _Rbrack 137 138 case '}': 139 s.nlsemi = true 140 s.tok = _Rbrace 141 142 case ':': 143 if s.getr() == '=' { 144 s.tok = _Define 145 break 146 } 147 s.ungetr() 148 s.tok = _Colon 149 150 case '.': 151 c = s.getr() 152 if isDigit(c) { 153 s.ungetr2() 154 s.number('.') 155 break 156 } 157 if c == '.' { 158 c = s.getr() 159 if c == '.' { 160 s.tok = _DotDotDot 161 break 162 } 163 s.ungetr2() 164 } 165 s.ungetr() 166 s.tok = _Dot 167 168 case '+': 169 s.op, s.prec = Add, precAdd 170 c = s.getr() 171 if c != '+' { 172 goto assignop 173 } 174 s.nlsemi = true 175 s.tok = _IncOp 176 177 case '-': 178 s.op, s.prec = Sub, precAdd 179 c = s.getr() 180 if c != '-' { 181 goto assignop 182 } 183 s.nlsemi = true 184 s.tok = _IncOp 185 186 case '*': 187 s.op, s.prec = Mul, precMul 188 // don't goto assignop - want _Star token 189 if s.getr() == '=' { 190 s.tok = _AssignOp 191 break 192 } 193 s.ungetr() 194 s.tok = _Star 195 196 case '/': 197 c = s.getr() 198 if c == '/' { 199 s.lineComment() 200 goto redo 201 } 202 if c == '*' { 203 s.fullComment() 204 if s.source.line > s.line && nlsemi { 205 // A multi-line comment acts like a newline; 206 // it translates to a ';' if nlsemi is set. 207 s.lit = "newline" 208 s.tok = _Semi 209 break 210 } 211 goto redo 212 } 213 s.op, s.prec = Div, precMul 214 goto assignop 215 216 case '%': 217 s.op, s.prec = Rem, precMul 218 c = s.getr() 219 goto assignop 220 221 case '&': 222 c = s.getr() 223 if c == '&' { 224 s.op, s.prec = AndAnd, precAndAnd 225 s.tok = _Operator 226 break 227 } 228 s.op, s.prec = And, precMul 229 if c == '^' { 230 s.op = AndNot 231 c = s.getr() 232 } 233 goto assignop 234 235 case '|': 236 c = s.getr() 237 if c == '|' { 238 s.op, s.prec = OrOr, precOrOr 239 s.tok = _Operator 240 break 241 } 242 s.op, s.prec = Or, precAdd 243 goto assignop 244 245 case '^': 246 s.op, s.prec = Xor, precAdd 247 c = s.getr() 248 goto assignop 249 250 case '<': 251 c = s.getr() 252 if c == '=' { 253 s.op, s.prec = Leq, precCmp 254 s.tok = _Operator 255 break 256 } 257 if c == '<' { 258 s.op, s.prec = Shl, precMul 259 c = s.getr() 260 goto assignop 261 } 262 if c == '-' { 263 s.tok = _Arrow 264 break 265 } 266 s.ungetr() 267 s.op, s.prec = Lss, precCmp 268 s.tok = _Operator 269 270 case '>': 271 c = s.getr() 272 if c == '=' { 273 s.op, s.prec = Geq, precCmp 274 s.tok = _Operator 275 break 276 } 277 if c == '>' { 278 s.op, s.prec = Shr, precMul 279 c = s.getr() 280 goto assignop 281 } 282 s.ungetr() 283 s.op, s.prec = Gtr, precCmp 284 s.tok = _Operator 285 286 case '=': 287 if s.getr() == '=' { 288 s.op, s.prec = Eql, precCmp 289 s.tok = _Operator 290 break 291 } 292 s.ungetr() 293 s.tok = _Assign 294 295 case '!': 296 if s.getr() == '=' { 297 s.op, s.prec = Neq, precCmp 298 s.tok = _Operator 299 break 300 } 301 s.ungetr() 302 s.op, s.prec = Not, 0 303 s.tok = _Operator 304 305 default: 306 s.tok = 0 307 s.error(fmt.Sprintf("invalid character %#U", c)) 308 goto redo 309 } 310 311 return 312 313 assignop: 314 if c == '=' { 315 s.tok = _AssignOp 316 return 317 } 318 s.ungetr() 319 s.tok = _Operator 320 } 321 322 func isLetter(c rune) bool { 323 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 324 } 325 326 func isDigit(c rune) bool { 327 return '0' <= c && c <= '9' 328 } 329 330 func (s *scanner) ident() { 331 s.startLit() 332 333 // accelerate common case (7bit ASCII) 334 c := s.getr() 335 for isLetter(c) || isDigit(c) { 336 c = s.getr() 337 } 338 339 // general case 340 if c >= utf8.RuneSelf { 341 for s.isIdentRune(c, false) { 342 c = s.getr() 343 } 344 } 345 s.ungetr() 346 347 lit := s.stopLit() 348 349 // possibly a keyword 350 if len(lit) >= 2 { 351 if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) { 352 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 353 s.tok = tok 354 return 355 } 356 } 357 358 s.nlsemi = true 359 s.lit = string(lit) 360 s.tok = _Name 361 } 362 363 // tokStrFast is a faster version of token.String, which assumes that tok 364 // is one of the valid tokens - and can thus skip bounds checks. 365 func tokStrFast(tok token) string { 366 return _token_name[_token_index[tok-1]:_token_index[tok]] 367 } 368 369 func (s *scanner) isIdentRune(c rune, first bool) bool { 370 switch { 371 case unicode.IsLetter(c) || c == '_': 372 // ok 373 case unicode.IsDigit(c): 374 if first { 375 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 376 } 377 case c >= utf8.RuneSelf: 378 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 379 default: 380 return false 381 } 382 return true 383 } 384 385 // hash is a perfect hash function for keywords. 386 // It assumes that s has at least length 2. 387 func hash(s []byte) uint { 388 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 389 } 390 391 var keywordMap [1 << 6]token // size must be power of two 392 393 func init() { 394 // populate keywordMap 395 for tok := _Break; tok <= _Var; tok++ { 396 h := hash([]byte(tok.String())) 397 if keywordMap[h] != 0 { 398 panic("imperfect hash") 399 } 400 keywordMap[h] = tok 401 } 402 } 403 404 func (s *scanner) number(c rune) { 405 s.startLit() 406 407 if c != '.' { 408 s.kind = IntLit // until proven otherwise 409 if c == '0' { 410 c = s.getr() 411 if c == 'x' || c == 'X' { 412 // hex 413 c = s.getr() 414 hasDigit := false 415 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 416 c = s.getr() 417 hasDigit = true 418 } 419 if !hasDigit { 420 s.error("malformed hex constant") 421 } 422 goto done 423 } 424 425 // decimal 0, octal, or float 426 has8or9 := false 427 for isDigit(c) { 428 if c > '7' { 429 has8or9 = true 430 } 431 c = s.getr() 432 } 433 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 434 // octal 435 if has8or9 { 436 s.error("malformed octal constant") 437 } 438 goto done 439 } 440 441 } else { 442 // decimal or float 443 for isDigit(c) { 444 c = s.getr() 445 } 446 } 447 } 448 449 // float 450 if c == '.' { 451 s.kind = FloatLit 452 c = s.getr() 453 for isDigit(c) { 454 c = s.getr() 455 } 456 } 457 458 // exponent 459 if c == 'e' || c == 'E' { 460 s.kind = FloatLit 461 c = s.getr() 462 if c == '-' || c == '+' { 463 c = s.getr() 464 } 465 if !isDigit(c) { 466 s.error("malformed floating-point constant exponent") 467 } 468 for isDigit(c) { 469 c = s.getr() 470 } 471 } 472 473 // complex 474 if c == 'i' { 475 s.kind = ImagLit 476 s.getr() 477 } 478 479 done: 480 s.ungetr() 481 s.nlsemi = true 482 s.lit = string(s.stopLit()) 483 s.tok = _Literal 484 } 485 486 func (s *scanner) rune() { 487 s.startLit() 488 489 ok := true // only report errors if we're ok so far 490 n := 0 491 for ; ; n++ { 492 r := s.getr() 493 if r == '\'' { 494 break 495 } 496 if r == '\\' { 497 if !s.escape('\'') { 498 ok = false 499 } 500 continue 501 } 502 if r == '\n' { 503 s.ungetr() // assume newline is not part of literal 504 if ok { 505 s.error("newline in character literal") 506 ok = false 507 } 508 break 509 } 510 if r < 0 { 511 if ok { 512 s.errh(s.line, s.col, "invalid character literal (missing closing ')") 513 ok = false 514 } 515 break 516 } 517 } 518 519 if ok { 520 if n == 0 { 521 s.error("empty character literal or unescaped ' in character literal") 522 } else if n != 1 { 523 s.errh(s.line, s.col, "invalid character literal (more than one character)") 524 } 525 } 526 527 s.nlsemi = true 528 s.lit = string(s.stopLit()) 529 s.kind = RuneLit 530 s.tok = _Literal 531 } 532 533 func (s *scanner) stdString() { 534 s.startLit() 535 536 for { 537 r := s.getr() 538 if r == '"' { 539 break 540 } 541 if r == '\\' { 542 s.escape('"') 543 continue 544 } 545 if r == '\n' { 546 s.ungetr() // assume newline is not part of literal 547 s.error("newline in string") 548 break 549 } 550 if r < 0 { 551 s.errh(s.line, s.col, "string not terminated") 552 break 553 } 554 } 555 556 s.nlsemi = true 557 s.lit = string(s.stopLit()) 558 s.kind = StringLit 559 s.tok = _Literal 560 } 561 562 func (s *scanner) rawString() { 563 s.startLit() 564 565 for { 566 r := s.getr() 567 if r == '`' { 568 break 569 } 570 if r < 0 { 571 s.errh(s.line, s.col, "string not terminated") 572 break 573 } 574 } 575 // We leave CRs in the string since they are part of the 576 // literal (even though they are not part of the literal 577 // value). 578 579 s.nlsemi = true 580 s.lit = string(s.stopLit()) 581 s.kind = StringLit 582 s.tok = _Literal 583 } 584 585 func (s *scanner) comment(text string) { 586 s.errh(s.line, s.col, text) 587 } 588 589 func (s *scanner) skipLine(r rune) { 590 for r >= 0 { 591 if r == '\n' { 592 s.ungetr() // don't consume '\n' - needed for nlsemi logic 593 break 594 } 595 r = s.getr() 596 } 597 } 598 599 func (s *scanner) lineComment() { 600 r := s.getr() 601 602 if s.mode&comments != 0 { 603 s.startLit() 604 s.skipLine(r) 605 s.comment("//" + string(s.stopLit())) 606 return 607 } 608 609 // directives must start at the beginning of the line (s.col == colbase) 610 if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') { 611 s.skipLine(r) 612 return 613 } 614 615 // recognize go: or line directives 616 prefix := "go:" 617 if r == 'l' { 618 prefix = "line " 619 } 620 for _, m := range prefix { 621 if r != m { 622 s.skipLine(r) 623 return 624 } 625 r = s.getr() 626 } 627 628 // directive text 629 s.startLit() 630 s.skipLine(r) 631 s.comment("//" + prefix + string(s.stopLit())) 632 } 633 634 func (s *scanner) skipComment(r rune) bool { 635 for r >= 0 { 636 for r == '*' { 637 r = s.getr() 638 if r == '/' { 639 return true 640 } 641 } 642 r = s.getr() 643 } 644 s.errh(s.line, s.col, "comment not terminated") 645 return false 646 } 647 648 func (s *scanner) fullComment() { 649 r := s.getr() 650 651 if s.mode&comments != 0 { 652 s.startLit() 653 if s.skipComment(r) { 654 s.comment("/*" + string(s.stopLit())) 655 } else { 656 s.killLit() // not a complete comment - ignore 657 } 658 return 659 } 660 661 if s.mode&directives == 0 || r != 'l' { 662 s.skipComment(r) 663 return 664 } 665 666 // recognize line directive 667 const prefix = "line " 668 for _, m := range prefix { 669 if r != m { 670 s.skipComment(r) 671 return 672 } 673 r = s.getr() 674 } 675 676 // directive text 677 s.startLit() 678 if s.skipComment(r) { 679 s.comment("/*" + prefix + string(s.stopLit())) 680 } else { 681 s.killLit() // not a complete comment - ignore 682 } 683 } 684 685 func (s *scanner) escape(quote rune) bool { 686 var n int 687 var base, max uint32 688 689 c := s.getr() 690 switch c { 691 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 692 return true 693 case '0', '1', '2', '3', '4', '5', '6', '7': 694 n, base, max = 3, 8, 255 695 case 'x': 696 c = s.getr() 697 n, base, max = 2, 16, 255 698 case 'u': 699 c = s.getr() 700 n, base, max = 4, 16, unicode.MaxRune 701 case 'U': 702 c = s.getr() 703 n, base, max = 8, 16, unicode.MaxRune 704 default: 705 if c < 0 { 706 return true // complain in caller about EOF 707 } 708 s.error("unknown escape sequence") 709 return false 710 } 711 712 var x uint32 713 for i := n; i > 0; i-- { 714 d := base 715 switch { 716 case isDigit(c): 717 d = uint32(c) - '0' 718 case 'a' <= c && c <= 'f': 719 d = uint32(c) - ('a' - 10) 720 case 'A' <= c && c <= 'F': 721 d = uint32(c) - ('A' - 10) 722 } 723 if d >= base { 724 if c < 0 { 725 return true // complain in caller about EOF 726 } 727 kind := "hex" 728 if base == 8 { 729 kind = "octal" 730 } 731 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c)) 732 s.ungetr() 733 return false 734 } 735 // d < base 736 x = x*base + d 737 c = s.getr() 738 } 739 s.ungetr() 740 741 if x > max && base == 8 { 742 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 743 return false 744 } 745 746 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 747 s.error("escape sequence is invalid Unicode code point") 748 return false 749 } 750 751 return true 752 }