github.com/likebike/go--@v0.0.0-20190911215757-0bd925d16e96/go/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, and tokens.go are self-contained 10 // (go tool compile scanner.go source.go tokens.go compiles) 11 // and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 type scanner struct { 23 source 24 pragh func(line, col uint, msg string) 25 nlsemi bool // if set '\n' and EOF translate to ';' 26 27 // current token, valid after calling next() 28 line, col uint 29 tok token 30 lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF") 31 kind LitKind // valid if tok is _Literal 32 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 33 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 34 } 35 36 func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) { 37 s.source.init(src, errh) 38 s.pragh = pragh 39 s.nlsemi = false 40 } 41 42 // next advances the scanner by reading the next token. 43 // 44 // If a read, source encoding, or lexical error occurs, next 45 // calls the error handler installed with init. The handler 46 // must exist. 47 // 48 // If a //line or //go: directive is encountered at the start 49 // of a line, next calls the directive handler pragh installed 50 // with init, if not nil. 51 // 52 // The (line, col) position passed to the error and directive 53 // handler is always at or after the current source reading 54 // position. 55 func (s *scanner) next() { 56 nlsemi := s.nlsemi 57 s.nlsemi = false 58 59 // Go-- support for 'shebang' lines: 60 if s.source.line0<=linebase && s.source.col0<=colbase { // BOM does not affect 'col'. 61 if s.getr()=='#' && s.source.r<len(s.source.buf) && s.source.buf[s.source.r]=='!' { s.skipLine('#') 62 } else { s.ungetr() } 63 } 64 65 redo: 66 // skip white space 67 c := s.getr() 68 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 69 c = s.getr() 70 } 71 72 // token start 73 s.line, s.col = s.source.line0, s.source.col0 74 75 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { 76 s.ident() 77 return 78 } 79 80 switch c { 81 case -1: 82 if nlsemi { 83 s.lit = "EOF" 84 s.tok = _Semi 85 break 86 } 87 s.tok = _EOF 88 89 case '\n': 90 s.lit = "newline" 91 s.tok = _Semi 92 93 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 94 s.number(c) 95 96 case '"': 97 s.stdString() 98 99 case '`': 100 s.rawString() 101 102 case '\'': 103 s.rune() 104 105 case '(': 106 s.tok = _Lparen 107 108 case '[': 109 s.tok = _Lbrack 110 111 case '{': 112 s.tok = _Lbrace 113 114 case ',': 115 s.tok = _Comma 116 117 case ';': 118 s.lit = "semicolon" 119 s.tok = _Semi 120 121 case ')': 122 s.nlsemi = true 123 s.tok = _Rparen 124 125 case ']': 126 s.nlsemi = true 127 s.tok = _Rbrack 128 129 case '}': 130 s.nlsemi = true 131 s.tok = _Rbrace 132 133 case ':': 134 if s.getr() == '=' { 135 s.tok = _Define 136 break 137 } 138 s.ungetr() 139 s.tok = _Colon 140 141 case '.': 142 c = s.getr() 143 if isDigit(c) { 144 s.ungetr2() 145 s.number('.') 146 break 147 } 148 if c == '.' { 149 c = s.getr() 150 if c == '.' { 151 s.tok = _DotDotDot 152 break 153 } 154 s.ungetr2() 155 } 156 s.ungetr() 157 s.tok = _Dot 158 159 case '+': 160 s.op, s.prec = Add, precAdd 161 c = s.getr() 162 if c != '+' { 163 goto assignop 164 } 165 s.nlsemi = true 166 s.tok = _IncOp 167 168 case '-': 169 s.op, s.prec = Sub, precAdd 170 c = s.getr() 171 if c != '-' { 172 goto assignop 173 } 174 s.nlsemi = true 175 s.tok = _IncOp 176 177 case '*': 178 s.op, s.prec = Mul, precMul 179 // don't goto assignop - want _Star token 180 if s.getr() == '=' { 181 s.tok = _AssignOp 182 break 183 } 184 s.ungetr() 185 s.tok = _Star 186 187 case '/': 188 c = s.getr() 189 if c == '/' { 190 s.lineComment() 191 goto redo 192 } 193 if c == '*' { 194 s.fullComment() 195 if s.source.line > s.line && nlsemi { 196 // A multi-line comment acts like a newline; 197 // it translates to a ';' if nlsemi is set. 198 s.lit = "newline" 199 s.tok = _Semi 200 break 201 } 202 goto redo 203 } 204 s.op, s.prec = Div, precMul 205 goto assignop 206 207 case '%': 208 s.op, s.prec = Rem, precMul 209 c = s.getr() 210 goto assignop 211 212 case '&': 213 c = s.getr() 214 if c == '&' { 215 s.op, s.prec = AndAnd, precAndAnd 216 s.tok = _Operator 217 break 218 } 219 s.op, s.prec = And, precMul 220 if c == '^' { 221 s.op = AndNot 222 c = s.getr() 223 } 224 goto assignop 225 226 case '|': 227 c = s.getr() 228 if c == '|' { 229 s.op, s.prec = OrOr, precOrOr 230 s.tok = _Operator 231 break 232 } 233 s.op, s.prec = Or, precAdd 234 goto assignop 235 236 case '~': 237 s.error("bitwise complement operator is ^") 238 fallthrough 239 240 case '^': 241 s.op, s.prec = Xor, precAdd 242 c = s.getr() 243 goto assignop 244 245 case '<': 246 c = s.getr() 247 if c == '=' { 248 s.op, s.prec = Leq, precCmp 249 s.tok = _Operator 250 break 251 } 252 if c == '<' { 253 s.op, s.prec = Shl, precMul 254 c = s.getr() 255 goto assignop 256 } 257 if c == '-' { 258 s.tok = _Arrow 259 break 260 } 261 s.ungetr() 262 s.op, s.prec = Lss, precCmp 263 s.tok = _Operator 264 265 case '>': 266 c = s.getr() 267 if c == '=' { 268 s.op, s.prec = Geq, precCmp 269 s.tok = _Operator 270 break 271 } 272 if c == '>' { 273 s.op, s.prec = Shr, precMul 274 c = s.getr() 275 goto assignop 276 } 277 s.ungetr() 278 s.op, s.prec = Gtr, precCmp 279 s.tok = _Operator 280 281 case '=': 282 if s.getr() == '=' { 283 s.op, s.prec = Eql, precCmp 284 s.tok = _Operator 285 break 286 } 287 s.ungetr() 288 s.tok = _Assign 289 290 case '!': 291 if s.getr() == '=' { 292 s.op, s.prec = Neq, precCmp 293 s.tok = _Operator 294 break 295 } 296 s.ungetr() 297 s.op, s.prec = Not, 0 298 s.tok = _Operator 299 300 default: 301 s.tok = 0 302 s.error(fmt.Sprintf("invalid character %#U", c)) 303 goto redo 304 } 305 306 return 307 308 assignop: 309 if c == '=' { 310 s.tok = _AssignOp 311 return 312 } 313 s.ungetr() 314 s.tok = _Operator 315 } 316 317 func isLetter(c rune) bool { 318 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 319 } 320 321 func isDigit(c rune) bool { 322 return '0' <= c && c <= '9' 323 } 324 325 func (s *scanner) ident() { 326 s.startLit() 327 328 // accelerate common case (7bit ASCII) 329 c := s.getr() 330 for isLetter(c) || isDigit(c) { 331 c = s.getr() 332 } 333 334 // general case 335 if c >= utf8.RuneSelf { 336 for s.isIdentRune(c, false) { 337 c = s.getr() 338 } 339 } 340 s.ungetr() 341 342 lit := s.stopLit() 343 344 // possibly a keyword 345 if len(lit) >= 2 { 346 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 347 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 348 s.tok = tok 349 return 350 } 351 } 352 353 s.nlsemi = true 354 s.lit = string(lit) 355 s.tok = _Name 356 } 357 358 func (s *scanner) isIdentRune(c rune, first bool) bool { 359 switch { 360 case unicode.IsLetter(c) || c == '_': 361 // ok 362 case unicode.IsDigit(c): 363 if first { 364 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 365 } 366 case c >= utf8.RuneSelf: 367 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 368 default: 369 return false 370 } 371 return true 372 } 373 374 // hash is a perfect hash function for keywords. 375 // It assumes that s has at least length 2. 376 func hash(s []byte) uint { 377 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 378 } 379 380 var keywordMap [1 << 6]token // size must be power of two 381 382 func init() { 383 // populate keywordMap 384 for tok := _Break; tok <= _Var; tok++ { 385 h := hash([]byte(tokstrings[tok])) 386 if keywordMap[h] != 0 { 387 panic("imperfect hash") 388 } 389 keywordMap[h] = tok 390 } 391 } 392 393 func (s *scanner) number(c rune) { 394 s.startLit() 395 396 if c != '.' { 397 s.kind = IntLit // until proven otherwise 398 if c == '0' { 399 c = s.getr() 400 if c == 'x' || c == 'X' { 401 // hex 402 c = s.getr() 403 hasDigit := false 404 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 405 c = s.getr() 406 hasDigit = true 407 } 408 if !hasDigit { 409 s.error("malformed hex constant") 410 } 411 goto done 412 } 413 414 // decimal 0, octal, or float 415 has8or9 := false 416 for isDigit(c) { 417 if c > '7' { 418 has8or9 = true 419 } 420 c = s.getr() 421 } 422 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 423 // octal 424 if has8or9 { 425 s.error("malformed octal constant") 426 } 427 goto done 428 } 429 430 } else { 431 // decimal or float 432 for isDigit(c) { 433 c = s.getr() 434 } 435 } 436 } 437 438 // float 439 if c == '.' { 440 s.kind = FloatLit 441 c = s.getr() 442 for isDigit(c) { 443 c = s.getr() 444 } 445 } 446 447 // exponent 448 if c == 'e' || c == 'E' { 449 s.kind = FloatLit 450 c = s.getr() 451 if c == '-' || c == '+' { 452 c = s.getr() 453 } 454 if !isDigit(c) { 455 s.error("malformed floating-point constant exponent") 456 } 457 for isDigit(c) { 458 c = s.getr() 459 } 460 } 461 462 // complex 463 if c == 'i' { 464 s.kind = ImagLit 465 s.getr() 466 } 467 468 done: 469 s.ungetr() 470 s.nlsemi = true 471 s.lit = string(s.stopLit()) 472 s.tok = _Literal 473 } 474 475 func (s *scanner) rune() { 476 s.startLit() 477 478 ok := true // only report errors if we're ok so far 479 n := 0 480 for ; ; n++ { 481 r := s.getr() 482 if r == '\'' { 483 break 484 } 485 if r == '\\' { 486 if !s.escape('\'') { 487 ok = false 488 } 489 continue 490 } 491 if r == '\n' { 492 s.ungetr() // assume newline is not part of literal 493 if ok { 494 s.error("newline in character literal") 495 ok = false 496 } 497 break 498 } 499 if r < 0 { 500 if ok { 501 s.errh(s.line, s.col, "invalid character literal (missing closing ')") 502 ok = false 503 } 504 break 505 } 506 } 507 508 if ok { 509 if n == 0 { 510 s.error("empty character literal or unescaped ' in character literal") 511 } else if n != 1 { 512 s.errh(s.line, s.col, "invalid character literal (more than one character)") 513 } 514 } 515 516 s.nlsemi = true 517 s.lit = string(s.stopLit()) 518 s.kind = RuneLit 519 s.tok = _Literal 520 } 521 522 func (s *scanner) stdString() { 523 s.startLit() 524 525 for { 526 r := s.getr() 527 if r == '"' { 528 break 529 } 530 if r == '\\' { 531 s.escape('"') 532 continue 533 } 534 if r == '\n' { 535 s.ungetr() // assume newline is not part of literal 536 s.error("newline in string") 537 break 538 } 539 if r < 0 { 540 s.errh(s.line, s.col, "string not terminated") 541 break 542 } 543 } 544 545 s.nlsemi = true 546 s.lit = string(s.stopLit()) 547 s.kind = StringLit 548 s.tok = _Literal 549 } 550 551 func (s *scanner) rawString() { 552 s.startLit() 553 554 for { 555 r := s.getr() 556 if r == '`' { 557 break 558 } 559 if r < 0 { 560 s.errh(s.line, s.col, "string not terminated") 561 break 562 } 563 } 564 // We leave CRs in the string since they are part of the 565 // literal (even though they are not part of the literal 566 // value). 567 568 s.nlsemi = true 569 s.lit = string(s.stopLit()) 570 s.kind = StringLit 571 s.tok = _Literal 572 } 573 574 func (s *scanner) skipLine(r rune) { 575 for r >= 0 { 576 if r == '\n' { 577 s.ungetr() // don't consume '\n' - needed for nlsemi logic 578 break 579 } 580 r = s.getr() 581 } 582 } 583 584 func (s *scanner) lineComment() { 585 r := s.getr() 586 // directives must start at the beginning of the line (s.col == colbase) 587 if s.col != colbase || s.pragh == nil || (r != 'g' && r != 'l') { 588 s.skipLine(r) 589 return 590 } 591 // s.col == colbase && s.pragh != nil && (r == 'g' || r == 'l') 592 593 // recognize directives 594 prefix := "go:" 595 if r == 'l' { 596 prefix = "line " 597 } 598 for _, m := range prefix { 599 if r != m { 600 s.skipLine(r) 601 return 602 } 603 r = s.getr() 604 } 605 606 // directive text without line ending (which may be "\r\n" if Windows), 607 s.startLit() 608 s.skipLine(r) 609 text := s.stopLit() 610 if i := len(text) - 1; i >= 0 && text[i] == '\r' { 611 text = text[:i] 612 } 613 614 s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since directive text starts after // 615 } 616 617 func (s *scanner) fullComment() { 618 for { 619 r := s.getr() 620 for r == '*' { 621 r = s.getr() 622 if r == '/' { 623 return 624 } 625 } 626 if r < 0 { 627 s.errh(s.line, s.col, "comment not terminated") 628 return 629 } 630 } 631 } 632 633 func (s *scanner) escape(quote rune) bool { 634 var n int 635 var base, max uint32 636 637 c := s.getr() 638 switch c { 639 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 640 return true 641 case '0', '1', '2', '3', '4', '5', '6', '7': 642 n, base, max = 3, 8, 255 643 case 'x': 644 c = s.getr() 645 n, base, max = 2, 16, 255 646 case 'u': 647 c = s.getr() 648 n, base, max = 4, 16, unicode.MaxRune 649 case 'U': 650 c = s.getr() 651 n, base, max = 8, 16, unicode.MaxRune 652 default: 653 if c < 0 { 654 return true // complain in caller about EOF 655 } 656 s.error("unknown escape sequence") 657 return false 658 } 659 660 var x uint32 661 for i := n; i > 0; i-- { 662 d := base 663 switch { 664 case isDigit(c): 665 d = uint32(c) - '0' 666 case 'a' <= c && c <= 'f': 667 d = uint32(c) - ('a' - 10) 668 case 'A' <= c && c <= 'F': 669 d = uint32(c) - ('A' - 10) 670 } 671 if d >= base { 672 if c < 0 { 673 return true // complain in caller about EOF 674 } 675 kind := "hex" 676 if base == 8 { 677 kind = "octal" 678 } 679 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c)) 680 s.ungetr() 681 return false 682 } 683 // d < base 684 x = x*base + d 685 c = s.getr() 686 } 687 s.ungetr() 688 689 if x > max && base == 8 { 690 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 691 return false 692 } 693 694 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 695 s.error("escape sequence is invalid Unicode code point") 696 return false 697 } 698 699 return true 700 }