github.com/tidwall/go@v0.0.0-20170415222209-6694a6888b7d/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, and tokens.go are self-contained 10 // (go tool compile scanner.go source.go tokens.go compiles) 11 // and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 type scanner struct { 23 source 24 pragh func(line, col uint, msg string) 25 nlsemi bool // if set '\n' and EOF translate to ';' 26 27 // current token, valid after calling next() 28 line, col uint 29 tok token 30 lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF") 31 kind LitKind // valid if tok is _Literal 32 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 33 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 34 } 35 36 func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) { 37 s.source.init(src, errh) 38 s.pragh = pragh 39 s.nlsemi = false 40 } 41 42 // next advances the scanner by reading the next token. 43 // 44 // If a read, source encoding, or lexical error occurs, next 45 // calls the error handler installed with init. The handler 46 // must exist. 47 // 48 // If a //line or //go: directive is encountered at the start 49 // of a line, next calls the directive handler pragh installed 50 // with init, if not nil. 51 // 52 // The (line, col) position passed to the error and directive 53 // handler is always at or after the current source reading 54 // position. 55 func (s *scanner) next() { 56 nlsemi := s.nlsemi 57 s.nlsemi = false 58 59 redo: 60 // skip white space 61 c := s.getr() 62 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 63 c = s.getr() 64 } 65 66 // token start 67 s.line, s.col = s.source.line0, s.source.col0 68 69 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { 70 s.ident() 71 return 72 } 73 74 switch c { 75 case -1: 76 if nlsemi { 77 s.lit = "EOF" 78 s.tok = _Semi 79 break 80 } 81 s.tok = _EOF 82 83 case '\n': 84 s.lit = "newline" 85 s.tok = _Semi 86 87 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 88 s.number(c) 89 90 case '"': 91 s.stdString() 92 93 case '`': 94 s.rawString() 95 96 case '\'': 97 s.rune() 98 99 case '(': 100 s.tok = _Lparen 101 102 case '[': 103 s.tok = _Lbrack 104 105 case '{': 106 s.tok = _Lbrace 107 108 case ',': 109 s.tok = _Comma 110 111 case ';': 112 s.lit = "semicolon" 113 s.tok = _Semi 114 115 case ')': 116 s.nlsemi = true 117 s.tok = _Rparen 118 119 case ']': 120 s.nlsemi = true 121 s.tok = _Rbrack 122 123 case '}': 124 s.nlsemi = true 125 s.tok = _Rbrace 126 127 case ':': 128 if s.getr() == '=' { 129 s.tok = _Define 130 break 131 } 132 s.ungetr() 133 s.tok = _Colon 134 135 case '.': 136 c = s.getr() 137 if isDigit(c) { 138 s.ungetr2() 139 s.number('.') 140 break 141 } 142 if c == '.' { 143 c = s.getr() 144 if c == '.' { 145 s.tok = _DotDotDot 146 break 147 } 148 s.ungetr2() 149 } 150 s.ungetr() 151 s.tok = _Dot 152 153 case '+': 154 s.op, s.prec = Add, precAdd 155 c = s.getr() 156 if c != '+' { 157 goto assignop 158 } 159 s.nlsemi = true 160 s.tok = _IncOp 161 162 case '-': 163 s.op, s.prec = Sub, precAdd 164 c = s.getr() 165 if c != '-' { 166 goto assignop 167 } 168 s.nlsemi = true 169 s.tok = _IncOp 170 171 case '*': 172 s.op, s.prec = Mul, precMul 173 // don't goto assignop - want _Star token 174 if s.getr() == '=' { 175 s.tok = _AssignOp 176 break 177 } 178 s.ungetr() 179 s.tok = _Star 180 181 case '/': 182 c = s.getr() 183 if c == '/' { 184 s.lineComment() 185 goto redo 186 } 187 if c == '*' { 188 s.fullComment() 189 if s.source.line > s.line && nlsemi { 190 // A multi-line comment acts like a newline; 191 // it translates to a ';' if nlsemi is set. 192 s.lit = "newline" 193 s.tok = _Semi 194 break 195 } 196 goto redo 197 } 198 s.op, s.prec = Div, precMul 199 goto assignop 200 201 case '%': 202 s.op, s.prec = Rem, precMul 203 c = s.getr() 204 goto assignop 205 206 case '&': 207 c = s.getr() 208 if c == '&' { 209 s.op, s.prec = AndAnd, precAndAnd 210 s.tok = _Operator 211 break 212 } 213 s.op, s.prec = And, precMul 214 if c == '^' { 215 s.op = AndNot 216 c = s.getr() 217 } 218 goto assignop 219 220 case '|': 221 c = s.getr() 222 if c == '|' { 223 s.op, s.prec = OrOr, precOrOr 224 s.tok = _Operator 225 break 226 } 227 s.op, s.prec = Or, precAdd 228 goto assignop 229 230 case '~': 231 s.error("bitwise complement operator is ^") 232 fallthrough 233 234 case '^': 235 s.op, s.prec = Xor, precAdd 236 c = s.getr() 237 goto assignop 238 239 case '<': 240 c = s.getr() 241 if c == '=' { 242 s.op, s.prec = Leq, precCmp 243 s.tok = _Operator 244 break 245 } 246 if c == '<' { 247 s.op, s.prec = Shl, precMul 248 c = s.getr() 249 goto assignop 250 } 251 if c == '-' { 252 s.tok = _Arrow 253 break 254 } 255 s.ungetr() 256 s.op, s.prec = Lss, precCmp 257 s.tok = _Operator 258 259 case '>': 260 c = s.getr() 261 if c == '=' { 262 s.op, s.prec = Geq, precCmp 263 s.tok = _Operator 264 break 265 } 266 if c == '>' { 267 s.op, s.prec = Shr, precMul 268 c = s.getr() 269 goto assignop 270 } 271 s.ungetr() 272 s.op, s.prec = Gtr, precCmp 273 s.tok = _Operator 274 275 case '=': 276 if s.getr() == '=' { 277 s.op, s.prec = Eql, precCmp 278 s.tok = _Operator 279 break 280 } 281 s.ungetr() 282 s.tok = _Assign 283 284 case '!': 285 if s.getr() == '=' { 286 s.op, s.prec = Neq, precCmp 287 s.tok = _Operator 288 break 289 } 290 s.ungetr() 291 s.op, s.prec = Not, 0 292 s.tok = _Operator 293 294 default: 295 s.tok = 0 296 s.error(fmt.Sprintf("invalid character %#U", c)) 297 goto redo 298 } 299 300 return 301 302 assignop: 303 if c == '=' { 304 s.tok = _AssignOp 305 return 306 } 307 s.ungetr() 308 s.tok = _Operator 309 } 310 311 func isLetter(c rune) bool { 312 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 313 } 314 315 func isDigit(c rune) bool { 316 return '0' <= c && c <= '9' 317 } 318 319 func (s *scanner) ident() { 320 s.startLit() 321 322 // accelerate common case (7bit ASCII) 323 c := s.getr() 324 for isLetter(c) || isDigit(c) { 325 c = s.getr() 326 } 327 328 // general case 329 if c >= utf8.RuneSelf { 330 for s.isIdentRune(c, false) { 331 c = s.getr() 332 } 333 } 334 s.ungetr() 335 336 lit := s.stopLit() 337 338 // possibly a keyword 339 if len(lit) >= 2 { 340 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 341 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 342 s.tok = tok 343 return 344 } 345 } 346 347 s.nlsemi = true 348 s.lit = string(lit) 349 s.tok = _Name 350 } 351 352 func (s *scanner) isIdentRune(c rune, first bool) bool { 353 switch { 354 case unicode.IsLetter(c) || c == '_': 355 // ok 356 case unicode.IsDigit(c): 357 if first { 358 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 359 } 360 case c >= utf8.RuneSelf: 361 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 362 default: 363 return false 364 } 365 return true 366 } 367 368 // hash is a perfect hash function for keywords. 369 // It assumes that s has at least length 2. 370 func hash(s []byte) uint { 371 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 372 } 373 374 var keywordMap [1 << 6]token // size must be power of two 375 376 func init() { 377 // populate keywordMap 378 for tok := _Break; tok <= _Var; tok++ { 379 h := hash([]byte(tokstrings[tok])) 380 if keywordMap[h] != 0 { 381 panic("imperfect hash") 382 } 383 keywordMap[h] = tok 384 } 385 } 386 387 func (s *scanner) number(c rune) { 388 s.startLit() 389 390 if c != '.' { 391 s.kind = IntLit // until proven otherwise 392 if c == '0' { 393 c = s.getr() 394 if c == 'x' || c == 'X' { 395 // hex 396 c = s.getr() 397 hasDigit := false 398 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 399 c = s.getr() 400 hasDigit = true 401 } 402 if !hasDigit { 403 s.error("malformed hex constant") 404 } 405 goto done 406 } 407 408 // decimal 0, octal, or float 409 has8or9 := false 410 for isDigit(c) { 411 if c > '7' { 412 has8or9 = true 413 } 414 c = s.getr() 415 } 416 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 417 // octal 418 if has8or9 { 419 s.error("malformed octal constant") 420 } 421 goto done 422 } 423 424 } else { 425 // decimal or float 426 for isDigit(c) { 427 c = s.getr() 428 } 429 } 430 } 431 432 // float 433 if c == '.' { 434 s.kind = FloatLit 435 c = s.getr() 436 for isDigit(c) { 437 c = s.getr() 438 } 439 } 440 441 // exponent 442 if c == 'e' || c == 'E' { 443 s.kind = FloatLit 444 c = s.getr() 445 if c == '-' || c == '+' { 446 c = s.getr() 447 } 448 if !isDigit(c) { 449 s.error("malformed floating-point constant exponent") 450 } 451 for isDigit(c) { 452 c = s.getr() 453 } 454 } 455 456 // complex 457 if c == 'i' { 458 s.kind = ImagLit 459 s.getr() 460 } 461 462 done: 463 s.ungetr() 464 s.nlsemi = true 465 s.lit = string(s.stopLit()) 466 s.tok = _Literal 467 } 468 469 func (s *scanner) rune() { 470 s.startLit() 471 472 ok := true // only report errors if we're ok so far 473 n := 0 474 for ; ; n++ { 475 r := s.getr() 476 if r == '\'' { 477 break 478 } 479 if r == '\\' { 480 if !s.escape('\'') { 481 ok = false 482 } 483 continue 484 } 485 if r == '\n' { 486 s.ungetr() // assume newline is not part of literal 487 if ok { 488 s.error("newline in character literal") 489 ok = false 490 } 491 break 492 } 493 if r < 0 { 494 if ok { 495 s.errh(s.line, s.col, "invalid character literal (missing closing ')") 496 ok = false 497 } 498 break 499 } 500 } 501 502 if ok { 503 if n == 0 { 504 s.error("empty character literal or unescaped ' in character literal") 505 } else if n != 1 { 506 s.errh(s.line, s.col, "invalid character literal (more than one character)") 507 } 508 } 509 510 s.nlsemi = true 511 s.lit = string(s.stopLit()) 512 s.kind = RuneLit 513 s.tok = _Literal 514 } 515 516 func (s *scanner) stdString() { 517 s.startLit() 518 519 for { 520 r := s.getr() 521 if r == '"' { 522 break 523 } 524 if r == '\\' { 525 s.escape('"') 526 continue 527 } 528 if r == '\n' { 529 s.ungetr() // assume newline is not part of literal 530 s.error("newline in string") 531 break 532 } 533 if r < 0 { 534 s.errh(s.line, s.col, "string not terminated") 535 break 536 } 537 } 538 539 s.nlsemi = true 540 s.lit = string(s.stopLit()) 541 s.kind = StringLit 542 s.tok = _Literal 543 } 544 545 func (s *scanner) rawString() { 546 s.startLit() 547 548 for { 549 r := s.getr() 550 if r == '`' { 551 break 552 } 553 if r < 0 { 554 s.errh(s.line, s.col, "string not terminated") 555 break 556 } 557 } 558 // We leave CRs in the string since they are part of the 559 // literal (even though they are not part of the literal 560 // value). 561 562 s.nlsemi = true 563 s.lit = string(s.stopLit()) 564 s.kind = StringLit 565 s.tok = _Literal 566 } 567 568 func (s *scanner) skipLine(r rune) { 569 for r >= 0 { 570 if r == '\n' { 571 s.ungetr() // don't consume '\n' - needed for nlsemi logic 572 break 573 } 574 r = s.getr() 575 } 576 } 577 578 func (s *scanner) lineComment() { 579 r := s.getr() 580 // directives must start at the beginning of the line (s.col == colbase) 581 if s.col != colbase || s.pragh == nil || (r != 'g' && r != 'l') { 582 s.skipLine(r) 583 return 584 } 585 // s.col == colbase && s.pragh != nil && (r == 'g' || r == 'l') 586 587 // recognize directives 588 prefix := "go:" 589 if r == 'l' { 590 prefix = "line " 591 } 592 for _, m := range prefix { 593 if r != m { 594 s.skipLine(r) 595 return 596 } 597 r = s.getr() 598 } 599 600 // directive text without line ending (which may be "\r\n" if Windows), 601 s.startLit() 602 s.skipLine(r) 603 text := s.stopLit() 604 if i := len(text) - 1; i >= 0 && text[i] == '\r' { 605 text = text[:i] 606 } 607 608 s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since directive text starts after // 609 } 610 611 func (s *scanner) fullComment() { 612 for { 613 r := s.getr() 614 for r == '*' { 615 r = s.getr() 616 if r == '/' { 617 return 618 } 619 } 620 if r < 0 { 621 s.errh(s.line, s.col, "comment not terminated") 622 return 623 } 624 } 625 } 626 627 func (s *scanner) escape(quote rune) bool { 628 var n int 629 var base, max uint32 630 631 c := s.getr() 632 switch c { 633 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 634 return true 635 case '0', '1', '2', '3', '4', '5', '6', '7': 636 n, base, max = 3, 8, 255 637 case 'x': 638 c = s.getr() 639 n, base, max = 2, 16, 255 640 case 'u': 641 c = s.getr() 642 n, base, max = 4, 16, unicode.MaxRune 643 case 'U': 644 c = s.getr() 645 n, base, max = 8, 16, unicode.MaxRune 646 default: 647 if c < 0 { 648 return true // complain in caller about EOF 649 } 650 s.error("unknown escape sequence") 651 return false 652 } 653 654 var x uint32 655 for i := n; i > 0; i-- { 656 d := base 657 switch { 658 case isDigit(c): 659 d = uint32(c) - '0' 660 case 'a' <= c && c <= 'f': 661 d = uint32(c) - ('a' - 10) 662 case 'A' <= c && c <= 'F': 663 d = uint32(c) - ('A' - 10) 664 } 665 if d >= base { 666 if c < 0 { 667 return true // complain in caller about EOF 668 } 669 kind := "hex" 670 if base == 8 { 671 kind = "octal" 672 } 673 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c)) 674 s.ungetr() 675 return false 676 } 677 // d < base 678 x = x*base + d 679 c = s.getr() 680 } 681 s.ungetr() 682 683 if x > max && base == 8 { 684 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 685 return false 686 } 687 688 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 689 s.error("escape sequence is invalid Unicode code point") 690 return false 691 } 692 693 return true 694 }