github.com/rakyll/go@v0.0.0-20170216000551-64c02460d703/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, and tokens.go are self-contained 10 // (go tool compile scanner.go source.go tokens.go compiles) 11 // and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 type scanner struct { 23 source 24 pragh func(line, col uint, msg string) 25 nlsemi bool // if set '\n' and EOF translate to ';' 26 27 // current token, valid after calling next() 28 line, col uint 29 tok token 30 lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF") 31 kind LitKind // valid if tok is _Literal 32 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 33 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 34 } 35 36 func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) { 37 s.source.init(src, errh) 38 s.pragh = pragh 39 s.nlsemi = false 40 } 41 42 // next advances the scanner by reading the next token. 43 // 44 // If a read, source encoding, or lexical error occurs, next 45 // calls the error handler installed with init. The handler 46 // must exist. 47 // 48 // If a //line or //go: directive is encountered at the start 49 // of a line, next calls the directive handler pragh installed 50 // with init, if not nil. 51 // 52 // The (line, col) position passed to the error and directive 53 // handler is always at or after the current source reading 54 // position. 55 func (s *scanner) next() { 56 nlsemi := s.nlsemi 57 s.nlsemi = false 58 59 redo: 60 // skip white space 61 c := s.getr() 62 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 63 c = s.getr() 64 } 65 66 // token start 67 s.line, s.col = s.source.line0, s.source.col0 68 69 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { 70 s.ident() 71 return 72 } 73 74 switch c { 75 case -1: 76 if nlsemi { 77 s.lit = "EOF" 78 s.tok = _Semi 79 break 80 } 81 s.tok = _EOF 82 83 case '\n': 84 s.lit = "newline" 85 s.tok = _Semi 86 87 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 88 s.number(c) 89 90 case '"': 91 s.stdString() 92 93 case '`': 94 s.rawString() 95 96 case '\'': 97 s.rune() 98 99 case '(': 100 s.tok = _Lparen 101 102 case '[': 103 s.tok = _Lbrack 104 105 case '{': 106 s.tok = _Lbrace 107 108 case ',': 109 s.tok = _Comma 110 111 case ';': 112 s.lit = "semicolon" 113 s.tok = _Semi 114 115 case ')': 116 s.nlsemi = true 117 s.tok = _Rparen 118 119 case ']': 120 s.nlsemi = true 121 s.tok = _Rbrack 122 123 case '}': 124 s.nlsemi = true 125 s.tok = _Rbrace 126 127 case ':': 128 if s.getr() == '=' { 129 s.tok = _Define 130 break 131 } 132 s.ungetr() 133 s.tok = _Colon 134 135 case '.': 136 c = s.getr() 137 if isDigit(c) { 138 s.ungetr2() 139 s.number('.') 140 break 141 } 142 if c == '.' { 143 c = s.getr() 144 if c == '.' { 145 s.tok = _DotDotDot 146 break 147 } 148 s.ungetr2() 149 } 150 s.ungetr() 151 s.tok = _Dot 152 153 case '+': 154 s.op, s.prec = Add, precAdd 155 c = s.getr() 156 if c != '+' { 157 goto assignop 158 } 159 s.nlsemi = true 160 s.tok = _IncOp 161 162 case '-': 163 s.op, s.prec = Sub, precAdd 164 c = s.getr() 165 if c != '-' { 166 goto assignop 167 } 168 s.nlsemi = true 169 s.tok = _IncOp 170 171 case '*': 172 s.op, s.prec = Mul, precMul 173 // don't goto assignop - want _Star token 174 if s.getr() == '=' { 175 s.tok = _AssignOp 176 break 177 } 178 s.ungetr() 179 s.tok = _Star 180 181 case '/': 182 c = s.getr() 183 if c == '/' { 184 s.lineComment() 185 goto redo 186 } 187 if c == '*' { 188 s.fullComment() 189 if s.source.line > s.line && nlsemi { 190 // A multi-line comment acts like a newline; 191 // it translates to a ';' if nlsemi is set. 192 s.lit = "newline" 193 s.tok = _Semi 194 break 195 } 196 goto redo 197 } 198 s.op, s.prec = Div, precMul 199 goto assignop 200 201 case '%': 202 s.op, s.prec = Rem, precMul 203 c = s.getr() 204 goto assignop 205 206 case '&': 207 c = s.getr() 208 if c == '&' { 209 s.op, s.prec = AndAnd, precAndAnd 210 s.tok = _Operator 211 break 212 } 213 s.op, s.prec = And, precMul 214 if c == '^' { 215 s.op = AndNot 216 c = s.getr() 217 } 218 goto assignop 219 220 case '|': 221 c = s.getr() 222 if c == '|' { 223 s.op, s.prec = OrOr, precOrOr 224 s.tok = _Operator 225 break 226 } 227 s.op, s.prec = Or, precAdd 228 goto assignop 229 230 case '~': 231 s.error("bitwise complement operator is ^") 232 fallthrough 233 234 case '^': 235 s.op, s.prec = Xor, precAdd 236 c = s.getr() 237 goto assignop 238 239 case '<': 240 c = s.getr() 241 if c == '=' { 242 s.op, s.prec = Leq, precCmp 243 s.tok = _Operator 244 break 245 } 246 if c == '<' { 247 s.op, s.prec = Shl, precMul 248 c = s.getr() 249 goto assignop 250 } 251 if c == '-' { 252 s.tok = _Arrow 253 break 254 } 255 s.ungetr() 256 s.op, s.prec = Lss, precCmp 257 s.tok = _Operator 258 259 case '>': 260 c = s.getr() 261 if c == '=' { 262 s.op, s.prec = Geq, precCmp 263 s.tok = _Operator 264 break 265 } 266 if c == '>' { 267 s.op, s.prec = Shr, precMul 268 c = s.getr() 269 goto assignop 270 } 271 s.ungetr() 272 s.op, s.prec = Gtr, precCmp 273 s.tok = _Operator 274 275 case '=': 276 if s.getr() == '=' { 277 s.op, s.prec = Eql, precCmp 278 s.tok = _Operator 279 break 280 } 281 s.ungetr() 282 s.tok = _Assign 283 284 case '!': 285 if s.getr() == '=' { 286 s.op, s.prec = Neq, precCmp 287 s.tok = _Operator 288 break 289 } 290 s.ungetr() 291 s.op, s.prec = Not, 0 292 s.tok = _Operator 293 294 default: 295 s.tok = 0 296 s.error(fmt.Sprintf("invalid character %#U", c)) 297 goto redo 298 } 299 300 return 301 302 assignop: 303 if c == '=' { 304 s.tok = _AssignOp 305 return 306 } 307 s.ungetr() 308 s.tok = _Operator 309 } 310 311 func isLetter(c rune) bool { 312 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 313 } 314 315 func isDigit(c rune) bool { 316 return '0' <= c && c <= '9' 317 } 318 319 func (s *scanner) ident() { 320 s.startLit() 321 322 // accelerate common case (7bit ASCII) 323 c := s.getr() 324 for isLetter(c) || isDigit(c) { 325 c = s.getr() 326 } 327 328 // general case 329 if c >= utf8.RuneSelf { 330 for s.isIdentRune(c, false) { 331 c = s.getr() 332 } 333 } 334 s.ungetr() 335 336 lit := s.stopLit() 337 338 // possibly a keyword 339 if len(lit) >= 2 { 340 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 341 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 342 s.tok = tok 343 return 344 } 345 } 346 347 s.nlsemi = true 348 s.lit = string(lit) 349 s.tok = _Name 350 } 351 352 func (s *scanner) isIdentRune(c rune, first bool) bool { 353 switch { 354 case unicode.IsLetter(c) || c == '_': 355 // ok 356 case unicode.IsDigit(c): 357 if first { 358 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 359 } 360 case c >= utf8.RuneSelf: 361 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 362 default: 363 return false 364 } 365 return true 366 } 367 368 // hash is a perfect hash function for keywords. 369 // It assumes that s has at least length 2. 370 func hash(s []byte) uint { 371 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 372 } 373 374 var keywordMap [1 << 6]token // size must be power of two 375 376 func init() { 377 // populate keywordMap 378 for tok := _Break; tok <= _Var; tok++ { 379 h := hash([]byte(tokstrings[tok])) 380 if keywordMap[h] != 0 { 381 panic("imperfect hash") 382 } 383 keywordMap[h] = tok 384 } 385 } 386 387 func (s *scanner) number(c rune) { 388 s.startLit() 389 390 if c != '.' { 391 s.kind = IntLit // until proven otherwise 392 if c == '0' { 393 c = s.getr() 394 if c == 'x' || c == 'X' { 395 // hex 396 c = s.getr() 397 hasDigit := false 398 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 399 c = s.getr() 400 hasDigit = true 401 } 402 if !hasDigit { 403 s.error("malformed hex constant") 404 } 405 goto done 406 } 407 408 // decimal 0, octal, or float 409 has8or9 := false 410 for isDigit(c) { 411 if c > '7' { 412 has8or9 = true 413 } 414 c = s.getr() 415 } 416 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 417 // octal 418 if has8or9 { 419 s.error("malformed octal constant") 420 } 421 goto done 422 } 423 424 } else { 425 // decimal or float 426 for isDigit(c) { 427 c = s.getr() 428 } 429 } 430 } 431 432 // float 433 if c == '.' { 434 s.kind = FloatLit 435 c = s.getr() 436 for isDigit(c) { 437 c = s.getr() 438 } 439 } 440 441 // exponent 442 if c == 'e' || c == 'E' { 443 s.kind = FloatLit 444 c = s.getr() 445 if c == '-' || c == '+' { 446 c = s.getr() 447 } 448 if !isDigit(c) { 449 s.error("malformed floating-point constant exponent") 450 } 451 for isDigit(c) { 452 c = s.getr() 453 } 454 } 455 456 // complex 457 if c == 'i' { 458 s.kind = ImagLit 459 s.getr() 460 } 461 462 done: 463 s.ungetr() 464 s.nlsemi = true 465 s.lit = string(s.stopLit()) 466 s.tok = _Literal 467 } 468 469 func (s *scanner) stdString() { 470 s.startLit() 471 472 for { 473 r := s.getr() 474 if r == '"' { 475 break 476 } 477 if r == '\\' { 478 s.escape('"') 479 continue 480 } 481 if r == '\n' { 482 s.ungetr() // assume newline is not part of literal 483 s.error("newline in string") 484 break 485 } 486 if r < 0 { 487 s.errh(s.line, s.col, "string not terminated") 488 break 489 } 490 } 491 492 s.nlsemi = true 493 s.lit = string(s.stopLit()) 494 s.kind = StringLit 495 s.tok = _Literal 496 } 497 498 func (s *scanner) rawString() { 499 s.startLit() 500 501 for { 502 r := s.getr() 503 if r == '`' { 504 break 505 } 506 if r < 0 { 507 s.errh(s.line, s.col, "string not terminated") 508 break 509 } 510 } 511 // We leave CRs in the string since they are part of the 512 // literal (even though they are not part of the literal 513 // value). 514 515 s.nlsemi = true 516 s.lit = string(s.stopLit()) 517 s.kind = StringLit 518 s.tok = _Literal 519 } 520 521 func (s *scanner) rune() { 522 s.startLit() 523 524 r := s.getr() 525 ok := false 526 if r == '\'' { 527 s.error("empty character literal or unescaped ' in character literal") 528 } else if r == '\n' { 529 s.ungetr() // assume newline is not part of literal 530 s.error("newline in character literal") 531 } else { 532 ok = true 533 if r == '\\' { 534 ok = s.escape('\'') 535 } 536 } 537 538 r = s.getr() 539 if r != '\'' { 540 // only report error if we're ok so far 541 if ok { 542 s.error("missing '") 543 } 544 s.ungetr() 545 } 546 547 s.nlsemi = true 548 s.lit = string(s.stopLit()) 549 s.kind = RuneLit 550 s.tok = _Literal 551 } 552 553 func (s *scanner) skipLine(r rune) { 554 for r >= 0 { 555 if r == '\n' { 556 s.ungetr() // don't consume '\n' - needed for nlsemi logic 557 break 558 } 559 r = s.getr() 560 } 561 } 562 563 func (s *scanner) lineComment() { 564 r := s.getr() 565 // directives must start at the beginning of the line (s.col == 0) 566 if s.col != 0 || s.pragh == nil || (r != 'g' && r != 'l') { 567 s.skipLine(r) 568 return 569 } 570 // s.col == 0 && s.pragh != nil && (r == 'g' || r == 'l') 571 572 // recognize directives 573 prefix := "go:" 574 if r == 'l' { 575 prefix = "line " 576 } 577 for _, m := range prefix { 578 if r != m { 579 s.skipLine(r) 580 return 581 } 582 r = s.getr() 583 } 584 585 // directive text without line ending (which may be "\r\n" if Windows), 586 s.startLit() 587 s.skipLine(r) 588 text := s.stopLit() 589 if i := len(text) - 1; i >= 0 && text[i] == '\r' { 590 text = text[:i] 591 } 592 593 s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since directive text starts after // 594 } 595 596 func (s *scanner) fullComment() { 597 for { 598 r := s.getr() 599 for r == '*' { 600 r = s.getr() 601 if r == '/' { 602 return 603 } 604 } 605 if r < 0 { 606 s.errh(s.line, s.col, "comment not terminated") 607 return 608 } 609 } 610 } 611 612 func (s *scanner) escape(quote rune) bool { 613 var n int 614 var base, max uint32 615 616 c := s.getr() 617 switch c { 618 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 619 return true 620 case '0', '1', '2', '3', '4', '5', '6', '7': 621 n, base, max = 3, 8, 255 622 case 'x': 623 c = s.getr() 624 n, base, max = 2, 16, 255 625 case 'u': 626 c = s.getr() 627 n, base, max = 4, 16, unicode.MaxRune 628 case 'U': 629 c = s.getr() 630 n, base, max = 8, 16, unicode.MaxRune 631 default: 632 if c < 0 { 633 return true // complain in caller about EOF 634 } 635 s.error("unknown escape sequence") 636 return false 637 } 638 639 var x uint32 640 for i := n; i > 0; i-- { 641 d := base 642 switch { 643 case isDigit(c): 644 d = uint32(c) - '0' 645 case 'a' <= c && c <= 'f': 646 d = uint32(c) - ('a' - 10) 647 case 'A' <= c && c <= 'F': 648 d = uint32(c) - ('A' - 10) 649 } 650 if d >= base { 651 if c < 0 { 652 return true // complain in caller about EOF 653 } 654 kind := "hex" 655 if base == 8 { 656 kind = "octal" 657 } 658 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c)) 659 s.ungetr() 660 return false 661 } 662 // d < base 663 x = x*base + d 664 c = s.getr() 665 } 666 s.ungetr() 667 668 if x > max && base == 8 { 669 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 670 return false 671 } 672 673 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 674 s.error("escape sequence is invalid Unicode code point") 675 return false 676 } 677 678 return true 679 }