github.com/epfl-dcsl/gotee@v0.0.0-20200909122901-014b35f5e5e9/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, and tokens.go are self-contained 10 // (go tool compile scanner.go source.go tokens.go compiles) 11 // and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 type scanner struct { 23 source 24 pragh func(line, col uint, msg string) 25 nlsemi bool // if set '\n' and EOF translate to ';' 26 27 // current token, valid after calling next() 28 line, col uint 29 tok token 30 lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF") 31 kind LitKind // valid if tok is _Literal 32 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 33 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 34 } 35 36 func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) { 37 s.source.init(src, errh) 38 s.pragh = pragh 39 s.nlsemi = false 40 } 41 42 // next advances the scanner by reading the next token. 43 // 44 // If a read, source encoding, or lexical error occurs, next 45 // calls the error handler installed with init. The handler 46 // must exist. 47 // 48 // If a //line or //go: directive is encountered at the start 49 // of a line, next calls the directive handler pragh installed 50 // with init, if not nil. 51 // 52 // The (line, col) position passed to the error and directive 53 // handler is always at or after the current source reading 54 // position. 55 func (s *scanner) next() { 56 nlsemi := s.nlsemi 57 s.nlsemi = false 58 59 redo: 60 // skip white space 61 c := s.getr() 62 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 63 c = s.getr() 64 } 65 66 // token start 67 s.line, s.col = s.source.line0, s.source.col0 68 69 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { 70 s.ident() 71 return 72 } 73 74 switch c { 75 case -1: 76 if nlsemi { 77 s.lit = "EOF" 78 s.tok = _Semi 79 break 80 } 81 s.tok = _EOF 82 83 case '\n': 84 s.lit = "newline" 85 s.tok = _Semi 86 87 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 88 s.number(c) 89 90 case '"': 91 s.stdString() 92 93 case '`': 94 s.rawString() 95 96 case '\'': 97 s.rune() 98 99 case '(': 100 s.tok = _Lparen 101 102 case '[': 103 s.tok = _Lbrack 104 105 case '{': 106 s.tok = _Lbrace 107 108 case ',': 109 s.tok = _Comma 110 111 case ';': 112 s.lit = "semicolon" 113 s.tok = _Semi 114 115 case ')': 116 s.nlsemi = true 117 s.tok = _Rparen 118 119 case ']': 120 s.nlsemi = true 121 s.tok = _Rbrack 122 123 case '}': 124 s.nlsemi = true 125 s.tok = _Rbrace 126 127 case ':': 128 if s.getr() == '=' { 129 s.tok = _Define 130 break 131 } 132 s.ungetr() 133 s.tok = _Colon 134 135 case '.': 136 c = s.getr() 137 if isDigit(c) { 138 s.ungetr2() 139 s.number('.') 140 break 141 } 142 if c == '.' { 143 c = s.getr() 144 if c == '.' { 145 s.tok = _DotDotDot 146 break 147 } 148 s.ungetr2() 149 } 150 s.ungetr() 151 s.tok = _Dot 152 153 case '+': 154 s.op, s.prec = Add, precAdd 155 c = s.getr() 156 if c != '+' { 157 goto assignop 158 } 159 s.nlsemi = true 160 s.tok = _IncOp 161 162 case '-': 163 s.op, s.prec = Sub, precAdd 164 c = s.getr() 165 if c != '-' { 166 goto assignop 167 } 168 s.nlsemi = true 169 s.tok = _IncOp 170 171 case '*': 172 s.op, s.prec = Mul, precMul 173 // don't goto assignop - want _Star token 174 if s.getr() == '=' { 175 s.tok = _AssignOp 176 break 177 } 178 s.ungetr() 179 s.tok = _Star 180 181 case '/': 182 c = s.getr() 183 if c == '/' { 184 s.lineComment() 185 goto redo 186 } 187 if c == '*' { 188 s.fullComment() 189 if s.source.line > s.line && nlsemi { 190 // A multi-line comment acts like a newline; 191 // it translates to a ';' if nlsemi is set. 192 s.lit = "newline" 193 s.tok = _Semi 194 break 195 } 196 goto redo 197 } 198 s.op, s.prec = Div, precMul 199 goto assignop 200 201 case '%': 202 s.op, s.prec = Rem, precMul 203 c = s.getr() 204 goto assignop 205 206 case '&': 207 c = s.getr() 208 if c == '&' { 209 s.op, s.prec = AndAnd, precAndAnd 210 s.tok = _Operator 211 break 212 } 213 s.op, s.prec = And, precMul 214 if c == '^' { 215 s.op = AndNot 216 c = s.getr() 217 } 218 goto assignop 219 220 case '|': 221 c = s.getr() 222 if c == '|' { 223 s.op, s.prec = OrOr, precOrOr 224 s.tok = _Operator 225 break 226 } 227 s.op, s.prec = Or, precAdd 228 goto assignop 229 230 case '~': 231 s.error("bitwise complement operator is ^") 232 fallthrough 233 234 case '^': 235 s.op, s.prec = Xor, precAdd 236 c = s.getr() 237 goto assignop 238 239 case '<': 240 c = s.getr() 241 if c == '=' { 242 s.op, s.prec = Leq, precCmp 243 s.tok = _Operator 244 break 245 } 246 if c == '<' { 247 s.op, s.prec = Shl, precMul 248 c = s.getr() 249 goto assignop 250 } 251 if c == '-' { 252 s.tok = _Arrow 253 break 254 } 255 s.ungetr() 256 s.op, s.prec = Lss, precCmp 257 s.tok = _Operator 258 259 case '>': 260 c = s.getr() 261 if c == '=' { 262 s.op, s.prec = Geq, precCmp 263 s.tok = _Operator 264 break 265 } 266 if c == '>' { 267 s.op, s.prec = Shr, precMul 268 c = s.getr() 269 goto assignop 270 } 271 s.ungetr() 272 s.op, s.prec = Gtr, precCmp 273 s.tok = _Operator 274 275 case '=': 276 if s.getr() == '=' { 277 s.op, s.prec = Eql, precCmp 278 s.tok = _Operator 279 break 280 } 281 s.ungetr() 282 s.tok = _Assign 283 284 case '!': 285 if s.getr() == '=' { 286 s.op, s.prec = Neq, precCmp 287 s.tok = _Operator 288 break 289 } 290 s.ungetr() 291 s.op, s.prec = Not, 0 292 s.tok = _Operator 293 294 default: 295 s.tok = 0 296 s.error(fmt.Sprintf("invalid character %#U", c)) 297 goto redo 298 } 299 300 return 301 302 assignop: 303 if c == '=' { 304 s.tok = _AssignOp 305 return 306 } 307 s.ungetr() 308 s.tok = _Operator 309 } 310 311 func isLetter(c rune) bool { 312 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 313 } 314 315 func isDigit(c rune) bool { 316 return '0' <= c && c <= '9' 317 } 318 319 func (s *scanner) ident() { 320 s.startLit() 321 322 // accelerate common case (7bit ASCII) 323 c := s.getr() 324 for isLetter(c) || isDigit(c) { 325 c = s.getr() 326 } 327 328 // general case 329 if c >= utf8.RuneSelf { 330 for s.isIdentRune(c, false) { 331 c = s.getr() 332 } 333 } 334 s.ungetr() 335 336 lit := s.stopLit() 337 338 // possibly a keyword 339 if len(lit) >= 2 { 340 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 341 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 342 s.tok = tok 343 return 344 } 345 } 346 347 s.nlsemi = true 348 s.lit = string(lit) 349 s.tok = _Name 350 } 351 352 func (s *scanner) isIdentRune(c rune, first bool) bool { 353 switch { 354 case unicode.IsLetter(c) || c == '_': 355 // ok 356 case unicode.IsDigit(c): 357 if first { 358 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 359 } 360 case c >= utf8.RuneSelf: 361 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 362 default: 363 return false 364 } 365 return true 366 } 367 368 // hash is a perfect hash function for keywords. 369 // It assumes that s has at least length 2. 370 func hash(s []byte) uint { 371 if v, ok := hashing[string(s)]; ok { 372 return v 373 } 374 return 0 375 //return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 376 } 377 378 var keywordMap [1 << 6]token // size must be power of two 379 var hashing map[string]uint 380 381 func init() { 382 hashing = make(map[string]uint) 383 384 // populate keywordMap 385 for tok := _Break; tok <= _Var; tok++ { 386 //TODO aghosn hack to fix the hash function: 387 hashing[tokstrings[tok]] = uint(tok) 388 h := hash([]byte(tokstrings[tok])) 389 if keywordMap[h] != 0 { 390 panic("imperfect hash") 391 } 392 keywordMap[h] = tok 393 } 394 } 395 396 func (s *scanner) number(c rune) { 397 s.startLit() 398 399 if c != '.' { 400 s.kind = IntLit // until proven otherwise 401 if c == '0' { 402 c = s.getr() 403 if c == 'x' || c == 'X' { 404 // hex 405 c = s.getr() 406 hasDigit := false 407 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 408 c = s.getr() 409 hasDigit = true 410 } 411 if !hasDigit { 412 s.error("malformed hex constant") 413 } 414 goto done 415 } 416 417 // decimal 0, octal, or float 418 has8or9 := false 419 for isDigit(c) { 420 if c > '7' { 421 has8or9 = true 422 } 423 c = s.getr() 424 } 425 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 426 // octal 427 if has8or9 { 428 s.error("malformed octal constant") 429 } 430 goto done 431 } 432 433 } else { 434 // decimal or float 435 for isDigit(c) { 436 c = s.getr() 437 } 438 } 439 } 440 441 // float 442 if c == '.' { 443 s.kind = FloatLit 444 c = s.getr() 445 for isDigit(c) { 446 c = s.getr() 447 } 448 } 449 450 // exponent 451 if c == 'e' || c == 'E' { 452 s.kind = FloatLit 453 c = s.getr() 454 if c == '-' || c == '+' { 455 c = s.getr() 456 } 457 if !isDigit(c) { 458 s.error("malformed floating-point constant exponent") 459 } 460 for isDigit(c) { 461 c = s.getr() 462 } 463 } 464 465 // complex 466 if c == 'i' { 467 s.kind = ImagLit 468 s.getr() 469 } 470 471 done: 472 s.ungetr() 473 s.nlsemi = true 474 s.lit = string(s.stopLit()) 475 s.tok = _Literal 476 } 477 478 func (s *scanner) rune() { 479 s.startLit() 480 481 ok := true // only report errors if we're ok so far 482 n := 0 483 for ; ; n++ { 484 r := s.getr() 485 if r == '\'' { 486 break 487 } 488 if r == '\\' { 489 if !s.escape('\'') { 490 ok = false 491 } 492 continue 493 } 494 if r == '\n' { 495 s.ungetr() // assume newline is not part of literal 496 if ok { 497 s.error("newline in character literal") 498 ok = false 499 } 500 break 501 } 502 if r < 0 { 503 if ok { 504 s.errh(s.line, s.col, "invalid character literal (missing closing ')") 505 ok = false 506 } 507 break 508 } 509 } 510 511 if ok { 512 if n == 0 { 513 s.error("empty character literal or unescaped ' in character literal") 514 } else if n != 1 { 515 s.errh(s.line, s.col, "invalid character literal (more than one character)") 516 } 517 } 518 519 s.nlsemi = true 520 s.lit = string(s.stopLit()) 521 s.kind = RuneLit 522 s.tok = _Literal 523 } 524 525 func (s *scanner) stdString() { 526 s.startLit() 527 528 for { 529 r := s.getr() 530 if r == '"' { 531 break 532 } 533 if r == '\\' { 534 s.escape('"') 535 continue 536 } 537 if r == '\n' { 538 s.ungetr() // assume newline is not part of literal 539 s.error("newline in string") 540 break 541 } 542 if r < 0 { 543 s.errh(s.line, s.col, "string not terminated") 544 break 545 } 546 } 547 548 s.nlsemi = true 549 s.lit = string(s.stopLit()) 550 s.kind = StringLit 551 s.tok = _Literal 552 } 553 554 func (s *scanner) rawString() { 555 s.startLit() 556 557 for { 558 r := s.getr() 559 if r == '`' { 560 break 561 } 562 if r < 0 { 563 s.errh(s.line, s.col, "string not terminated") 564 break 565 } 566 } 567 // We leave CRs in the string since they are part of the 568 // literal (even though they are not part of the literal 569 // value). 570 571 s.nlsemi = true 572 s.lit = string(s.stopLit()) 573 s.kind = StringLit 574 s.tok = _Literal 575 } 576 577 func (s *scanner) skipLine(r rune) { 578 for r >= 0 { 579 if r == '\n' { 580 s.ungetr() // don't consume '\n' - needed for nlsemi logic 581 break 582 } 583 r = s.getr() 584 } 585 } 586 587 func (s *scanner) lineComment() { 588 r := s.getr() 589 // directives must start at the beginning of the line (s.col == colbase) 590 if s.col != colbase || s.pragh == nil || (r != 'g' && r != 'l') { 591 s.skipLine(r) 592 return 593 } 594 // s.col == colbase && s.pragh != nil && (r == 'g' || r == 'l') 595 596 // recognize directives 597 prefix := "go:" 598 if r == 'l' { 599 prefix = "line " 600 } 601 for _, m := range prefix { 602 if r != m { 603 s.skipLine(r) 604 return 605 } 606 r = s.getr() 607 } 608 609 // directive text without line ending (which may be "\r\n" if Windows), 610 s.startLit() 611 s.skipLine(r) 612 text := s.stopLit() 613 if i := len(text) - 1; i >= 0 && text[i] == '\r' { 614 text = text[:i] 615 } 616 617 s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since directive text starts after // 618 } 619 620 func (s *scanner) fullComment() { 621 for { 622 r := s.getr() 623 for r == '*' { 624 r = s.getr() 625 if r == '/' { 626 return 627 } 628 } 629 if r < 0 { 630 s.errh(s.line, s.col, "comment not terminated") 631 return 632 } 633 } 634 } 635 636 func (s *scanner) escape(quote rune) bool { 637 var n int 638 var base, max uint32 639 640 c := s.getr() 641 switch c { 642 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 643 return true 644 case '0', '1', '2', '3', '4', '5', '6', '7': 645 n, base, max = 3, 8, 255 646 case 'x': 647 c = s.getr() 648 n, base, max = 2, 16, 255 649 case 'u': 650 c = s.getr() 651 n, base, max = 4, 16, unicode.MaxRune 652 case 'U': 653 c = s.getr() 654 n, base, max = 8, 16, unicode.MaxRune 655 default: 656 if c < 0 { 657 return true // complain in caller about EOF 658 } 659 s.error("unknown escape sequence") 660 return false 661 } 662 663 var x uint32 664 for i := n; i > 0; i-- { 665 d := base 666 switch { 667 case isDigit(c): 668 d = uint32(c) - '0' 669 case 'a' <= c && c <= 'f': 670 d = uint32(c) - ('a' - 10) 671 case 'A' <= c && c <= 'F': 672 d = uint32(c) - ('A' - 10) 673 } 674 if d >= base { 675 if c < 0 { 676 return true // complain in caller about EOF 677 } 678 kind := "hex" 679 if base == 8 { 680 kind = "octal" 681 } 682 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c)) 683 s.ungetr() 684 return false 685 } 686 // d < base 687 x = x*base + d 688 c = s.getr() 689 } 690 s.ungetr() 691 692 if x > max && base == 8 { 693 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 694 return false 695 } 696 697 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 698 s.error("escape sequence is invalid Unicode code point") 699 return false 700 } 701 702 return true 703 }