github.com/riscv/riscv-go@v0.0.0-20200123204226-124ebd6fcc8e/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements scanner, a lexical tokenizer for 6 // Go source. After initialization, consecutive calls of 7 // next advance the scanner one token at a time. 8 // 9 // This file, source.go, and tokens.go are self-contained 10 // (go tool compile scanner.go source.go tokens.go compiles) 11 // and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "fmt" 17 "io" 18 "unicode" 19 "unicode/utf8" 20 ) 21 22 type scanner struct { 23 source 24 pragh func(line, col uint, msg string) 25 nlsemi bool // if set '\n' and EOF translate to ';' 26 27 // current token, valid after calling next() 28 line, col uint 29 tok token 30 lit string // valid if tok is _Name or _Literal 31 kind LitKind // valid if tok is _Literal 32 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 33 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 34 } 35 36 func (s *scanner) init(src io.Reader, errh, pragh func(line, col uint, msg string)) { 37 s.source.init(src, errh) 38 s.pragh = pragh 39 s.nlsemi = false 40 } 41 42 // next advances the scanner by reading the next token. 43 // 44 // If a read, source encoding, or lexical error occurs, next 45 // calls the error handler installed with init. The handler 46 // must exist. 47 // 48 // If a //line or //go: directive is encountered, next 49 // calls the pragma handler installed with init, if not nil. 50 // 51 // The (line, col) position passed to the error and pragma 52 // handler is always at or after the current source reading 53 // position. 54 func (s *scanner) next() { 55 nlsemi := s.nlsemi 56 s.nlsemi = false 57 58 redo: 59 // skip white space 60 c := s.getr() 61 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 62 c = s.getr() 63 } 64 65 // token start 66 s.line, s.col = s.source.line0, s.source.col0 67 68 if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { 69 s.ident() 70 return 71 } 72 73 switch c { 74 case -1: 75 if nlsemi { 76 s.tok = _Semi 77 break 78 } 79 s.tok = _EOF 80 81 case '\n': 82 s.tok = _Semi 83 84 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 85 s.number(c) 86 87 case '"': 88 s.stdString() 89 90 case '`': 91 s.rawString() 92 93 case '\'': 94 s.rune() 95 96 case '(': 97 s.tok = _Lparen 98 99 case '[': 100 s.tok = _Lbrack 101 102 case '{': 103 s.tok = _Lbrace 104 105 case ',': 106 s.tok = _Comma 107 108 case ';': 109 s.tok = _Semi 110 111 case ')': 112 s.nlsemi = true 113 s.tok = _Rparen 114 115 case ']': 116 s.nlsemi = true 117 s.tok = _Rbrack 118 119 case '}': 120 s.nlsemi = true 121 s.tok = _Rbrace 122 123 case ':': 124 if s.getr() == '=' { 125 s.tok = _Define 126 break 127 } 128 s.ungetr() 129 s.tok = _Colon 130 131 case '.': 132 c = s.getr() 133 if isDigit(c) { 134 s.ungetr2() 135 s.number('.') 136 break 137 } 138 if c == '.' { 139 c = s.getr() 140 if c == '.' { 141 s.tok = _DotDotDot 142 break 143 } 144 s.ungetr2() 145 } 146 s.ungetr() 147 s.tok = _Dot 148 149 case '+': 150 s.op, s.prec = Add, precAdd 151 c = s.getr() 152 if c != '+' { 153 goto assignop 154 } 155 s.nlsemi = true 156 s.tok = _IncOp 157 158 case '-': 159 s.op, s.prec = Sub, precAdd 160 c = s.getr() 161 if c != '-' { 162 goto assignop 163 } 164 s.nlsemi = true 165 s.tok = _IncOp 166 167 case '*': 168 s.op, s.prec = Mul, precMul 169 // don't goto assignop - want _Star token 170 if s.getr() == '=' { 171 s.tok = _AssignOp 172 break 173 } 174 s.ungetr() 175 s.tok = _Star 176 177 case '/': 178 c = s.getr() 179 if c == '/' { 180 s.lineComment() 181 goto redo 182 } 183 if c == '*' { 184 s.fullComment() 185 if s.source.line > s.line && nlsemi { 186 // A multi-line comment acts like a newline; 187 // it translates to a ';' if nlsemi is set. 188 s.tok = _Semi 189 break 190 } 191 goto redo 192 } 193 s.op, s.prec = Div, precMul 194 goto assignop 195 196 case '%': 197 s.op, s.prec = Rem, precMul 198 c = s.getr() 199 goto assignop 200 201 case '&': 202 c = s.getr() 203 if c == '&' { 204 s.op, s.prec = AndAnd, precAndAnd 205 s.tok = _Operator 206 break 207 } 208 s.op, s.prec = And, precMul 209 if c == '^' { 210 s.op = AndNot 211 c = s.getr() 212 } 213 goto assignop 214 215 case '|': 216 c = s.getr() 217 if c == '|' { 218 s.op, s.prec = OrOr, precOrOr 219 s.tok = _Operator 220 break 221 } 222 s.op, s.prec = Or, precAdd 223 goto assignop 224 225 case '~': 226 s.error("bitwise complement operator is ^") 227 fallthrough 228 229 case '^': 230 s.op, s.prec = Xor, precAdd 231 c = s.getr() 232 goto assignop 233 234 case '<': 235 c = s.getr() 236 if c == '=' { 237 s.op, s.prec = Leq, precCmp 238 s.tok = _Operator 239 break 240 } 241 if c == '<' { 242 s.op, s.prec = Shl, precMul 243 c = s.getr() 244 goto assignop 245 } 246 if c == '-' { 247 s.tok = _Arrow 248 break 249 } 250 s.ungetr() 251 s.op, s.prec = Lss, precCmp 252 s.tok = _Operator 253 254 case '>': 255 c = s.getr() 256 if c == '=' { 257 s.op, s.prec = Geq, precCmp 258 s.tok = _Operator 259 break 260 } 261 if c == '>' { 262 s.op, s.prec = Shr, precMul 263 c = s.getr() 264 goto assignop 265 } 266 s.ungetr() 267 s.op, s.prec = Gtr, precCmp 268 s.tok = _Operator 269 270 case '=': 271 if s.getr() == '=' { 272 s.op, s.prec = Eql, precCmp 273 s.tok = _Operator 274 break 275 } 276 s.ungetr() 277 s.tok = _Assign 278 279 case '!': 280 if s.getr() == '=' { 281 s.op, s.prec = Neq, precCmp 282 s.tok = _Operator 283 break 284 } 285 s.ungetr() 286 s.op, s.prec = Not, 0 287 s.tok = _Operator 288 289 default: 290 s.tok = 0 291 s.error(fmt.Sprintf("invalid character %#U", c)) 292 goto redo 293 } 294 295 return 296 297 assignop: 298 if c == '=' { 299 s.tok = _AssignOp 300 return 301 } 302 s.ungetr() 303 s.tok = _Operator 304 } 305 306 func isLetter(c rune) bool { 307 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 308 } 309 310 func isDigit(c rune) bool { 311 return '0' <= c && c <= '9' 312 } 313 314 func (s *scanner) ident() { 315 s.startLit() 316 317 // accelerate common case (7bit ASCII) 318 c := s.getr() 319 for isLetter(c) || isDigit(c) { 320 c = s.getr() 321 } 322 323 // general case 324 if c >= utf8.RuneSelf { 325 for s.isIdentRune(c, false) { 326 c = s.getr() 327 } 328 } 329 s.ungetr() 330 331 lit := s.stopLit() 332 333 // possibly a keyword 334 if len(lit) >= 2 { 335 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 336 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 337 s.tok = tok 338 return 339 } 340 } 341 342 s.nlsemi = true 343 s.lit = string(lit) 344 s.tok = _Name 345 } 346 347 func (s *scanner) isIdentRune(c rune, first bool) bool { 348 switch { 349 case unicode.IsLetter(c) || c == '_': 350 // ok 351 case unicode.IsDigit(c): 352 if first { 353 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 354 } 355 case c >= utf8.RuneSelf: 356 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 357 default: 358 return false 359 } 360 return true 361 } 362 363 // hash is a perfect hash function for keywords. 364 // It assumes that s has at least length 2. 365 func hash(s []byte) uint { 366 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 367 } 368 369 var keywordMap [1 << 6]token // size must be power of two 370 371 func init() { 372 // populate keywordMap 373 for tok := _Break; tok <= _Var; tok++ { 374 h := hash([]byte(tokstrings[tok])) 375 if keywordMap[h] != 0 { 376 panic("imperfect hash") 377 } 378 keywordMap[h] = tok 379 } 380 } 381 382 func (s *scanner) number(c rune) { 383 s.startLit() 384 385 if c != '.' { 386 s.kind = IntLit // until proven otherwise 387 if c == '0' { 388 c = s.getr() 389 if c == 'x' || c == 'X' { 390 // hex 391 c = s.getr() 392 hasDigit := false 393 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 394 c = s.getr() 395 hasDigit = true 396 } 397 if !hasDigit { 398 s.error("malformed hex constant") 399 } 400 goto done 401 } 402 403 // decimal 0, octal, or float 404 has8or9 := false 405 for isDigit(c) { 406 if c > '7' { 407 has8or9 = true 408 } 409 c = s.getr() 410 } 411 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 412 // octal 413 if has8or9 { 414 s.error("malformed octal constant") 415 } 416 goto done 417 } 418 419 } else { 420 // decimal or float 421 for isDigit(c) { 422 c = s.getr() 423 } 424 } 425 } 426 427 // float 428 if c == '.' { 429 s.kind = FloatLit 430 c = s.getr() 431 for isDigit(c) { 432 c = s.getr() 433 } 434 } 435 436 // exponent 437 if c == 'e' || c == 'E' { 438 s.kind = FloatLit 439 c = s.getr() 440 if c == '-' || c == '+' { 441 c = s.getr() 442 } 443 if !isDigit(c) { 444 s.error("malformed floating-point constant exponent") 445 } 446 for isDigit(c) { 447 c = s.getr() 448 } 449 } 450 451 // complex 452 if c == 'i' { 453 s.kind = ImagLit 454 s.getr() 455 } 456 457 done: 458 s.ungetr() 459 s.nlsemi = true 460 s.lit = string(s.stopLit()) 461 s.tok = _Literal 462 } 463 464 func (s *scanner) stdString() { 465 s.startLit() 466 467 for { 468 r := s.getr() 469 if r == '"' { 470 break 471 } 472 if r == '\\' { 473 s.escape('"') 474 continue 475 } 476 if r == '\n' { 477 s.ungetr() // assume newline is not part of literal 478 s.error("newline in string") 479 break 480 } 481 if r < 0 { 482 s.errh(s.line, s.col, "string not terminated") 483 break 484 } 485 } 486 487 s.nlsemi = true 488 s.lit = string(s.stopLit()) 489 s.kind = StringLit 490 s.tok = _Literal 491 } 492 493 func (s *scanner) rawString() { 494 s.startLit() 495 496 for { 497 r := s.getr() 498 if r == '`' { 499 break 500 } 501 if r < 0 { 502 s.errh(s.line, s.col, "string not terminated") 503 break 504 } 505 } 506 // We leave CRs in the string since they are part of the 507 // literal (even though they are not part of the literal 508 // value). 509 510 s.nlsemi = true 511 s.lit = string(s.stopLit()) 512 s.kind = StringLit 513 s.tok = _Literal 514 } 515 516 func (s *scanner) rune() { 517 s.startLit() 518 519 r := s.getr() 520 ok := false 521 if r == '\'' { 522 s.error("empty character literal or unescaped ' in character literal") 523 } else if r == '\n' { 524 s.ungetr() // assume newline is not part of literal 525 s.error("newline in character literal") 526 } else { 527 ok = true 528 if r == '\\' { 529 ok = s.escape('\'') 530 } 531 } 532 533 r = s.getr() 534 if r != '\'' { 535 // only report error if we're ok so far 536 if ok { 537 s.error("missing '") 538 } 539 s.ungetr() 540 } 541 542 s.nlsemi = true 543 s.lit = string(s.stopLit()) 544 s.kind = RuneLit 545 s.tok = _Literal 546 } 547 548 func (s *scanner) skipLine(r rune) { 549 for r >= 0 { 550 if r == '\n' { 551 s.ungetr() // don't consume '\n' - needed for nlsemi logic 552 break 553 } 554 r = s.getr() 555 } 556 } 557 558 func (s *scanner) lineComment() { 559 r := s.getr() 560 if s.pragh == nil || (r != 'g' && r != 'l') { 561 s.skipLine(r) 562 return 563 } 564 // s.pragh != nil && (r == 'g' || r == 'l') 565 566 // recognize pragmas 567 prefix := "go:" 568 if r == 'l' { 569 prefix = "line " 570 } 571 for _, m := range prefix { 572 if r != m { 573 s.skipLine(r) 574 return 575 } 576 r = s.getr() 577 } 578 579 // pragma text without line ending (which may be "\r\n" if Windows), 580 s.startLit() 581 s.skipLine(r) 582 text := s.stopLit() 583 if i := len(text) - 1; i >= 0 && text[i] == '\r' { 584 text = text[:i] 585 } 586 587 s.pragh(s.line, s.col+2, prefix+string(text)) // +2 since pragma text starts after // 588 } 589 590 func (s *scanner) fullComment() { 591 for { 592 r := s.getr() 593 for r == '*' { 594 r = s.getr() 595 if r == '/' { 596 return 597 } 598 } 599 if r < 0 { 600 s.errh(s.line, s.col, "comment not terminated") 601 return 602 } 603 } 604 } 605 606 func (s *scanner) escape(quote rune) bool { 607 var n int 608 var base, max uint32 609 610 c := s.getr() 611 switch c { 612 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 613 return true 614 case '0', '1', '2', '3', '4', '5', '6', '7': 615 n, base, max = 3, 8, 255 616 case 'x': 617 c = s.getr() 618 n, base, max = 2, 16, 255 619 case 'u': 620 c = s.getr() 621 n, base, max = 4, 16, unicode.MaxRune 622 case 'U': 623 c = s.getr() 624 n, base, max = 8, 16, unicode.MaxRune 625 default: 626 if c < 0 { 627 return true // complain in caller about EOF 628 } 629 s.error("unknown escape sequence") 630 return false 631 } 632 633 var x uint32 634 for i := n; i > 0; i-- { 635 d := base 636 switch { 637 case isDigit(c): 638 d = uint32(c) - '0' 639 case 'a' <= c && c <= 'f': 640 d = uint32(c) - ('a' - 10) 641 case 'A' <= c && c <= 'F': 642 d = uint32(c) - ('A' - 10) 643 } 644 if d >= base { 645 if c < 0 { 646 return true // complain in caller about EOF 647 } 648 kind := "hex" 649 if base == 8 { 650 kind = "octal" 651 } 652 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c)) 653 s.ungetr() 654 return false 655 } 656 // d < base 657 x = x*base + d 658 c = s.getr() 659 } 660 s.ungetr() 661 662 if x > max && base == 8 { 663 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 664 return false 665 } 666 667 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 668 s.error("escape sequence is invalid Unicode code point") 669 return false 670 } 671 672 return true 673 }