github.com/euank/go@v0.0.0-20160829210321-495514729181/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "fmt" 9 "io" 10 "strings" 11 "unicode" 12 "unicode/utf8" 13 ) 14 15 type scanner struct { 16 source 17 nlsemi bool // if set '\n' and EOF translate to ';' 18 19 // current token, valid after calling next() 20 pos, line int 21 tok token 22 lit string // valid if tok is _Name or _Literal 23 kind LitKind // valid if tok is _Literal 24 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 25 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 26 27 pragmas []Pragma 28 } 29 30 func (s *scanner) init(src io.Reader, errh ErrorHandler) { 31 s.source.init(src, errh) 32 s.nlsemi = false 33 } 34 35 func (s *scanner) next() { 36 nlsemi := s.nlsemi 37 s.nlsemi = false 38 39 redo: 40 // skip white space 41 c := s.getr() 42 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 43 c = s.getr() 44 } 45 46 // token start 47 s.pos, s.line = s.source.pos0(), s.source.line0 48 49 if isLetter(c) || c >= utf8.RuneSelf && (unicode.IsLetter(c) || s.isCompatRune(c, true)) { 50 s.ident() 51 return 52 } 53 54 switch c { 55 case -1: 56 if nlsemi { 57 s.tok = _Semi 58 break 59 } 60 s.tok = _EOF 61 62 case '\n': 63 s.tok = _Semi 64 65 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 66 s.number(c) 67 68 case '"': 69 s.stdString() 70 71 case '`': 72 s.rawString() 73 74 case '\'': 75 s.rune() 76 77 case '(': 78 s.tok = _Lparen 79 80 case '[': 81 s.tok = _Lbrack 82 83 case '{': 84 s.tok = _Lbrace 85 86 case ',': 87 s.tok = _Comma 88 89 case ';': 90 s.tok = _Semi 91 92 case ')': 93 s.nlsemi = true 94 s.tok = _Rparen 95 96 case ']': 97 s.nlsemi = true 98 s.tok = _Rbrack 99 100 case '}': 101 s.nlsemi = true 102 s.tok = _Rbrace 103 104 case ':': 105 if s.getr() == '=' { 106 s.tok = _Define 107 break 108 } 109 s.ungetr() 110 s.tok = _Colon 111 112 case '.': 113 c = s.getr() 114 if isDigit(c) { 115 s.ungetr() 116 s.source.r0-- // make sure '.' is part of literal (line cannot have changed) 117 s.number('.') 118 break 119 } 120 if c == '.' { 121 c = s.getr() 122 if c == '.' { 123 s.tok = _DotDotDot 124 break 125 } 126 s.ungetr() 127 s.source.r0-- // make next ungetr work (line cannot have changed) 128 } 129 s.ungetr() 130 s.tok = _Dot 131 132 case '+': 133 s.op, s.prec = Add, precAdd 134 c = s.getr() 135 if c != '+' { 136 goto assignop 137 } 138 s.nlsemi = true 139 s.tok = _IncOp 140 141 case '-': 142 s.op, s.prec = Sub, precAdd 143 c = s.getr() 144 if c != '-' { 145 goto assignop 146 } 147 s.nlsemi = true 148 s.tok = _IncOp 149 150 case '*': 151 s.op, s.prec = Mul, precMul 152 // don't goto assignop - want _Star token 153 if s.getr() == '=' { 154 s.tok = _AssignOp 155 break 156 } 157 s.ungetr() 158 s.tok = _Star 159 160 case '/': 161 c = s.getr() 162 if c == '/' { 163 s.lineComment() 164 goto redo 165 } 166 if c == '*' { 167 s.fullComment() 168 if s.source.line > s.line && nlsemi { 169 // A multi-line comment acts like a newline; 170 // it translates to a ';' if nlsemi is set. 171 s.tok = _Semi 172 break 173 } 174 goto redo 175 } 176 s.op, s.prec = Div, precMul 177 goto assignop 178 179 case '%': 180 s.op, s.prec = Rem, precMul 181 c = s.getr() 182 goto assignop 183 184 case '&': 185 c = s.getr() 186 if c == '&' { 187 s.op, s.prec = AndAnd, precAndAnd 188 s.tok = _Operator 189 break 190 } 191 s.op, s.prec = And, precMul 192 if c == '^' { 193 s.op = AndNot 194 c = s.getr() 195 } 196 goto assignop 197 198 case '|': 199 c = s.getr() 200 if c == '|' { 201 s.op, s.prec = OrOr, precOrOr 202 s.tok = _Operator 203 break 204 } 205 s.op, s.prec = Or, precAdd 206 goto assignop 207 208 case '~': 209 s.error("bitwise complement operator is ^") 210 fallthrough 211 212 case '^': 213 s.op, s.prec = Xor, precAdd 214 c = s.getr() 215 goto assignop 216 217 case '<': 218 c = s.getr() 219 if c == '=' { 220 s.op, s.prec = Leq, precCmp 221 s.tok = _Operator 222 break 223 } 224 if c == '<' { 225 s.op, s.prec = Shl, precMul 226 c = s.getr() 227 goto assignop 228 } 229 if c == '-' { 230 s.tok = _Arrow 231 break 232 } 233 s.ungetr() 234 s.op, s.prec = Lss, precCmp 235 s.tok = _Operator 236 237 case '>': 238 c = s.getr() 239 if c == '=' { 240 s.op, s.prec = Geq, precCmp 241 s.tok = _Operator 242 break 243 } 244 if c == '>' { 245 s.op, s.prec = Shr, precMul 246 c = s.getr() 247 goto assignop 248 } 249 s.ungetr() 250 s.op, s.prec = Gtr, precCmp 251 s.tok = _Operator 252 253 case '=': 254 if s.getr() == '=' { 255 s.op, s.prec = Eql, precCmp 256 s.tok = _Operator 257 break 258 } 259 s.ungetr() 260 s.tok = _Assign 261 262 case '!': 263 if s.getr() == '=' { 264 s.op, s.prec = Neq, precCmp 265 s.tok = _Operator 266 break 267 } 268 s.ungetr() 269 s.op, s.prec = Not, 0 270 s.tok = _Operator 271 272 default: 273 s.tok = 0 274 s.error(fmt.Sprintf("illegal character %#U", c)) 275 goto redo 276 } 277 278 return 279 280 assignop: 281 if c == '=' { 282 s.tok = _AssignOp 283 return 284 } 285 s.ungetr() 286 s.tok = _Operator 287 } 288 289 func isLetter(c rune) bool { 290 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 291 } 292 293 func isDigit(c rune) bool { 294 return '0' <= c && c <= '9' 295 } 296 297 func (s *scanner) ident() { 298 s.startLit() 299 300 // accelerate common case (7bit ASCII) 301 c := s.getr() 302 for isLetter(c) || isDigit(c) { 303 c = s.getr() 304 } 305 306 // general case 307 if c >= utf8.RuneSelf { 308 for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || s.isCompatRune(c, false) { 309 c = s.getr() 310 } 311 } 312 s.ungetr() 313 314 lit := s.stopLit() 315 316 // possibly a keyword 317 if len(lit) >= 2 { 318 if tok := keywordMap[hash(lit)]; tok != 0 && strbyteseql(tokstrings[tok], lit) { 319 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 320 s.tok = tok 321 return 322 } 323 } 324 325 s.nlsemi = true 326 s.lit = string(lit) 327 s.tok = _Name 328 } 329 330 func (s *scanner) isCompatRune(c rune, start bool) bool { 331 if !gcCompat || c < utf8.RuneSelf { 332 return false 333 } 334 if start && unicode.IsNumber(c) { 335 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 336 } else { 337 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 338 } 339 return true 340 } 341 342 // hash is a perfect hash function for keywords. 343 // It assumes that s has at least length 2. 344 func hash(s []byte) uint { 345 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 346 } 347 348 func strbyteseql(s string, b []byte) bool { 349 if len(s) == len(b) { 350 for i, b := range b { 351 if s[i] != b { 352 return false 353 } 354 } 355 return true 356 } 357 return false 358 } 359 360 var keywordMap [1 << 6]token // size must be power of two 361 362 func init() { 363 // populate keywordMap 364 for tok := _Break; tok <= _Var; tok++ { 365 h := hash([]byte(tokstrings[tok])) 366 if keywordMap[h] != 0 { 367 panic("imperfect hash") 368 } 369 keywordMap[h] = tok 370 } 371 } 372 373 func (s *scanner) number(c rune) { 374 s.startLit() 375 376 if c != '.' { 377 s.kind = IntLit // until proven otherwise 378 if c == '0' { 379 c = s.getr() 380 if c == 'x' || c == 'X' { 381 // hex 382 c = s.getr() 383 hasDigit := false 384 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 385 c = s.getr() 386 hasDigit = true 387 } 388 if !hasDigit { 389 s.error("malformed hex constant") 390 } 391 goto done 392 } 393 394 // decimal 0, octal, or float 395 has8or9 := false 396 for isDigit(c) { 397 if c > '7' { 398 has8or9 = true 399 } 400 c = s.getr() 401 } 402 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 403 // octal 404 if has8or9 { 405 s.error("malformed octal constant") 406 } 407 goto done 408 } 409 410 } else { 411 // decimal or float 412 for isDigit(c) { 413 c = s.getr() 414 } 415 } 416 } 417 418 // float 419 if c == '.' { 420 s.kind = FloatLit 421 c = s.getr() 422 for isDigit(c) { 423 c = s.getr() 424 } 425 } 426 427 // exponent 428 if c == 'e' || c == 'E' { 429 s.kind = FloatLit 430 c = s.getr() 431 if c == '-' || c == '+' { 432 c = s.getr() 433 } 434 if !isDigit(c) { 435 s.error("malformed floating-point constant exponent") 436 } 437 for isDigit(c) { 438 c = s.getr() 439 } 440 } 441 442 // complex 443 if c == 'i' { 444 s.kind = ImagLit 445 s.getr() 446 } 447 448 done: 449 s.ungetr() 450 s.nlsemi = true 451 s.lit = string(s.stopLit()) 452 s.tok = _Literal 453 } 454 455 func (s *scanner) stdString() { 456 s.startLit() 457 458 for { 459 r := s.getr() 460 if r == '"' { 461 break 462 } 463 if r == '\\' { 464 s.escape('"') 465 continue 466 } 467 if r == '\n' { 468 s.ungetr() // assume newline is not part of literal 469 s.error("newline in string") 470 break 471 } 472 if r < 0 { 473 s.error_at(s.pos, s.line, "string not terminated") 474 break 475 } 476 } 477 478 s.nlsemi = true 479 s.lit = string(s.stopLit()) 480 s.kind = StringLit 481 s.tok = _Literal 482 } 483 484 func (s *scanner) rawString() { 485 s.startLit() 486 487 for { 488 r := s.getr() 489 if r == '`' { 490 break 491 } 492 if r < 0 { 493 s.error_at(s.pos, s.line, "string not terminated") 494 break 495 } 496 } 497 // We leave CRs in the string since they are part of the 498 // literal (even though they are not part of the literal 499 // value). 500 501 s.nlsemi = true 502 s.lit = string(s.stopLit()) 503 s.kind = StringLit 504 s.tok = _Literal 505 } 506 507 func (s *scanner) rune() { 508 s.startLit() 509 510 r := s.getr() 511 ok := false 512 if r == '\'' { 513 s.error("empty character literal or unescaped ' in character literal") 514 } else if r == '\n' { 515 s.ungetr() // assume newline is not part of literal 516 s.error("newline in character literal") 517 } else { 518 ok = true 519 if r == '\\' { 520 ok = s.escape('\'') 521 } 522 } 523 524 r = s.getr() 525 if r != '\'' { 526 // only report error if we're ok so far 527 if ok { 528 s.error("missing '") 529 } 530 s.ungetr() 531 } 532 533 s.nlsemi = true 534 s.lit = string(s.stopLit()) 535 s.kind = RuneLit 536 s.tok = _Literal 537 } 538 539 func (s *scanner) lineComment() { 540 // recognize pragmas 541 var prefix string 542 r := s.getr() 543 switch r { 544 case 'g': 545 prefix = "go:" 546 case 'l': 547 prefix = "line " 548 default: 549 goto skip 550 } 551 552 s.startLit() 553 for _, m := range prefix { 554 if r != m { 555 s.stopLit() 556 goto skip 557 } 558 r = s.getr() 559 } 560 561 for r >= 0 { 562 if r == '\n' { 563 s.ungetr() 564 break 565 } 566 r = s.getr() 567 } 568 s.pragmas = append(s.pragmas, Pragma{ 569 Line: s.line, 570 Text: strings.TrimSuffix(string(s.stopLit()), "\r"), 571 }) 572 return 573 574 skip: 575 // consume line 576 for r != '\n' && r >= 0 { 577 r = s.getr() 578 } 579 s.ungetr() // don't consume '\n' - needed for nlsemi logic 580 } 581 582 func (s *scanner) fullComment() { 583 for { 584 r := s.getr() 585 for r == '*' { 586 r = s.getr() 587 if r == '/' { 588 return 589 } 590 } 591 if r < 0 { 592 s.error_at(s.pos, s.line, "comment not terminated") 593 return 594 } 595 } 596 } 597 598 func (s *scanner) escape(quote rune) bool { 599 var n int 600 var base, max uint32 601 602 c := s.getr() 603 switch c { 604 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 605 return true 606 case '0', '1', '2', '3', '4', '5', '6', '7': 607 n, base, max = 3, 8, 255 608 case 'x': 609 c = s.getr() 610 n, base, max = 2, 16, 255 611 case 'u': 612 c = s.getr() 613 n, base, max = 4, 16, unicode.MaxRune 614 case 'U': 615 c = s.getr() 616 n, base, max = 8, 16, unicode.MaxRune 617 default: 618 if c < 0 { 619 return true // complain in caller about EOF 620 } 621 s.error("unknown escape sequence") 622 return false 623 } 624 625 var x uint32 626 for i := n; i > 0; i-- { 627 d := base 628 switch { 629 case isDigit(c): 630 d = uint32(c) - '0' 631 case 'a' <= c && c <= 'f': 632 d = uint32(c) - ('a' - 10) 633 case 'A' <= c && c <= 'F': 634 d = uint32(c) - ('A' - 10) 635 } 636 if d >= base { 637 if c < 0 { 638 return true // complain in caller about EOF 639 } 640 if gcCompat { 641 name := "hex" 642 if base == 8 { 643 name = "octal" 644 } 645 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", name, c)) 646 } else { 647 if c != quote { 648 s.error(fmt.Sprintf("illegal character %#U in escape sequence", c)) 649 } else { 650 s.error("escape sequence incomplete") 651 } 652 } 653 s.ungetr() 654 return false 655 } 656 // d < base 657 x = x*base + d 658 c = s.getr() 659 } 660 s.ungetr() 661 662 if x > max && base == 8 { 663 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 664 return false 665 } 666 667 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 668 s.error("escape sequence is invalid Unicode code point") 669 return false 670 } 671 672 return true 673 }