github.com/dannin/go@v0.0.0-20161031215817-d35dfd405eaa/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "fmt" 9 "io" 10 "strings" 11 "unicode" 12 "unicode/utf8" 13 ) 14 15 type scanner struct { 16 source 17 nlsemi bool // if set '\n' and EOF translate to ';' 18 pragma Pragma 19 20 // current token, valid after calling next() 21 pos, line int 22 tok token 23 lit string // valid if tok is _Name or _Literal 24 kind LitKind // valid if tok is _Literal 25 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 26 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 27 28 pragh PragmaHandler 29 } 30 31 func (s *scanner) init(src io.Reader, errh ErrorHandler, pragh PragmaHandler) { 32 s.source.init(src, errh) 33 s.nlsemi = false 34 s.pragh = pragh 35 } 36 37 func (s *scanner) next() { 38 nlsemi := s.nlsemi 39 s.nlsemi = false 40 41 redo: 42 // skip white space 43 c := s.getr() 44 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 45 c = s.getr() 46 } 47 48 // token start 49 s.pos, s.line = s.source.pos0(), s.source.line0 50 51 if isLetter(c) || c >= utf8.RuneSelf && (unicode.IsLetter(c) || s.isCompatRune(c, true)) { 52 s.ident() 53 return 54 } 55 56 switch c { 57 case -1: 58 if nlsemi { 59 s.tok = _Semi 60 break 61 } 62 s.tok = _EOF 63 64 case '\n': 65 s.tok = _Semi 66 67 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 68 s.number(c) 69 70 case '"': 71 s.stdString() 72 73 case '`': 74 s.rawString() 75 76 case '\'': 77 s.rune() 78 79 case '(': 80 s.tok = _Lparen 81 82 case '[': 83 s.tok = _Lbrack 84 85 case '{': 86 s.tok = _Lbrace 87 88 case ',': 89 s.tok = _Comma 90 91 case ';': 92 s.tok = _Semi 93 94 case ')': 95 s.nlsemi = true 96 s.tok = _Rparen 97 98 case ']': 99 s.nlsemi = true 100 s.tok = _Rbrack 101 102 case '}': 103 s.nlsemi = true 104 s.tok = _Rbrace 105 106 case ':': 107 if s.getr() == '=' { 108 s.tok = _Define 109 break 110 } 111 s.ungetr() 112 s.tok = _Colon 113 114 case '.': 115 c = s.getr() 116 if isDigit(c) { 117 s.ungetr() 118 s.source.r0-- // make sure '.' is part of literal (line cannot have changed) 119 s.number('.') 120 break 121 } 122 if c == '.' { 123 c = s.getr() 124 if c == '.' { 125 s.tok = _DotDotDot 126 break 127 } 128 s.ungetr() 129 s.source.r0-- // make next ungetr work (line cannot have changed) 130 } 131 s.ungetr() 132 s.tok = _Dot 133 134 case '+': 135 s.op, s.prec = Add, precAdd 136 c = s.getr() 137 if c != '+' { 138 goto assignop 139 } 140 s.nlsemi = true 141 s.tok = _IncOp 142 143 case '-': 144 s.op, s.prec = Sub, precAdd 145 c = s.getr() 146 if c != '-' { 147 goto assignop 148 } 149 s.nlsemi = true 150 s.tok = _IncOp 151 152 case '*': 153 s.op, s.prec = Mul, precMul 154 // don't goto assignop - want _Star token 155 if s.getr() == '=' { 156 s.tok = _AssignOp 157 break 158 } 159 s.ungetr() 160 s.tok = _Star 161 162 case '/': 163 c = s.getr() 164 if c == '/' { 165 s.lineComment() 166 goto redo 167 } 168 if c == '*' { 169 s.fullComment() 170 if s.source.line > s.line && nlsemi { 171 // A multi-line comment acts like a newline; 172 // it translates to a ';' if nlsemi is set. 173 s.tok = _Semi 174 break 175 } 176 goto redo 177 } 178 s.op, s.prec = Div, precMul 179 goto assignop 180 181 case '%': 182 s.op, s.prec = Rem, precMul 183 c = s.getr() 184 goto assignop 185 186 case '&': 187 c = s.getr() 188 if c == '&' { 189 s.op, s.prec = AndAnd, precAndAnd 190 s.tok = _Operator 191 break 192 } 193 s.op, s.prec = And, precMul 194 if c == '^' { 195 s.op = AndNot 196 c = s.getr() 197 } 198 goto assignop 199 200 case '|': 201 c = s.getr() 202 if c == '|' { 203 s.op, s.prec = OrOr, precOrOr 204 s.tok = _Operator 205 break 206 } 207 s.op, s.prec = Or, precAdd 208 goto assignop 209 210 case '~': 211 s.error("bitwise complement operator is ^") 212 fallthrough 213 214 case '^': 215 s.op, s.prec = Xor, precAdd 216 c = s.getr() 217 goto assignop 218 219 case '<': 220 c = s.getr() 221 if c == '=' { 222 s.op, s.prec = Leq, precCmp 223 s.tok = _Operator 224 break 225 } 226 if c == '<' { 227 s.op, s.prec = Shl, precMul 228 c = s.getr() 229 goto assignop 230 } 231 if c == '-' { 232 s.tok = _Larrow 233 break 234 } 235 s.ungetr() 236 s.op, s.prec = Lss, precCmp 237 s.tok = _Operator 238 239 case '>': 240 c = s.getr() 241 if c == '=' { 242 s.op, s.prec = Geq, precCmp 243 s.tok = _Operator 244 break 245 } 246 if c == '>' { 247 s.op, s.prec = Shr, precMul 248 c = s.getr() 249 goto assignop 250 } 251 s.ungetr() 252 s.op, s.prec = Gtr, precCmp 253 s.tok = _Operator 254 255 case '=': 256 c = s.getr() 257 if c == '=' { 258 s.op, s.prec = Eql, precCmp 259 s.tok = _Operator 260 break 261 } 262 if c == '>' { 263 s.tok = _Rarrow 264 break 265 } 266 s.ungetr() 267 s.tok = _Assign 268 269 case '!': 270 if s.getr() == '=' { 271 s.op, s.prec = Neq, precCmp 272 s.tok = _Operator 273 break 274 } 275 s.ungetr() 276 s.op, s.prec = Not, 0 277 s.tok = _Operator 278 279 default: 280 s.tok = 0 281 s.error(fmt.Sprintf("illegal character %#U", c)) 282 goto redo 283 } 284 285 return 286 287 assignop: 288 if c == '=' { 289 s.tok = _AssignOp 290 return 291 } 292 s.ungetr() 293 s.tok = _Operator 294 } 295 296 func isLetter(c rune) bool { 297 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 298 } 299 300 func isDigit(c rune) bool { 301 return '0' <= c && c <= '9' 302 } 303 304 func (s *scanner) ident() { 305 s.startLit() 306 307 // accelerate common case (7bit ASCII) 308 c := s.getr() 309 for isLetter(c) || isDigit(c) { 310 c = s.getr() 311 } 312 313 // general case 314 if c >= utf8.RuneSelf { 315 for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || s.isCompatRune(c, false) { 316 c = s.getr() 317 } 318 } 319 s.ungetr() 320 321 lit := s.stopLit() 322 323 // possibly a keyword 324 if len(lit) >= 2 { 325 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 326 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 327 s.tok = tok 328 return 329 } 330 } 331 332 s.nlsemi = true 333 s.lit = string(lit) 334 s.tok = _Name 335 } 336 337 func (s *scanner) isCompatRune(c rune, start bool) bool { 338 if !gcCompat || c < utf8.RuneSelf { 339 return false 340 } 341 if start && unicode.IsNumber(c) { 342 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 343 } else { 344 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 345 } 346 return true 347 } 348 349 // hash is a perfect hash function for keywords. 350 // It assumes that s has at least length 2. 351 func hash(s []byte) uint { 352 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 353 } 354 355 var keywordMap [1 << 6]token // size must be power of two 356 357 func init() { 358 // populate keywordMap 359 for tok := _Break; tok <= _Var; tok++ { 360 h := hash([]byte(tokstrings[tok])) 361 if keywordMap[h] != 0 { 362 panic("imperfect hash") 363 } 364 keywordMap[h] = tok 365 } 366 } 367 368 func (s *scanner) number(c rune) { 369 s.startLit() 370 371 if c != '.' { 372 s.kind = IntLit // until proven otherwise 373 if c == '0' { 374 c = s.getr() 375 if c == 'x' || c == 'X' { 376 // hex 377 c = s.getr() 378 hasDigit := false 379 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 380 c = s.getr() 381 hasDigit = true 382 } 383 if !hasDigit { 384 s.error("malformed hex constant") 385 } 386 goto done 387 } 388 389 // decimal 0, octal, or float 390 has8or9 := false 391 for isDigit(c) { 392 if c > '7' { 393 has8or9 = true 394 } 395 c = s.getr() 396 } 397 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 398 // octal 399 if has8or9 { 400 s.error("malformed octal constant") 401 } 402 goto done 403 } 404 405 } else { 406 // decimal or float 407 for isDigit(c) { 408 c = s.getr() 409 } 410 } 411 } 412 413 // float 414 if c == '.' { 415 s.kind = FloatLit 416 c = s.getr() 417 for isDigit(c) { 418 c = s.getr() 419 } 420 } 421 422 // exponent 423 if c == 'e' || c == 'E' { 424 s.kind = FloatLit 425 c = s.getr() 426 if c == '-' || c == '+' { 427 c = s.getr() 428 } 429 if !isDigit(c) { 430 s.error("malformed floating-point constant exponent") 431 } 432 for isDigit(c) { 433 c = s.getr() 434 } 435 } 436 437 // complex 438 if c == 'i' { 439 s.kind = ImagLit 440 s.getr() 441 } 442 443 done: 444 s.ungetr() 445 s.nlsemi = true 446 s.lit = string(s.stopLit()) 447 s.tok = _Literal 448 } 449 450 func (s *scanner) stdString() { 451 s.startLit() 452 453 for { 454 r := s.getr() 455 if r == '"' { 456 break 457 } 458 if r == '\\' { 459 s.escape('"') 460 continue 461 } 462 if r == '\n' { 463 s.ungetr() // assume newline is not part of literal 464 s.error("newline in string") 465 break 466 } 467 if r < 0 { 468 s.error_at(s.pos, s.line, "string not terminated") 469 break 470 } 471 } 472 473 s.nlsemi = true 474 s.lit = string(s.stopLit()) 475 s.kind = StringLit 476 s.tok = _Literal 477 } 478 479 func (s *scanner) rawString() { 480 s.startLit() 481 482 for { 483 r := s.getr() 484 if r == '`' { 485 break 486 } 487 if r < 0 { 488 s.error_at(s.pos, s.line, "string not terminated") 489 break 490 } 491 } 492 // We leave CRs in the string since they are part of the 493 // literal (even though they are not part of the literal 494 // value). 495 496 s.nlsemi = true 497 s.lit = string(s.stopLit()) 498 s.kind = StringLit 499 s.tok = _Literal 500 } 501 502 func (s *scanner) rune() { 503 s.startLit() 504 505 r := s.getr() 506 ok := false 507 if r == '\'' { 508 s.error("empty character literal or unescaped ' in character literal") 509 } else if r == '\n' { 510 s.ungetr() // assume newline is not part of literal 511 s.error("newline in character literal") 512 } else { 513 ok = true 514 if r == '\\' { 515 ok = s.escape('\'') 516 } 517 } 518 519 r = s.getr() 520 if r != '\'' { 521 // only report error if we're ok so far 522 if ok { 523 s.error("missing '") 524 } 525 s.ungetr() 526 } 527 528 s.nlsemi = true 529 s.lit = string(s.stopLit()) 530 s.kind = RuneLit 531 s.tok = _Literal 532 } 533 534 func (s *scanner) lineComment() { 535 // recognize pragmas 536 var prefix string 537 r := s.getr() 538 if s.pragh == nil { 539 goto skip 540 } 541 542 switch r { 543 case 'g': 544 prefix = "go:" 545 case 'l': 546 prefix = "line " 547 default: 548 goto skip 549 } 550 551 s.startLit() 552 for _, m := range prefix { 553 if r != m { 554 s.stopLit() 555 goto skip 556 } 557 r = s.getr() 558 } 559 560 for r >= 0 { 561 if r == '\n' { 562 s.ungetr() 563 break 564 } 565 r = s.getr() 566 } 567 s.pragma |= s.pragh(0, s.line, strings.TrimSuffix(string(s.stopLit()), "\r")) 568 return 569 570 skip: 571 // consume line 572 for r != '\n' && r >= 0 { 573 r = s.getr() 574 } 575 s.ungetr() // don't consume '\n' - needed for nlsemi logic 576 } 577 578 func (s *scanner) fullComment() { 579 for { 580 r := s.getr() 581 for r == '*' { 582 r = s.getr() 583 if r == '/' { 584 return 585 } 586 } 587 if r < 0 { 588 s.error_at(s.pos, s.line, "comment not terminated") 589 return 590 } 591 } 592 } 593 594 func (s *scanner) escape(quote rune) bool { 595 var n int 596 var base, max uint32 597 598 c := s.getr() 599 switch c { 600 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 601 return true 602 case '0', '1', '2', '3', '4', '5', '6', '7': 603 n, base, max = 3, 8, 255 604 case 'x': 605 c = s.getr() 606 n, base, max = 2, 16, 255 607 case 'u': 608 c = s.getr() 609 n, base, max = 4, 16, unicode.MaxRune 610 case 'U': 611 c = s.getr() 612 n, base, max = 8, 16, unicode.MaxRune 613 default: 614 if c < 0 { 615 return true // complain in caller about EOF 616 } 617 s.error("unknown escape sequence") 618 return false 619 } 620 621 var x uint32 622 for i := n; i > 0; i-- { 623 d := base 624 switch { 625 case isDigit(c): 626 d = uint32(c) - '0' 627 case 'a' <= c && c <= 'f': 628 d = uint32(c) - ('a' - 10) 629 case 'A' <= c && c <= 'F': 630 d = uint32(c) - ('A' - 10) 631 } 632 if d >= base { 633 if c < 0 { 634 return true // complain in caller about EOF 635 } 636 if gcCompat { 637 name := "hex" 638 if base == 8 { 639 name = "octal" 640 } 641 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", name, c)) 642 } else { 643 if c != quote { 644 s.error(fmt.Sprintf("illegal character %#U in escape sequence", c)) 645 } else { 646 s.error("escape sequence incomplete") 647 } 648 } 649 s.ungetr() 650 return false 651 } 652 // d < base 653 x = x*base + d 654 c = s.getr() 655 } 656 s.ungetr() 657 658 if x > max && base == 8 { 659 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 660 return false 661 } 662 663 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 664 s.error("escape sequence is invalid Unicode code point") 665 return false 666 } 667 668 return true 669 }