github.com/FenixAra/go@v0.0.0-20170127160404-96ea0918e670/src/cmd/compile/internal/syntax/scanner.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "fmt" 9 "io" 10 "strings" 11 "unicode" 12 "unicode/utf8" 13 ) 14 15 type scanner struct { 16 source 17 nlsemi bool // if set '\n' and EOF translate to ';' 18 pragma Pragma 19 20 // current token, valid after calling next() 21 pos, line int 22 tok token 23 lit string // valid if tok is _Name or _Literal 24 kind LitKind // valid if tok is _Literal 25 op Operator // valid if tok is _Operator, _AssignOp, or _IncOp 26 prec int // valid if tok is _Operator, _AssignOp, or _IncOp 27 28 pragh PragmaHandler 29 } 30 31 func (s *scanner) init(src io.Reader, errh ErrorHandler, pragh PragmaHandler) { 32 s.source.init(src, errh) 33 s.nlsemi = false 34 s.pragh = pragh 35 } 36 37 func (s *scanner) next() { 38 nlsemi := s.nlsemi 39 s.nlsemi = false 40 41 redo: 42 // skip white space 43 c := s.getr() 44 for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { 45 c = s.getr() 46 } 47 48 // token start 49 s.pos, s.line = s.source.pos0(), s.source.line0 50 51 if isLetter(c) || c >= utf8.RuneSelf && (unicode.IsLetter(c) || s.isCompatRune(c, true)) { 52 s.ident() 53 return 54 } 55 56 switch c { 57 case -1: 58 if nlsemi { 59 s.tok = _Semi 60 break 61 } 62 s.tok = _EOF 63 64 case '\n': 65 s.tok = _Semi 66 67 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 68 s.number(c) 69 70 case '"': 71 s.stdString() 72 73 case '`': 74 s.rawString() 75 76 case '\'': 77 s.rune() 78 79 case '(': 80 s.tok = _Lparen 81 82 case '[': 83 s.tok = _Lbrack 84 85 case '{': 86 s.tok = _Lbrace 87 88 case ',': 89 s.tok = _Comma 90 91 case ';': 92 s.tok = _Semi 93 94 case ')': 95 s.nlsemi = true 96 s.tok = _Rparen 97 98 case ']': 99 s.nlsemi = true 100 s.tok = _Rbrack 101 102 case '}': 103 s.nlsemi = true 104 s.tok = _Rbrace 105 106 case ':': 107 if s.getr() == '=' { 108 s.tok = _Define 109 break 110 } 111 s.ungetr() 112 s.tok = _Colon 113 114 case '.': 115 c = s.getr() 116 if isDigit(c) { 117 s.ungetr() 118 s.source.r0-- // make sure '.' is part of literal (line cannot have changed) 119 s.number('.') 120 break 121 } 122 if c == '.' { 123 c = s.getr() 124 if c == '.' { 125 s.tok = _DotDotDot 126 break 127 } 128 s.ungetr() 129 s.source.r0-- // make next ungetr work (line cannot have changed) 130 } 131 s.ungetr() 132 s.tok = _Dot 133 134 case '+': 135 s.op, s.prec = Add, precAdd 136 c = s.getr() 137 if c != '+' { 138 goto assignop 139 } 140 s.nlsemi = true 141 s.tok = _IncOp 142 143 case '-': 144 s.op, s.prec = Sub, precAdd 145 c = s.getr() 146 if c != '-' { 147 goto assignop 148 } 149 s.nlsemi = true 150 s.tok = _IncOp 151 152 case '*': 153 s.op, s.prec = Mul, precMul 154 // don't goto assignop - want _Star token 155 if s.getr() == '=' { 156 s.tok = _AssignOp 157 break 158 } 159 s.ungetr() 160 s.tok = _Star 161 162 case '/': 163 c = s.getr() 164 if c == '/' { 165 s.lineComment() 166 goto redo 167 } 168 if c == '*' { 169 s.fullComment() 170 if s.source.line > s.line && nlsemi { 171 // A multi-line comment acts like a newline; 172 // it translates to a ';' if nlsemi is set. 173 s.tok = _Semi 174 break 175 } 176 goto redo 177 } 178 s.op, s.prec = Div, precMul 179 goto assignop 180 181 case '%': 182 s.op, s.prec = Rem, precMul 183 c = s.getr() 184 goto assignop 185 186 case '&': 187 c = s.getr() 188 if c == '&' { 189 s.op, s.prec = AndAnd, precAndAnd 190 s.tok = _Operator 191 break 192 } 193 s.op, s.prec = And, precMul 194 if c == '^' { 195 s.op = AndNot 196 c = s.getr() 197 } 198 goto assignop 199 200 case '|': 201 c = s.getr() 202 if c == '|' { 203 s.op, s.prec = OrOr, precOrOr 204 s.tok = _Operator 205 break 206 } 207 s.op, s.prec = Or, precAdd 208 goto assignop 209 210 case '~': 211 s.error("bitwise complement operator is ^") 212 fallthrough 213 214 case '^': 215 s.op, s.prec = Xor, precAdd 216 c = s.getr() 217 goto assignop 218 219 case '<': 220 c = s.getr() 221 if c == '=' { 222 s.op, s.prec = Leq, precCmp 223 s.tok = _Operator 224 break 225 } 226 if c == '<' { 227 s.op, s.prec = Shl, precMul 228 c = s.getr() 229 goto assignop 230 } 231 if c == '-' { 232 s.tok = _Arrow 233 break 234 } 235 s.ungetr() 236 s.op, s.prec = Lss, precCmp 237 s.tok = _Operator 238 239 case '>': 240 c = s.getr() 241 if c == '=' { 242 s.op, s.prec = Geq, precCmp 243 s.tok = _Operator 244 break 245 } 246 if c == '>' { 247 s.op, s.prec = Shr, precMul 248 c = s.getr() 249 goto assignop 250 } 251 s.ungetr() 252 s.op, s.prec = Gtr, precCmp 253 s.tok = _Operator 254 255 case '=': 256 if s.getr() == '=' { 257 s.op, s.prec = Eql, precCmp 258 s.tok = _Operator 259 break 260 } 261 s.ungetr() 262 s.tok = _Assign 263 264 case '!': 265 if s.getr() == '=' { 266 s.op, s.prec = Neq, precCmp 267 s.tok = _Operator 268 break 269 } 270 s.ungetr() 271 s.op, s.prec = Not, 0 272 s.tok = _Operator 273 274 default: 275 s.tok = 0 276 s.error(fmt.Sprintf("illegal character %#U", c)) 277 goto redo 278 } 279 280 return 281 282 assignop: 283 if c == '=' { 284 s.tok = _AssignOp 285 return 286 } 287 s.ungetr() 288 s.tok = _Operator 289 } 290 291 func isLetter(c rune) bool { 292 return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' 293 } 294 295 func isDigit(c rune) bool { 296 return '0' <= c && c <= '9' 297 } 298 299 func (s *scanner) ident() { 300 s.startLit() 301 302 // accelerate common case (7bit ASCII) 303 c := s.getr() 304 for isLetter(c) || isDigit(c) { 305 c = s.getr() 306 } 307 308 // general case 309 if c >= utf8.RuneSelf { 310 for unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || s.isCompatRune(c, false) { 311 c = s.getr() 312 } 313 } 314 s.ungetr() 315 316 lit := s.stopLit() 317 318 // possibly a keyword 319 if len(lit) >= 2 { 320 if tok := keywordMap[hash(lit)]; tok != 0 && tokstrings[tok] == string(lit) { 321 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) 322 s.tok = tok 323 return 324 } 325 } 326 327 s.nlsemi = true 328 s.lit = string(lit) 329 s.tok = _Name 330 } 331 332 func (s *scanner) isCompatRune(c rune, start bool) bool { 333 if !gcCompat || c < utf8.RuneSelf { 334 return false 335 } 336 if start && unicode.IsNumber(c) { 337 s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c)) 338 } else { 339 s.error(fmt.Sprintf("invalid identifier character %#U", c)) 340 } 341 return true 342 } 343 344 // hash is a perfect hash function for keywords. 345 // It assumes that s has at least length 2. 346 func hash(s []byte) uint { 347 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1) 348 } 349 350 var keywordMap [1 << 6]token // size must be power of two 351 352 func init() { 353 // populate keywordMap 354 for tok := _Break; tok <= _Var; tok++ { 355 h := hash([]byte(tokstrings[tok])) 356 if keywordMap[h] != 0 { 357 panic("imperfect hash") 358 } 359 keywordMap[h] = tok 360 } 361 } 362 363 func (s *scanner) number(c rune) { 364 s.startLit() 365 366 if c != '.' { 367 s.kind = IntLit // until proven otherwise 368 if c == '0' { 369 c = s.getr() 370 if c == 'x' || c == 'X' { 371 // hex 372 c = s.getr() 373 hasDigit := false 374 for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { 375 c = s.getr() 376 hasDigit = true 377 } 378 if !hasDigit { 379 s.error("malformed hex constant") 380 } 381 goto done 382 } 383 384 // decimal 0, octal, or float 385 has8or9 := false 386 for isDigit(c) { 387 if c > '7' { 388 has8or9 = true 389 } 390 c = s.getr() 391 } 392 if c != '.' && c != 'e' && c != 'E' && c != 'i' { 393 // octal 394 if has8or9 { 395 s.error("malformed octal constant") 396 } 397 goto done 398 } 399 400 } else { 401 // decimal or float 402 for isDigit(c) { 403 c = s.getr() 404 } 405 } 406 } 407 408 // float 409 if c == '.' { 410 s.kind = FloatLit 411 c = s.getr() 412 for isDigit(c) { 413 c = s.getr() 414 } 415 } 416 417 // exponent 418 if c == 'e' || c == 'E' { 419 s.kind = FloatLit 420 c = s.getr() 421 if c == '-' || c == '+' { 422 c = s.getr() 423 } 424 if !isDigit(c) { 425 s.error("malformed floating-point constant exponent") 426 } 427 for isDigit(c) { 428 c = s.getr() 429 } 430 } 431 432 // complex 433 if c == 'i' { 434 s.kind = ImagLit 435 s.getr() 436 } 437 438 done: 439 s.ungetr() 440 s.nlsemi = true 441 s.lit = string(s.stopLit()) 442 s.tok = _Literal 443 } 444 445 func (s *scanner) stdString() { 446 s.startLit() 447 448 for { 449 r := s.getr() 450 if r == '"' { 451 break 452 } 453 if r == '\\' { 454 s.escape('"') 455 continue 456 } 457 if r == '\n' { 458 s.ungetr() // assume newline is not part of literal 459 s.error("newline in string") 460 break 461 } 462 if r < 0 { 463 s.error_at(s.pos, s.line, "string not terminated") 464 break 465 } 466 } 467 468 s.nlsemi = true 469 s.lit = string(s.stopLit()) 470 s.kind = StringLit 471 s.tok = _Literal 472 } 473 474 func (s *scanner) rawString() { 475 s.startLit() 476 477 for { 478 r := s.getr() 479 if r == '`' { 480 break 481 } 482 if r < 0 { 483 s.error_at(s.pos, s.line, "string not terminated") 484 break 485 } 486 } 487 // We leave CRs in the string since they are part of the 488 // literal (even though they are not part of the literal 489 // value). 490 491 s.nlsemi = true 492 s.lit = string(s.stopLit()) 493 s.kind = StringLit 494 s.tok = _Literal 495 } 496 497 func (s *scanner) rune() { 498 s.startLit() 499 500 r := s.getr() 501 ok := false 502 if r == '\'' { 503 s.error("empty character literal or unescaped ' in character literal") 504 } else if r == '\n' { 505 s.ungetr() // assume newline is not part of literal 506 s.error("newline in character literal") 507 } else { 508 ok = true 509 if r == '\\' { 510 ok = s.escape('\'') 511 } 512 } 513 514 r = s.getr() 515 if r != '\'' { 516 // only report error if we're ok so far 517 if ok { 518 s.error("missing '") 519 } 520 s.ungetr() 521 } 522 523 s.nlsemi = true 524 s.lit = string(s.stopLit()) 525 s.kind = RuneLit 526 s.tok = _Literal 527 } 528 529 func (s *scanner) lineComment() { 530 // recognize pragmas 531 var prefix string 532 r := s.getr() 533 if s.pragh == nil { 534 goto skip 535 } 536 537 switch r { 538 case 'g': 539 prefix = "go:" 540 case 'l': 541 prefix = "line " 542 default: 543 goto skip 544 } 545 546 s.startLit() 547 for _, m := range prefix { 548 if r != m { 549 s.stopLit() 550 goto skip 551 } 552 r = s.getr() 553 } 554 555 for r >= 0 { 556 if r == '\n' { 557 s.ungetr() 558 break 559 } 560 r = s.getr() 561 } 562 s.pragma |= s.pragh(0, s.line, strings.TrimSuffix(string(s.stopLit()), "\r")) 563 return 564 565 skip: 566 // consume line 567 for r != '\n' && r >= 0 { 568 r = s.getr() 569 } 570 s.ungetr() // don't consume '\n' - needed for nlsemi logic 571 } 572 573 func (s *scanner) fullComment() { 574 for { 575 r := s.getr() 576 for r == '*' { 577 r = s.getr() 578 if r == '/' { 579 return 580 } 581 } 582 if r < 0 { 583 s.error_at(s.pos, s.line, "comment not terminated") 584 return 585 } 586 } 587 } 588 589 func (s *scanner) escape(quote rune) bool { 590 var n int 591 var base, max uint32 592 593 c := s.getr() 594 switch c { 595 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 596 return true 597 case '0', '1', '2', '3', '4', '5', '6', '7': 598 n, base, max = 3, 8, 255 599 case 'x': 600 c = s.getr() 601 n, base, max = 2, 16, 255 602 case 'u': 603 c = s.getr() 604 n, base, max = 4, 16, unicode.MaxRune 605 case 'U': 606 c = s.getr() 607 n, base, max = 8, 16, unicode.MaxRune 608 default: 609 if c < 0 { 610 return true // complain in caller about EOF 611 } 612 s.error("unknown escape sequence") 613 return false 614 } 615 616 var x uint32 617 for i := n; i > 0; i-- { 618 d := base 619 switch { 620 case isDigit(c): 621 d = uint32(c) - '0' 622 case 'a' <= c && c <= 'f': 623 d = uint32(c) - ('a' - 10) 624 case 'A' <= c && c <= 'F': 625 d = uint32(c) - ('A' - 10) 626 } 627 if d >= base { 628 if c < 0 { 629 return true // complain in caller about EOF 630 } 631 if gcCompat { 632 name := "hex" 633 if base == 8 { 634 name = "octal" 635 } 636 s.error(fmt.Sprintf("non-%s character in escape sequence: %c", name, c)) 637 } else { 638 if c != quote { 639 s.error(fmt.Sprintf("illegal character %#U in escape sequence", c)) 640 } else { 641 s.error("escape sequence incomplete") 642 } 643 } 644 s.ungetr() 645 return false 646 } 647 // d < base 648 x = x*base + d 649 c = s.getr() 650 } 651 s.ungetr() 652 653 if x > max && base == 8 { 654 s.error(fmt.Sprintf("octal escape value > 255: %d", x)) 655 return false 656 } 657 658 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { 659 s.error("escape sequence is invalid Unicode code point") 660 return false 661 } 662 663 return true 664 }