github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 // 9 package scanner 10 11 import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22 // encountered and a handler was installed, the handler is called with a 23 // position and an error message. The position points to the beginning of 24 // the offending token. 25 // 26 type ErrorHandler func(pos token.Position, msg string) 27 28 // A Scanner holds the scanner's internal state while processing 29 // a given text. It can be allocated as part of another data 30 // structure but must be initialized via Init before use. 31 // 32 type Scanner struct { 33 // immutable state 34 file *token.File // source file handle 35 dir string // directory portion of file.Name() 36 src []byte // source 37 err ErrorHandler // error reporting; or nil 38 mode Mode // scanning mode 39 40 // scanning state 41 ch rune // current character 42 offset int // character offset 43 rdOffset int // reading offset (position after current character) 44 lineOffset int // current line offset 45 insertSemi bool // insert a semicolon before next newline 46 47 // public state - ok to modify 48 ErrorCount int // number of errors encountered 49 } 50 51 const bom = 0xFEFF // byte order mark, only permitted as very first character 52 53 // Read the next Unicode char into s.ch. 54 // s.ch < 0 means end-of-file. 55 // 56 func (s *Scanner) next() { 57 if s.rdOffset < len(s.src) { 58 s.offset = s.rdOffset 59 if s.ch == '\n' { 60 s.lineOffset = s.offset 61 s.file.AddLine(s.offset) 62 } 63 r, w := rune(s.src[s.rdOffset]), 1 64 switch { 65 case r == 0: 66 s.error(s.offset, "illegal character NUL") 67 case r >= utf8.RuneSelf: 68 // not ASCII 69 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 70 if r == utf8.RuneError && w == 1 { 71 s.error(s.offset, "illegal UTF-8 encoding") 72 } else if r == bom && s.offset > 0 { 73 s.error(s.offset, "illegal byte order mark") 74 } 75 } 76 s.rdOffset += w 77 s.ch = r 78 } else { 79 s.offset = len(s.src) 80 if s.ch == '\n' { 81 s.lineOffset = s.offset 82 s.file.AddLine(s.offset) 83 } 84 s.ch = -1 // eof 85 } 86 } 87 88 // A mode value is a set of flags (or 0). 89 // They control scanner behavior. 90 // 91 type Mode uint 92 93 const ( 94 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 95 dontInsertSemis // do not automatically insert semicolons - for testing only 96 ) 97 98 // Init prepares the scanner s to tokenize the text src by setting the 99 // scanner at the beginning of src. The scanner uses the file set file 100 // for position information and it adds line information for each line. 101 // It is ok to re-use the same file when re-scanning the same file as 102 // line information which is already present is ignored. Init causes a 103 // panic if the file size does not match the src size. 104 // 105 // Calls to Scan will invoke the error handler err if they encounter a 106 // syntax error and err is not nil. Also, for each error encountered, 107 // the Scanner field ErrorCount is incremented by one. The mode parameter 108 // determines how comments are handled. 109 // 110 // Note that Init may call err if there is an error in the first character 111 // of the file. 112 // 113 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 114 // Explicitly initialize all fields since a scanner may be reused. 115 if file.Size() != len(src) { 116 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 117 } 118 s.file = file 119 s.dir, _ = filepath.Split(file.Name()) 120 s.src = src 121 s.err = err 122 s.mode = mode 123 124 s.ch = ' ' 125 s.offset = 0 126 s.rdOffset = 0 127 s.lineOffset = 0 128 s.insertSemi = false 129 s.ErrorCount = 0 130 131 s.next() 132 if s.ch == bom { 133 s.next() // ignore BOM at file beginning 134 } 135 } 136 137 func (s *Scanner) error(offs int, msg string) { 138 if s.err != nil { 139 s.err(s.file.Position(s.file.Pos(offs)), msg) 140 } 141 s.ErrorCount++ 142 } 143 144 var prefix = []byte("//line ") 145 146 func (s *Scanner) interpretLineComment(text []byte) { 147 if bytes.HasPrefix(text, prefix) { 148 // get filename and line number, if any 149 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 150 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 151 // valid //line filename:line comment 152 filename := string(bytes.TrimSpace(text[len(prefix):i])) 153 if filename != "" { 154 filename = filepath.Clean(filename) 155 if !filepath.IsAbs(filename) { 156 // make filename relative to current directory 157 filename = filepath.Join(s.dir, filename) 158 } 159 } 160 // update scanner position 161 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 162 } 163 } 164 } 165 } 166 167 func (s *Scanner) scanComment() string { 168 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 169 offs := s.offset - 1 // position of initial '/' 170 hasCR := false 171 172 if s.ch == '/' { 173 //-style comment 174 s.next() 175 for s.ch != '\n' && s.ch >= 0 { 176 if s.ch == '\r' { 177 hasCR = true 178 } 179 s.next() 180 } 181 if offs == s.lineOffset { 182 // comment starts at the beginning of the current line 183 s.interpretLineComment(s.src[offs:s.offset]) 184 } 185 goto exit 186 } 187 188 /*-style comment */ 189 s.next() 190 for s.ch >= 0 { 191 ch := s.ch 192 if ch == '\r' { 193 hasCR = true 194 } 195 s.next() 196 if ch == '*' && s.ch == '/' { 197 s.next() 198 goto exit 199 } 200 } 201 202 s.error(offs, "comment not terminated") 203 204 exit: 205 lit := s.src[offs:s.offset] 206 if hasCR { 207 lit = stripCR(lit, lit[1] == '*') 208 } 209 210 return string(lit) 211 } 212 213 func (s *Scanner) findLineEnd() bool { 214 // initial '/' already consumed 215 216 defer func(offs int) { 217 // reset scanner state to where it was upon calling findLineEnd 218 s.ch = '/' 219 s.offset = offs 220 s.rdOffset = offs + 1 221 s.next() // consume initial '/' again 222 }(s.offset - 1) 223 224 // read ahead until a newline, EOF, or non-comment token is found 225 for s.ch == '/' || s.ch == '*' { 226 if s.ch == '/' { 227 //-style comment always contains a newline 228 return true 229 } 230 /*-style comment: look for newline */ 231 s.next() 232 for s.ch >= 0 { 233 ch := s.ch 234 if ch == '\n' { 235 return true 236 } 237 s.next() 238 if ch == '*' && s.ch == '/' { 239 s.next() 240 break 241 } 242 } 243 s.skipWhitespace() // s.insertSemi is set 244 if s.ch < 0 || s.ch == '\n' { 245 return true 246 } 247 if s.ch != '/' { 248 // non-comment token 249 return false 250 } 251 s.next() // consume '/' 252 } 253 254 return false 255 } 256 257 func isLetter(ch rune) bool { 258 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 259 } 260 261 func isDigit(ch rune) bool { 262 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 263 } 264 265 func (s *Scanner) scanIdentifier() string { 266 offs := s.offset 267 for isLetter(s.ch) || isDigit(s.ch) { 268 s.next() 269 } 270 return string(s.src[offs:s.offset]) 271 } 272 273 func digitVal(ch rune) int { 274 switch { 275 case '0' <= ch && ch <= '9': 276 return int(ch - '0') 277 case 'a' <= ch && ch <= 'f': 278 return int(ch - 'a' + 10) 279 case 'A' <= ch && ch <= 'F': 280 return int(ch - 'A' + 10) 281 } 282 return 16 // larger than any legal digit val 283 } 284 285 func (s *Scanner) scanMantissa(base int) { 286 for digitVal(s.ch) < base { 287 s.next() 288 } 289 } 290 291 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 292 // digitVal(s.ch) < 10 293 offs := s.offset 294 tok := token.INT 295 296 if seenDecimalPoint { 297 offs-- 298 tok = token.FLOAT 299 s.scanMantissa(10) 300 goto exponent 301 } 302 303 if s.ch == '0' { 304 // int or float 305 offs := s.offset 306 s.next() 307 if s.ch == 'x' || s.ch == 'X' { 308 // hexadecimal int 309 s.next() 310 s.scanMantissa(16) 311 if s.offset-offs <= 2 { 312 // only scanned "0x" or "0X" 313 s.error(offs, "illegal hexadecimal number") 314 } 315 } else { 316 // octal int or float 317 seenDecimalDigit := false 318 s.scanMantissa(8) 319 if s.ch == '8' || s.ch == '9' { 320 // illegal octal int or float 321 seenDecimalDigit = true 322 s.scanMantissa(10) 323 } 324 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 325 goto fraction 326 } 327 // octal int 328 if seenDecimalDigit { 329 s.error(offs, "illegal octal number") 330 } 331 } 332 goto exit 333 } 334 335 // decimal int or float 336 s.scanMantissa(10) 337 338 fraction: 339 if s.ch == '.' { 340 tok = token.FLOAT 341 s.next() 342 s.scanMantissa(10) 343 } 344 345 exponent: 346 if s.ch == 'e' || s.ch == 'E' { 347 tok = token.FLOAT 348 s.next() 349 if s.ch == '-' || s.ch == '+' { 350 s.next() 351 } 352 if digitVal(s.ch) < 10 { 353 s.scanMantissa(10) 354 } else { 355 s.error(offs, "illegal floating-point exponent") 356 } 357 } 358 359 if s.ch == 'i' { 360 tok = token.IMAG 361 s.next() 362 } 363 364 exit: 365 return tok, string(s.src[offs:s.offset]) 366 } 367 368 // scanEscape parses an escape sequence where rune is the accepted 369 // escaped quote. In case of a syntax error, it stops at the offending 370 // character (without consuming it) and returns false. Otherwise 371 // it returns true. 372 func (s *Scanner) scanEscape(quote rune) bool { 373 offs := s.offset 374 375 var n int 376 var base, max uint32 377 switch s.ch { 378 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 379 s.next() 380 return true 381 case '0', '1', '2', '3', '4', '5', '6', '7': 382 n, base, max = 3, 8, 255 383 case 'x': 384 s.next() 385 n, base, max = 2, 16, 255 386 case 'u': 387 s.next() 388 n, base, max = 4, 16, unicode.MaxRune 389 case 'U': 390 s.next() 391 n, base, max = 8, 16, unicode.MaxRune 392 default: 393 msg := "unknown escape sequence" 394 if s.ch < 0 { 395 msg = "escape sequence not terminated" 396 } 397 s.error(offs, msg) 398 return false 399 } 400 401 var x uint32 402 for n > 0 { 403 d := uint32(digitVal(s.ch)) 404 if d >= base { 405 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 406 if s.ch < 0 { 407 msg = "escape sequence not terminated" 408 } 409 s.error(s.offset, msg) 410 return false 411 } 412 x = x*base + d 413 s.next() 414 n-- 415 } 416 417 if x > max || 0xD800 <= x && x < 0xE000 { 418 s.error(offs, "escape sequence is invalid Unicode code point") 419 return false 420 } 421 422 return true 423 } 424 425 func (s *Scanner) scanRune() string { 426 // '\'' opening already consumed 427 offs := s.offset - 1 428 429 valid := true 430 n := 0 431 for { 432 ch := s.ch 433 if ch == '\n' || ch < 0 { 434 // only report error if we don't have one already 435 if valid { 436 s.error(offs, "rune literal not terminated") 437 valid = false 438 } 439 break 440 } 441 s.next() 442 if ch == '\'' { 443 break 444 } 445 n++ 446 if ch == '\\' { 447 if !s.scanEscape('\'') { 448 valid = false 449 } 450 // continue to read to closing quote 451 } 452 } 453 454 if valid && n != 1 { 455 s.error(offs, "illegal rune literal") 456 } 457 458 return string(s.src[offs:s.offset]) 459 } 460 461 func (s *Scanner) scanString() string { 462 // '"' opening already consumed 463 offs := s.offset - 1 464 465 for { 466 ch := s.ch 467 if ch == '\n' || ch < 0 { 468 s.error(offs, "string literal not terminated") 469 break 470 } 471 s.next() 472 if ch == '"' { 473 break 474 } 475 if ch == '\\' { 476 s.scanEscape('"') 477 } 478 } 479 480 return string(s.src[offs:s.offset]) 481 } 482 483 func stripCR(b []byte, comment bool) []byte { 484 c := make([]byte, len(b)) 485 i := 0 486 for j, ch := range b { 487 // In a /*-style comment, don't strip \r from *\r/ (incl. 488 // sequences of \r from *\r\r...\r/) since the resulting 489 // */ would terminate the comment too early unless the \r 490 // is immediately following the opening /* in which case 491 // it's ok because /*/ is not closed yet (issue #11151). 492 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' { 493 c[i] = ch 494 i++ 495 } 496 } 497 return c[:i] 498 } 499 500 func (s *Scanner) scanRawString() string { 501 // '`' opening already consumed 502 offs := s.offset - 1 503 504 hasCR := false 505 for { 506 ch := s.ch 507 if ch < 0 { 508 s.error(offs, "raw string literal not terminated") 509 break 510 } 511 s.next() 512 if ch == '`' { 513 break 514 } 515 if ch == '\r' { 516 hasCR = true 517 } 518 } 519 520 lit := s.src[offs:s.offset] 521 if hasCR { 522 lit = stripCR(lit, false) 523 } 524 525 return string(lit) 526 } 527 528 func (s *Scanner) skipWhitespace() { 529 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 530 s.next() 531 } 532 } 533 534 // Helper functions for scanning multi-byte tokens such as >> += >>= . 535 // Different routines recognize different length tok_i based on matches 536 // of ch_i. If a token ends in '=', the result is tok1 or tok3 537 // respectively. Otherwise, the result is tok0 if there was no other 538 // matching character, or tok2 if the matching character was ch2. 539 540 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 541 if s.ch == '=' { 542 s.next() 543 return tok1 544 } 545 return tok0 546 } 547 548 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 549 if s.ch == '=' { 550 s.next() 551 return tok1 552 } 553 if s.ch == ch2 { 554 s.next() 555 return tok2 556 } 557 return tok0 558 } 559 560 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 561 if s.ch == '=' { 562 s.next() 563 return tok1 564 } 565 if s.ch == ch2 { 566 s.next() 567 if s.ch == '=' { 568 s.next() 569 return tok3 570 } 571 return tok2 572 } 573 return tok0 574 } 575 576 // Scan scans the next token and returns the token position, the token, 577 // and its literal string if applicable. The source end is indicated by 578 // token.EOF. 579 // 580 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 581 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 582 // has the corresponding value. 583 // 584 // If the returned token is a keyword, the literal string is the keyword. 585 // 586 // If the returned token is token.SEMICOLON, the corresponding 587 // literal string is ";" if the semicolon was present in the source, 588 // and "\n" if the semicolon was inserted because of a newline or 589 // at EOF. 590 // 591 // If the returned token is token.ILLEGAL, the literal string is the 592 // offending character. 593 // 594 // In all other cases, Scan returns an empty literal string. 595 // 596 // For more tolerant parsing, Scan will return a valid token if 597 // possible even if a syntax error was encountered. Thus, even 598 // if the resulting token sequence contains no illegal tokens, 599 // a client may not assume that no error occurred. Instead it 600 // must check the scanner's ErrorCount or the number of calls 601 // of the error handler, if there was one installed. 602 // 603 // Scan adds line information to the file added to the file 604 // set with Init. Token positions are relative to that file 605 // and thus relative to the file set. 606 // 607 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 608 scanAgain: 609 s.skipWhitespace() 610 611 // current token start 612 pos = s.file.Pos(s.offset) 613 614 // determine token value 615 insertSemi := false 616 switch ch := s.ch; { 617 case isLetter(ch): 618 lit = s.scanIdentifier() 619 if len(lit) > 1 { 620 // keywords are longer than one letter - avoid lookup otherwise 621 tok = token.Lookup(lit) 622 switch tok { 623 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 624 insertSemi = true 625 } 626 } else { 627 insertSemi = true 628 tok = token.IDENT 629 } 630 case '0' <= ch && ch <= '9': 631 insertSemi = true 632 tok, lit = s.scanNumber(false) 633 default: 634 s.next() // always make progress 635 switch ch { 636 case -1: 637 if s.insertSemi { 638 s.insertSemi = false // EOF consumed 639 return pos, token.SEMICOLON, "\n" 640 } 641 tok = token.EOF 642 case '\n': 643 // we only reach here if s.insertSemi was 644 // set in the first place and exited early 645 // from s.skipWhitespace() 646 s.insertSemi = false // newline consumed 647 return pos, token.SEMICOLON, "\n" 648 case '"': 649 insertSemi = true 650 tok = token.STRING 651 lit = s.scanString() 652 case '\'': 653 insertSemi = true 654 tok = token.CHAR 655 lit = s.scanRune() 656 case '`': 657 insertSemi = true 658 tok = token.STRING 659 lit = s.scanRawString() 660 case ':': 661 tok = s.switch2(token.COLON, token.DEFINE) 662 case '.': 663 if '0' <= s.ch && s.ch <= '9' { 664 insertSemi = true 665 tok, lit = s.scanNumber(true) 666 } else if s.ch == '.' { 667 s.next() 668 if s.ch == '.' { 669 s.next() 670 tok = token.ELLIPSIS 671 } 672 } else { 673 tok = token.PERIOD 674 } 675 case ',': 676 tok = token.COMMA 677 case ';': 678 tok = token.SEMICOLON 679 lit = ";" 680 case '(': 681 tok = token.LPAREN 682 case ')': 683 insertSemi = true 684 tok = token.RPAREN 685 case '[': 686 tok = token.LBRACK 687 case ']': 688 insertSemi = true 689 tok = token.RBRACK 690 case '{': 691 tok = token.LBRACE 692 case '}': 693 insertSemi = true 694 tok = token.RBRACE 695 case '+': 696 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 697 if tok == token.INC { 698 insertSemi = true 699 } 700 case '-': 701 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 702 if tok == token.DEC { 703 insertSemi = true 704 } 705 case '*': 706 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 707 case '/': 708 if s.ch == '/' || s.ch == '*' { 709 // comment 710 if s.insertSemi && s.findLineEnd() { 711 // reset position to the beginning of the comment 712 s.ch = '/' 713 s.offset = s.file.Offset(pos) 714 s.rdOffset = s.offset + 1 715 s.insertSemi = false // newline consumed 716 return pos, token.SEMICOLON, "\n" 717 } 718 comment := s.scanComment() 719 if s.mode&ScanComments == 0 { 720 // skip comment 721 s.insertSemi = false // newline consumed 722 goto scanAgain 723 } 724 tok = token.COMMENT 725 lit = comment 726 } else { 727 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 728 } 729 case '%': 730 tok = s.switch2(token.REM, token.REM_ASSIGN) 731 case '^': 732 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 733 case '<': 734 if s.ch == '-' { 735 s.next() 736 tok = token.ARROW 737 } else { 738 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 739 } 740 case '>': 741 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 742 case '=': 743 tok = s.switch2(token.ASSIGN, token.EQL) 744 case '!': 745 tok = s.switch2(token.NOT, token.NEQ) 746 case '&': 747 if s.ch == '^' { 748 s.next() 749 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 750 } else { 751 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 752 } 753 case '|': 754 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 755 default: 756 // next reports unexpected BOMs - don't repeat 757 if ch != bom { 758 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 759 } 760 insertSemi = s.insertSemi // preserve insertSemi info 761 tok = token.ILLEGAL 762 lit = string(ch) 763 } 764 } 765 if s.mode&dontInsertSemis == 0 { 766 s.insertSemi = insertSemi 767 } 768 769 return 770 }