github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 // 9 package scanner 10 11 import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22 // encountered and a handler was installed, the handler is called with a 23 // position and an error message. The position points to the beginning of 24 // the offending token. 25 // 26 // 错误无函数处理 27 type ErrorHandler func(pos token.Position, msg string) 28 29 // A Scanner holds the scanner's internal state while processing 30 // a given text. It can be allocated as part of another data 31 // structure but must be initialized via Init before use. 32 // 描述器 33 type Scanner struct { 34 // immutable state 35 file *token.File // source file handle 36 dir string // directory portion of file.Name() 37 src []byte // source 38 err ErrorHandler // error reporting; or nil 39 mode Mode // scanning mode 40 41 // scanning state 42 ch rune // current character 43 offset int // character offset 44 rdOffset int // reading offset (position after current character) 45 lineOffset int // current line offset 46 insertSemi bool // insert a semicolon before next newline 47 48 // public state - ok to modify 49 ErrorCount int // number of errors encountered 50 } 51 52 const bom = 0xFEFF // byte order mark, only permitted as very first character 53 54 // Read the next Unicode char into s.ch. 55 // s.ch < 0 means end-of-file. 56 // 57 func (s *Scanner) next() { 58 if s.rdOffset < len(s.src) { 59 s.offset = s.rdOffset 60 if s.ch == '\n' { 61 s.lineOffset = s.offset 62 s.file.AddLine(s.offset) 63 } 64 r, w := rune(s.src[s.rdOffset]), 1 65 switch { 66 case r == 0: 67 s.error(s.offset, "illegal character NUL") 68 case r >= utf8.RuneSelf: 69 // not ASCII 70 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 71 if r == utf8.RuneError && w == 1 { 72 s.error(s.offset, "illegal UTF-8 encoding") 73 } else if r == bom && s.offset > 0 { 74 s.error(s.offset, "illegal byte order mark") 75 } 76 } 77 s.rdOffset += w 78 s.ch = r 79 } else { 80 s.offset = len(s.src) 81 if s.ch == '\n' { 82 s.lineOffset = s.offset 83 s.file.AddLine(s.offset) 84 } 85 s.ch = -1 // eof 86 } 87 } 88 89 // A mode value is a set of flags (or 0). 90 // They control scanner behavior. 91 // 92 type Mode uint 93 94 const ( 95 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 96 dontInsertSemis // do not automatically insert semicolons - for testing only 97 ) 98 99 // Init prepares the scanner s to tokenize the text src by setting the 100 // scanner at the beginning of src. The scanner uses the file set file 101 // for position information and it adds line information for each line. 102 // It is ok to re-use the same file when re-scanning the same file as 103 // line information which is already present is ignored. Init causes a 104 // panic if the file size does not match the src size. 105 // 106 // Calls to Scan will invoke the error handler err if they encounter a 107 // syntax error and err is not nil. Also, for each error encountered, 108 // the Scanner field ErrorCount is incremented by one. The mode parameter 109 // determines how comments are handled. 110 // 111 // Note that Init may call err if there is an error in the first character 112 // of the file. 113 // 114 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 115 // Explicitly initialize all fields since a scanner may be reused. 116 if file.Size() != len(src) { 117 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 118 } 119 s.file = file 120 s.dir, _ = filepath.Split(file.Name()) 121 s.src = src 122 s.err = err 123 s.mode = mode 124 125 s.ch = ' ' 126 s.offset = 0 127 s.rdOffset = 0 128 s.lineOffset = 0 129 s.insertSemi = false 130 s.ErrorCount = 0 131 132 s.next() 133 if s.ch == bom { 134 s.next() // ignore BOM at file beginning 135 } 136 } 137 138 func (s *Scanner) error(offs int, msg string) { 139 if s.err != nil { 140 s.err(s.file.Position(s.file.Pos(offs)), msg) 141 } 142 s.ErrorCount++ 143 } 144 145 var prefix = []byte("//line ") 146 147 func (s *Scanner) interpretLineComment(text []byte) { 148 if bytes.HasPrefix(text, prefix) { 149 // get filename and line number, if any 150 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 151 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 152 // valid //line filename:line comment 153 filename := string(bytes.TrimSpace(text[len(prefix):i])) 154 if filename != "" { 155 filename = filepath.Clean(filename) 156 if !filepath.IsAbs(filename) { 157 // make filename relative to current directory 158 filename = filepath.Join(s.dir, filename) 159 } 160 } 161 // update scanner position 162 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 163 } 164 } 165 } 166 } 167 168 func (s *Scanner) scanComment() string { 169 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 170 offs := s.offset - 1 // position of initial '/' 171 hasCR := false 172 173 if s.ch == '/' { 174 //-style comment 175 s.next() 176 for s.ch != '\n' && s.ch >= 0 { 177 if s.ch == '\r' { 178 hasCR = true 179 } 180 s.next() 181 } 182 if offs == s.lineOffset { 183 // comment starts at the beginning of the current line 184 s.interpretLineComment(s.src[offs:s.offset]) 185 } 186 goto exit 187 } 188 189 /*-style comment */ 190 s.next() 191 for s.ch >= 0 { 192 ch := s.ch 193 if ch == '\r' { 194 hasCR = true 195 } 196 s.next() 197 if ch == '*' && s.ch == '/' { 198 s.next() 199 goto exit 200 } 201 } 202 203 s.error(offs, "comment not terminated") 204 205 exit: 206 lit := s.src[offs:s.offset] 207 if hasCR { 208 lit = stripCR(lit) 209 } 210 211 return string(lit) 212 } 213 214 func (s *Scanner) findLineEnd() bool { 215 // initial '/' already consumed 216 217 defer func(offs int) { 218 // reset scanner state to where it was upon calling findLineEnd 219 s.ch = '/' 220 s.offset = offs 221 s.rdOffset = offs + 1 222 s.next() // consume initial '/' again 223 }(s.offset - 1) 224 225 // read ahead until a newline, EOF, or non-comment token is found 226 for s.ch == '/' || s.ch == '*' { 227 if s.ch == '/' { 228 //-style comment always contains a newline 229 return true 230 } 231 /*-style comment: look for newline */ 232 s.next() 233 for s.ch >= 0 { 234 ch := s.ch 235 if ch == '\n' { 236 return true 237 } 238 s.next() 239 if ch == '*' && s.ch == '/' { 240 s.next() 241 break 242 } 243 } 244 s.skipWhitespace() // s.insertSemi is set 245 if s.ch < 0 || s.ch == '\n' { 246 return true 247 } 248 if s.ch != '/' { 249 // non-comment token 250 return false 251 } 252 s.next() // consume '/' 253 } 254 255 return false 256 } 257 258 func isLetter(ch rune) bool { 259 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 260 } 261 262 func isDigit(ch rune) bool { 263 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 264 } 265 266 func (s *Scanner) scanIdentifier() string { 267 offs := s.offset 268 for isLetter(s.ch) || isDigit(s.ch) { 269 s.next() 270 } 271 return string(s.src[offs:s.offset]) 272 } 273 274 func digitVal(ch rune) int { 275 switch { 276 case '0' <= ch && ch <= '9': 277 return int(ch - '0') 278 case 'a' <= ch && ch <= 'f': 279 return int(ch - 'a' + 10) 280 case 'A' <= ch && ch <= 'F': 281 return int(ch - 'A' + 10) 282 } 283 return 16 // larger than any legal digit val 284 } 285 286 func (s *Scanner) scanMantissa(base int) { 287 for digitVal(s.ch) < base { 288 s.next() 289 } 290 } 291 292 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 293 // digitVal(s.ch) < 10 294 offs := s.offset 295 tok := token.INT 296 297 if seenDecimalPoint { 298 offs-- 299 tok = token.FLOAT 300 s.scanMantissa(10) 301 goto exponent 302 } 303 304 if s.ch == '0' { 305 // int or float 306 offs := s.offset 307 s.next() 308 if s.ch == 'x' || s.ch == 'X' { 309 // hexadecimal int 310 s.next() 311 s.scanMantissa(16) 312 if s.offset-offs <= 2 { 313 // only scanned "0x" or "0X" 314 s.error(offs, "illegal hexadecimal number") 315 } 316 } else { 317 // octal int or float 318 seenDecimalDigit := false 319 s.scanMantissa(8) 320 if s.ch == '8' || s.ch == '9' { 321 // illegal octal int or float 322 seenDecimalDigit = true 323 s.scanMantissa(10) 324 } 325 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 326 goto fraction 327 } 328 // octal int 329 if seenDecimalDigit { 330 s.error(offs, "illegal octal number") 331 } 332 } 333 goto exit 334 } 335 336 // decimal int or float 337 s.scanMantissa(10) 338 339 fraction: 340 if s.ch == '.' { 341 tok = token.FLOAT 342 s.next() 343 s.scanMantissa(10) 344 } 345 346 exponent: 347 if s.ch == 'e' || s.ch == 'E' { 348 tok = token.FLOAT 349 s.next() 350 if s.ch == '-' || s.ch == '+' { 351 s.next() 352 } 353 if digitVal(s.ch) < 10 { 354 s.scanMantissa(10) 355 } else { 356 s.error(offs, "illegal floating-point exponent") 357 } 358 } 359 360 if s.ch == 'i' { 361 tok = token.IMAG 362 s.next() 363 } 364 365 exit: 366 return tok, string(s.src[offs:s.offset]) 367 } 368 369 // scanEscape parses an escape sequence where rune is the accepted 370 // escaped quote. In case of a syntax error, it stops at the offending 371 // character (without consuming it) and returns false. Otherwise 372 // it returns true. 373 func (s *Scanner) scanEscape(quote rune) bool { 374 offs := s.offset 375 376 var n int 377 var base, max uint32 378 switch s.ch { 379 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 380 s.next() 381 return true 382 case '0', '1', '2', '3', '4', '5', '6', '7': 383 n, base, max = 3, 8, 255 384 case 'x': 385 s.next() 386 n, base, max = 2, 16, 255 387 case 'u': 388 s.next() 389 n, base, max = 4, 16, unicode.MaxRune 390 case 'U': 391 s.next() 392 n, base, max = 8, 16, unicode.MaxRune 393 default: 394 msg := "unknown escape sequence" 395 if s.ch < 0 { 396 msg = "escape sequence not terminated" 397 } 398 s.error(offs, msg) 399 return false 400 } 401 402 var x uint32 403 for n > 0 { 404 d := uint32(digitVal(s.ch)) 405 if d >= base { 406 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 407 if s.ch < 0 { 408 msg = "escape sequence not terminated" 409 } 410 s.error(s.offset, msg) 411 return false 412 } 413 x = x*base + d 414 s.next() 415 n-- 416 } 417 418 if x > max || 0xD800 <= x && x < 0xE000 { 419 s.error(offs, "escape sequence is invalid Unicode code point") 420 return false 421 } 422 423 return true 424 } 425 426 func (s *Scanner) scanRune() string { 427 // '\'' opening already consumed 428 offs := s.offset - 1 429 430 valid := true 431 n := 0 432 for { 433 ch := s.ch 434 if ch == '\n' || ch < 0 { 435 // only report error if we don't have one already 436 if valid { 437 s.error(offs, "rune literal not terminated") 438 valid = false 439 } 440 break 441 } 442 s.next() 443 if ch == '\'' { 444 break 445 } 446 n++ 447 if ch == '\\' { 448 if !s.scanEscape('\'') { 449 valid = false 450 } 451 // continue to read to closing quote 452 } 453 } 454 455 if valid && n != 1 { 456 s.error(offs, "illegal rune literal") 457 } 458 459 return string(s.src[offs:s.offset]) 460 } 461 462 func (s *Scanner) scanString() string { 463 // '"' opening already consumed 464 offs := s.offset - 1 465 466 for { 467 ch := s.ch 468 if ch == '\n' || ch < 0 { 469 s.error(offs, "string literal not terminated") 470 break 471 } 472 s.next() 473 if ch == '"' { 474 break 475 } 476 if ch == '\\' { 477 s.scanEscape('"') 478 } 479 } 480 481 return string(s.src[offs:s.offset]) 482 } 483 484 func stripCR(b []byte) []byte { 485 c := make([]byte, len(b)) 486 i := 0 487 for _, ch := range b { 488 if ch != '\r' { 489 c[i] = ch 490 i++ 491 } 492 } 493 return c[:i] 494 } 495 496 func (s *Scanner) scanRawString() string { 497 // '`' opening already consumed 498 offs := s.offset - 1 499 500 hasCR := false 501 for { 502 ch := s.ch 503 if ch < 0 { 504 s.error(offs, "raw string literal not terminated") 505 break 506 } 507 s.next() 508 if ch == '`' { 509 break 510 } 511 if ch == '\r' { 512 hasCR = true 513 } 514 } 515 516 lit := s.src[offs:s.offset] 517 if hasCR { 518 lit = stripCR(lit) 519 } 520 521 return string(lit) 522 } 523 524 func (s *Scanner) skipWhitespace() { 525 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 526 s.next() 527 } 528 } 529 530 // Helper functions for scanning multi-byte tokens such as >> += >>= . 531 // Different routines recognize different length tok_i based on matches 532 // of ch_i. If a token ends in '=', the result is tok1 or tok3 533 // respectively. Otherwise, the result is tok0 if there was no other 534 // matching character, or tok2 if the matching character was ch2. 535 536 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 537 if s.ch == '=' { 538 s.next() 539 return tok1 540 } 541 return tok0 542 } 543 544 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 545 if s.ch == '=' { 546 s.next() 547 return tok1 548 } 549 if s.ch == ch2 { 550 s.next() 551 return tok2 552 } 553 return tok0 554 } 555 556 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 557 if s.ch == '=' { 558 s.next() 559 return tok1 560 } 561 if s.ch == ch2 { 562 s.next() 563 if s.ch == '=' { 564 s.next() 565 return tok3 566 } 567 return tok2 568 } 569 return tok0 570 } 571 572 // Scan scans the next token and returns the token position, the token, 573 // and its literal string if applicable. The source end is indicated by 574 // token.EOF. 575 // 576 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 577 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 578 // has the corresponding value. 579 // 580 // If the returned token is a keyword, the literal string is the keyword. 581 // 582 // If the returned token is token.SEMICOLON, the corresponding 583 // literal string is ";" if the semicolon was present in the source, 584 // and "\n" if the semicolon was inserted because of a newline or 585 // at EOF. 586 // 587 // If the returned token is token.ILLEGAL, the literal string is the 588 // offending character. 589 // 590 // In all other cases, Scan returns an empty literal string. 591 // 592 // For more tolerant parsing, Scan will return a valid token if 593 // possible even if a syntax error was encountered. Thus, even 594 // if the resulting token sequence contains no illegal tokens, 595 // a client may not assume that no error occurred. Instead it 596 // must check the scanner's ErrorCount or the number of calls 597 // of the error handler, if there was one installed. 598 // 599 // Scan adds line information to the file added to the file 600 // set with Init. Token positions are relative to that file 601 // and thus relative to the file set. 602 // 603 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 604 scanAgain: 605 s.skipWhitespace() 606 607 // current token start 608 pos = s.file.Pos(s.offset) 609 610 // determine token value 611 insertSemi := false 612 switch ch := s.ch; { 613 case isLetter(ch): 614 lit = s.scanIdentifier() 615 if len(lit) > 1 { 616 // keywords are longer than one letter - avoid lookup otherwise 617 tok = token.Lookup(lit) 618 switch tok { 619 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 620 insertSemi = true 621 } 622 } else { 623 insertSemi = true 624 tok = token.IDENT 625 } 626 case '0' <= ch && ch <= '9': 627 insertSemi = true 628 tok, lit = s.scanNumber(false) 629 default: 630 s.next() // always make progress 631 switch ch { 632 case -1: 633 if s.insertSemi { 634 s.insertSemi = false // EOF consumed 635 return pos, token.SEMICOLON, "\n" 636 } 637 tok = token.EOF 638 case '\n': 639 // we only reach here if s.insertSemi was 640 // set in the first place and exited early 641 // from s.skipWhitespace() 642 s.insertSemi = false // newline consumed 643 return pos, token.SEMICOLON, "\n" 644 case '"': 645 insertSemi = true 646 tok = token.STRING 647 lit = s.scanString() 648 case '\'': 649 insertSemi = true 650 tok = token.CHAR 651 lit = s.scanRune() 652 case '`': 653 insertSemi = true 654 tok = token.STRING 655 lit = s.scanRawString() 656 case ':': 657 tok = s.switch2(token.COLON, token.DEFINE) 658 case '.': 659 if '0' <= s.ch && s.ch <= '9' { 660 insertSemi = true 661 tok, lit = s.scanNumber(true) 662 } else if s.ch == '.' { 663 s.next() 664 if s.ch == '.' { 665 s.next() 666 tok = token.ELLIPSIS 667 } 668 } else { 669 tok = token.PERIOD 670 } 671 case ',': 672 tok = token.COMMA 673 case ';': 674 tok = token.SEMICOLON 675 lit = ";" 676 case '(': 677 tok = token.LPAREN 678 case ')': 679 insertSemi = true 680 tok = token.RPAREN 681 case '[': 682 tok = token.LBRACK 683 case ']': 684 insertSemi = true 685 tok = token.RBRACK 686 case '{': 687 tok = token.LBRACE 688 case '}': 689 insertSemi = true 690 tok = token.RBRACE 691 case '+': 692 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 693 if tok == token.INC { 694 insertSemi = true 695 } 696 case '-': 697 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 698 if tok == token.DEC { 699 insertSemi = true 700 } 701 case '*': 702 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 703 case '/': 704 if s.ch == '/' || s.ch == '*' { 705 // comment 706 if s.insertSemi && s.findLineEnd() { 707 // reset position to the beginning of the comment 708 s.ch = '/' 709 s.offset = s.file.Offset(pos) 710 s.rdOffset = s.offset + 1 711 s.insertSemi = false // newline consumed 712 return pos, token.SEMICOLON, "\n" 713 } 714 comment := s.scanComment() 715 if s.mode&ScanComments == 0 { 716 // skip comment 717 s.insertSemi = false // newline consumed 718 goto scanAgain 719 } 720 tok = token.COMMENT 721 lit = comment 722 } else { 723 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 724 } 725 case '%': 726 tok = s.switch2(token.REM, token.REM_ASSIGN) 727 case '^': 728 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 729 case '<': 730 if s.ch == '-' { 731 s.next() 732 tok = token.ARROW 733 } else { 734 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 735 } 736 case '>': 737 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 738 case '=': 739 tok = s.switch2(token.ASSIGN, token.EQL) 740 case '!': 741 tok = s.switch2(token.NOT, token.NEQ) 742 case '&': 743 if s.ch == '^' { 744 s.next() 745 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 746 } else { 747 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 748 } 749 case '|': 750 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 751 default: 752 // next reports unexpected BOMs - don't repeat 753 if ch != bom { 754 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 755 } 756 insertSemi = s.insertSemi // preserve insertSemi info 757 tok = token.ILLEGAL 758 lit = string(ch) 759 } 760 } 761 if s.mode&dontInsertSemis == 0 { 762 s.insertSemi = insertSemi 763 } 764 765 return 766 }