github.com/gocuntian/go@v0.0.0-20160610041250-fee02d270bf8/src/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 // 9 package scanner 10 11 import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22 // encountered and a handler was installed, the handler is called with a 23 // position and an error message. The position points to the beginning of 24 // the offending token. 25 // 26 type ErrorHandler func(pos token.Position, msg string) 27 28 // A Scanner holds the scanner's internal state while processing 29 // a given text. It can be allocated as part of another data 30 // structure but must be initialized via Init before use. 31 // 32 type Scanner struct { 33 // immutable state 34 file *token.File // source file handle 35 dir string // directory portion of file.Name() 36 src []byte // source 37 err ErrorHandler // error reporting; or nil 38 mode Mode // scanning mode 39 40 // scanning state 41 ch rune // current character 42 offset int // character offset 43 rdOffset int // reading offset (position after current character) 44 lineOffset int // current line offset 45 insertSemi bool // insert a semicolon before next newline 46 47 // public state - ok to modify 48 ErrorCount int // number of errors encountered 49 } 50 51 const bom = 0xFEFF // byte order mark, only permitted as very first character 52 53 // Read the next Unicode char into s.ch. 54 // s.ch < 0 means end-of-file. 55 // 56 func (s *Scanner) next() { 57 if s.rdOffset < len(s.src) { 58 s.offset = s.rdOffset 59 if s.ch == '\n' { 60 s.lineOffset = s.offset 61 s.file.AddLine(s.offset) 62 } 63 r, w := rune(s.src[s.rdOffset]), 1 64 switch { 65 case r == 0: 66 s.error(s.offset, "illegal character NUL") 67 case r >= utf8.RuneSelf: 68 // not ASCII 69 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 70 if r == utf8.RuneError && w == 1 { 71 s.error(s.offset, "illegal UTF-8 encoding") 72 } else if r == bom && s.offset > 0 { 73 s.error(s.offset, "illegal byte order mark") 74 } 75 } 76 s.rdOffset += w 77 s.ch = r 78 } else { 79 s.offset = len(s.src) 80 if s.ch == '\n' { 81 s.lineOffset = s.offset 82 s.file.AddLine(s.offset) 83 } 84 s.ch = -1 // eof 85 } 86 } 87 88 // A mode value is a set of flags (or 0). 89 // They control scanner behavior. 90 // 91 type Mode uint 92 93 const ( 94 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 95 dontInsertSemis // do not automatically insert semicolons - for testing only 96 ) 97 98 // Init prepares the scanner s to tokenize the text src by setting the 99 // scanner at the beginning of src. The scanner uses the file set file 100 // for position information and it adds line information for each line. 101 // It is ok to re-use the same file when re-scanning the same file as 102 // line information which is already present is ignored. Init causes a 103 // panic if the file size does not match the src size. 104 // 105 // Calls to Scan will invoke the error handler err if they encounter a 106 // syntax error and err is not nil. Also, for each error encountered, 107 // the Scanner field ErrorCount is incremented by one. The mode parameter 108 // determines how comments are handled. 109 // 110 // Note that Init may call err if there is an error in the first character 111 // of the file. 112 // 113 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 114 // Explicitly initialize all fields since a scanner may be reused. 115 if file.Size() != len(src) { 116 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 117 } 118 s.file = file 119 s.dir, _ = filepath.Split(file.Name()) 120 s.src = src 121 s.err = err 122 s.mode = mode 123 124 s.ch = ' ' 125 s.offset = 0 126 s.rdOffset = 0 127 s.lineOffset = 0 128 s.insertSemi = false 129 s.ErrorCount = 0 130 131 s.next() 132 if s.ch == bom { 133 s.next() // ignore BOM at file beginning 134 } 135 } 136 137 func (s *Scanner) error(offs int, msg string) { 138 if s.err != nil { 139 s.err(s.file.Position(s.file.Pos(offs)), msg) 140 } 141 s.ErrorCount++ 142 } 143 144 var prefix = []byte("//line ") 145 146 func (s *Scanner) interpretLineComment(text []byte) { 147 if bytes.HasPrefix(text, prefix) { 148 // get filename and line number, if any 149 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 150 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 151 // valid //line filename:line comment 152 filename := string(bytes.TrimSpace(text[len(prefix):i])) 153 if filename != "" { 154 filename = filepath.Clean(filename) 155 if !filepath.IsAbs(filename) { 156 // make filename relative to current directory 157 filename = filepath.Join(s.dir, filename) 158 } 159 } 160 // update scanner position 161 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 162 } 163 } 164 } 165 } 166 167 func (s *Scanner) scanComment() string { 168 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 169 offs := s.offset - 1 // position of initial '/' 170 hasCR := false 171 172 if s.ch == '/' { 173 //-style comment 174 s.next() 175 for s.ch != '\n' && s.ch >= 0 { 176 if s.ch == '\r' { 177 hasCR = true 178 } 179 s.next() 180 } 181 if offs == s.lineOffset { 182 // comment starts at the beginning of the current line 183 s.interpretLineComment(s.src[offs:s.offset]) 184 } 185 goto exit 186 } 187 188 /*-style comment */ 189 s.next() 190 for s.ch >= 0 { 191 ch := s.ch 192 if ch == '\r' { 193 hasCR = true 194 } 195 s.next() 196 if ch == '*' && s.ch == '/' { 197 s.next() 198 goto exit 199 } 200 } 201 202 s.error(offs, "comment not terminated") 203 204 exit: 205 lit := s.src[offs:s.offset] 206 if hasCR { 207 lit = stripCR(lit) 208 } 209 210 return string(lit) 211 } 212 213 func (s *Scanner) findLineEnd() bool { 214 // initial '/' already consumed 215 216 defer func(offs int) { 217 // reset scanner state to where it was upon calling findLineEnd 218 s.ch = '/' 219 s.offset = offs 220 s.rdOffset = offs + 1 221 s.next() // consume initial '/' again 222 }(s.offset - 1) 223 224 // read ahead until a newline, EOF, or non-comment token is found 225 for s.ch == '/' || s.ch == '*' { 226 if s.ch == '/' { 227 //-style comment always contains a newline 228 return true 229 } 230 /*-style comment: look for newline */ 231 s.next() 232 for s.ch >= 0 { 233 ch := s.ch 234 if ch == '\n' { 235 return true 236 } 237 s.next() 238 if ch == '*' && s.ch == '/' { 239 s.next() 240 break 241 } 242 } 243 s.skipWhitespace() // s.insertSemi is set 244 if s.ch < 0 || s.ch == '\n' { 245 return true 246 } 247 if s.ch != '/' { 248 // non-comment token 249 return false 250 } 251 s.next() // consume '/' 252 } 253 254 return false 255 } 256 257 func isLetter(ch rune) bool { 258 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 259 } 260 261 func isDigit(ch rune) bool { 262 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 263 } 264 265 func (s *Scanner) scanIdentifier() string { 266 offs := s.offset 267 for isLetter(s.ch) || isDigit(s.ch) { 268 s.next() 269 } 270 return string(s.src[offs:s.offset]) 271 } 272 273 func digitVal(ch rune) int { 274 switch { 275 case '0' <= ch && ch <= '9': 276 return int(ch - '0') 277 case 'a' <= ch && ch <= 'f': 278 return int(ch - 'a' + 10) 279 case 'A' <= ch && ch <= 'F': 280 return int(ch - 'A' + 10) 281 } 282 return 16 // larger than any legal digit val 283 } 284 285 func (s *Scanner) scanMantissa(base int) { 286 for digitVal(s.ch) < base { 287 s.next() 288 } 289 } 290 291 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 292 // digitVal(s.ch) < 10 293 offs := s.offset 294 tok := token.INT 295 296 if seenDecimalPoint { 297 offs-- 298 tok = token.FLOAT 299 s.scanMantissa(10) 300 goto exponent 301 } 302 303 if s.ch == '0' { 304 // int or float 305 offs := s.offset 306 s.next() 307 if s.ch == 'x' || s.ch == 'X' { 308 // hexadecimal int 309 s.next() 310 s.scanMantissa(16) 311 if s.offset-offs <= 2 { 312 // only scanned "0x" or "0X" 313 s.error(offs, "illegal hexadecimal number") 314 } 315 } else { 316 // octal int or float 317 seenDecimalDigit := false 318 s.scanMantissa(8) 319 if s.ch == '8' || s.ch == '9' { 320 // illegal octal int or float 321 seenDecimalDigit = true 322 s.scanMantissa(10) 323 } 324 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 325 goto fraction 326 } 327 // octal int 328 if seenDecimalDigit { 329 s.error(offs, "illegal octal number") 330 } 331 } 332 goto exit 333 } 334 335 // decimal int or float 336 s.scanMantissa(10) 337 338 fraction: 339 if s.ch == '.' { 340 tok = token.FLOAT 341 s.next() 342 s.scanMantissa(10) 343 } 344 345 exponent: 346 if s.ch == 'e' || s.ch == 'E' { 347 tok = token.FLOAT 348 s.next() 349 if s.ch == '-' || s.ch == '+' { 350 s.next() 351 } 352 s.scanMantissa(10) 353 } 354 355 if s.ch == 'i' { 356 tok = token.IMAG 357 s.next() 358 } 359 360 exit: 361 return tok, string(s.src[offs:s.offset]) 362 } 363 364 // scanEscape parses an escape sequence where rune is the accepted 365 // escaped quote. In case of a syntax error, it stops at the offending 366 // character (without consuming it) and returns false. Otherwise 367 // it returns true. 368 func (s *Scanner) scanEscape(quote rune) bool { 369 offs := s.offset 370 371 var n int 372 var base, max uint32 373 switch s.ch { 374 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 375 s.next() 376 return true 377 case '0', '1', '2', '3', '4', '5', '6', '7': 378 n, base, max = 3, 8, 255 379 case 'x': 380 s.next() 381 n, base, max = 2, 16, 255 382 case 'u': 383 s.next() 384 n, base, max = 4, 16, unicode.MaxRune 385 case 'U': 386 s.next() 387 n, base, max = 8, 16, unicode.MaxRune 388 default: 389 msg := "unknown escape sequence" 390 if s.ch < 0 { 391 msg = "escape sequence not terminated" 392 } 393 s.error(offs, msg) 394 return false 395 } 396 397 var x uint32 398 for n > 0 { 399 d := uint32(digitVal(s.ch)) 400 if d >= base { 401 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 402 if s.ch < 0 { 403 msg = "escape sequence not terminated" 404 } 405 s.error(s.offset, msg) 406 return false 407 } 408 x = x*base + d 409 s.next() 410 n-- 411 } 412 413 if x > max || 0xD800 <= x && x < 0xE000 { 414 s.error(offs, "escape sequence is invalid Unicode code point") 415 return false 416 } 417 418 return true 419 } 420 421 func (s *Scanner) scanRune() string { 422 // '\'' opening already consumed 423 offs := s.offset - 1 424 425 valid := true 426 n := 0 427 for { 428 ch := s.ch 429 if ch == '\n' || ch < 0 { 430 // only report error if we don't have one already 431 if valid { 432 s.error(offs, "rune literal not terminated") 433 valid = false 434 } 435 break 436 } 437 s.next() 438 if ch == '\'' { 439 break 440 } 441 n++ 442 if ch == '\\' { 443 if !s.scanEscape('\'') { 444 valid = false 445 } 446 // continue to read to closing quote 447 } 448 } 449 450 if valid && n != 1 { 451 s.error(offs, "illegal rune literal") 452 } 453 454 return string(s.src[offs:s.offset]) 455 } 456 457 func (s *Scanner) scanString() string { 458 // '"' opening already consumed 459 offs := s.offset - 1 460 461 for { 462 ch := s.ch 463 if ch == '\n' || ch < 0 { 464 s.error(offs, "string literal not terminated") 465 break 466 } 467 s.next() 468 if ch == '"' { 469 break 470 } 471 if ch == '\\' { 472 s.scanEscape('"') 473 } 474 } 475 476 return string(s.src[offs:s.offset]) 477 } 478 479 func stripCR(b []byte) []byte { 480 c := make([]byte, len(b)) 481 i := 0 482 for _, ch := range b { 483 if ch != '\r' { 484 c[i] = ch 485 i++ 486 } 487 } 488 return c[:i] 489 } 490 491 func (s *Scanner) scanRawString() string { 492 // '`' opening already consumed 493 offs := s.offset - 1 494 495 hasCR := false 496 for { 497 ch := s.ch 498 if ch < 0 { 499 s.error(offs, "raw string literal not terminated") 500 break 501 } 502 s.next() 503 if ch == '`' { 504 break 505 } 506 if ch == '\r' { 507 hasCR = true 508 } 509 } 510 511 lit := s.src[offs:s.offset] 512 if hasCR { 513 lit = stripCR(lit) 514 } 515 516 return string(lit) 517 } 518 519 func (s *Scanner) skipWhitespace() { 520 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 521 s.next() 522 } 523 } 524 525 // Helper functions for scanning multi-byte tokens such as >> += >>= . 526 // Different routines recognize different length tok_i based on matches 527 // of ch_i. If a token ends in '=', the result is tok1 or tok3 528 // respectively. Otherwise, the result is tok0 if there was no other 529 // matching character, or tok2 if the matching character was ch2. 530 531 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 532 if s.ch == '=' { 533 s.next() 534 return tok1 535 } 536 return tok0 537 } 538 539 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 540 if s.ch == '=' { 541 s.next() 542 return tok1 543 } 544 if s.ch == ch2 { 545 s.next() 546 return tok2 547 } 548 return tok0 549 } 550 551 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 552 if s.ch == '=' { 553 s.next() 554 return tok1 555 } 556 if s.ch == ch2 { 557 s.next() 558 if s.ch == '=' { 559 s.next() 560 return tok3 561 } 562 return tok2 563 } 564 return tok0 565 } 566 567 // Scan scans the next token and returns the token position, the token, 568 // and its literal string if applicable. The source end is indicated by 569 // token.EOF. 570 // 571 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 572 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 573 // has the corresponding value. 574 // 575 // If the returned token is a keyword, the literal string is the keyword. 576 // 577 // If the returned token is token.SEMICOLON, the corresponding 578 // literal string is ";" if the semicolon was present in the source, 579 // and "\n" if the semicolon was inserted because of a newline or 580 // at EOF. 581 // 582 // If the returned token is token.ILLEGAL, the literal string is the 583 // offending character. 584 // 585 // In all other cases, Scan returns an empty literal string. 586 // 587 // For more tolerant parsing, Scan will return a valid token if 588 // possible even if a syntax error was encountered. Thus, even 589 // if the resulting token sequence contains no illegal tokens, 590 // a client may not assume that no error occurred. Instead it 591 // must check the scanner's ErrorCount or the number of calls 592 // of the error handler, if there was one installed. 593 // 594 // Scan adds line information to the file added to the file 595 // set with Init. Token positions are relative to that file 596 // and thus relative to the file set. 597 // 598 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 599 scanAgain: 600 s.skipWhitespace() 601 602 // current token start 603 pos = s.file.Pos(s.offset) 604 605 // determine token value 606 insertSemi := false 607 switch ch := s.ch; { 608 case isLetter(ch): 609 lit = s.scanIdentifier() 610 if len(lit) > 1 { 611 // keywords are longer than one letter - avoid lookup otherwise 612 tok = token.Lookup(lit) 613 switch tok { 614 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 615 insertSemi = true 616 } 617 } else { 618 insertSemi = true 619 tok = token.IDENT 620 } 621 case '0' <= ch && ch <= '9': 622 insertSemi = true 623 tok, lit = s.scanNumber(false) 624 default: 625 s.next() // always make progress 626 switch ch { 627 case -1: 628 if s.insertSemi { 629 s.insertSemi = false // EOF consumed 630 return pos, token.SEMICOLON, "\n" 631 } 632 tok = token.EOF 633 case '\n': 634 // we only reach here if s.insertSemi was 635 // set in the first place and exited early 636 // from s.skipWhitespace() 637 s.insertSemi = false // newline consumed 638 return pos, token.SEMICOLON, "\n" 639 case '"': 640 insertSemi = true 641 tok = token.STRING 642 lit = s.scanString() 643 case '\'': 644 insertSemi = true 645 tok = token.CHAR 646 lit = s.scanRune() 647 case '`': 648 insertSemi = true 649 tok = token.STRING 650 lit = s.scanRawString() 651 case ':': 652 tok = s.switch2(token.COLON, token.DEFINE) 653 case '.': 654 if '0' <= s.ch && s.ch <= '9' { 655 insertSemi = true 656 tok, lit = s.scanNumber(true) 657 } else if s.ch == '.' { 658 s.next() 659 if s.ch == '.' { 660 s.next() 661 tok = token.ELLIPSIS 662 } 663 } else { 664 tok = token.PERIOD 665 } 666 case ',': 667 tok = token.COMMA 668 case ';': 669 tok = token.SEMICOLON 670 lit = ";" 671 case '(': 672 tok = token.LPAREN 673 case ')': 674 insertSemi = true 675 tok = token.RPAREN 676 case '[': 677 tok = token.LBRACK 678 case ']': 679 insertSemi = true 680 tok = token.RBRACK 681 case '{': 682 tok = token.LBRACE 683 case '}': 684 insertSemi = true 685 tok = token.RBRACE 686 case '+': 687 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 688 if tok == token.INC { 689 insertSemi = true 690 } 691 case '-': 692 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 693 if tok == token.DEC { 694 insertSemi = true 695 } 696 case '*': 697 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 698 case '/': 699 if s.ch == '/' || s.ch == '*' { 700 // comment 701 if s.insertSemi && s.findLineEnd() { 702 // reset position to the beginning of the comment 703 s.ch = '/' 704 s.offset = s.file.Offset(pos) 705 s.rdOffset = s.offset + 1 706 s.insertSemi = false // newline consumed 707 return pos, token.SEMICOLON, "\n" 708 } 709 comment := s.scanComment() 710 if s.mode&ScanComments == 0 { 711 // skip comment 712 s.insertSemi = false // newline consumed 713 goto scanAgain 714 } 715 tok = token.COMMENT 716 lit = comment 717 } else { 718 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 719 } 720 case '%': 721 tok = s.switch2(token.REM, token.REM_ASSIGN) 722 case '^': 723 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 724 case '<': 725 if s.ch == '-' { 726 s.next() 727 tok = token.ARROW 728 } else { 729 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 730 } 731 case '>': 732 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 733 case '=': 734 tok = s.switch2(token.ASSIGN, token.EQL) 735 case '!': 736 tok = s.switch2(token.NOT, token.NEQ) 737 case '&': 738 if s.ch == '^' { 739 s.next() 740 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 741 } else { 742 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 743 } 744 case '|': 745 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 746 default: 747 // next reports unexpected BOMs - don't repeat 748 if ch != bom { 749 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 750 } 751 insertSemi = s.insertSemi // preserve insertSemi info 752 tok = token.ILLEGAL 753 lit = string(ch) 754 } 755 } 756 if s.mode&dontInsertSemis == 0 { 757 s.insertSemi = insertSemi 758 } 759 760 return 761 }