github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 // 9 package scanner 10 11 import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22 // encountered and a handler was installed, the handler is called with a 23 // position and an error message. The position points to the beginning of 24 // the offending token. 25 // 26 type ErrorHandler func(pos token.Position, msg string) 27 28 // A Scanner holds the scanner's internal state while processing 29 // a given text. It can be allocated as part of another data 30 // structure but must be initialized via Init before use. 31 // 32 type Scanner struct { 33 // immutable state 34 file *token.File // source file handle 35 dir string // directory portion of file.Name() 36 src []byte // source 37 err ErrorHandler // error reporting; or nil 38 mode Mode // scanning mode 39 40 // scanning state 41 ch rune // current character 42 offset int // character offset 43 rdOffset int // reading offset (position after current character) 44 lineOffset int // current line offset 45 insertSemi bool // insert a semicolon before next newline 46 47 // public state - ok to modify 48 ErrorCount int // number of errors encountered 49 } 50 51 const bom = 0xFEFF // byte order mark, only permitted as very first character 52 53 // Read the next Unicode char into s.ch. 54 // s.ch < 0 means end-of-file. 55 // 56 func (s *Scanner) next() { 57 if s.rdOffset < len(s.src) { 58 s.offset = s.rdOffset 59 if s.ch == '\n' { 60 s.lineOffset = s.offset 61 s.file.AddLine(s.offset) 62 } 63 r, w := rune(s.src[s.rdOffset]), 1 64 switch { 65 case r == 0: 66 s.error(s.offset, "illegal character NUL") 67 case r >= utf8.RuneSelf: 68 // not ASCII 69 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 70 if r == utf8.RuneError && w == 1 { 71 s.error(s.offset, "illegal UTF-8 encoding") 72 } else if r == bom && s.offset > 0 { 73 s.error(s.offset, "illegal byte order mark") 74 } 75 } 76 s.rdOffset += w 77 s.ch = r 78 } else { 79 s.offset = len(s.src) 80 if s.ch == '\n' { 81 s.lineOffset = s.offset 82 s.file.AddLine(s.offset) 83 } 84 s.ch = -1 // eof 85 } 86 } 87 88 // A mode value is a set of flags (or 0). 89 // They control scanner behavior. 90 // 91 type Mode uint 92 93 const ( 94 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 95 dontInsertSemis // do not automatically insert semicolons - for testing only 96 ) 97 98 // Init prepares the scanner s to tokenize the text src by setting the 99 // scanner at the beginning of src. The scanner uses the file set file 100 // for position information and it adds line information for each line. 101 // It is ok to re-use the same file when re-scanning the same file as 102 // line information which is already present is ignored. Init causes a 103 // panic if the file size does not match the src size. 104 // 105 // Calls to Scan will invoke the error handler err if they encounter a 106 // syntax error and err is not nil. Also, for each error encountered, 107 // the Scanner field ErrorCount is incremented by one. The mode parameter 108 // determines how comments are handled. 109 // 110 // Note that Init may call err if there is an error in the first character 111 // of the file. 112 // 113 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 114 // Explicitly initialize all fields since a scanner may be reused. 115 if file.Size() != len(src) { 116 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 117 } 118 s.file = file 119 s.dir, _ = filepath.Split(file.Name()) 120 s.src = src 121 s.err = err 122 s.mode = mode 123 124 s.ch = ' ' 125 s.offset = 0 126 s.rdOffset = 0 127 s.lineOffset = 0 128 s.insertSemi = false 129 s.ErrorCount = 0 130 131 s.next() 132 if s.ch == bom { 133 s.next() // ignore BOM at file beginning 134 } 135 } 136 137 func (s *Scanner) error(offs int, msg string) { 138 if s.err != nil { 139 s.err(s.file.Position(s.file.Pos(offs)), msg) 140 } 141 s.ErrorCount++ 142 } 143 144 var prefix = []byte("//line ") 145 146 func (s *Scanner) interpretLineComment(text []byte) { 147 if bytes.HasPrefix(text, prefix) { 148 // get filename and line number, if any 149 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 150 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 151 // valid //line filename:line comment 152 filename := string(bytes.TrimSpace(text[len(prefix):i])) 153 if filename != "" { 154 filename = filepath.Clean(filename) 155 if !filepath.IsAbs(filename) { 156 // make filename relative to current directory 157 filename = filepath.Join(s.dir, filename) 158 } 159 } 160 // update scanner position 161 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 162 } 163 } 164 } 165 } 166 167 func (s *Scanner) scanComment() string { 168 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 169 offs := s.offset - 1 // position of initial '/' 170 hasCR := false 171 172 if s.ch == '/' { 173 //-style comment 174 s.next() 175 for s.ch != '\n' && s.ch >= 0 { 176 if s.ch == '\r' { 177 hasCR = true 178 } 179 s.next() 180 } 181 if offs == s.lineOffset { 182 // comment starts at the beginning of the current line 183 s.interpretLineComment(s.src[offs:s.offset]) 184 } 185 goto exit 186 } 187 188 /*-style comment */ 189 s.next() 190 for s.ch >= 0 { 191 ch := s.ch 192 if ch == '\r' { 193 hasCR = true 194 } 195 s.next() 196 if ch == '*' && s.ch == '/' { 197 s.next() 198 goto exit 199 } 200 } 201 202 s.error(offs, "comment not terminated") 203 204 exit: 205 lit := s.src[offs:s.offset] 206 if hasCR { 207 lit = stripCR(lit) 208 } 209 210 return string(lit) 211 } 212 213 func (s *Scanner) findLineEnd() bool { 214 // initial '/' already consumed 215 216 defer func(offs int) { 217 // reset scanner state to where it was upon calling findLineEnd 218 s.ch = '/' 219 s.offset = offs 220 s.rdOffset = offs + 1 221 s.next() // consume initial '/' again 222 }(s.offset - 1) 223 224 // read ahead until a newline, EOF, or non-comment token is found 225 for s.ch == '/' || s.ch == '*' { 226 if s.ch == '/' { 227 //-style comment always contains a newline 228 return true 229 } 230 /*-style comment: look for newline */ 231 s.next() 232 for s.ch >= 0 { 233 ch := s.ch 234 if ch == '\n' { 235 return true 236 } 237 s.next() 238 if ch == '*' && s.ch == '/' { 239 s.next() 240 break 241 } 242 } 243 s.skipWhitespace() // s.insertSemi is set 244 if s.ch < 0 || s.ch == '\n' { 245 return true 246 } 247 if s.ch != '/' { 248 // non-comment token 249 return false 250 } 251 s.next() // consume '/' 252 } 253 254 return false 255 } 256 257 func isLetter(ch rune) bool { 258 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 259 } 260 261 func isDigit(ch rune) bool { 262 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 263 } 264 265 func (s *Scanner) scanIdentifier() string { 266 offs := s.offset 267 for isLetter(s.ch) || isDigit(s.ch) { 268 s.next() 269 } 270 return string(s.src[offs:s.offset]) 271 } 272 273 func digitVal(ch rune) int { 274 switch { 275 case '0' <= ch && ch <= '9': 276 return int(ch - '0') 277 case 'a' <= ch && ch <= 'f': 278 return int(ch - 'a' + 10) 279 case 'A' <= ch && ch <= 'F': 280 return int(ch - 'A' + 10) 281 } 282 return 16 // larger than any legal digit val 283 } 284 285 func (s *Scanner) scanMantissa(base int) { 286 for digitVal(s.ch) < base { 287 s.next() 288 } 289 } 290 291 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 292 // digitVal(s.ch) < 10 293 offs := s.offset 294 tok := token.INT 295 296 if seenDecimalPoint { 297 offs-- 298 tok = token.FLOAT 299 s.scanMantissa(10) 300 goto exponent 301 } 302 303 if s.ch == '0' { 304 // int or float 305 offs := s.offset 306 s.next() 307 if s.ch == 'x' || s.ch == 'X' { 308 // hexadecimal int 309 s.next() 310 s.scanMantissa(16) 311 if s.offset-offs <= 2 { 312 // only scanned "0x" or "0X" 313 s.error(offs, "illegal hexadecimal number") 314 } 315 } else { 316 // octal int or float 317 seenDecimalDigit := false 318 s.scanMantissa(8) 319 if s.ch == '8' || s.ch == '9' { 320 // illegal octal int or float 321 seenDecimalDigit = true 322 s.scanMantissa(10) 323 } 324 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 325 goto fraction 326 } 327 // octal int 328 if seenDecimalDigit { 329 s.error(offs, "illegal octal number") 330 } 331 } 332 goto exit 333 } 334 335 // decimal int or float 336 s.scanMantissa(10) 337 338 fraction: 339 if s.ch == '.' { 340 tok = token.FLOAT 341 s.next() 342 s.scanMantissa(10) 343 } 344 345 exponent: 346 if s.ch == 'e' || s.ch == 'E' { 347 tok = token.FLOAT 348 s.next() 349 if s.ch == '-' || s.ch == '+' { 350 s.next() 351 } 352 if digitVal(s.ch) < 10 { 353 s.scanMantissa(10) 354 } else { 355 s.error(offs, "illegal floating-point exponent") 356 } 357 } 358 359 if s.ch == 'i' { 360 tok = token.IMAG 361 s.next() 362 } 363 364 exit: 365 return tok, string(s.src[offs:s.offset]) 366 } 367 368 // scanEscape parses an escape sequence where rune is the accepted 369 // escaped quote. In case of a syntax error, it stops at the offending 370 // character (without consuming it) and returns false. Otherwise 371 // it returns true. 372 func (s *Scanner) scanEscape(quote rune) bool { 373 offs := s.offset 374 375 var n int 376 var base, max uint32 377 switch s.ch { 378 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 379 s.next() 380 return true 381 case '0', '1', '2', '3', '4', '5', '6', '7': 382 n, base, max = 3, 8, 255 383 case 'x': 384 s.next() 385 n, base, max = 2, 16, 255 386 case 'u': 387 s.next() 388 n, base, max = 4, 16, unicode.MaxRune 389 case 'U': 390 s.next() 391 n, base, max = 8, 16, unicode.MaxRune 392 default: 393 msg := "unknown escape sequence" 394 if s.ch < 0 { 395 msg = "escape sequence not terminated" 396 } 397 s.error(offs, msg) 398 return false 399 } 400 401 var x uint32 402 for n > 0 { 403 d := uint32(digitVal(s.ch)) 404 if d >= base { 405 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 406 if s.ch < 0 { 407 msg = "escape sequence not terminated" 408 } 409 s.error(s.offset, msg) 410 return false 411 } 412 x = x*base + d 413 s.next() 414 n-- 415 } 416 417 if x > max || 0xD800 <= x && x < 0xE000 { 418 s.error(offs, "escape sequence is invalid Unicode code point") 419 return false 420 } 421 422 return true 423 } 424 425 func (s *Scanner) scanRune() string { 426 // '\'' opening already consumed 427 offs := s.offset - 1 428 429 valid := true 430 n := 0 431 for { 432 ch := s.ch 433 if ch == '\n' || ch < 0 { 434 // only report error if we don't have one already 435 if valid { 436 s.error(offs, "rune literal not terminated") 437 valid = false 438 } 439 break 440 } 441 s.next() 442 if ch == '\'' { 443 break 444 } 445 n++ 446 if ch == '\\' { 447 if !s.scanEscape('\'') { 448 valid = false 449 } 450 // continue to read to closing quote 451 } 452 } 453 454 if valid && n != 1 { 455 s.error(offs, "illegal rune literal") 456 } 457 458 return string(s.src[offs:s.offset]) 459 } 460 461 func (s *Scanner) scanString() string { 462 // '"' opening already consumed 463 offs := s.offset - 1 464 465 for { 466 ch := s.ch 467 if ch == '\n' || ch < 0 { 468 s.error(offs, "string literal not terminated") 469 break 470 } 471 s.next() 472 if ch == '"' { 473 break 474 } 475 if ch == '\\' { 476 s.scanEscape('"') 477 } 478 } 479 480 return string(s.src[offs:s.offset]) 481 } 482 483 func stripCR(b []byte) []byte { 484 c := make([]byte, len(b)) 485 i := 0 486 for _, ch := range b { 487 if ch != '\r' { 488 c[i] = ch 489 i++ 490 } 491 } 492 return c[:i] 493 } 494 495 func (s *Scanner) scanRawString() string { 496 // '`' opening already consumed 497 offs := s.offset - 1 498 499 hasCR := false 500 for { 501 ch := s.ch 502 if ch < 0 { 503 s.error(offs, "raw string literal not terminated") 504 break 505 } 506 s.next() 507 if ch == '`' { 508 break 509 } 510 if ch == '\r' { 511 hasCR = true 512 } 513 } 514 515 lit := s.src[offs:s.offset] 516 if hasCR { 517 lit = stripCR(lit) 518 } 519 520 return string(lit) 521 } 522 523 func (s *Scanner) skipWhitespace() { 524 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 525 s.next() 526 } 527 } 528 529 // Helper functions for scanning multi-byte tokens such as >> += >>= . 530 // Different routines recognize different length tok_i based on matches 531 // of ch_i. If a token ends in '=', the result is tok1 or tok3 532 // respectively. Otherwise, the result is tok0 if there was no other 533 // matching character, or tok2 if the matching character was ch2. 534 535 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 536 if s.ch == '=' { 537 s.next() 538 return tok1 539 } 540 return tok0 541 } 542 543 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 544 if s.ch == '=' { 545 s.next() 546 return tok1 547 } 548 if s.ch == ch2 { 549 s.next() 550 return tok2 551 } 552 return tok0 553 } 554 555 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 556 if s.ch == '=' { 557 s.next() 558 return tok1 559 } 560 if s.ch == ch2 { 561 s.next() 562 if s.ch == '=' { 563 s.next() 564 return tok3 565 } 566 return tok2 567 } 568 return tok0 569 } 570 571 // Scan scans the next token and returns the token position, the token, 572 // and its literal string if applicable. The source end is indicated by 573 // token.EOF. 574 // 575 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 576 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 577 // has the corresponding value. 578 // 579 // If the returned token is a keyword, the literal string is the keyword. 580 // 581 // If the returned token is token.SEMICOLON, the corresponding 582 // literal string is ";" if the semicolon was present in the source, 583 // and "\n" if the semicolon was inserted because of a newline or 584 // at EOF. 585 // 586 // If the returned token is token.ILLEGAL, the literal string is the 587 // offending character. 588 // 589 // In all other cases, Scan returns an empty literal string. 590 // 591 // For more tolerant parsing, Scan will return a valid token if 592 // possible even if a syntax error was encountered. Thus, even 593 // if the resulting token sequence contains no illegal tokens, 594 // a client may not assume that no error occurred. Instead it 595 // must check the scanner's ErrorCount or the number of calls 596 // of the error handler, if there was one installed. 597 // 598 // Scan adds line information to the file added to the file 599 // set with Init. Token positions are relative to that file 600 // and thus relative to the file set. 601 // 602 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 603 scanAgain: 604 s.skipWhitespace() 605 606 // current token start 607 pos = s.file.Pos(s.offset) 608 609 // determine token value 610 insertSemi := false 611 switch ch := s.ch; { 612 case isLetter(ch): 613 lit = s.scanIdentifier() 614 if len(lit) > 1 { 615 // keywords are longer than one letter - avoid lookup otherwise 616 tok = token.Lookup(lit) 617 switch tok { 618 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 619 insertSemi = true 620 } 621 } else { 622 insertSemi = true 623 tok = token.IDENT 624 } 625 case '0' <= ch && ch <= '9': 626 insertSemi = true 627 tok, lit = s.scanNumber(false) 628 default: 629 s.next() // always make progress 630 switch ch { 631 case -1: 632 if s.insertSemi { 633 s.insertSemi = false // EOF consumed 634 return pos, token.SEMICOLON, "\n" 635 } 636 tok = token.EOF 637 case '\n': 638 // we only reach here if s.insertSemi was 639 // set in the first place and exited early 640 // from s.skipWhitespace() 641 s.insertSemi = false // newline consumed 642 return pos, token.SEMICOLON, "\n" 643 case '"': 644 insertSemi = true 645 tok = token.STRING 646 lit = s.scanString() 647 case '\'': 648 insertSemi = true 649 tok = token.CHAR 650 lit = s.scanRune() 651 case '`': 652 insertSemi = true 653 tok = token.STRING 654 lit = s.scanRawString() 655 case ':': 656 tok = s.switch2(token.COLON, token.DEFINE) 657 case '.': 658 if '0' <= s.ch && s.ch <= '9' { 659 insertSemi = true 660 tok, lit = s.scanNumber(true) 661 } else if s.ch == '.' { 662 s.next() 663 if s.ch == '.' { 664 s.next() 665 tok = token.ELLIPSIS 666 } 667 } else { 668 tok = token.PERIOD 669 } 670 case ',': 671 tok = token.COMMA 672 case ';': 673 tok = token.SEMICOLON 674 lit = ";" 675 case '(': 676 tok = token.LPAREN 677 case ')': 678 insertSemi = true 679 tok = token.RPAREN 680 case '[': 681 tok = token.LBRACK 682 case ']': 683 insertSemi = true 684 tok = token.RBRACK 685 case '{': 686 tok = token.LBRACE 687 case '}': 688 insertSemi = true 689 tok = token.RBRACE 690 case '+': 691 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 692 if tok == token.INC { 693 insertSemi = true 694 } 695 case '-': 696 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 697 if tok == token.DEC { 698 insertSemi = true 699 } 700 case '*': 701 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 702 case '/': 703 if s.ch == '/' || s.ch == '*' { 704 // comment 705 if s.insertSemi && s.findLineEnd() { 706 // reset position to the beginning of the comment 707 s.ch = '/' 708 s.offset = s.file.Offset(pos) 709 s.rdOffset = s.offset + 1 710 s.insertSemi = false // newline consumed 711 return pos, token.SEMICOLON, "\n" 712 } 713 comment := s.scanComment() 714 if s.mode&ScanComments == 0 { 715 // skip comment 716 s.insertSemi = false // newline consumed 717 goto scanAgain 718 } 719 tok = token.COMMENT 720 lit = comment 721 } else { 722 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 723 } 724 case '%': 725 tok = s.switch2(token.REM, token.REM_ASSIGN) 726 case '^': 727 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 728 case '<': 729 if s.ch == '-' { 730 s.next() 731 tok = token.ARROW 732 } else { 733 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 734 } 735 case '>': 736 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 737 case '=': 738 tok = s.switch2(token.ASSIGN, token.EQL) 739 case '!': 740 tok = s.switch2(token.NOT, token.NEQ) 741 case '&': 742 if s.ch == '^' { 743 s.next() 744 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 745 } else { 746 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 747 } 748 case '|': 749 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 750 default: 751 // next reports unexpected BOMs - don't repeat 752 if ch != bom { 753 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 754 } 755 insertSemi = s.insertSemi // preserve insertSemi info 756 tok = token.ILLEGAL 757 lit = string(ch) 758 } 759 } 760 if s.mode&dontInsertSemis == 0 { 761 s.insertSemi = insertSemi 762 } 763 764 return 765 }