github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 // 9 package scanner 10 11 import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22 // encountered and a handler was installed, the handler is called with a 23 // position and an error message. The position points to the beginning of 24 // the offending token. 25 // 26 type ErrorHandler func(pos token.Position, msg string) 27 28 // A Scanner holds the scanner's internal state while processing 29 // a given text. It can be allocated as part of another data 30 // structure but must be initialized via Init before use. 31 // 32 type Scanner struct { 33 // immutable state 34 file *token.File // source file handle 35 dir string // directory portion of file.Name() 36 src []byte // source 37 err ErrorHandler // error reporting; or nil 38 mode Mode // scanning mode 39 40 // scanning state 41 ch rune // current character 42 offset int // character offset 43 rdOffset int // reading offset (position after current character) 44 lineOffset int // current line offset 45 insertSemi bool // insert a semicolon before next newline 46 47 // public state - ok to modify 48 ErrorCount int // number of errors encountered 49 } 50 51 const bom = 0xFEFF // byte order mark, only permitted as very first character 52 53 // Read the next Unicode char into s.ch. 54 // s.ch < 0 means end-of-file. 55 // 56 func (s *Scanner) next() { 57 if s.rdOffset < len(s.src) { 58 s.offset = s.rdOffset 59 if s.ch == '\n' { 60 s.lineOffset = s.offset 61 s.file.AddLine(s.offset) 62 } 63 r, w := rune(s.src[s.rdOffset]), 1 64 switch { 65 case r == 0: 66 s.error(s.offset, "illegal character NUL") 67 case r >= 0x80: 68 // not ASCII 69 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 70 if r == utf8.RuneError && w == 1 { 71 s.error(s.offset, "illegal UTF-8 encoding") 72 } else if r == bom && s.offset > 0 { 73 s.error(s.offset, "illegal byte order mark") 74 } 75 } 76 s.rdOffset += w 77 s.ch = r 78 } else { 79 s.offset = len(s.src) 80 if s.ch == '\n' { 81 s.lineOffset = s.offset 82 s.file.AddLine(s.offset) 83 } 84 s.ch = -1 // eof 85 } 86 } 87 88 // A mode value is a set of flags (or 0). 89 // They control scanner behavior. 90 // 91 type Mode uint 92 93 const ( 94 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 95 dontInsertSemis // do not automatically insert semicolons - for testing only 96 ) 97 98 // Init prepares the scanner s to tokenize the text src by setting the 99 // scanner at the beginning of src. The scanner uses the file set file 100 // for position information and it adds line information for each line. 101 // It is ok to re-use the same file when re-scanning the same file as 102 // line information which is already present is ignored. Init causes a 103 // panic if the file size does not match the src size. 104 // 105 // Calls to Scan will invoke the error handler err if they encounter a 106 // syntax error and err is not nil. Also, for each error encountered, 107 // the Scanner field ErrorCount is incremented by one. The mode parameter 108 // determines how comments are handled. 109 // 110 // Note that Init may call err if there is an error in the first character 111 // of the file. 112 // 113 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 114 // Explicitly initialize all fields since a scanner may be reused. 115 if file.Size() != len(src) { 116 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 117 } 118 s.file = file 119 s.dir, _ = filepath.Split(file.Name()) 120 s.src = src 121 s.err = err 122 s.mode = mode 123 124 s.ch = ' ' 125 s.offset = 0 126 s.rdOffset = 0 127 s.lineOffset = 0 128 s.insertSemi = false 129 s.ErrorCount = 0 130 131 s.next() 132 if s.ch == bom { 133 s.next() // ignore BOM at file beginning 134 } 135 } 136 137 func (s *Scanner) error(offs int, msg string) { 138 if s.err != nil { 139 s.err(s.file.Position(s.file.Pos(offs)), msg) 140 } 141 s.ErrorCount++ 142 } 143 144 var prefix = []byte("//line ") 145 146 func (s *Scanner) interpretLineComment(text []byte) { 147 if bytes.HasPrefix(text, prefix) { 148 // get filename and line number, if any 149 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 150 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 151 // valid //line filename:line comment; 152 filename := filepath.Clean(string(text[len(prefix):i])) 153 if !filepath.IsAbs(filename) { 154 // make filename relative to current directory 155 filename = filepath.Join(s.dir, filename) 156 } 157 // update scanner position 158 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 159 } 160 } 161 } 162 } 163 164 func (s *Scanner) scanComment() string { 165 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 166 offs := s.offset - 1 // position of initial '/' 167 hasCR := false 168 169 if s.ch == '/' { 170 //-style comment 171 s.next() 172 for s.ch != '\n' && s.ch >= 0 { 173 if s.ch == '\r' { 174 hasCR = true 175 } 176 s.next() 177 } 178 if offs == s.lineOffset { 179 // comment starts at the beginning of the current line 180 s.interpretLineComment(s.src[offs:s.offset]) 181 } 182 goto exit 183 } 184 185 /*-style comment */ 186 s.next() 187 for s.ch >= 0 { 188 ch := s.ch 189 if ch == '\r' { 190 hasCR = true 191 } 192 s.next() 193 if ch == '*' && s.ch == '/' { 194 s.next() 195 goto exit 196 } 197 } 198 199 s.error(offs, "comment not terminated") 200 201 exit: 202 lit := s.src[offs:s.offset] 203 if hasCR { 204 lit = stripCR(lit) 205 } 206 207 return string(lit) 208 } 209 210 func (s *Scanner) findLineEnd() bool { 211 // initial '/' already consumed 212 213 defer func(offs int) { 214 // reset scanner state to where it was upon calling findLineEnd 215 s.ch = '/' 216 s.offset = offs 217 s.rdOffset = offs + 1 218 s.next() // consume initial '/' again 219 }(s.offset - 1) 220 221 // read ahead until a newline, EOF, or non-comment token is found 222 for s.ch == '/' || s.ch == '*' { 223 if s.ch == '/' { 224 //-style comment always contains a newline 225 return true 226 } 227 /*-style comment: look for newline */ 228 s.next() 229 for s.ch >= 0 { 230 ch := s.ch 231 if ch == '\n' { 232 return true 233 } 234 s.next() 235 if ch == '*' && s.ch == '/' { 236 s.next() 237 break 238 } 239 } 240 s.skipWhitespace() // s.insertSemi is set 241 if s.ch < 0 || s.ch == '\n' { 242 return true 243 } 244 if s.ch != '/' { 245 // non-comment token 246 return false 247 } 248 s.next() // consume '/' 249 } 250 251 return false 252 } 253 254 func isLetter(ch rune) bool { 255 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch) 256 } 257 258 func isDigit(ch rune) bool { 259 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) 260 } 261 262 func (s *Scanner) scanIdentifier() string { 263 offs := s.offset 264 for isLetter(s.ch) || isDigit(s.ch) { 265 s.next() 266 } 267 return string(s.src[offs:s.offset]) 268 } 269 270 func digitVal(ch rune) int { 271 switch { 272 case '0' <= ch && ch <= '9': 273 return int(ch - '0') 274 case 'a' <= ch && ch <= 'f': 275 return int(ch - 'a' + 10) 276 case 'A' <= ch && ch <= 'F': 277 return int(ch - 'A' + 10) 278 } 279 return 16 // larger than any legal digit val 280 } 281 282 func (s *Scanner) scanMantissa(base int) { 283 for digitVal(s.ch) < base { 284 s.next() 285 } 286 } 287 288 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 289 // digitVal(s.ch) < 10 290 offs := s.offset 291 tok := token.INT 292 293 if seenDecimalPoint { 294 offs-- 295 tok = token.FLOAT 296 s.scanMantissa(10) 297 goto exponent 298 } 299 300 if s.ch == '0' { 301 // int or float 302 offs := s.offset 303 s.next() 304 if s.ch == 'x' || s.ch == 'X' { 305 // hexadecimal int 306 s.next() 307 s.scanMantissa(16) 308 if s.offset-offs <= 2 { 309 // only scanned "0x" or "0X" 310 s.error(offs, "illegal hexadecimal number") 311 } 312 } else { 313 // octal int or float 314 seenDecimalDigit := false 315 s.scanMantissa(8) 316 if s.ch == '8' || s.ch == '9' { 317 // illegal octal int or float 318 seenDecimalDigit = true 319 s.scanMantissa(10) 320 } 321 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 322 goto fraction 323 } 324 // octal int 325 if seenDecimalDigit { 326 s.error(offs, "illegal octal number") 327 } 328 } 329 goto exit 330 } 331 332 // decimal int or float 333 s.scanMantissa(10) 334 335 fraction: 336 if s.ch == '.' { 337 tok = token.FLOAT 338 s.next() 339 s.scanMantissa(10) 340 } 341 342 exponent: 343 if s.ch == 'e' || s.ch == 'E' { 344 tok = token.FLOAT 345 s.next() 346 if s.ch == '-' || s.ch == '+' { 347 s.next() 348 } 349 s.scanMantissa(10) 350 } 351 352 if s.ch == 'i' { 353 tok = token.IMAG 354 s.next() 355 } 356 357 exit: 358 return tok, string(s.src[offs:s.offset]) 359 } 360 361 func (s *Scanner) scanEscape(quote rune) { 362 offs := s.offset 363 364 var i, base, max uint32 365 switch s.ch { 366 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 367 s.next() 368 return 369 case '0', '1', '2', '3', '4', '5', '6', '7': 370 i, base, max = 3, 8, 255 371 case 'x': 372 s.next() 373 i, base, max = 2, 16, 255 374 case 'u': 375 s.next() 376 i, base, max = 4, 16, unicode.MaxRune 377 case 'U': 378 s.next() 379 i, base, max = 8, 16, unicode.MaxRune 380 default: 381 s.next() // always make progress 382 s.error(offs, "unknown escape sequence") 383 return 384 } 385 386 var x uint32 387 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- { 388 d := uint32(digitVal(s.ch)) 389 if d >= base { 390 s.error(s.offset, "illegal character in escape sequence") 391 break 392 } 393 x = x*base + d 394 s.next() 395 } 396 // in case of an error, consume remaining chars 397 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- { 398 s.next() 399 } 400 if x > max || 0xD800 <= x && x < 0xE000 { 401 s.error(offs, "escape sequence is invalid Unicode code point") 402 } 403 } 404 405 func (s *Scanner) scanChar() string { 406 // '\'' opening already consumed 407 offs := s.offset - 1 408 409 n := 0 410 for s.ch != '\'' { 411 ch := s.ch 412 n++ 413 s.next() 414 if ch == '\n' || ch < 0 { 415 s.error(offs, "character literal not terminated") 416 n = 1 417 break 418 } 419 if ch == '\\' { 420 s.scanEscape('\'') 421 } 422 } 423 424 s.next() 425 426 if n != 1 { 427 s.error(offs, "illegal character literal") 428 } 429 430 return string(s.src[offs:s.offset]) 431 } 432 433 func (s *Scanner) scanString() string { 434 // '"' opening already consumed 435 offs := s.offset - 1 436 437 for s.ch != '"' { 438 ch := s.ch 439 s.next() 440 if ch == '\n' || ch < 0 { 441 s.error(offs, "string not terminated") 442 break 443 } 444 if ch == '\\' { 445 s.scanEscape('"') 446 } 447 } 448 449 s.next() 450 451 return string(s.src[offs:s.offset]) 452 } 453 454 func stripCR(b []byte) []byte { 455 c := make([]byte, len(b)) 456 i := 0 457 for _, ch := range b { 458 if ch != '\r' { 459 c[i] = ch 460 i++ 461 } 462 } 463 return c[:i] 464 } 465 466 func (s *Scanner) scanRawString() string { 467 // '`' opening already consumed 468 offs := s.offset - 1 469 470 hasCR := false 471 for s.ch != '`' { 472 ch := s.ch 473 s.next() 474 if ch == '\r' { 475 hasCR = true 476 } 477 if ch < 0 { 478 s.error(offs, "string not terminated") 479 break 480 } 481 } 482 483 s.next() 484 485 lit := s.src[offs:s.offset] 486 if hasCR { 487 lit = stripCR(lit) 488 } 489 490 return string(lit) 491 } 492 493 func (s *Scanner) skipWhitespace() { 494 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 495 s.next() 496 } 497 } 498 499 // Helper functions for scanning multi-byte tokens such as >> += >>= . 500 // Different routines recognize different length tok_i based on matches 501 // of ch_i. If a token ends in '=', the result is tok1 or tok3 502 // respectively. Otherwise, the result is tok0 if there was no other 503 // matching character, or tok2 if the matching character was ch2. 504 505 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 506 if s.ch == '=' { 507 s.next() 508 return tok1 509 } 510 return tok0 511 } 512 513 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 514 if s.ch == '=' { 515 s.next() 516 return tok1 517 } 518 if s.ch == ch2 { 519 s.next() 520 return tok2 521 } 522 return tok0 523 } 524 525 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 526 if s.ch == '=' { 527 s.next() 528 return tok1 529 } 530 if s.ch == ch2 { 531 s.next() 532 if s.ch == '=' { 533 s.next() 534 return tok3 535 } 536 return tok2 537 } 538 return tok0 539 } 540 541 // Scan scans the next token and returns the token position, the token, 542 // and its literal string if applicable. The source end is indicated by 543 // token.EOF. 544 // 545 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 546 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 547 // has the corresponding value. 548 // 549 // If the returned token is a keyword, the literal string is the keyword. 550 // 551 // If the returned token is token.SEMICOLON, the corresponding 552 // literal string is ";" if the semicolon was present in the source, 553 // and "\n" if the semicolon was inserted because of a newline or 554 // at EOF. 555 // 556 // If the returned token is token.ILLEGAL, the literal string is the 557 // offending character. 558 // 559 // In all other cases, Scan returns an empty literal string. 560 // 561 // For more tolerant parsing, Scan will return a valid token if 562 // possible even if a syntax error was encountered. Thus, even 563 // if the resulting token sequence contains no illegal tokens, 564 // a client may not assume that no error occurred. Instead it 565 // must check the scanner's ErrorCount or the number of calls 566 // of the error handler, if there was one installed. 567 // 568 // Scan adds line information to the file added to the file 569 // set with Init. Token positions are relative to that file 570 // and thus relative to the file set. 571 // 572 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 573 scanAgain: 574 s.skipWhitespace() 575 576 // current token start 577 pos = s.file.Pos(s.offset) 578 579 // determine token value 580 insertSemi := false 581 switch ch := s.ch; { 582 case isLetter(ch): 583 lit = s.scanIdentifier() 584 if len(lit) > 1 { 585 // keywords are longer than one letter - avoid lookup otherwise 586 tok = token.Lookup(lit) 587 switch tok { 588 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 589 insertSemi = true 590 } 591 } else { 592 insertSemi = true 593 tok = token.IDENT 594 } 595 case '0' <= ch && ch <= '9': 596 insertSemi = true 597 tok, lit = s.scanNumber(false) 598 default: 599 s.next() // always make progress 600 switch ch { 601 case -1: 602 if s.insertSemi { 603 s.insertSemi = false // EOF consumed 604 return pos, token.SEMICOLON, "\n" 605 } 606 tok = token.EOF 607 case '\n': 608 // we only reach here if s.insertSemi was 609 // set in the first place and exited early 610 // from s.skipWhitespace() 611 s.insertSemi = false // newline consumed 612 return pos, token.SEMICOLON, "\n" 613 case '"': 614 insertSemi = true 615 tok = token.STRING 616 lit = s.scanString() 617 case '\'': 618 insertSemi = true 619 tok = token.CHAR 620 lit = s.scanChar() 621 case '`': 622 insertSemi = true 623 tok = token.STRING 624 lit = s.scanRawString() 625 case ':': 626 tok = s.switch2(token.COLON, token.DEFINE) 627 case '.': 628 if '0' <= s.ch && s.ch <= '9' { 629 insertSemi = true 630 tok, lit = s.scanNumber(true) 631 } else if s.ch == '.' { 632 s.next() 633 if s.ch == '.' { 634 s.next() 635 tok = token.ELLIPSIS 636 } 637 } else { 638 tok = token.PERIOD 639 } 640 case ',': 641 tok = token.COMMA 642 case ';': 643 tok = token.SEMICOLON 644 lit = ";" 645 case '(': 646 tok = token.LPAREN 647 case ')': 648 insertSemi = true 649 tok = token.RPAREN 650 case '[': 651 tok = token.LBRACK 652 case ']': 653 insertSemi = true 654 tok = token.RBRACK 655 case '{': 656 tok = token.LBRACE 657 case '}': 658 insertSemi = true 659 tok = token.RBRACE 660 case '+': 661 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 662 if tok == token.INC { 663 insertSemi = true 664 } 665 case '-': 666 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 667 if tok == token.DEC { 668 insertSemi = true 669 } 670 case '*': 671 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 672 case '/': 673 if s.ch == '/' || s.ch == '*' { 674 // comment 675 if s.insertSemi && s.findLineEnd() { 676 // reset position to the beginning of the comment 677 s.ch = '/' 678 s.offset = s.file.Offset(pos) 679 s.rdOffset = s.offset + 1 680 s.insertSemi = false // newline consumed 681 return pos, token.SEMICOLON, "\n" 682 } 683 lit = s.scanComment() 684 if s.mode&ScanComments == 0 { 685 // skip comment 686 s.insertSemi = false // newline consumed 687 goto scanAgain 688 } 689 tok = token.COMMENT 690 } else { 691 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 692 } 693 case '%': 694 tok = s.switch2(token.REM, token.REM_ASSIGN) 695 case '^': 696 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 697 case '<': 698 if s.ch == '-' { 699 s.next() 700 tok = token.ARROW 701 } else { 702 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 703 } 704 case '>': 705 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 706 case '=': 707 tok = s.switch2(token.ASSIGN, token.EQL) 708 case '!': 709 tok = s.switch2(token.NOT, token.NEQ) 710 case '&': 711 if s.ch == '^' { 712 s.next() 713 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 714 } else { 715 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 716 } 717 case '|': 718 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 719 default: 720 // next reports unexpected BOMs - don't repeat 721 if ch != bom { 722 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 723 } 724 insertSemi = s.insertSemi // preserve insertSemi info 725 tok = token.ILLEGAL 726 lit = string(ch) 727 } 728 } 729 if s.mode&dontInsertSemis == 0 { 730 s.insertSemi = insertSemi 731 } 732 733 return 734 }