github.com/bir3/gocompiler@v0.3.205/src/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 package scanner 9 10 import ( 11 "bytes" 12 "fmt" 13 "github.com/bir3/gocompiler/src/go/token" 14 "path/filepath" 15 "strconv" 16 "unicode" 17 "unicode/utf8" 18 ) 19 20 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 21 // encountered and a handler was installed, the handler is called with a 22 // position and an error message. The position points to the beginning of 23 // the offending token. 24 type ErrorHandler func(pos token.Position, msg string) 25 26 // A Scanner holds the scanner's internal state while processing 27 // a given text. It can be allocated as part of another data 28 // structure but must be initialized via Init before use. 29 type Scanner struct { 30 // immutable state 31 file *token.File // source file handle 32 dir string // directory portion of file.Name() 33 src []byte // source 34 err ErrorHandler // error reporting; or nil 35 mode Mode // scanning mode 36 37 // scanning state 38 ch rune // current character 39 offset int // character offset 40 rdOffset int // reading offset (position after current character) 41 lineOffset int // current line offset 42 insertSemi bool // insert a semicolon before next newline 43 nlPos token.Pos // position of newline in preceding comment 44 45 // public state - ok to modify 46 ErrorCount int // number of errors encountered 47 } 48 49 const ( 50 bom = 0xFEFF // byte order mark, only permitted as very first character 51 eof = -1 // end of file 52 ) 53 54 // Read the next Unicode char into s.ch. 55 // s.ch < 0 means end-of-file. 56 // 57 // For optimization, there is some overlap between this method and 58 // s.scanIdentifier. 59 func (s *Scanner) next() { 60 if s.rdOffset < len(s.src) { 61 s.offset = s.rdOffset 62 if s.ch == '\n' { 63 s.lineOffset = s.offset 64 s.file.AddLine(s.offset) 65 } 66 r, w := rune(s.src[s.rdOffset]), 1 67 switch { 68 case r == 0: 69 s.error(s.offset, "illegal character NUL") 70 case r >= utf8.RuneSelf: 71 // not ASCII 72 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 73 if r == utf8.RuneError && w == 1 { 74 s.error(s.offset, "illegal UTF-8 encoding") 75 } else if r == bom && s.offset > 0 { 76 s.error(s.offset, "illegal byte order mark") 77 } 78 } 79 s.rdOffset += w 80 s.ch = r 81 } else { 82 s.offset = len(s.src) 83 if s.ch == '\n' { 84 s.lineOffset = s.offset 85 s.file.AddLine(s.offset) 86 } 87 s.ch = eof 88 } 89 } 90 91 // peek returns the byte following the most recently read character without 92 // advancing the scanner. If the scanner is at EOF, peek returns 0. 93 func (s *Scanner) peek() byte { 94 if s.rdOffset < len(s.src) { 95 return s.src[s.rdOffset] 96 } 97 return 0 98 } 99 100 // A mode value is a set of flags (or 0). 101 // They control scanner behavior. 102 type Mode uint 103 104 const ( 105 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 106 dontInsertSemis // do not automatically insert semicolons - for testing only 107 ) 108 109 // Init prepares the scanner s to tokenize the text src by setting the 110 // scanner at the beginning of src. The scanner uses the file set file 111 // for position information and it adds line information for each line. 112 // It is ok to re-use the same file when re-scanning the same file as 113 // line information which is already present is ignored. Init causes a 114 // panic if the file size does not match the src size. 115 // 116 // Calls to Scan will invoke the error handler err if they encounter a 117 // syntax error and err is not nil. Also, for each error encountered, 118 // the Scanner field ErrorCount is incremented by one. The mode parameter 119 // determines how comments are handled. 120 // 121 // Note that Init may call err if there is an error in the first character 122 // of the file. 123 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 124 // Explicitly initialize all fields since a scanner may be reused. 125 if file.Size() != len(src) { 126 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 127 } 128 s.file = file 129 s.dir, _ = filepath.Split(file.Name()) 130 s.src = src 131 s.err = err 132 s.mode = mode 133 134 s.ch = ' ' 135 s.offset = 0 136 s.rdOffset = 0 137 s.lineOffset = 0 138 s.insertSemi = false 139 s.ErrorCount = 0 140 141 s.next() 142 if s.ch == bom { 143 s.next() // ignore BOM at file beginning 144 } 145 } 146 147 func (s *Scanner) error(offs int, msg string) { 148 if s.err != nil { 149 s.err(s.file.Position(s.file.Pos(offs)), msg) 150 } 151 s.ErrorCount++ 152 } 153 154 func (s *Scanner) errorf(offs int, format string, args ...any) { 155 s.error(offs, fmt.Sprintf(format, args...)) 156 } 157 158 // scanComment returns the text of the comment and (if nonzero) 159 // the offset of the first newline within it, which implies a 160 // /*...*/ comment. 161 func (s *Scanner) scanComment() (string, int) { 162 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 163 offs := s.offset - 1 // position of initial '/' 164 next := -1 // position immediately following the comment; < 0 means invalid comment 165 numCR := 0 166 nlOffset := 0 // offset of first newline within /*...*/ comment 167 168 if s.ch == '/' { 169 //-style comment 170 // (the final '\n' is not considered part of the comment) 171 s.next() 172 for s.ch != '\n' && s.ch >= 0 { 173 if s.ch == '\r' { 174 numCR++ 175 } 176 s.next() 177 } 178 // if we are at '\n', the position following the comment is afterwards 179 next = s.offset 180 if s.ch == '\n' { 181 next++ 182 } 183 goto exit 184 } 185 186 /*-style comment */ 187 s.next() 188 for s.ch >= 0 { 189 ch := s.ch 190 if ch == '\r' { 191 numCR++ 192 } else if ch == '\n' && nlOffset == 0 { 193 nlOffset = s.offset 194 } 195 s.next() 196 if ch == '*' && s.ch == '/' { 197 s.next() 198 next = s.offset 199 goto exit 200 } 201 } 202 203 s.error(offs, "comment not terminated") 204 205 exit: 206 lit := s.src[offs:s.offset] 207 208 // On Windows, a (//-comment) line may end in "\r\n". 209 // Remove the final '\r' before analyzing the text for 210 // line directives (matching the compiler). Remove any 211 // other '\r' afterwards (matching the pre-existing be- 212 // havior of the scanner). 213 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { 214 lit = lit[:len(lit)-1] 215 numCR-- 216 } 217 218 // interpret line directives 219 // (//line directives must start at the beginning of the current line) 220 if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) { 221 s.updateLineInfo(next, offs, lit) 222 } 223 224 if numCR > 0 { 225 lit = stripCR(lit, lit[1] == '*') 226 } 227 228 return string(lit), nlOffset 229 } 230 231 var prefix = []byte("line ") 232 233 // updateLineInfo parses the incoming comment text at offset offs 234 // as a line directive. If successful, it updates the line info table 235 // for the position next per the line directive. 236 func (s *Scanner) updateLineInfo(next, offs int, text []byte) { 237 // extract comment text 238 if text[1] == '*' { 239 text = text[:len(text)-2] // lop off trailing "*/" 240 } 241 text = text[7:] // lop off leading "//line " or "/*line " 242 offs += 7 243 244 i, n, ok := trailingDigits(text) 245 if i == 0 { 246 return // ignore (not a line directive) 247 } 248 // i > 0 249 250 if !ok { 251 // text has a suffix :xxx but xxx is not a number 252 s.error(offs+i, "invalid line number: "+string(text[i:])) 253 return 254 } 255 256 // Put a cap on the maximum size of line and column numbers. 257 // 30 bits allows for some additional space before wrapping an int32. 258 const maxLineCol = 1<<30 - 1 259 var line, col int 260 i2, n2, ok2 := trailingDigits(text[:i-1]) 261 if ok2 { 262 //line filename:line:col 263 i, i2 = i2, i 264 line, col = n2, n 265 if col == 0 || col > maxLineCol { 266 s.error(offs+i2, "invalid column number: "+string(text[i2:])) 267 return 268 } 269 text = text[:i2-1] // lop off ":col" 270 } else { 271 //line filename:line 272 line = n 273 } 274 275 if line == 0 || line > maxLineCol { 276 s.error(offs+i, "invalid line number: "+string(text[i:])) 277 return 278 } 279 280 // If we have a column (//line filename:line:col form), 281 // an empty filename means to use the previous filename. 282 filename := string(text[:i-1]) // lop off ":line", and trim white space 283 if filename == "" && ok2 { 284 filename = s.file.Position(s.file.Pos(offs)).Filename 285 } else if filename != "" { 286 // Put a relative filename in the current directory. 287 // This is for compatibility with earlier releases. 288 // See issue 26671. 289 filename = filepath.Clean(filename) 290 if !filepath.IsAbs(filename) { 291 filename = filepath.Join(s.dir, filename) 292 } 293 } 294 295 s.file.AddLineColumnInfo(next, filename, line, col) 296 } 297 298 func trailingDigits(text []byte) (int, int, bool) { 299 i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':') 300 if i < 0 { 301 return 0, 0, false // no ":" 302 } 303 // i >= 0 304 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0) 305 return i + 1, int(n), err == nil 306 } 307 308 func isLetter(ch rune) bool { 309 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 310 } 311 312 func isDigit(ch rune) bool { 313 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 314 } 315 316 // scanIdentifier reads the string of valid identifier characters at s.offset. 317 // It must only be called when s.ch is known to be a valid letter. 318 // 319 // Be careful when making changes to this function: it is optimized and affects 320 // scanning performance significantly. 321 func (s *Scanner) scanIdentifier() string { 322 offs := s.offset 323 324 // Optimize for the common case of an ASCII identifier. 325 // 326 // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and 327 // avoids conversions to runes. 328 // 329 // In case we encounter a non-ASCII character, fall back on the slower path 330 // of calling into s.next(). 331 for rdOffset, b := range s.src[s.rdOffset:] { 332 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' { 333 // Avoid assigning a rune for the common case of an ascii character. 334 continue 335 } 336 s.rdOffset += rdOffset 337 if 0 < b && b < utf8.RuneSelf { 338 // Optimization: we've encountered an ASCII character that's not a letter 339 // or number. Avoid the call into s.next() and corresponding set up. 340 // 341 // Note that s.next() does some line accounting if s.ch is '\n', so this 342 // shortcut is only possible because we know that the preceding character 343 // is not '\n'. 344 s.ch = rune(b) 345 s.offset = s.rdOffset 346 s.rdOffset++ 347 goto exit 348 } 349 // We know that the preceding character is valid for an identifier because 350 // scanIdentifier is only called when s.ch is a letter, so calling s.next() 351 // at s.rdOffset resets the scanner state. 352 s.next() 353 for isLetter(s.ch) || isDigit(s.ch) { 354 s.next() 355 } 356 goto exit 357 } 358 s.offset = len(s.src) 359 s.rdOffset = len(s.src) 360 s.ch = eof 361 362 exit: 363 return string(s.src[offs:s.offset]) 364 } 365 366 func digitVal(ch rune) int { 367 switch { 368 case '0' <= ch && ch <= '9': 369 return int(ch - '0') 370 case 'a' <= lower(ch) && lower(ch) <= 'f': 371 return int(lower(ch) - 'a' + 10) 372 } 373 return 16 // larger than any legal digit val 374 } 375 376 func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter 377 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 378 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } 379 380 // digits accepts the sequence { digit | '_' }. 381 // If base <= 10, digits accepts any decimal digit but records 382 // the offset (relative to the source start) of a digit >= base 383 // in *invalid, if *invalid < 0. 384 // digits returns a bitset describing whether the sequence contained 385 // digits (bit 0 is set), or separators '_' (bit 1 is set). 386 func (s *Scanner) digits(base int, invalid *int) (digsep int) { 387 if base <= 10 { 388 max := rune('0' + base) 389 for isDecimal(s.ch) || s.ch == '_' { 390 ds := 1 391 if s.ch == '_' { 392 ds = 2 393 } else if s.ch >= max && *invalid < 0 { 394 *invalid = s.offset // record invalid rune offset 395 } 396 digsep |= ds 397 s.next() 398 } 399 } else { 400 for isHex(s.ch) || s.ch == '_' { 401 ds := 1 402 if s.ch == '_' { 403 ds = 2 404 } 405 digsep |= ds 406 s.next() 407 } 408 } 409 return 410 } 411 412 func (s *Scanner) scanNumber() (token.Token, string) { 413 offs := s.offset 414 tok := token.ILLEGAL 415 416 base := 10 // number base 417 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' 418 digsep := 0 // bit 0: digit present, bit 1: '_' present 419 invalid := -1 // index of invalid digit in literal, or < 0 420 421 // integer part 422 if s.ch != '.' { 423 tok = token.INT 424 if s.ch == '0' { 425 s.next() 426 switch lower(s.ch) { 427 case 'x': 428 s.next() 429 base, prefix = 16, 'x' 430 case 'o': 431 s.next() 432 base, prefix = 8, 'o' 433 case 'b': 434 s.next() 435 base, prefix = 2, 'b' 436 default: 437 base, prefix = 8, '0' 438 digsep = 1 // leading 0 439 } 440 } 441 digsep |= s.digits(base, &invalid) 442 } 443 444 // fractional part 445 if s.ch == '.' { 446 tok = token.FLOAT 447 if prefix == 'o' || prefix == 'b' { 448 s.error(s.offset, "invalid radix point in "+litname(prefix)) 449 } 450 s.next() 451 digsep |= s.digits(base, &invalid) 452 } 453 454 if digsep&1 == 0 { 455 s.error(s.offset, litname(prefix)+" has no digits") 456 } 457 458 // exponent 459 if e := lower(s.ch); e == 'e' || e == 'p' { 460 switch { 461 case e == 'e' && prefix != 0 && prefix != '0': 462 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch) 463 case e == 'p' && prefix != 'x': 464 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch) 465 } 466 s.next() 467 tok = token.FLOAT 468 if s.ch == '+' || s.ch == '-' { 469 s.next() 470 } 471 ds := s.digits(10, nil) 472 digsep |= ds 473 if ds&1 == 0 { 474 s.error(s.offset, "exponent has no digits") 475 } 476 } else if prefix == 'x' && tok == token.FLOAT { 477 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent") 478 } 479 480 // suffix 'i' 481 if s.ch == 'i' { 482 tok = token.IMAG 483 s.next() 484 } 485 486 lit := string(s.src[offs:s.offset]) 487 if tok == token.INT && invalid >= 0 { 488 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix)) 489 } 490 if digsep&2 != 0 { 491 if i := invalidSep(lit); i >= 0 { 492 s.error(offs+i, "'_' must separate successive digits") 493 } 494 } 495 496 return tok, lit 497 } 498 499 func litname(prefix rune) string { 500 switch prefix { 501 case 'x': 502 return "hexadecimal literal" 503 case 'o', '0': 504 return "octal literal" 505 case 'b': 506 return "binary literal" 507 } 508 return "decimal literal" 509 } 510 511 // invalidSep returns the index of the first invalid separator in x, or -1. 512 func invalidSep(x string) int { 513 x1 := ' ' // prefix char, we only care if it's 'x' 514 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) 515 i := 0 516 517 // a prefix counts as a digit 518 if len(x) >= 2 && x[0] == '0' { 519 x1 = lower(rune(x[1])) 520 if x1 == 'x' || x1 == 'o' || x1 == 'b' { 521 d = '0' 522 i = 2 523 } 524 } 525 526 // mantissa and exponent 527 for ; i < len(x); i++ { 528 p := d // previous digit 529 d = rune(x[i]) 530 switch { 531 case d == '_': 532 if p != '0' { 533 return i 534 } 535 case isDecimal(d) || x1 == 'x' && isHex(d): 536 d = '0' 537 default: 538 if p == '_' { 539 return i - 1 540 } 541 d = '.' 542 } 543 } 544 if d == '_' { 545 return len(x) - 1 546 } 547 548 return -1 549 } 550 551 // scanEscape parses an escape sequence where rune is the accepted 552 // escaped quote. In case of a syntax error, it stops at the offending 553 // character (without consuming it) and returns false. Otherwise 554 // it returns true. 555 func (s *Scanner) scanEscape(quote rune) bool { 556 offs := s.offset 557 558 var n int 559 var base, max uint32 560 switch s.ch { 561 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 562 s.next() 563 return true 564 case '0', '1', '2', '3', '4', '5', '6', '7': 565 n, base, max = 3, 8, 255 566 case 'x': 567 s.next() 568 n, base, max = 2, 16, 255 569 case 'u': 570 s.next() 571 n, base, max = 4, 16, unicode.MaxRune 572 case 'U': 573 s.next() 574 n, base, max = 8, 16, unicode.MaxRune 575 default: 576 msg := "unknown escape sequence" 577 if s.ch < 0 { 578 msg = "escape sequence not terminated" 579 } 580 s.error(offs, msg) 581 return false 582 } 583 584 var x uint32 585 for n > 0 { 586 d := uint32(digitVal(s.ch)) 587 if d >= base { 588 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 589 if s.ch < 0 { 590 msg = "escape sequence not terminated" 591 } 592 s.error(s.offset, msg) 593 return false 594 } 595 x = x*base + d 596 s.next() 597 n-- 598 } 599 600 if x > max || 0xD800 <= x && x < 0xE000 { 601 s.error(offs, "escape sequence is invalid Unicode code point") 602 return false 603 } 604 605 return true 606 } 607 608 func (s *Scanner) scanRune() string { 609 // '\'' opening already consumed 610 offs := s.offset - 1 611 612 valid := true 613 n := 0 614 for { 615 ch := s.ch 616 if ch == '\n' || ch < 0 { 617 // only report error if we don't have one already 618 if valid { 619 s.error(offs, "rune literal not terminated") 620 valid = false 621 } 622 break 623 } 624 s.next() 625 if ch == '\'' { 626 break 627 } 628 n++ 629 if ch == '\\' { 630 if !s.scanEscape('\'') { 631 valid = false 632 } 633 // continue to read to closing quote 634 } 635 } 636 637 if valid && n != 1 { 638 s.error(offs, "illegal rune literal") 639 } 640 641 return string(s.src[offs:s.offset]) 642 } 643 644 func (s *Scanner) scanString() string { 645 // '"' opening already consumed 646 offs := s.offset - 1 647 648 for { 649 ch := s.ch 650 if ch == '\n' || ch < 0 { 651 s.error(offs, "string literal not terminated") 652 break 653 } 654 s.next() 655 if ch == '"' { 656 break 657 } 658 if ch == '\\' { 659 s.scanEscape('"') 660 } 661 } 662 663 return string(s.src[offs:s.offset]) 664 } 665 666 func stripCR(b []byte, comment bool) []byte { 667 c := make([]byte, len(b)) 668 i := 0 669 for j, ch := range b { 670 // In a /*-style comment, don't strip \r from *\r/ (incl. 671 // sequences of \r from *\r\r...\r/) since the resulting 672 // */ would terminate the comment too early unless the \r 673 // is immediately following the opening /* in which case 674 // it's ok because /*/ is not closed yet (issue #11151). 675 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' { 676 c[i] = ch 677 i++ 678 } 679 } 680 return c[:i] 681 } 682 683 func (s *Scanner) scanRawString() string { 684 // '`' opening already consumed 685 offs := s.offset - 1 686 687 hasCR := false 688 for { 689 ch := s.ch 690 if ch < 0 { 691 s.error(offs, "raw string literal not terminated") 692 break 693 } 694 s.next() 695 if ch == '`' { 696 break 697 } 698 if ch == '\r' { 699 hasCR = true 700 } 701 } 702 703 lit := s.src[offs:s.offset] 704 if hasCR { 705 lit = stripCR(lit, false) 706 } 707 708 return string(lit) 709 } 710 711 func (s *Scanner) skipWhitespace() { 712 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 713 s.next() 714 } 715 } 716 717 // Helper functions for scanning multi-byte tokens such as >> += >>= . 718 // Different routines recognize different length tok_i based on matches 719 // of ch_i. If a token ends in '=', the result is tok1 or tok3 720 // respectively. Otherwise, the result is tok0 if there was no other 721 // matching character, or tok2 if the matching character was ch2. 722 723 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 724 if s.ch == '=' { 725 s.next() 726 return tok1 727 } 728 return tok0 729 } 730 731 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 732 if s.ch == '=' { 733 s.next() 734 return tok1 735 } 736 if s.ch == ch2 { 737 s.next() 738 return tok2 739 } 740 return tok0 741 } 742 743 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 744 if s.ch == '=' { 745 s.next() 746 return tok1 747 } 748 if s.ch == ch2 { 749 s.next() 750 if s.ch == '=' { 751 s.next() 752 return tok3 753 } 754 return tok2 755 } 756 return tok0 757 } 758 759 // Scan scans the next token and returns the token position, the token, 760 // and its literal string if applicable. The source end is indicated by 761 // token.EOF. 762 // 763 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 764 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 765 // has the corresponding value. 766 // 767 // If the returned token is a keyword, the literal string is the keyword. 768 // 769 // If the returned token is token.SEMICOLON, the corresponding 770 // literal string is ";" if the semicolon was present in the source, 771 // and "\n" if the semicolon was inserted because of a newline or 772 // at EOF. 773 // 774 // If the returned token is token.ILLEGAL, the literal string is the 775 // offending character. 776 // 777 // In all other cases, Scan returns an empty literal string. 778 // 779 // For more tolerant parsing, Scan will return a valid token if 780 // possible even if a syntax error was encountered. Thus, even 781 // if the resulting token sequence contains no illegal tokens, 782 // a client may not assume that no error occurred. Instead it 783 // must check the scanner's ErrorCount or the number of calls 784 // of the error handler, if there was one installed. 785 // 786 // Scan adds line information to the file added to the file 787 // set with Init. Token positions are relative to that file 788 // and thus relative to the file set. 789 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 790 scanAgain: 791 if s.nlPos.IsValid() { 792 // Return artificial ';' token after /*...*/ comment 793 // containing newline, at position of first newline. 794 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n" 795 s.nlPos = token.NoPos 796 return 797 } 798 799 s.skipWhitespace() 800 801 // current token start 802 pos = s.file.Pos(s.offset) 803 804 // determine token value 805 insertSemi := false 806 switch ch := s.ch; { 807 case isLetter(ch): 808 lit = s.scanIdentifier() 809 if len(lit) > 1 { 810 // keywords are longer than one letter - avoid lookup otherwise 811 tok = token.Lookup(lit) 812 switch tok { 813 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 814 insertSemi = true 815 } 816 } else { 817 insertSemi = true 818 tok = token.IDENT 819 } 820 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())): 821 insertSemi = true 822 tok, lit = s.scanNumber() 823 default: 824 s.next() // always make progress 825 switch ch { 826 case eof: 827 if s.insertSemi { 828 s.insertSemi = false // EOF consumed 829 return pos, token.SEMICOLON, "\n" 830 } 831 tok = token.EOF 832 case '\n': 833 // we only reach here if s.insertSemi was 834 // set in the first place and exited early 835 // from s.skipWhitespace() 836 s.insertSemi = false // newline consumed 837 return pos, token.SEMICOLON, "\n" 838 case '"': 839 insertSemi = true 840 tok = token.STRING 841 lit = s.scanString() 842 case '\'': 843 insertSemi = true 844 tok = token.CHAR 845 lit = s.scanRune() 846 case '`': 847 insertSemi = true 848 tok = token.STRING 849 lit = s.scanRawString() 850 case ':': 851 tok = s.switch2(token.COLON, token.DEFINE) 852 case '.': 853 // fractions starting with a '.' are handled by outer switch 854 tok = token.PERIOD 855 if s.ch == '.' && s.peek() == '.' { 856 s.next() 857 s.next() // consume last '.' 858 tok = token.ELLIPSIS 859 } 860 case ',': 861 tok = token.COMMA 862 case ';': 863 tok = token.SEMICOLON 864 lit = ";" 865 case '(': 866 tok = token.LPAREN 867 case ')': 868 insertSemi = true 869 tok = token.RPAREN 870 case '[': 871 tok = token.LBRACK 872 case ']': 873 insertSemi = true 874 tok = token.RBRACK 875 case '{': 876 tok = token.LBRACE 877 case '}': 878 insertSemi = true 879 tok = token.RBRACE 880 case '+': 881 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 882 if tok == token.INC { 883 insertSemi = true 884 } 885 case '-': 886 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 887 if tok == token.DEC { 888 insertSemi = true 889 } 890 case '*': 891 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 892 case '/': 893 if s.ch == '/' || s.ch == '*' { 894 // comment 895 comment, nlOffset := s.scanComment() 896 if s.insertSemi && nlOffset != 0 { 897 // For /*...*/ containing \n, return 898 // COMMENT then artificial SEMICOLON. 899 s.nlPos = s.file.Pos(nlOffset) 900 s.insertSemi = false 901 } else { 902 insertSemi = s.insertSemi // preserve insertSemi info 903 } 904 if s.mode&ScanComments == 0 { 905 // skip comment 906 goto scanAgain 907 } 908 tok = token.COMMENT 909 lit = comment 910 } else { 911 // division 912 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 913 } 914 case '%': 915 tok = s.switch2(token.REM, token.REM_ASSIGN) 916 case '^': 917 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 918 case '<': 919 if s.ch == '-' { 920 s.next() 921 tok = token.ARROW 922 } else { 923 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 924 } 925 case '>': 926 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 927 case '=': 928 tok = s.switch2(token.ASSIGN, token.EQL) 929 case '!': 930 tok = s.switch2(token.NOT, token.NEQ) 931 case '&': 932 if s.ch == '^' { 933 s.next() 934 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 935 } else { 936 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 937 } 938 case '|': 939 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 940 case '~': 941 tok = token.TILDE 942 default: 943 // next reports unexpected BOMs - don't repeat 944 if ch != bom { 945 s.errorf(s.file.Offset(pos), "illegal character %#U", ch) 946 } 947 insertSemi = s.insertSemi // preserve insertSemi info 948 tok = token.ILLEGAL 949 lit = string(ch) 950 } 951 } 952 if s.mode&dontInsertSemis == 0 { 953 s.insertSemi = insertSemi 954 } 955 956 return 957 }