github.com/AndrienkoAleksandr/go@v0.0.19/src/go/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 package scanner 9 10 import ( 11 "bytes" 12 "fmt" 13 "go/token" 14 "path/filepath" 15 "strconv" 16 "unicode" 17 "unicode/utf8" 18 ) 19 20 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 21 // encountered and a handler was installed, the handler is called with a 22 // position and an error message. The position points to the beginning of 23 // the offending token. 24 type ErrorHandler func(pos token.Position, msg string) 25 26 // A Scanner holds the scanner's internal state while processing 27 // a given text. It can be allocated as part of another data 28 // structure but must be initialized via Init before use. 29 type Scanner struct { 30 // immutable state 31 file *token.File // source file handle 32 dir string // directory portion of file.Name() 33 src []byte // source 34 err ErrorHandler // error reporting; or nil 35 mode Mode // scanning mode 36 37 // scanning state 38 ch rune // current character 39 offset int // character offset 40 rdOffset int // reading offset (position after current character) 41 lineOffset int // current line offset 42 insertSemi bool // insert a semicolon before next newline 43 nlPos token.Pos // position of newline in preceding comment 44 45 // public state - ok to modify 46 ErrorCount int // number of errors encountered 47 } 48 49 const ( 50 bom = 0xFEFF // byte order mark, only permitted as very first character 51 eof = -1 // end of file 52 ) 53 54 // Read the next Unicode char into s.ch. 55 // s.ch < 0 means end-of-file. 56 // 57 // For optimization, there is some overlap between this method and 58 // s.scanIdentifier. 59 func (s *Scanner) next() { 60 if s.rdOffset < len(s.src) { 61 s.offset = s.rdOffset 62 if s.ch == '\n' { 63 s.lineOffset = s.offset 64 s.file.AddLine(s.offset) 65 } 66 r, w := rune(s.src[s.rdOffset]), 1 67 switch { 68 case r == 0: 69 s.error(s.offset, "illegal character NUL") 70 case r >= utf8.RuneSelf: 71 // not ASCII 72 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 73 if r == utf8.RuneError && w == 1 { 74 s.error(s.offset, "illegal UTF-8 encoding") 75 } else if r == bom && s.offset > 0 { 76 s.error(s.offset, "illegal byte order mark") 77 } 78 } 79 s.rdOffset += w 80 s.ch = r 81 } else { 82 s.offset = len(s.src) 83 if s.ch == '\n' { 84 s.lineOffset = s.offset 85 s.file.AddLine(s.offset) 86 } 87 s.ch = eof 88 } 89 } 90 91 // peek returns the byte following the most recently read character without 92 // advancing the scanner. If the scanner is at EOF, peek returns 0. 93 func (s *Scanner) peek() byte { 94 if s.rdOffset < len(s.src) { 95 return s.src[s.rdOffset] 96 } 97 return 0 98 } 99 100 // A mode value is a set of flags (or 0). 101 // They control scanner behavior. 102 type Mode uint 103 104 const ( 105 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 106 dontInsertSemis // do not automatically insert semicolons - for testing only 107 ) 108 109 // Init prepares the scanner s to tokenize the text src by setting the 110 // scanner at the beginning of src. The scanner uses the file set file 111 // for position information and it adds line information for each line. 112 // It is ok to re-use the same file when re-scanning the same file as 113 // line information which is already present is ignored. Init causes a 114 // panic if the file size does not match the src size. 115 // 116 // Calls to Scan will invoke the error handler err if they encounter a 117 // syntax error and err is not nil. Also, for each error encountered, 118 // the Scanner field ErrorCount is incremented by one. The mode parameter 119 // determines how comments are handled. 120 // 121 // Note that Init may call err if there is an error in the first character 122 // of the file. 123 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 124 // Explicitly initialize all fields since a scanner may be reused. 125 if file.Size() != len(src) { 126 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 127 } 128 s.file = file 129 s.dir, _ = filepath.Split(file.Name()) 130 s.src = src 131 s.err = err 132 s.mode = mode 133 134 s.ch = ' ' 135 s.offset = 0 136 s.rdOffset = 0 137 s.lineOffset = 0 138 s.insertSemi = false 139 s.ErrorCount = 0 140 141 s.next() 142 if s.ch == bom { 143 s.next() // ignore BOM at file beginning 144 } 145 } 146 147 func (s *Scanner) error(offs int, msg string) { 148 if s.err != nil { 149 s.err(s.file.Position(s.file.Pos(offs)), msg) 150 } 151 s.ErrorCount++ 152 } 153 154 func (s *Scanner) errorf(offs int, format string, args ...any) { 155 s.error(offs, fmt.Sprintf(format, args...)) 156 } 157 158 // scanComment returns the text of the comment and (if nonzero) 159 // the offset of the first newline within it, which implies a 160 // /*...*/ comment. 161 func (s *Scanner) scanComment() (string, int) { 162 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 163 offs := s.offset - 1 // position of initial '/' 164 next := -1 // position immediately following the comment; < 0 means invalid comment 165 numCR := 0 166 nlOffset := 0 // offset of first newline within /*...*/ comment 167 168 if s.ch == '/' { 169 //-style comment 170 // (the final '\n' is not considered part of the comment) 171 s.next() 172 for s.ch != '\n' && s.ch >= 0 { 173 if s.ch == '\r' { 174 numCR++ 175 } 176 s.next() 177 } 178 // if we are at '\n', the position following the comment is afterwards 179 next = s.offset 180 if s.ch == '\n' { 181 next++ 182 } 183 goto exit 184 } 185 186 /*-style comment */ 187 s.next() 188 for s.ch >= 0 { 189 ch := s.ch 190 if ch == '\r' { 191 numCR++ 192 } else if ch == '\n' && nlOffset == 0 { 193 nlOffset = s.offset 194 } 195 s.next() 196 if ch == '*' && s.ch == '/' { 197 s.next() 198 next = s.offset 199 goto exit 200 } 201 } 202 203 s.error(offs, "comment not terminated") 204 205 exit: 206 lit := s.src[offs:s.offset] 207 208 // On Windows, a (//-comment) line may end in "\r\n". 209 // Remove the final '\r' before analyzing the text for 210 // line directives (matching the compiler). Remove any 211 // other '\r' afterwards (matching the pre-existing be- 212 // havior of the scanner). 213 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { 214 lit = lit[:len(lit)-1] 215 numCR-- 216 } 217 218 // interpret line directives 219 // (//line directives must start at the beginning of the current line) 220 if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) { 221 s.updateLineInfo(next, offs, lit) 222 } 223 224 if numCR > 0 { 225 lit = stripCR(lit, lit[1] == '*') 226 } 227 228 return string(lit), nlOffset 229 } 230 231 var prefix = []byte("line ") 232 233 // updateLineInfo parses the incoming comment text at offset offs 234 // as a line directive. If successful, it updates the line info table 235 // for the position next per the line directive. 236 func (s *Scanner) updateLineInfo(next, offs int, text []byte) { 237 // extract comment text 238 if text[1] == '*' { 239 text = text[:len(text)-2] // lop off trailing "*/" 240 } 241 text = text[7:] // lop off leading "//line " or "/*line " 242 offs += 7 243 244 i, n, ok := trailingDigits(text) 245 if i == 0 { 246 return // ignore (not a line directive) 247 } 248 // i > 0 249 250 if !ok { 251 // text has a suffix :xxx but xxx is not a number 252 s.error(offs+i, "invalid line number: "+string(text[i:])) 253 return 254 } 255 256 // Put a cap on the maximum size of line and column numbers. 257 // 30 bits allows for some additional space before wrapping an int32. 258 // Keep this consistent with cmd/compile/internal/syntax.PosMax. 259 const maxLineCol = 1 << 30 260 var line, col int 261 i2, n2, ok2 := trailingDigits(text[:i-1]) 262 if ok2 { 263 //line filename:line:col 264 i, i2 = i2, i 265 line, col = n2, n 266 if col == 0 || col > maxLineCol { 267 s.error(offs+i2, "invalid column number: "+string(text[i2:])) 268 return 269 } 270 text = text[:i2-1] // lop off ":col" 271 } else { 272 //line filename:line 273 line = n 274 } 275 276 if line == 0 || line > maxLineCol { 277 s.error(offs+i, "invalid line number: "+string(text[i:])) 278 return 279 } 280 281 // If we have a column (//line filename:line:col form), 282 // an empty filename means to use the previous filename. 283 filename := string(text[:i-1]) // lop off ":line", and trim white space 284 if filename == "" && ok2 { 285 filename = s.file.Position(s.file.Pos(offs)).Filename 286 } else if filename != "" { 287 // Put a relative filename in the current directory. 288 // This is for compatibility with earlier releases. 289 // See issue 26671. 290 filename = filepath.Clean(filename) 291 if !filepath.IsAbs(filename) { 292 filename = filepath.Join(s.dir, filename) 293 } 294 } 295 296 s.file.AddLineColumnInfo(next, filename, line, col) 297 } 298 299 func trailingDigits(text []byte) (int, int, bool) { 300 i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':') 301 if i < 0 { 302 return 0, 0, false // no ":" 303 } 304 // i >= 0 305 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0) 306 return i + 1, int(n), err == nil 307 } 308 309 func isLetter(ch rune) bool { 310 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 311 } 312 313 func isDigit(ch rune) bool { 314 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 315 } 316 317 // scanIdentifier reads the string of valid identifier characters at s.offset. 318 // It must only be called when s.ch is known to be a valid letter. 319 // 320 // Be careful when making changes to this function: it is optimized and affects 321 // scanning performance significantly. 322 func (s *Scanner) scanIdentifier() string { 323 offs := s.offset 324 325 // Optimize for the common case of an ASCII identifier. 326 // 327 // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and 328 // avoids conversions to runes. 329 // 330 // In case we encounter a non-ASCII character, fall back on the slower path 331 // of calling into s.next(). 332 for rdOffset, b := range s.src[s.rdOffset:] { 333 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' { 334 // Avoid assigning a rune for the common case of an ascii character. 335 continue 336 } 337 s.rdOffset += rdOffset 338 if 0 < b && b < utf8.RuneSelf { 339 // Optimization: we've encountered an ASCII character that's not a letter 340 // or number. Avoid the call into s.next() and corresponding set up. 341 // 342 // Note that s.next() does some line accounting if s.ch is '\n', so this 343 // shortcut is only possible because we know that the preceding character 344 // is not '\n'. 345 s.ch = rune(b) 346 s.offset = s.rdOffset 347 s.rdOffset++ 348 goto exit 349 } 350 // We know that the preceding character is valid for an identifier because 351 // scanIdentifier is only called when s.ch is a letter, so calling s.next() 352 // at s.rdOffset resets the scanner state. 353 s.next() 354 for isLetter(s.ch) || isDigit(s.ch) { 355 s.next() 356 } 357 goto exit 358 } 359 s.offset = len(s.src) 360 s.rdOffset = len(s.src) 361 s.ch = eof 362 363 exit: 364 return string(s.src[offs:s.offset]) 365 } 366 367 func digitVal(ch rune) int { 368 switch { 369 case '0' <= ch && ch <= '9': 370 return int(ch - '0') 371 case 'a' <= lower(ch) && lower(ch) <= 'f': 372 return int(lower(ch) - 'a' + 10) 373 } 374 return 16 // larger than any legal digit val 375 } 376 377 func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter 378 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 379 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } 380 381 // digits accepts the sequence { digit | '_' }. 382 // If base <= 10, digits accepts any decimal digit but records 383 // the offset (relative to the source start) of a digit >= base 384 // in *invalid, if *invalid < 0. 385 // digits returns a bitset describing whether the sequence contained 386 // digits (bit 0 is set), or separators '_' (bit 1 is set). 387 func (s *Scanner) digits(base int, invalid *int) (digsep int) { 388 if base <= 10 { 389 max := rune('0' + base) 390 for isDecimal(s.ch) || s.ch == '_' { 391 ds := 1 392 if s.ch == '_' { 393 ds = 2 394 } else if s.ch >= max && *invalid < 0 { 395 *invalid = s.offset // record invalid rune offset 396 } 397 digsep |= ds 398 s.next() 399 } 400 } else { 401 for isHex(s.ch) || s.ch == '_' { 402 ds := 1 403 if s.ch == '_' { 404 ds = 2 405 } 406 digsep |= ds 407 s.next() 408 } 409 } 410 return 411 } 412 413 func (s *Scanner) scanNumber() (token.Token, string) { 414 offs := s.offset 415 tok := token.ILLEGAL 416 417 base := 10 // number base 418 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' 419 digsep := 0 // bit 0: digit present, bit 1: '_' present 420 invalid := -1 // index of invalid digit in literal, or < 0 421 422 // integer part 423 if s.ch != '.' { 424 tok = token.INT 425 if s.ch == '0' { 426 s.next() 427 switch lower(s.ch) { 428 case 'x': 429 s.next() 430 base, prefix = 16, 'x' 431 case 'o': 432 s.next() 433 base, prefix = 8, 'o' 434 case 'b': 435 s.next() 436 base, prefix = 2, 'b' 437 default: 438 base, prefix = 8, '0' 439 digsep = 1 // leading 0 440 } 441 } 442 digsep |= s.digits(base, &invalid) 443 } 444 445 // fractional part 446 if s.ch == '.' { 447 tok = token.FLOAT 448 if prefix == 'o' || prefix == 'b' { 449 s.error(s.offset, "invalid radix point in "+litname(prefix)) 450 } 451 s.next() 452 digsep |= s.digits(base, &invalid) 453 } 454 455 if digsep&1 == 0 { 456 s.error(s.offset, litname(prefix)+" has no digits") 457 } 458 459 // exponent 460 if e := lower(s.ch); e == 'e' || e == 'p' { 461 switch { 462 case e == 'e' && prefix != 0 && prefix != '0': 463 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch) 464 case e == 'p' && prefix != 'x': 465 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch) 466 } 467 s.next() 468 tok = token.FLOAT 469 if s.ch == '+' || s.ch == '-' { 470 s.next() 471 } 472 ds := s.digits(10, nil) 473 digsep |= ds 474 if ds&1 == 0 { 475 s.error(s.offset, "exponent has no digits") 476 } 477 } else if prefix == 'x' && tok == token.FLOAT { 478 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent") 479 } 480 481 // suffix 'i' 482 if s.ch == 'i' { 483 tok = token.IMAG 484 s.next() 485 } 486 487 lit := string(s.src[offs:s.offset]) 488 if tok == token.INT && invalid >= 0 { 489 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix)) 490 } 491 if digsep&2 != 0 { 492 if i := invalidSep(lit); i >= 0 { 493 s.error(offs+i, "'_' must separate successive digits") 494 } 495 } 496 497 return tok, lit 498 } 499 500 func litname(prefix rune) string { 501 switch prefix { 502 case 'x': 503 return "hexadecimal literal" 504 case 'o', '0': 505 return "octal literal" 506 case 'b': 507 return "binary literal" 508 } 509 return "decimal literal" 510 } 511 512 // invalidSep returns the index of the first invalid separator in x, or -1. 513 func invalidSep(x string) int { 514 x1 := ' ' // prefix char, we only care if it's 'x' 515 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) 516 i := 0 517 518 // a prefix counts as a digit 519 if len(x) >= 2 && x[0] == '0' { 520 x1 = lower(rune(x[1])) 521 if x1 == 'x' || x1 == 'o' || x1 == 'b' { 522 d = '0' 523 i = 2 524 } 525 } 526 527 // mantissa and exponent 528 for ; i < len(x); i++ { 529 p := d // previous digit 530 d = rune(x[i]) 531 switch { 532 case d == '_': 533 if p != '0' { 534 return i 535 } 536 case isDecimal(d) || x1 == 'x' && isHex(d): 537 d = '0' 538 default: 539 if p == '_' { 540 return i - 1 541 } 542 d = '.' 543 } 544 } 545 if d == '_' { 546 return len(x) - 1 547 } 548 549 return -1 550 } 551 552 // scanEscape parses an escape sequence where rune is the accepted 553 // escaped quote. In case of a syntax error, it stops at the offending 554 // character (without consuming it) and returns false. Otherwise 555 // it returns true. 556 func (s *Scanner) scanEscape(quote rune) bool { 557 offs := s.offset 558 559 var n int 560 var base, max uint32 561 switch s.ch { 562 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 563 s.next() 564 return true 565 case '0', '1', '2', '3', '4', '5', '6', '7': 566 n, base, max = 3, 8, 255 567 case 'x': 568 s.next() 569 n, base, max = 2, 16, 255 570 case 'u': 571 s.next() 572 n, base, max = 4, 16, unicode.MaxRune 573 case 'U': 574 s.next() 575 n, base, max = 8, 16, unicode.MaxRune 576 default: 577 msg := "unknown escape sequence" 578 if s.ch < 0 { 579 msg = "escape sequence not terminated" 580 } 581 s.error(offs, msg) 582 return false 583 } 584 585 var x uint32 586 for n > 0 { 587 d := uint32(digitVal(s.ch)) 588 if d >= base { 589 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 590 if s.ch < 0 { 591 msg = "escape sequence not terminated" 592 } 593 s.error(s.offset, msg) 594 return false 595 } 596 x = x*base + d 597 s.next() 598 n-- 599 } 600 601 if x > max || 0xD800 <= x && x < 0xE000 { 602 s.error(offs, "escape sequence is invalid Unicode code point") 603 return false 604 } 605 606 return true 607 } 608 609 func (s *Scanner) scanRune() string { 610 // '\'' opening already consumed 611 offs := s.offset - 1 612 613 valid := true 614 n := 0 615 for { 616 ch := s.ch 617 if ch == '\n' || ch < 0 { 618 // only report error if we don't have one already 619 if valid { 620 s.error(offs, "rune literal not terminated") 621 valid = false 622 } 623 break 624 } 625 s.next() 626 if ch == '\'' { 627 break 628 } 629 n++ 630 if ch == '\\' { 631 if !s.scanEscape('\'') { 632 valid = false 633 } 634 // continue to read to closing quote 635 } 636 } 637 638 if valid && n != 1 { 639 s.error(offs, "illegal rune literal") 640 } 641 642 return string(s.src[offs:s.offset]) 643 } 644 645 func (s *Scanner) scanString() string { 646 // '"' opening already consumed 647 offs := s.offset - 1 648 649 for { 650 ch := s.ch 651 if ch == '\n' || ch < 0 { 652 s.error(offs, "string literal not terminated") 653 break 654 } 655 s.next() 656 if ch == '"' { 657 break 658 } 659 if ch == '\\' { 660 s.scanEscape('"') 661 } 662 } 663 664 return string(s.src[offs:s.offset]) 665 } 666 667 func stripCR(b []byte, comment bool) []byte { 668 c := make([]byte, len(b)) 669 i := 0 670 for j, ch := range b { 671 // In a /*-style comment, don't strip \r from *\r/ (incl. 672 // sequences of \r from *\r\r...\r/) since the resulting 673 // */ would terminate the comment too early unless the \r 674 // is immediately following the opening /* in which case 675 // it's ok because /*/ is not closed yet (issue #11151). 676 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' { 677 c[i] = ch 678 i++ 679 } 680 } 681 return c[:i] 682 } 683 684 func (s *Scanner) scanRawString() string { 685 // '`' opening already consumed 686 offs := s.offset - 1 687 688 hasCR := false 689 for { 690 ch := s.ch 691 if ch < 0 { 692 s.error(offs, "raw string literal not terminated") 693 break 694 } 695 s.next() 696 if ch == '`' { 697 break 698 } 699 if ch == '\r' { 700 hasCR = true 701 } 702 } 703 704 lit := s.src[offs:s.offset] 705 if hasCR { 706 lit = stripCR(lit, false) 707 } 708 709 return string(lit) 710 } 711 712 func (s *Scanner) skipWhitespace() { 713 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 714 s.next() 715 } 716 } 717 718 // Helper functions for scanning multi-byte tokens such as >> += >>= . 719 // Different routines recognize different length tok_i based on matches 720 // of ch_i. If a token ends in '=', the result is tok1 or tok3 721 // respectively. Otherwise, the result is tok0 if there was no other 722 // matching character, or tok2 if the matching character was ch2. 723 724 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 725 if s.ch == '=' { 726 s.next() 727 return tok1 728 } 729 return tok0 730 } 731 732 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 733 if s.ch == '=' { 734 s.next() 735 return tok1 736 } 737 if s.ch == ch2 { 738 s.next() 739 return tok2 740 } 741 return tok0 742 } 743 744 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 745 if s.ch == '=' { 746 s.next() 747 return tok1 748 } 749 if s.ch == ch2 { 750 s.next() 751 if s.ch == '=' { 752 s.next() 753 return tok3 754 } 755 return tok2 756 } 757 return tok0 758 } 759 760 // Scan scans the next token and returns the token position, the token, 761 // and its literal string if applicable. The source end is indicated by 762 // token.EOF. 763 // 764 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 765 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 766 // has the corresponding value. 767 // 768 // If the returned token is a keyword, the literal string is the keyword. 769 // 770 // If the returned token is token.SEMICOLON, the corresponding 771 // literal string is ";" if the semicolon was present in the source, 772 // and "\n" if the semicolon was inserted because of a newline or 773 // at EOF. 774 // 775 // If the returned token is token.ILLEGAL, the literal string is the 776 // offending character. 777 // 778 // In all other cases, Scan returns an empty literal string. 779 // 780 // For more tolerant parsing, Scan will return a valid token if 781 // possible even if a syntax error was encountered. Thus, even 782 // if the resulting token sequence contains no illegal tokens, 783 // a client may not assume that no error occurred. Instead it 784 // must check the scanner's ErrorCount or the number of calls 785 // of the error handler, if there was one installed. 786 // 787 // Scan adds line information to the file added to the file 788 // set with Init. Token positions are relative to that file 789 // and thus relative to the file set. 790 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 791 scanAgain: 792 if s.nlPos.IsValid() { 793 // Return artificial ';' token after /*...*/ comment 794 // containing newline, at position of first newline. 795 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n" 796 s.nlPos = token.NoPos 797 return 798 } 799 800 s.skipWhitespace() 801 802 // current token start 803 pos = s.file.Pos(s.offset) 804 805 // determine token value 806 insertSemi := false 807 switch ch := s.ch; { 808 case isLetter(ch): 809 lit = s.scanIdentifier() 810 if len(lit) > 1 { 811 // keywords are longer than one letter - avoid lookup otherwise 812 tok = token.Lookup(lit) 813 switch tok { 814 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 815 insertSemi = true 816 } 817 } else { 818 insertSemi = true 819 tok = token.IDENT 820 } 821 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())): 822 insertSemi = true 823 tok, lit = s.scanNumber() 824 default: 825 s.next() // always make progress 826 switch ch { 827 case eof: 828 if s.insertSemi { 829 s.insertSemi = false // EOF consumed 830 return pos, token.SEMICOLON, "\n" 831 } 832 tok = token.EOF 833 case '\n': 834 // we only reach here if s.insertSemi was 835 // set in the first place and exited early 836 // from s.skipWhitespace() 837 s.insertSemi = false // newline consumed 838 return pos, token.SEMICOLON, "\n" 839 case '"': 840 insertSemi = true 841 tok = token.STRING 842 lit = s.scanString() 843 case '\'': 844 insertSemi = true 845 tok = token.CHAR 846 lit = s.scanRune() 847 case '`': 848 insertSemi = true 849 tok = token.STRING 850 lit = s.scanRawString() 851 case ':': 852 tok = s.switch2(token.COLON, token.DEFINE) 853 case '.': 854 // fractions starting with a '.' are handled by outer switch 855 tok = token.PERIOD 856 if s.ch == '.' && s.peek() == '.' { 857 s.next() 858 s.next() // consume last '.' 859 tok = token.ELLIPSIS 860 } 861 case ',': 862 tok = token.COMMA 863 case ';': 864 tok = token.SEMICOLON 865 lit = ";" 866 case '(': 867 tok = token.LPAREN 868 case ')': 869 insertSemi = true 870 tok = token.RPAREN 871 case '[': 872 tok = token.LBRACK 873 case ']': 874 insertSemi = true 875 tok = token.RBRACK 876 case '{': 877 tok = token.LBRACE 878 case '}': 879 insertSemi = true 880 tok = token.RBRACE 881 case '+': 882 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 883 if tok == token.INC { 884 insertSemi = true 885 } 886 case '-': 887 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 888 if tok == token.DEC { 889 insertSemi = true 890 } 891 case '*': 892 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 893 case '/': 894 if s.ch == '/' || s.ch == '*' { 895 // comment 896 comment, nlOffset := s.scanComment() 897 if s.insertSemi && nlOffset != 0 { 898 // For /*...*/ containing \n, return 899 // COMMENT then artificial SEMICOLON. 900 s.nlPos = s.file.Pos(nlOffset) 901 s.insertSemi = false 902 } else { 903 insertSemi = s.insertSemi // preserve insertSemi info 904 } 905 if s.mode&ScanComments == 0 { 906 // skip comment 907 goto scanAgain 908 } 909 tok = token.COMMENT 910 lit = comment 911 } else { 912 // division 913 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 914 } 915 case '%': 916 tok = s.switch2(token.REM, token.REM_ASSIGN) 917 case '^': 918 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 919 case '<': 920 if s.ch == '-' { 921 s.next() 922 tok = token.ARROW 923 } else { 924 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 925 } 926 case '>': 927 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 928 case '=': 929 tok = s.switch2(token.ASSIGN, token.EQL) 930 case '!': 931 tok = s.switch2(token.NOT, token.NEQ) 932 case '&': 933 if s.ch == '^' { 934 s.next() 935 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 936 } else { 937 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 938 } 939 case '|': 940 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 941 case '~': 942 tok = token.TILDE 943 default: 944 // next reports unexpected BOMs - don't repeat 945 if ch != bom { 946 s.errorf(s.file.Offset(pos), "illegal character %#U", ch) 947 } 948 insertSemi = s.insertSemi // preserve insertSemi info 949 tok = token.ILLEGAL 950 lit = string(ch) 951 } 952 } 953 if s.mode&dontInsertSemis == 0 { 954 s.insertSemi = insertSemi 955 } 956 957 return 958 }