github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/text/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. If the first character 9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. 10 // 11 // By default, a Scanner skips white space and Go comments and recognizes all 12 // literals as defined by the Go language specification. It may be 13 // customized to recognize only a subset of those literals and to recognize 14 // different identifier and white space characters. 15 package scanner 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "os" 22 "unicode" 23 "unicode/utf8" 24 ) 25 26 // A source position is represented by a Position value. 27 // A position is valid if Line > 0. 28 type Position struct { 29 Filename string // filename, if any 30 Offset int // byte offset, starting at 0 31 Line int // line number, starting at 1 32 Column int // column number, starting at 1 (character count per line) 33 } 34 35 // IsValid reports whether the position is valid. 36 func (pos *Position) IsValid() bool { return pos.Line > 0 } 37 38 func (pos Position) String() string { 39 s := pos.Filename 40 if s == "" { 41 s = "<input>" 42 } 43 if pos.IsValid() { 44 s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column) 45 } 46 return s 47 } 48 49 // Predefined mode bits to control recognition of tokens. For instance, 50 // to configure a Scanner such that it only recognizes (Go) identifiers, 51 // integers, and skips comments, set the Scanner's Mode field to: 52 // 53 // ScanIdents | ScanInts | SkipComments 54 // 55 // With the exceptions of comments, which are skipped if SkipComments is 56 // set, unrecognized tokens are not ignored. Instead, the scanner simply 57 // returns the respective individual characters (or possibly sub-tokens). 58 // For instance, if the mode is ScanIdents (not ScanStrings), the string 59 // "foo" is scanned as the token sequence '"' Ident '"'. 60 // 61 const ( 62 ScanIdents = 1 << -Ident 63 ScanInts = 1 << -Int 64 ScanFloats = 1 << -Float // includes Ints 65 ScanChars = 1 << -Char 66 ScanStrings = 1 << -String 67 ScanRawStrings = 1 << -RawString 68 ScanComments = 1 << -Comment 69 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 70 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 71 ) 72 73 // The result of Scan is one of these tokens or a Unicode character. 74 const ( 75 EOF = -(iota + 1) 76 Ident 77 Int 78 Float 79 Char 80 String 81 RawString 82 Comment 83 skipComment 84 ) 85 86 var tokenString = map[rune]string{ 87 EOF: "EOF", 88 Ident: "Ident", 89 Int: "Int", 90 Float: "Float", 91 Char: "Char", 92 String: "String", 93 RawString: "RawString", 94 Comment: "Comment", 95 } 96 97 // TokenString returns a printable string for a token or Unicode character. 98 func TokenString(tok rune) string { 99 if s, found := tokenString[tok]; found { 100 return s 101 } 102 return fmt.Sprintf("%q", string(tok)) 103 } 104 105 // GoWhitespace is the default value for the Scanner's Whitespace field. 106 // Its value selects Go's white space characters. 107 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 108 109 const bufLen = 1024 // at least utf8.UTFMax 110 111 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 112 type Scanner struct { 113 // Input 114 src io.Reader 115 116 // Source buffer 117 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 118 srcPos int // reading position (srcBuf index) 119 srcEnd int // source end (srcBuf index) 120 121 // Source position 122 srcBufOffset int // byte offset of srcBuf[0] in source 123 line int // line count 124 column int // character count 125 lastLineLen int // length of last line in characters (for correct column reporting) 126 lastCharLen int // length of last character in bytes 127 128 // Token text buffer 129 // Typically, token text is stored completely in srcBuf, but in general 130 // the token text's head may be buffered in tokBuf while the token text's 131 // tail is stored in srcBuf. 132 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 133 tokPos int // token text tail position (srcBuf index); valid if >= 0 134 tokEnd int // token text tail end (srcBuf index) 135 136 // One character look-ahead 137 ch rune // character before current srcPos 138 139 // Error is called for each error encountered. If no Error 140 // function is set, the error is reported to os.Stderr. 141 Error func(s *Scanner, msg string) 142 143 // ErrorCount is incremented by one for each error encountered. 144 ErrorCount int 145 146 // The Mode field controls which tokens are recognized. For instance, 147 // to recognize Ints, set the ScanInts bit in Mode. The field may be 148 // changed at any time. 149 Mode uint 150 151 // The Whitespace field controls which characters are recognized 152 // as white space. To recognize a character ch <= ' ' as white space, 153 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 154 // for values ch > ' '). The field may be changed at any time. 155 Whitespace uint64 156 157 // IsIdentRune is a predicate controlling the characters accepted 158 // as the ith rune in an identifier. The set of valid characters 159 // must not intersect with the set of white space characters. 160 // If no IsIdentRune function is set, regular Go identifiers are 161 // accepted instead. The field may be changed at any time. 162 IsIdentRune func(ch rune, i int) bool 163 164 // Start position of most recently scanned token; set by Scan. 165 // Calling Init or Next invalidates the position (Line == 0). 166 // The Filename field is always left untouched by the Scanner. 167 // If an error is reported (via Error) and Position is invalid, 168 // the scanner is not inside a token. Call Pos to obtain an error 169 // position in that case, or to obtain the position immediately 170 // after the most recently scanned token. 171 Position 172 } 173 174 // Init initializes a Scanner with a new source and returns s. 175 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 176 // and Whitespace is set to GoWhitespace. 177 func (s *Scanner) Init(src io.Reader) *Scanner { 178 s.src = src 179 180 // initialize source buffer 181 // (the first call to next() will fill it by calling src.Read) 182 s.srcBuf[0] = utf8.RuneSelf // sentinel 183 s.srcPos = 0 184 s.srcEnd = 0 185 186 // initialize source position 187 s.srcBufOffset = 0 188 s.line = 1 189 s.column = 0 190 s.lastLineLen = 0 191 s.lastCharLen = 0 192 193 // initialize token text buffer 194 // (required for first call to next()). 195 s.tokPos = -1 196 197 // initialize one character look-ahead 198 s.ch = -2 // no char read yet, not EOF 199 200 // initialize public fields 201 s.Error = nil 202 s.ErrorCount = 0 203 s.Mode = GoTokens 204 s.Whitespace = GoWhitespace 205 s.Line = 0 // invalidate token position 206 207 return s 208 } 209 210 // next reads and returns the next Unicode character. It is designed such 211 // that only a minimal amount of work needs to be done in the common ASCII 212 // case (one test to check for both ASCII and end-of-buffer, and one test 213 // to check for newlines). 214 func (s *Scanner) next() rune { 215 ch, width := rune(s.srcBuf[s.srcPos]), 1 216 217 if ch >= utf8.RuneSelf { 218 // uncommon case: not ASCII or not enough bytes 219 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 220 // not enough bytes: read some more, but first 221 // save away token text if any 222 if s.tokPos >= 0 { 223 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 224 s.tokPos = 0 225 // s.tokEnd is set by Scan() 226 } 227 // move unread bytes to beginning of buffer 228 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 229 s.srcBufOffset += s.srcPos 230 // read more bytes 231 // (an io.Reader must return io.EOF when it reaches 232 // the end of what it is reading - simply returning 233 // n == 0 will make this loop retry forever; but the 234 // error is in the reader implementation in that case) 235 i := s.srcEnd - s.srcPos 236 n, err := s.src.Read(s.srcBuf[i:bufLen]) 237 s.srcPos = 0 238 s.srcEnd = i + n 239 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 240 if err != nil { 241 if err != io.EOF { 242 s.error(err.Error()) 243 } 244 if s.srcEnd == 0 { 245 if s.lastCharLen > 0 { 246 // previous character was not EOF 247 s.column++ 248 } 249 s.lastCharLen = 0 250 return EOF 251 } 252 // If err == EOF, we won't be getting more 253 // bytes; break to avoid infinite loop. If 254 // err is something else, we don't know if 255 // we can get more bytes; thus also break. 256 break 257 } 258 } 259 // at least one byte 260 ch = rune(s.srcBuf[s.srcPos]) 261 if ch >= utf8.RuneSelf { 262 // uncommon case: not ASCII 263 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 264 if ch == utf8.RuneError && width == 1 { 265 // advance for correct error position 266 s.srcPos += width 267 s.lastCharLen = width 268 s.column++ 269 s.error("illegal UTF-8 encoding") 270 return ch 271 } 272 } 273 } 274 275 // advance 276 s.srcPos += width 277 s.lastCharLen = width 278 s.column++ 279 280 // special situations 281 switch ch { 282 case 0: 283 // for compatibility with other tools 284 s.error("illegal character NUL") 285 case '\n': 286 s.line++ 287 s.lastLineLen = s.column 288 s.column = 0 289 } 290 291 return ch 292 } 293 294 // Next reads and returns the next Unicode character. 295 // It returns EOF at the end of the source. It reports 296 // a read error by calling s.Error, if not nil; otherwise 297 // it prints an error message to os.Stderr. Next does not 298 // update the Scanner's Position field; use Pos() to 299 // get the current position. 300 func (s *Scanner) Next() rune { 301 s.tokPos = -1 // don't collect token text 302 s.Line = 0 // invalidate token position 303 ch := s.Peek() 304 if ch != EOF { 305 s.ch = s.next() 306 } 307 return ch 308 } 309 310 // Peek returns the next Unicode character in the source without advancing 311 // the scanner. It returns EOF if the scanner's position is at the last 312 // character of the source. 313 func (s *Scanner) Peek() rune { 314 if s.ch == -2 { 315 // this code is only run for the very first character 316 s.ch = s.next() 317 if s.ch == '\uFEFF' { 318 s.ch = s.next() // ignore BOM 319 } 320 } 321 return s.ch 322 } 323 324 func (s *Scanner) error(msg string) { 325 s.ErrorCount++ 326 if s.Error != nil { 327 s.Error(s, msg) 328 return 329 } 330 pos := s.Position 331 if !pos.IsValid() { 332 pos = s.Pos() 333 } 334 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 335 } 336 337 func (s *Scanner) isIdentRune(ch rune, i int) bool { 338 if s.IsIdentRune != nil { 339 return s.IsIdentRune(ch, i) 340 } 341 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 342 } 343 344 func (s *Scanner) scanIdentifier() rune { 345 // we know the zero'th rune is OK; start scanning at the next one 346 ch := s.next() 347 for i := 1; s.isIdentRune(ch, i); i++ { 348 ch = s.next() 349 } 350 return ch 351 } 352 353 func digitVal(ch rune) int { 354 switch { 355 case '0' <= ch && ch <= '9': 356 return int(ch - '0') 357 case 'a' <= ch && ch <= 'f': 358 return int(ch - 'a' + 10) 359 case 'A' <= ch && ch <= 'F': 360 return int(ch - 'A' + 10) 361 } 362 return 16 // larger than any legal digit val 363 } 364 365 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 366 367 func (s *Scanner) scanMantissa(ch rune) rune { 368 for isDecimal(ch) { 369 ch = s.next() 370 } 371 return ch 372 } 373 374 func (s *Scanner) scanFraction(ch rune) rune { 375 if ch == '.' { 376 ch = s.scanMantissa(s.next()) 377 } 378 return ch 379 } 380 381 func (s *Scanner) scanExponent(ch rune) rune { 382 if ch == 'e' || ch == 'E' { 383 ch = s.next() 384 if ch == '-' || ch == '+' { 385 ch = s.next() 386 } 387 ch = s.scanMantissa(ch) 388 } 389 return ch 390 } 391 392 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 393 // isDecimal(ch) 394 if ch == '0' { 395 // int or float 396 ch = s.next() 397 if ch == 'x' || ch == 'X' { 398 // hexadecimal int 399 ch = s.next() 400 hasMantissa := false 401 for digitVal(ch) < 16 { 402 ch = s.next() 403 hasMantissa = true 404 } 405 if !hasMantissa { 406 s.error("illegal hexadecimal number") 407 } 408 } else { 409 // octal int or float 410 has8or9 := false 411 for isDecimal(ch) { 412 if ch > '7' { 413 has8or9 = true 414 } 415 ch = s.next() 416 } 417 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 418 // float 419 ch = s.scanFraction(ch) 420 ch = s.scanExponent(ch) 421 return Float, ch 422 } 423 // octal int 424 if has8or9 { 425 s.error("illegal octal number") 426 } 427 } 428 return Int, ch 429 } 430 // decimal int or float 431 ch = s.scanMantissa(ch) 432 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 433 // float 434 ch = s.scanFraction(ch) 435 ch = s.scanExponent(ch) 436 return Float, ch 437 } 438 return Int, ch 439 } 440 441 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 442 for n > 0 && digitVal(ch) < base { 443 ch = s.next() 444 n-- 445 } 446 if n > 0 { 447 s.error("illegal char escape") 448 } 449 return ch 450 } 451 452 func (s *Scanner) scanEscape(quote rune) rune { 453 ch := s.next() // read character after '/' 454 switch ch { 455 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 456 // nothing to do 457 ch = s.next() 458 case '0', '1', '2', '3', '4', '5', '6', '7': 459 ch = s.scanDigits(ch, 8, 3) 460 case 'x': 461 ch = s.scanDigits(s.next(), 16, 2) 462 case 'u': 463 ch = s.scanDigits(s.next(), 16, 4) 464 case 'U': 465 ch = s.scanDigits(s.next(), 16, 8) 466 default: 467 s.error("illegal char escape") 468 } 469 return ch 470 } 471 472 func (s *Scanner) scanString(quote rune) (n int) { 473 ch := s.next() // read character after quote 474 for ch != quote { 475 if ch == '\n' || ch < 0 { 476 s.error("literal not terminated") 477 return 478 } 479 if ch == '\\' { 480 ch = s.scanEscape(quote) 481 } else { 482 ch = s.next() 483 } 484 n++ 485 } 486 return 487 } 488 489 func (s *Scanner) scanRawString() { 490 ch := s.next() // read character after '`' 491 for ch != '`' { 492 if ch < 0 { 493 s.error("literal not terminated") 494 return 495 } 496 ch = s.next() 497 } 498 } 499 500 func (s *Scanner) scanChar() { 501 if s.scanString('\'') != 1 { 502 s.error("illegal char literal") 503 } 504 } 505 506 func (s *Scanner) scanComment(ch rune) rune { 507 // ch == '/' || ch == '*' 508 if ch == '/' { 509 // line comment 510 ch = s.next() // read character after "//" 511 for ch != '\n' && ch >= 0 { 512 ch = s.next() 513 } 514 return ch 515 } 516 517 // general comment 518 ch = s.next() // read character after "/*" 519 for { 520 if ch < 0 { 521 s.error("comment not terminated") 522 break 523 } 524 ch0 := ch 525 ch = s.next() 526 if ch0 == '*' && ch == '/' { 527 ch = s.next() 528 break 529 } 530 } 531 return ch 532 } 533 534 // Scan reads the next token or Unicode character from source and returns it. 535 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 536 // It returns EOF at the end of the source. It reports scanner errors (read and 537 // token errors) by calling s.Error, if not nil; otherwise it prints an error 538 // message to os.Stderr. 539 func (s *Scanner) Scan() rune { 540 ch := s.Peek() 541 542 // reset token text position 543 s.tokPos = -1 544 s.Line = 0 545 546 redo: 547 // skip white space 548 for s.Whitespace&(1<<uint(ch)) != 0 { 549 ch = s.next() 550 } 551 552 // start collecting token text 553 s.tokBuf.Reset() 554 s.tokPos = s.srcPos - s.lastCharLen 555 556 // set token position 557 // (this is a slightly optimized version of the code in Pos()) 558 s.Offset = s.srcBufOffset + s.tokPos 559 if s.column > 0 { 560 // common case: last character was not a '\n' 561 s.Line = s.line 562 s.Column = s.column 563 } else { 564 // last character was a '\n' 565 // (we cannot be at the beginning of the source 566 // since we have called next() at least once) 567 s.Line = s.line - 1 568 s.Column = s.lastLineLen 569 } 570 571 // determine token value 572 tok := ch 573 switch { 574 case s.isIdentRune(ch, 0): 575 if s.Mode&ScanIdents != 0 { 576 tok = Ident 577 ch = s.scanIdentifier() 578 } else { 579 ch = s.next() 580 } 581 case isDecimal(ch): 582 if s.Mode&(ScanInts|ScanFloats) != 0 { 583 tok, ch = s.scanNumber(ch) 584 } else { 585 ch = s.next() 586 } 587 default: 588 switch ch { 589 case EOF: 590 break 591 case '"': 592 if s.Mode&ScanStrings != 0 { 593 s.scanString('"') 594 tok = String 595 } 596 ch = s.next() 597 case '\'': 598 if s.Mode&ScanChars != 0 { 599 s.scanChar() 600 tok = Char 601 } 602 ch = s.next() 603 case '.': 604 ch = s.next() 605 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 606 tok = Float 607 ch = s.scanMantissa(ch) 608 ch = s.scanExponent(ch) 609 } 610 case '/': 611 ch = s.next() 612 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 613 if s.Mode&SkipComments != 0 { 614 s.tokPos = -1 // don't collect token text 615 ch = s.scanComment(ch) 616 goto redo 617 } 618 ch = s.scanComment(ch) 619 tok = Comment 620 } 621 case '`': 622 if s.Mode&ScanRawStrings != 0 { 623 s.scanRawString() 624 tok = String 625 } 626 ch = s.next() 627 default: 628 ch = s.next() 629 } 630 } 631 632 // end of token text 633 s.tokEnd = s.srcPos - s.lastCharLen 634 635 s.ch = ch 636 return tok 637 } 638 639 // Pos returns the position of the character immediately after 640 // the character or token returned by the last call to Next or Scan. 641 // Use the Scanner's Position field for the start position of the most 642 // recently scanned token. 643 func (s *Scanner) Pos() (pos Position) { 644 pos.Filename = s.Filename 645 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 646 switch { 647 case s.column > 0: 648 // common case: last character was not a '\n' 649 pos.Line = s.line 650 pos.Column = s.column 651 case s.lastLineLen > 0: 652 // last character was a '\n' 653 pos.Line = s.line - 1 654 pos.Column = s.lastLineLen 655 default: 656 // at the beginning of the source 657 pos.Line = 1 658 pos.Column = 1 659 } 660 return 661 } 662 663 // TokenText returns the string corresponding to the most recently scanned token. 664 // Valid after calling Scan(). 665 func (s *Scanner) TokenText() string { 666 if s.tokPos < 0 { 667 // no token text 668 return "" 669 } 670 671 if s.tokEnd < 0 { 672 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 673 s.tokEnd = s.tokPos 674 } 675 676 if s.tokBuf.Len() == 0 { 677 // common case: the entire token text is still in srcBuf 678 return string(s.srcBuf[s.tokPos:s.tokEnd]) 679 } 680 681 // part of the token text was saved in tokBuf: save the rest in 682 // tokBuf as well and return its content 683 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 684 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 685 return s.tokBuf.String() 686 }