github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/text/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. If the first character 9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. 10 // 11 // By default, a Scanner skips white space and Go comments and recognizes all 12 // literals as defined by the Go language specification. It may be 13 // customized to recognize only a subset of those literals and to recognize 14 // different identifier and white space characters. 15 package scanner 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "os" 22 "unicode" 23 "unicode/utf8" 24 ) 25 26 // A source position is represented by a Position value. 27 // A position is valid if Line > 0. 28 type Position struct { 29 Filename string // filename, if any 30 Offset int // byte offset, starting at 0 31 Line int // line number, starting at 1 32 Column int // column number, starting at 1 (character count per line) 33 } 34 35 // IsValid reports whether the position is valid. 36 func (pos *Position) IsValid() bool { return pos.Line > 0 } 37 38 func (pos Position) String() string { 39 s := pos.Filename 40 if s == "" { 41 s = "<input>" 42 } 43 if pos.IsValid() { 44 s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column) 45 } 46 return s 47 } 48 49 // Predefined mode bits to control recognition of tokens. For instance, 50 // to configure a Scanner such that it only recognizes (Go) identifiers, 51 // integers, and skips comments, set the Scanner's Mode field to: 52 // 53 // ScanIdents | ScanInts | SkipComments 54 // 55 // With the exceptions of comments, which are skipped if SkipComments is 56 // set, unrecognized tokens are not ignored. Instead, the scanner simply 57 // returns the respective individual characters (or possibly sub-tokens). 58 // For instance, if the mode is ScanIdents (not ScanStrings), the string 59 // "foo" is scanned as the token sequence '"' Ident '"'. 60 // 61 const ( 62 ScanIdents = 1 << -Ident 63 ScanInts = 1 << -Int 64 ScanFloats = 1 << -Float // includes Ints 65 ScanChars = 1 << -Char 66 ScanStrings = 1 << -String 67 ScanRawStrings = 1 << -RawString 68 ScanComments = 1 << -Comment 69 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 70 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 71 ) 72 73 // The result of Scan is one of these tokens or a Unicode character. 74 const ( 75 EOF = -(iota + 1) 76 Ident 77 Int 78 Float 79 Char 80 String 81 RawString 82 Comment 83 skipComment 84 ) 85 86 var tokenString = map[rune]string{ 87 EOF: "EOF", 88 Ident: "Ident", 89 Int: "Int", 90 Float: "Float", 91 Char: "Char", 92 String: "String", 93 RawString: "RawString", 94 Comment: "Comment", 95 } 96 97 // TokenString returns a printable string for a token or Unicode character. 98 func TokenString(tok rune) string { 99 if s, found := tokenString[tok]; found { 100 return s 101 } 102 return fmt.Sprintf("%q", string(tok)) 103 } 104 105 // GoWhitespace is the default value for the Scanner's Whitespace field. 106 // Its value selects Go's white space characters. 107 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 108 109 const bufLen = 1024 // at least utf8.UTFMax 110 111 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 112 type Scanner struct { 113 // Input 114 src io.Reader 115 116 // Source buffer 117 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 118 srcPos int // reading position (srcBuf index) 119 srcEnd int // source end (srcBuf index) 120 121 // Source position 122 srcBufOffset int // byte offset of srcBuf[0] in source 123 line int // line count 124 column int // character count 125 lastLineLen int // length of last line in characters (for correct column reporting) 126 lastCharLen int // length of last character in bytes 127 128 // Token text buffer 129 // Typically, token text is stored completely in srcBuf, but in general 130 // the token text's head may be buffered in tokBuf while the token text's 131 // tail is stored in srcBuf. 132 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 133 tokPos int // token text tail position (srcBuf index); valid if >= 0 134 tokEnd int // token text tail end (srcBuf index) 135 136 // One character look-ahead 137 ch rune // character before current srcPos 138 139 // Error is called for each error encountered. If no Error 140 // function is set, the error is reported to os.Stderr. 141 Error func(s *Scanner, msg string) 142 143 // ErrorCount is incremented by one for each error encountered. 144 ErrorCount int 145 146 // The Mode field controls which tokens are recognized. For instance, 147 // to recognize Ints, set the ScanInts bit in Mode. The field may be 148 // changed at any time. 149 Mode uint 150 151 // The Whitespace field controls which characters are recognized 152 // as white space. To recognize a character ch <= ' ' as white space, 153 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 154 // for values ch > ' '). The field may be changed at any time. 155 Whitespace uint64 156 157 // IsIdentRune is a predicate controlling the characters accepted 158 // as the ith rune in an identifier. The set of valid characters 159 // must not intersect with the set of white space characters. 160 // If no IsIdentRune function is set, regular Go identifiers are 161 // accepted instead. The field may be changed at any time. 162 IsIdentRune func(ch rune, i int) bool 163 164 // Start position of most recently scanned token; set by Scan. 165 // Calling Init or Next invalidates the position (Line == 0). 166 // The Filename field is always left untouched by the Scanner. 167 // If an error is reported (via Error) and Position is invalid, 168 // the scanner is not inside a token. Call Pos to obtain an error 169 // position in that case, or to obtain the position immediately 170 // after the most recently scanned token. 171 Position 172 } 173 174 // Init initializes a Scanner with a new source and returns s. 175 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 176 // and Whitespace is set to GoWhitespace. 177 func (s *Scanner) Init(src io.Reader) *Scanner { 178 s.src = src 179 180 // initialize source buffer 181 // (the first call to next() will fill it by calling src.Read) 182 s.srcBuf[0] = utf8.RuneSelf // sentinel 183 s.srcPos = 0 184 s.srcEnd = 0 185 186 // initialize source position 187 s.srcBufOffset = 0 188 s.line = 1 189 s.column = 0 190 s.lastLineLen = 0 191 s.lastCharLen = 0 192 193 // initialize token text buffer 194 // (required for first call to next()). 195 s.tokPos = -1 196 197 // initialize one character look-ahead 198 s.ch = -2 // no char read yet, not EOF 199 200 // initialize public fields 201 s.Error = nil 202 s.ErrorCount = 0 203 s.Mode = GoTokens 204 s.Whitespace = GoWhitespace 205 s.Line = 0 // invalidate token position 206 207 return s 208 } 209 210 // next reads and returns the next Unicode character. It is designed such 211 // that only a minimal amount of work needs to be done in the common ASCII 212 // case (one test to check for both ASCII and end-of-buffer, and one test 213 // to check for newlines). 214 func (s *Scanner) next() rune { 215 ch, width := rune(s.srcBuf[s.srcPos]), 1 216 217 if ch >= utf8.RuneSelf { 218 // uncommon case: not ASCII or not enough bytes 219 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 220 // not enough bytes: read some more, but first 221 // save away token text if any 222 if s.tokPos >= 0 { 223 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 224 s.tokPos = 0 225 // s.tokEnd is set by Scan() 226 } 227 // move unread bytes to beginning of buffer 228 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 229 s.srcBufOffset += s.srcPos 230 // read more bytes 231 // (an io.Reader must return io.EOF when it reaches 232 // the end of what it is reading - simply returning 233 // n == 0 will make this loop retry forever; but the 234 // error is in the reader implementation in that case) 235 i := s.srcEnd - s.srcPos 236 n, err := s.src.Read(s.srcBuf[i:bufLen]) 237 s.srcPos = 0 238 s.srcEnd = i + n 239 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 240 if err != nil { 241 if err != io.EOF { 242 s.error(err.Error()) 243 } 244 if s.srcEnd == 0 { 245 if s.lastCharLen > 0 { 246 // previous character was not EOF 247 s.column++ 248 } 249 s.lastCharLen = 0 250 return EOF 251 } 252 // If err == EOF, we won't be getting more 253 // bytes; break to avoid infinite loop. If 254 // err is something else, we don't know if 255 // we can get more bytes; thus also break. 256 break 257 } 258 } 259 // at least one byte 260 ch = rune(s.srcBuf[s.srcPos]) 261 if ch >= utf8.RuneSelf { 262 // uncommon case: not ASCII 263 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 264 if ch == utf8.RuneError && width == 1 { 265 // advance for correct error position 266 s.srcPos += width 267 s.lastCharLen = width 268 s.column++ 269 s.error("illegal UTF-8 encoding") 270 return ch 271 } 272 } 273 } 274 275 // advance 276 s.srcPos += width 277 s.lastCharLen = width 278 s.column++ 279 280 // special situations 281 switch ch { 282 case 0: 283 // for compatibility with other tools 284 s.error("illegal character NUL") 285 case '\n': 286 s.line++ 287 s.lastLineLen = s.column 288 s.column = 0 289 } 290 291 return ch 292 } 293 294 // Next reads and returns the next Unicode character. 295 // It returns EOF at the end of the source. It reports 296 // a read error by calling s.Error, if not nil; otherwise 297 // it prints an error message to os.Stderr. Next does not 298 // update the Scanner's Position field; use Pos() to 299 // get the current position. 300 func (s *Scanner) Next() rune { 301 s.tokPos = -1 // don't collect token text 302 s.Line = 0 // invalidate token position 303 ch := s.Peek() 304 if ch != EOF { 305 s.ch = s.next() 306 } 307 return ch 308 } 309 310 // Peek returns the next Unicode character in the source without advancing 311 // the scanner. It returns EOF if the scanner's position is at the last 312 // character of the source. 313 func (s *Scanner) Peek() rune { 314 if s.ch == -2 { 315 // this code is only run for the very first character 316 s.ch = s.next() 317 if s.ch == '\uFEFF' { 318 s.ch = s.next() // ignore BOM 319 } 320 } 321 return s.ch 322 } 323 324 func (s *Scanner) error(msg string) { 325 s.ErrorCount++ 326 if s.Error != nil { 327 s.Error(s, msg) 328 return 329 } 330 pos := s.Position 331 if !pos.IsValid() { 332 pos = s.Pos() 333 } 334 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 335 } 336 337 func (s *Scanner) isIdentRune(ch rune, i int) bool { 338 if s.IsIdentRune != nil { 339 return s.IsIdentRune(ch, i) 340 } 341 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 342 } 343 344 func (s *Scanner) scanIdentifier() rune { 345 // we know the zero'th rune is OK; start scanning at the next one 346 ch := s.next() 347 for i := 1; s.isIdentRune(ch, i); i++ { 348 ch = s.next() 349 } 350 return ch 351 } 352 353 func digitVal(ch rune) int { 354 switch { 355 case '0' <= ch && ch <= '9': 356 return int(ch - '0') 357 case 'a' <= ch && ch <= 'f': 358 return int(ch - 'a' + 10) 359 case 'A' <= ch && ch <= 'F': 360 return int(ch - 'A' + 10) 361 } 362 return 16 // larger than any legal digit val 363 } 364 365 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 366 367 func (s *Scanner) scanMantissa(ch rune) rune { 368 for isDecimal(ch) { 369 ch = s.next() 370 } 371 return ch 372 } 373 374 func (s *Scanner) scanFraction(ch rune) rune { 375 if ch == '.' { 376 ch = s.scanMantissa(s.next()) 377 } 378 return ch 379 } 380 381 func (s *Scanner) scanExponent(ch rune) rune { 382 if ch == 'e' || ch == 'E' { 383 ch = s.next() 384 if ch == '-' || ch == '+' { 385 ch = s.next() 386 } 387 if !isDecimal(ch) { 388 s.error("illegal exponent") 389 } 390 ch = s.scanMantissa(ch) 391 } 392 return ch 393 } 394 395 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 396 // isDecimal(ch) 397 if ch == '0' { 398 // int or float 399 ch = s.next() 400 if ch == 'x' || ch == 'X' { 401 // hexadecimal int 402 ch = s.next() 403 hasMantissa := false 404 for digitVal(ch) < 16 { 405 ch = s.next() 406 hasMantissa = true 407 } 408 if !hasMantissa { 409 s.error("illegal hexadecimal number") 410 } 411 } else { 412 // octal int or float 413 has8or9 := false 414 for isDecimal(ch) { 415 if ch > '7' { 416 has8or9 = true 417 } 418 ch = s.next() 419 } 420 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 421 // float 422 ch = s.scanFraction(ch) 423 ch = s.scanExponent(ch) 424 return Float, ch 425 } 426 // octal int 427 if has8or9 { 428 s.error("illegal octal number") 429 } 430 } 431 return Int, ch 432 } 433 // decimal int or float 434 ch = s.scanMantissa(ch) 435 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 436 // float 437 ch = s.scanFraction(ch) 438 ch = s.scanExponent(ch) 439 return Float, ch 440 } 441 return Int, ch 442 } 443 444 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 445 for n > 0 && digitVal(ch) < base { 446 ch = s.next() 447 n-- 448 } 449 if n > 0 { 450 s.error("illegal char escape") 451 } 452 return ch 453 } 454 455 func (s *Scanner) scanEscape(quote rune) rune { 456 ch := s.next() // read character after '/' 457 switch ch { 458 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 459 // nothing to do 460 ch = s.next() 461 case '0', '1', '2', '3', '4', '5', '6', '7': 462 ch = s.scanDigits(ch, 8, 3) 463 case 'x': 464 ch = s.scanDigits(s.next(), 16, 2) 465 case 'u': 466 ch = s.scanDigits(s.next(), 16, 4) 467 case 'U': 468 ch = s.scanDigits(s.next(), 16, 8) 469 default: 470 s.error("illegal char escape") 471 } 472 return ch 473 } 474 475 func (s *Scanner) scanString(quote rune) (n int) { 476 ch := s.next() // read character after quote 477 for ch != quote { 478 if ch == '\n' || ch < 0 { 479 s.error("literal not terminated") 480 return 481 } 482 if ch == '\\' { 483 ch = s.scanEscape(quote) 484 } else { 485 ch = s.next() 486 } 487 n++ 488 } 489 return 490 } 491 492 func (s *Scanner) scanRawString() { 493 ch := s.next() // read character after '`' 494 for ch != '`' { 495 if ch < 0 { 496 s.error("literal not terminated") 497 return 498 } 499 ch = s.next() 500 } 501 } 502 503 func (s *Scanner) scanChar() { 504 if s.scanString('\'') != 1 { 505 s.error("illegal char literal") 506 } 507 } 508 509 func (s *Scanner) scanComment(ch rune) rune { 510 // ch == '/' || ch == '*' 511 if ch == '/' { 512 // line comment 513 ch = s.next() // read character after "//" 514 for ch != '\n' && ch >= 0 { 515 ch = s.next() 516 } 517 return ch 518 } 519 520 // general comment 521 ch = s.next() // read character after "/*" 522 for { 523 if ch < 0 { 524 s.error("comment not terminated") 525 break 526 } 527 ch0 := ch 528 ch = s.next() 529 if ch0 == '*' && ch == '/' { 530 ch = s.next() 531 break 532 } 533 } 534 return ch 535 } 536 537 // Scan reads the next token or Unicode character from source and returns it. 538 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 539 // It returns EOF at the end of the source. It reports scanner errors (read and 540 // token errors) by calling s.Error, if not nil; otherwise it prints an error 541 // message to os.Stderr. 542 func (s *Scanner) Scan() rune { 543 ch := s.Peek() 544 545 // reset token text position 546 s.tokPos = -1 547 s.Line = 0 548 549 redo: 550 // skip white space 551 for s.Whitespace&(1<<uint(ch)) != 0 { 552 ch = s.next() 553 } 554 555 // start collecting token text 556 s.tokBuf.Reset() 557 s.tokPos = s.srcPos - s.lastCharLen 558 559 // set token position 560 // (this is a slightly optimized version of the code in Pos()) 561 s.Offset = s.srcBufOffset + s.tokPos 562 if s.column > 0 { 563 // common case: last character was not a '\n' 564 s.Line = s.line 565 s.Column = s.column 566 } else { 567 // last character was a '\n' 568 // (we cannot be at the beginning of the source 569 // since we have called next() at least once) 570 s.Line = s.line - 1 571 s.Column = s.lastLineLen 572 } 573 574 // determine token value 575 tok := ch 576 switch { 577 case s.isIdentRune(ch, 0): 578 if s.Mode&ScanIdents != 0 { 579 tok = Ident 580 ch = s.scanIdentifier() 581 } else { 582 ch = s.next() 583 } 584 case isDecimal(ch): 585 if s.Mode&(ScanInts|ScanFloats) != 0 { 586 tok, ch = s.scanNumber(ch) 587 } else { 588 ch = s.next() 589 } 590 default: 591 switch ch { 592 case EOF: 593 break 594 case '"': 595 if s.Mode&ScanStrings != 0 { 596 s.scanString('"') 597 tok = String 598 } 599 ch = s.next() 600 case '\'': 601 if s.Mode&ScanChars != 0 { 602 s.scanChar() 603 tok = Char 604 } 605 ch = s.next() 606 case '.': 607 ch = s.next() 608 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 609 tok = Float 610 ch = s.scanMantissa(ch) 611 ch = s.scanExponent(ch) 612 } 613 case '/': 614 ch = s.next() 615 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 616 if s.Mode&SkipComments != 0 { 617 s.tokPos = -1 // don't collect token text 618 ch = s.scanComment(ch) 619 goto redo 620 } 621 ch = s.scanComment(ch) 622 tok = Comment 623 } 624 case '`': 625 if s.Mode&ScanRawStrings != 0 { 626 s.scanRawString() 627 tok = RawString 628 } 629 ch = s.next() 630 default: 631 ch = s.next() 632 } 633 } 634 635 // end of token text 636 s.tokEnd = s.srcPos - s.lastCharLen 637 638 s.ch = ch 639 return tok 640 } 641 642 // Pos returns the position of the character immediately after 643 // the character or token returned by the last call to Next or Scan. 644 // Use the Scanner's Position field for the start position of the most 645 // recently scanned token. 646 func (s *Scanner) Pos() (pos Position) { 647 pos.Filename = s.Filename 648 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 649 switch { 650 case s.column > 0: 651 // common case: last character was not a '\n' 652 pos.Line = s.line 653 pos.Column = s.column 654 case s.lastLineLen > 0: 655 // last character was a '\n' 656 pos.Line = s.line - 1 657 pos.Column = s.lastLineLen 658 default: 659 // at the beginning of the source 660 pos.Line = 1 661 pos.Column = 1 662 } 663 return 664 } 665 666 // TokenText returns the string corresponding to the most recently scanned token. 667 // Valid after calling Scan(). 668 func (s *Scanner) TokenText() string { 669 if s.tokPos < 0 { 670 // no token text 671 return "" 672 } 673 674 if s.tokEnd < 0 { 675 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 676 s.tokEnd = s.tokPos 677 } 678 679 if s.tokBuf.Len() == 0 { 680 // common case: the entire token text is still in srcBuf 681 return string(s.srcBuf[s.tokPos:s.tokEnd]) 682 } 683 684 // part of the token text was saved in tokBuf: save the rest in 685 // tokBuf as well and return its content 686 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 687 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 688 return s.tokBuf.String() 689 }