github.com/tcnksm/go@v0.0.0-20141208075154-439b32936367/src/text/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. If the first character 9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. 10 // 11 // By default, a Scanner skips white space and Go comments and recognizes all 12 // literals as defined by the Go language specification. It may be 13 // customized to recognize only a subset of those literals and to recognize 14 // different identifier and white space characters. 15 // 16 // Basic usage pattern: 17 // 18 // var s scanner.Scanner 19 // s.Init(src) 20 // tok := s.Scan() 21 // for tok != scanner.EOF { 22 // // do something with tok 23 // tok = s.Scan() 24 // } 25 // 26 package scanner 27 28 import ( 29 "bytes" 30 "fmt" 31 "io" 32 "os" 33 "unicode" 34 "unicode/utf8" 35 ) 36 37 // A source position is represented by a Position value. 38 // A position is valid if Line > 0. 39 type Position struct { 40 Filename string // filename, if any 41 Offset int // byte offset, starting at 0 42 Line int // line number, starting at 1 43 Column int // column number, starting at 1 (character count per line) 44 } 45 46 // IsValid returns true if the position is valid. 47 func (pos *Position) IsValid() bool { return pos.Line > 0 } 48 49 func (pos Position) String() string { 50 s := pos.Filename 51 if pos.IsValid() { 52 if s != "" { 53 s += ":" 54 } 55 s += fmt.Sprintf("%d:%d", pos.Line, pos.Column) 56 } 57 if s == "" { 58 s = "???" 59 } 60 return s 61 } 62 63 // Predefined mode bits to control recognition of tokens. For instance, 64 // to configure a Scanner such that it only recognizes (Go) identifiers, 65 // integers, and skips comments, set the Scanner's Mode field to: 66 // 67 // ScanIdents | ScanInts | SkipComments 68 // 69 // With the exceptions of comments, which are skipped if SkipComments is 70 // set, unrecognized tokens are not ignored. Instead, the scanner simply 71 // returns the respective individual characters (or possibly sub-tokens). 72 // For instance, if the mode is ScanIdents (not ScanStrings), the string 73 // "foo" is scanned as the token sequence '"' Ident '"'. 74 // 75 const ( 76 ScanIdents = 1 << -Ident 77 ScanInts = 1 << -Int 78 ScanFloats = 1 << -Float // includes Ints 79 ScanChars = 1 << -Char 80 ScanStrings = 1 << -String 81 ScanRawStrings = 1 << -RawString 82 ScanComments = 1 << -Comment 83 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 84 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 85 ) 86 87 // The result of Scan is one of the following tokens or a Unicode character. 88 const ( 89 EOF = -(iota + 1) 90 Ident 91 Int 92 Float 93 Char 94 String 95 RawString 96 Comment 97 skipComment 98 ) 99 100 var tokenString = map[rune]string{ 101 EOF: "EOF", 102 Ident: "Ident", 103 Int: "Int", 104 Float: "Float", 105 Char: "Char", 106 String: "String", 107 RawString: "RawString", 108 Comment: "Comment", 109 } 110 111 // TokenString returns a printable string for a token or Unicode character. 112 func TokenString(tok rune) string { 113 if s, found := tokenString[tok]; found { 114 return s 115 } 116 return fmt.Sprintf("%q", string(tok)) 117 } 118 119 // GoWhitespace is the default value for the Scanner's Whitespace field. 120 // Its value selects Go's white space characters. 121 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 122 123 const bufLen = 1024 // at least utf8.UTFMax 124 125 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 126 type Scanner struct { 127 // Input 128 src io.Reader 129 130 // Source buffer 131 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 132 srcPos int // reading position (srcBuf index) 133 srcEnd int // source end (srcBuf index) 134 135 // Source position 136 srcBufOffset int // byte offset of srcBuf[0] in source 137 line int // line count 138 column int // character count 139 lastLineLen int // length of last line in characters (for correct column reporting) 140 lastCharLen int // length of last character in bytes 141 142 // Token text buffer 143 // Typically, token text is stored completely in srcBuf, but in general 144 // the token text's head may be buffered in tokBuf while the token text's 145 // tail is stored in srcBuf. 146 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 147 tokPos int // token text tail position (srcBuf index); valid if >= 0 148 tokEnd int // token text tail end (srcBuf index) 149 150 // One character look-ahead 151 ch rune // character before current srcPos 152 153 // Error is called for each error encountered. If no Error 154 // function is set, the error is reported to os.Stderr. 155 Error func(s *Scanner, msg string) 156 157 // ErrorCount is incremented by one for each error encountered. 158 ErrorCount int 159 160 // The Mode field controls which tokens are recognized. For instance, 161 // to recognize Ints, set the ScanInts bit in Mode. The field may be 162 // changed at any time. 163 Mode uint 164 165 // The Whitespace field controls which characters are recognized 166 // as white space. To recognize a character ch <= ' ' as white space, 167 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 168 // for values ch > ' '). The field may be changed at any time. 169 Whitespace uint64 170 171 // IsIdentRune is a predicate controlling the characters accepted 172 // as the ith rune in an identifier. The set of valid characters 173 // must not intersect with the set of white space characters. 174 // If no IsIdentRune function is set, regular Go identifiers are 175 // accepted instead. The field may be changed at any time. 176 IsIdentRune func(ch rune, i int) bool 177 178 // Start position of most recently scanned token; set by Scan. 179 // Calling Init or Next invalidates the position (Line == 0). 180 // The Filename field is always left untouched by the Scanner. 181 // If an error is reported (via Error) and Position is invalid, 182 // the scanner is not inside a token. Call Pos to obtain an error 183 // position in that case. 184 Position 185 } 186 187 // Init initializes a Scanner with a new source and returns s. 188 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 189 // and Whitespace is set to GoWhitespace. 190 func (s *Scanner) Init(src io.Reader) *Scanner { 191 s.src = src 192 193 // initialize source buffer 194 // (the first call to next() will fill it by calling src.Read) 195 s.srcBuf[0] = utf8.RuneSelf // sentinel 196 s.srcPos = 0 197 s.srcEnd = 0 198 199 // initialize source position 200 s.srcBufOffset = 0 201 s.line = 1 202 s.column = 0 203 s.lastLineLen = 0 204 s.lastCharLen = 0 205 206 // initialize token text buffer 207 // (required for first call to next()). 208 s.tokPos = -1 209 210 // initialize one character look-ahead 211 s.ch = -1 // no char read yet 212 213 // initialize public fields 214 s.Error = nil 215 s.ErrorCount = 0 216 s.Mode = GoTokens 217 s.Whitespace = GoWhitespace 218 s.Line = 0 // invalidate token position 219 220 return s 221 } 222 223 // next reads and returns the next Unicode character. It is designed such 224 // that only a minimal amount of work needs to be done in the common ASCII 225 // case (one test to check for both ASCII and end-of-buffer, and one test 226 // to check for newlines). 227 func (s *Scanner) next() rune { 228 ch, width := rune(s.srcBuf[s.srcPos]), 1 229 230 if ch >= utf8.RuneSelf { 231 // uncommon case: not ASCII or not enough bytes 232 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 233 // not enough bytes: read some more, but first 234 // save away token text if any 235 if s.tokPos >= 0 { 236 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 237 s.tokPos = 0 238 // s.tokEnd is set by Scan() 239 } 240 // move unread bytes to beginning of buffer 241 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 242 s.srcBufOffset += s.srcPos 243 // read more bytes 244 // (an io.Reader must return io.EOF when it reaches 245 // the end of what it is reading - simply returning 246 // n == 0 will make this loop retry forever; but the 247 // error is in the reader implementation in that case) 248 i := s.srcEnd - s.srcPos 249 n, err := s.src.Read(s.srcBuf[i:bufLen]) 250 s.srcPos = 0 251 s.srcEnd = i + n 252 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 253 if err != nil { 254 if err != io.EOF { 255 s.error(err.Error()) 256 } 257 if s.srcEnd == 0 { 258 if s.lastCharLen > 0 { 259 // previous character was not EOF 260 s.column++ 261 } 262 s.lastCharLen = 0 263 return EOF 264 } 265 // If err == EOF, we won't be getting more 266 // bytes; break to avoid infinite loop. If 267 // err is something else, we don't know if 268 // we can get more bytes; thus also break. 269 break 270 } 271 } 272 // at least one byte 273 ch = rune(s.srcBuf[s.srcPos]) 274 if ch >= utf8.RuneSelf { 275 // uncommon case: not ASCII 276 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 277 if ch == utf8.RuneError && width == 1 { 278 // advance for correct error position 279 s.srcPos += width 280 s.lastCharLen = width 281 s.column++ 282 s.error("illegal UTF-8 encoding") 283 return ch 284 } 285 } 286 } 287 288 // advance 289 s.srcPos += width 290 s.lastCharLen = width 291 s.column++ 292 293 // special situations 294 switch ch { 295 case 0: 296 // for compatibility with other tools 297 s.error("illegal character NUL") 298 case '\n': 299 s.line++ 300 s.lastLineLen = s.column 301 s.column = 0 302 } 303 304 return ch 305 } 306 307 // Next reads and returns the next Unicode character. 308 // It returns EOF at the end of the source. It reports 309 // a read error by calling s.Error, if not nil; otherwise 310 // it prints an error message to os.Stderr. Next does not 311 // update the Scanner's Position field; use Pos() to 312 // get the current position. 313 func (s *Scanner) Next() rune { 314 s.tokPos = -1 // don't collect token text 315 s.Line = 0 // invalidate token position 316 ch := s.Peek() 317 s.ch = s.next() 318 return ch 319 } 320 321 // Peek returns the next Unicode character in the source without advancing 322 // the scanner. It returns EOF if the scanner's position is at the last 323 // character of the source. 324 func (s *Scanner) Peek() rune { 325 if s.ch < 0 { 326 // this code is only run for the very first character 327 s.ch = s.next() 328 if s.ch == '\uFEFF' { 329 s.ch = s.next() // ignore BOM 330 } 331 } 332 return s.ch 333 } 334 335 func (s *Scanner) error(msg string) { 336 s.ErrorCount++ 337 if s.Error != nil { 338 s.Error(s, msg) 339 return 340 } 341 pos := s.Position 342 if !pos.IsValid() { 343 pos = s.Pos() 344 } 345 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 346 } 347 348 func (s *Scanner) isIdentRune(ch rune, i int) bool { 349 if s.IsIdentRune != nil { 350 return s.IsIdentRune(ch, i) 351 } 352 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 353 } 354 355 func (s *Scanner) scanIdentifier() rune { 356 // we know the zero'th rune is OK; start scanning at the next one 357 ch := s.next() 358 for i := 1; s.isIdentRune(ch, i); i++ { 359 ch = s.next() 360 } 361 return ch 362 } 363 364 func digitVal(ch rune) int { 365 switch { 366 case '0' <= ch && ch <= '9': 367 return int(ch - '0') 368 case 'a' <= ch && ch <= 'f': 369 return int(ch - 'a' + 10) 370 case 'A' <= ch && ch <= 'F': 371 return int(ch - 'A' + 10) 372 } 373 return 16 // larger than any legal digit val 374 } 375 376 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 377 378 func (s *Scanner) scanMantissa(ch rune) rune { 379 for isDecimal(ch) { 380 ch = s.next() 381 } 382 return ch 383 } 384 385 func (s *Scanner) scanFraction(ch rune) rune { 386 if ch == '.' { 387 ch = s.scanMantissa(s.next()) 388 } 389 return ch 390 } 391 392 func (s *Scanner) scanExponent(ch rune) rune { 393 if ch == 'e' || ch == 'E' { 394 ch = s.next() 395 if ch == '-' || ch == '+' { 396 ch = s.next() 397 } 398 ch = s.scanMantissa(ch) 399 } 400 return ch 401 } 402 403 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 404 // isDecimal(ch) 405 if ch == '0' { 406 // int or float 407 ch = s.next() 408 if ch == 'x' || ch == 'X' { 409 // hexadecimal int 410 ch = s.next() 411 hasMantissa := false 412 for digitVal(ch) < 16 { 413 ch = s.next() 414 hasMantissa = true 415 } 416 if !hasMantissa { 417 s.error("illegal hexadecimal number") 418 } 419 } else { 420 // octal int or float 421 has8or9 := false 422 for isDecimal(ch) { 423 if ch > '7' { 424 has8or9 = true 425 } 426 ch = s.next() 427 } 428 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 429 // float 430 ch = s.scanFraction(ch) 431 ch = s.scanExponent(ch) 432 return Float, ch 433 } 434 // octal int 435 if has8or9 { 436 s.error("illegal octal number") 437 } 438 } 439 return Int, ch 440 } 441 // decimal int or float 442 ch = s.scanMantissa(ch) 443 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 444 // float 445 ch = s.scanFraction(ch) 446 ch = s.scanExponent(ch) 447 return Float, ch 448 } 449 return Int, ch 450 } 451 452 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 453 for n > 0 && digitVal(ch) < base { 454 ch = s.next() 455 n-- 456 } 457 if n > 0 { 458 s.error("illegal char escape") 459 } 460 return ch 461 } 462 463 func (s *Scanner) scanEscape(quote rune) rune { 464 ch := s.next() // read character after '/' 465 switch ch { 466 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 467 // nothing to do 468 ch = s.next() 469 case '0', '1', '2', '3', '4', '5', '6', '7': 470 ch = s.scanDigits(ch, 8, 3) 471 case 'x': 472 ch = s.scanDigits(s.next(), 16, 2) 473 case 'u': 474 ch = s.scanDigits(s.next(), 16, 4) 475 case 'U': 476 ch = s.scanDigits(s.next(), 16, 8) 477 default: 478 s.error("illegal char escape") 479 } 480 return ch 481 } 482 483 func (s *Scanner) scanString(quote rune) (n int) { 484 ch := s.next() // read character after quote 485 for ch != quote { 486 if ch == '\n' || ch < 0 { 487 s.error("literal not terminated") 488 return 489 } 490 if ch == '\\' { 491 ch = s.scanEscape(quote) 492 } else { 493 ch = s.next() 494 } 495 n++ 496 } 497 return 498 } 499 500 func (s *Scanner) scanRawString() { 501 ch := s.next() // read character after '`' 502 for ch != '`' { 503 if ch < 0 { 504 s.error("literal not terminated") 505 return 506 } 507 ch = s.next() 508 } 509 } 510 511 func (s *Scanner) scanChar() { 512 if s.scanString('\'') != 1 { 513 s.error("illegal char literal") 514 } 515 } 516 517 func (s *Scanner) scanComment(ch rune) rune { 518 // ch == '/' || ch == '*' 519 if ch == '/' { 520 // line comment 521 ch = s.next() // read character after "//" 522 for ch != '\n' && ch >= 0 { 523 ch = s.next() 524 } 525 return ch 526 } 527 528 // general comment 529 ch = s.next() // read character after "/*" 530 for { 531 if ch < 0 { 532 s.error("comment not terminated") 533 break 534 } 535 ch0 := ch 536 ch = s.next() 537 if ch0 == '*' && ch == '/' { 538 ch = s.next() 539 break 540 } 541 } 542 return ch 543 } 544 545 // Scan reads the next token or Unicode character from source and returns it. 546 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 547 // It returns EOF at the end of the source. It reports scanner errors (read and 548 // token errors) by calling s.Error, if not nil; otherwise it prints an error 549 // message to os.Stderr. 550 func (s *Scanner) Scan() rune { 551 ch := s.Peek() 552 553 // reset token text position 554 s.tokPos = -1 555 s.Line = 0 556 557 redo: 558 // skip white space 559 for s.Whitespace&(1<<uint(ch)) != 0 { 560 ch = s.next() 561 } 562 563 // start collecting token text 564 s.tokBuf.Reset() 565 s.tokPos = s.srcPos - s.lastCharLen 566 567 // set token position 568 // (this is a slightly optimized version of the code in Pos()) 569 s.Offset = s.srcBufOffset + s.tokPos 570 if s.column > 0 { 571 // common case: last character was not a '\n' 572 s.Line = s.line 573 s.Column = s.column 574 } else { 575 // last character was a '\n' 576 // (we cannot be at the beginning of the source 577 // since we have called next() at least once) 578 s.Line = s.line - 1 579 s.Column = s.lastLineLen 580 } 581 582 // determine token value 583 tok := ch 584 switch { 585 case s.isIdentRune(ch, 0): 586 if s.Mode&ScanIdents != 0 { 587 tok = Ident 588 ch = s.scanIdentifier() 589 } else { 590 ch = s.next() 591 } 592 case isDecimal(ch): 593 if s.Mode&(ScanInts|ScanFloats) != 0 { 594 tok, ch = s.scanNumber(ch) 595 } else { 596 ch = s.next() 597 } 598 default: 599 switch ch { 600 case '"': 601 if s.Mode&ScanStrings != 0 { 602 s.scanString('"') 603 tok = String 604 } 605 ch = s.next() 606 case '\'': 607 if s.Mode&ScanChars != 0 { 608 s.scanChar() 609 tok = Char 610 } 611 ch = s.next() 612 case '.': 613 ch = s.next() 614 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 615 tok = Float 616 ch = s.scanMantissa(ch) 617 ch = s.scanExponent(ch) 618 } 619 case '/': 620 ch = s.next() 621 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 622 if s.Mode&SkipComments != 0 { 623 s.tokPos = -1 // don't collect token text 624 ch = s.scanComment(ch) 625 goto redo 626 } 627 ch = s.scanComment(ch) 628 tok = Comment 629 } 630 case '`': 631 if s.Mode&ScanRawStrings != 0 { 632 s.scanRawString() 633 tok = String 634 } 635 ch = s.next() 636 default: 637 ch = s.next() 638 } 639 } 640 641 // end of token text 642 s.tokEnd = s.srcPos - s.lastCharLen 643 644 s.ch = ch 645 return tok 646 } 647 648 // Pos returns the position of the character immediately after 649 // the character or token returned by the last call to Next or Scan. 650 func (s *Scanner) Pos() (pos Position) { 651 pos.Filename = s.Filename 652 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 653 switch { 654 case s.column > 0: 655 // common case: last character was not a '\n' 656 pos.Line = s.line 657 pos.Column = s.column 658 case s.lastLineLen > 0: 659 // last character was a '\n' 660 pos.Line = s.line - 1 661 pos.Column = s.lastLineLen 662 default: 663 // at the beginning of the source 664 pos.Line = 1 665 pos.Column = 1 666 } 667 return 668 } 669 670 // TokenText returns the string corresponding to the most recently scanned token. 671 // Valid after calling Scan(). 672 func (s *Scanner) TokenText() string { 673 if s.tokPos < 0 { 674 // no token text 675 return "" 676 } 677 678 if s.tokEnd < 0 { 679 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 680 s.tokEnd = s.tokPos 681 } 682 683 if s.tokBuf.Len() == 0 { 684 // common case: the entire token text is still in srcBuf 685 return string(s.srcBuf[s.tokPos:s.tokEnd]) 686 } 687 688 // part of the token text was saved in tokBuf: save the rest in 689 // tokBuf as well and return its content 690 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 691 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 692 return s.tokBuf.String() 693 }