github.com/d4l3k/go@v0.0.0-20151015000803-65fc379daeda/src/text/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. If the first character 9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. 10 // 11 // By default, a Scanner skips white space and Go comments and recognizes all 12 // literals as defined by the Go language specification. It may be 13 // customized to recognize only a subset of those literals and to recognize 14 // different identifier and white space characters. 15 package scanner 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "os" 22 "unicode" 23 "unicode/utf8" 24 ) 25 26 // A source position is represented by a Position value. 27 // A position is valid if Line > 0. 28 type Position struct { 29 Filename string // filename, if any 30 Offset int // byte offset, starting at 0 31 Line int // line number, starting at 1 32 Column int // column number, starting at 1 (character count per line) 33 } 34 35 // IsValid reports whether the position is valid. 36 func (pos *Position) IsValid() bool { return pos.Line > 0 } 37 38 func (pos Position) String() string { 39 s := pos.Filename 40 if pos.IsValid() { 41 if s != "" { 42 s += ":" 43 } 44 s += fmt.Sprintf("%d:%d", pos.Line, pos.Column) 45 } 46 if s == "" { 47 s = "???" 48 } 49 return s 50 } 51 52 // Predefined mode bits to control recognition of tokens. For instance, 53 // to configure a Scanner such that it only recognizes (Go) identifiers, 54 // integers, and skips comments, set the Scanner's Mode field to: 55 // 56 // ScanIdents | ScanInts | SkipComments 57 // 58 // With the exceptions of comments, which are skipped if SkipComments is 59 // set, unrecognized tokens are not ignored. Instead, the scanner simply 60 // returns the respective individual characters (or possibly sub-tokens). 61 // For instance, if the mode is ScanIdents (not ScanStrings), the string 62 // "foo" is scanned as the token sequence '"' Ident '"'. 63 // 64 const ( 65 ScanIdents = 1 << -Ident 66 ScanInts = 1 << -Int 67 ScanFloats = 1 << -Float // includes Ints 68 ScanChars = 1 << -Char 69 ScanStrings = 1 << -String 70 ScanRawStrings = 1 << -RawString 71 ScanComments = 1 << -Comment 72 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 73 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 74 ) 75 76 // The result of Scan is one of these tokens or a Unicode character. 77 const ( 78 EOF = -(iota + 1) 79 Ident 80 Int 81 Float 82 Char 83 String 84 RawString 85 Comment 86 skipComment 87 ) 88 89 var tokenString = map[rune]string{ 90 EOF: "EOF", 91 Ident: "Ident", 92 Int: "Int", 93 Float: "Float", 94 Char: "Char", 95 String: "String", 96 RawString: "RawString", 97 Comment: "Comment", 98 } 99 100 // TokenString returns a printable string for a token or Unicode character. 101 func TokenString(tok rune) string { 102 if s, found := tokenString[tok]; found { 103 return s 104 } 105 return fmt.Sprintf("%q", string(tok)) 106 } 107 108 // GoWhitespace is the default value for the Scanner's Whitespace field. 109 // Its value selects Go's white space characters. 110 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 111 112 const bufLen = 1024 // at least utf8.UTFMax 113 114 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 115 type Scanner struct { 116 // Input 117 src io.Reader 118 119 // Source buffer 120 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 121 srcPos int // reading position (srcBuf index) 122 srcEnd int // source end (srcBuf index) 123 124 // Source position 125 srcBufOffset int // byte offset of srcBuf[0] in source 126 line int // line count 127 column int // character count 128 lastLineLen int // length of last line in characters (for correct column reporting) 129 lastCharLen int // length of last character in bytes 130 131 // Token text buffer 132 // Typically, token text is stored completely in srcBuf, but in general 133 // the token text's head may be buffered in tokBuf while the token text's 134 // tail is stored in srcBuf. 135 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 136 tokPos int // token text tail position (srcBuf index); valid if >= 0 137 tokEnd int // token text tail end (srcBuf index) 138 139 // One character look-ahead 140 ch rune // character before current srcPos 141 142 // Error is called for each error encountered. If no Error 143 // function is set, the error is reported to os.Stderr. 144 Error func(s *Scanner, msg string) 145 146 // ErrorCount is incremented by one for each error encountered. 147 ErrorCount int 148 149 // The Mode field controls which tokens are recognized. For instance, 150 // to recognize Ints, set the ScanInts bit in Mode. The field may be 151 // changed at any time. 152 Mode uint 153 154 // The Whitespace field controls which characters are recognized 155 // as white space. To recognize a character ch <= ' ' as white space, 156 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 157 // for values ch > ' '). The field may be changed at any time. 158 Whitespace uint64 159 160 // IsIdentRune is a predicate controlling the characters accepted 161 // as the ith rune in an identifier. The set of valid characters 162 // must not intersect with the set of white space characters. 163 // If no IsIdentRune function is set, regular Go identifiers are 164 // accepted instead. The field may be changed at any time. 165 IsIdentRune func(ch rune, i int) bool 166 167 // Start position of most recently scanned token; set by Scan. 168 // Calling Init or Next invalidates the position (Line == 0). 169 // The Filename field is always left untouched by the Scanner. 170 // If an error is reported (via Error) and Position is invalid, 171 // the scanner is not inside a token. Call Pos to obtain an error 172 // position in that case. 173 Position 174 } 175 176 // Init initializes a Scanner with a new source and returns s. 177 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 178 // and Whitespace is set to GoWhitespace. 179 func (s *Scanner) Init(src io.Reader) *Scanner { 180 s.src = src 181 182 // initialize source buffer 183 // (the first call to next() will fill it by calling src.Read) 184 s.srcBuf[0] = utf8.RuneSelf // sentinel 185 s.srcPos = 0 186 s.srcEnd = 0 187 188 // initialize source position 189 s.srcBufOffset = 0 190 s.line = 1 191 s.column = 0 192 s.lastLineLen = 0 193 s.lastCharLen = 0 194 195 // initialize token text buffer 196 // (required for first call to next()). 197 s.tokPos = -1 198 199 // initialize one character look-ahead 200 s.ch = -2 // no char read yet, not EOF 201 202 // initialize public fields 203 s.Error = nil 204 s.ErrorCount = 0 205 s.Mode = GoTokens 206 s.Whitespace = GoWhitespace 207 s.Line = 0 // invalidate token position 208 209 return s 210 } 211 212 // next reads and returns the next Unicode character. It is designed such 213 // that only a minimal amount of work needs to be done in the common ASCII 214 // case (one test to check for both ASCII and end-of-buffer, and one test 215 // to check for newlines). 216 func (s *Scanner) next() rune { 217 ch, width := rune(s.srcBuf[s.srcPos]), 1 218 219 if ch >= utf8.RuneSelf { 220 // uncommon case: not ASCII or not enough bytes 221 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 222 // not enough bytes: read some more, but first 223 // save away token text if any 224 if s.tokPos >= 0 { 225 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 226 s.tokPos = 0 227 // s.tokEnd is set by Scan() 228 } 229 // move unread bytes to beginning of buffer 230 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 231 s.srcBufOffset += s.srcPos 232 // read more bytes 233 // (an io.Reader must return io.EOF when it reaches 234 // the end of what it is reading - simply returning 235 // n == 0 will make this loop retry forever; but the 236 // error is in the reader implementation in that case) 237 i := s.srcEnd - s.srcPos 238 n, err := s.src.Read(s.srcBuf[i:bufLen]) 239 s.srcPos = 0 240 s.srcEnd = i + n 241 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 242 if err != nil { 243 if err != io.EOF { 244 s.error(err.Error()) 245 } 246 if s.srcEnd == 0 { 247 if s.lastCharLen > 0 { 248 // previous character was not EOF 249 s.column++ 250 } 251 s.lastCharLen = 0 252 return EOF 253 } 254 // If err == EOF, we won't be getting more 255 // bytes; break to avoid infinite loop. If 256 // err is something else, we don't know if 257 // we can get more bytes; thus also break. 258 break 259 } 260 } 261 // at least one byte 262 ch = rune(s.srcBuf[s.srcPos]) 263 if ch >= utf8.RuneSelf { 264 // uncommon case: not ASCII 265 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 266 if ch == utf8.RuneError && width == 1 { 267 // advance for correct error position 268 s.srcPos += width 269 s.lastCharLen = width 270 s.column++ 271 s.error("illegal UTF-8 encoding") 272 return ch 273 } 274 } 275 } 276 277 // advance 278 s.srcPos += width 279 s.lastCharLen = width 280 s.column++ 281 282 // special situations 283 switch ch { 284 case 0: 285 // for compatibility with other tools 286 s.error("illegal character NUL") 287 case '\n': 288 s.line++ 289 s.lastLineLen = s.column 290 s.column = 0 291 } 292 293 return ch 294 } 295 296 // Next reads and returns the next Unicode character. 297 // It returns EOF at the end of the source. It reports 298 // a read error by calling s.Error, if not nil; otherwise 299 // it prints an error message to os.Stderr. Next does not 300 // update the Scanner's Position field; use Pos() to 301 // get the current position. 302 func (s *Scanner) Next() rune { 303 s.tokPos = -1 // don't collect token text 304 s.Line = 0 // invalidate token position 305 ch := s.Peek() 306 if ch != EOF { 307 s.ch = s.next() 308 } 309 return ch 310 } 311 312 // Peek returns the next Unicode character in the source without advancing 313 // the scanner. It returns EOF if the scanner's position is at the last 314 // character of the source. 315 func (s *Scanner) Peek() rune { 316 if s.ch == -2 { 317 // this code is only run for the very first character 318 s.ch = s.next() 319 if s.ch == '\uFEFF' { 320 s.ch = s.next() // ignore BOM 321 } 322 } 323 return s.ch 324 } 325 326 func (s *Scanner) error(msg string) { 327 s.ErrorCount++ 328 if s.Error != nil { 329 s.Error(s, msg) 330 return 331 } 332 pos := s.Position 333 if !pos.IsValid() { 334 pos = s.Pos() 335 } 336 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 337 } 338 339 func (s *Scanner) isIdentRune(ch rune, i int) bool { 340 if s.IsIdentRune != nil { 341 return s.IsIdentRune(ch, i) 342 } 343 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 344 } 345 346 func (s *Scanner) scanIdentifier() rune { 347 // we know the zero'th rune is OK; start scanning at the next one 348 ch := s.next() 349 for i := 1; s.isIdentRune(ch, i); i++ { 350 ch = s.next() 351 } 352 return ch 353 } 354 355 func digitVal(ch rune) int { 356 switch { 357 case '0' <= ch && ch <= '9': 358 return int(ch - '0') 359 case 'a' <= ch && ch <= 'f': 360 return int(ch - 'a' + 10) 361 case 'A' <= ch && ch <= 'F': 362 return int(ch - 'A' + 10) 363 } 364 return 16 // larger than any legal digit val 365 } 366 367 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 368 369 func (s *Scanner) scanMantissa(ch rune) rune { 370 for isDecimal(ch) { 371 ch = s.next() 372 } 373 return ch 374 } 375 376 func (s *Scanner) scanFraction(ch rune) rune { 377 if ch == '.' { 378 ch = s.scanMantissa(s.next()) 379 } 380 return ch 381 } 382 383 func (s *Scanner) scanExponent(ch rune) rune { 384 if ch == 'e' || ch == 'E' { 385 ch = s.next() 386 if ch == '-' || ch == '+' { 387 ch = s.next() 388 } 389 ch = s.scanMantissa(ch) 390 } 391 return ch 392 } 393 394 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 395 // isDecimal(ch) 396 if ch == '0' { 397 // int or float 398 ch = s.next() 399 if ch == 'x' || ch == 'X' { 400 // hexadecimal int 401 ch = s.next() 402 hasMantissa := false 403 for digitVal(ch) < 16 { 404 ch = s.next() 405 hasMantissa = true 406 } 407 if !hasMantissa { 408 s.error("illegal hexadecimal number") 409 } 410 } else { 411 // octal int or float 412 has8or9 := false 413 for isDecimal(ch) { 414 if ch > '7' { 415 has8or9 = true 416 } 417 ch = s.next() 418 } 419 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 420 // float 421 ch = s.scanFraction(ch) 422 ch = s.scanExponent(ch) 423 return Float, ch 424 } 425 // octal int 426 if has8or9 { 427 s.error("illegal octal number") 428 } 429 } 430 return Int, ch 431 } 432 // decimal int or float 433 ch = s.scanMantissa(ch) 434 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 435 // float 436 ch = s.scanFraction(ch) 437 ch = s.scanExponent(ch) 438 return Float, ch 439 } 440 return Int, ch 441 } 442 443 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 444 for n > 0 && digitVal(ch) < base { 445 ch = s.next() 446 n-- 447 } 448 if n > 0 { 449 s.error("illegal char escape") 450 } 451 return ch 452 } 453 454 func (s *Scanner) scanEscape(quote rune) rune { 455 ch := s.next() // read character after '/' 456 switch ch { 457 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 458 // nothing to do 459 ch = s.next() 460 case '0', '1', '2', '3', '4', '5', '6', '7': 461 ch = s.scanDigits(ch, 8, 3) 462 case 'x': 463 ch = s.scanDigits(s.next(), 16, 2) 464 case 'u': 465 ch = s.scanDigits(s.next(), 16, 4) 466 case 'U': 467 ch = s.scanDigits(s.next(), 16, 8) 468 default: 469 s.error("illegal char escape") 470 } 471 return ch 472 } 473 474 func (s *Scanner) scanString(quote rune) (n int) { 475 ch := s.next() // read character after quote 476 for ch != quote { 477 if ch == '\n' || ch < 0 { 478 s.error("literal not terminated") 479 return 480 } 481 if ch == '\\' { 482 ch = s.scanEscape(quote) 483 } else { 484 ch = s.next() 485 } 486 n++ 487 } 488 return 489 } 490 491 func (s *Scanner) scanRawString() { 492 ch := s.next() // read character after '`' 493 for ch != '`' { 494 if ch < 0 { 495 s.error("literal not terminated") 496 return 497 } 498 ch = s.next() 499 } 500 } 501 502 func (s *Scanner) scanChar() { 503 if s.scanString('\'') != 1 { 504 s.error("illegal char literal") 505 } 506 } 507 508 func (s *Scanner) scanComment(ch rune) rune { 509 // ch == '/' || ch == '*' 510 if ch == '/' { 511 // line comment 512 ch = s.next() // read character after "//" 513 for ch != '\n' && ch >= 0 { 514 ch = s.next() 515 } 516 return ch 517 } 518 519 // general comment 520 ch = s.next() // read character after "/*" 521 for { 522 if ch < 0 { 523 s.error("comment not terminated") 524 break 525 } 526 ch0 := ch 527 ch = s.next() 528 if ch0 == '*' && ch == '/' { 529 ch = s.next() 530 break 531 } 532 } 533 return ch 534 } 535 536 // Scan reads the next token or Unicode character from source and returns it. 537 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 538 // It returns EOF at the end of the source. It reports scanner errors (read and 539 // token errors) by calling s.Error, if not nil; otherwise it prints an error 540 // message to os.Stderr. 541 func (s *Scanner) Scan() rune { 542 ch := s.Peek() 543 544 // reset token text position 545 s.tokPos = -1 546 s.Line = 0 547 548 redo: 549 // skip white space 550 for s.Whitespace&(1<<uint(ch)) != 0 { 551 ch = s.next() 552 } 553 554 // start collecting token text 555 s.tokBuf.Reset() 556 s.tokPos = s.srcPos - s.lastCharLen 557 558 // set token position 559 // (this is a slightly optimized version of the code in Pos()) 560 s.Offset = s.srcBufOffset + s.tokPos 561 if s.column > 0 { 562 // common case: last character was not a '\n' 563 s.Line = s.line 564 s.Column = s.column 565 } else { 566 // last character was a '\n' 567 // (we cannot be at the beginning of the source 568 // since we have called next() at least once) 569 s.Line = s.line - 1 570 s.Column = s.lastLineLen 571 } 572 573 // determine token value 574 tok := ch 575 switch { 576 case s.isIdentRune(ch, 0): 577 if s.Mode&ScanIdents != 0 { 578 tok = Ident 579 ch = s.scanIdentifier() 580 } else { 581 ch = s.next() 582 } 583 case isDecimal(ch): 584 if s.Mode&(ScanInts|ScanFloats) != 0 { 585 tok, ch = s.scanNumber(ch) 586 } else { 587 ch = s.next() 588 } 589 default: 590 switch ch { 591 case EOF: 592 break 593 case '"': 594 if s.Mode&ScanStrings != 0 { 595 s.scanString('"') 596 tok = String 597 } 598 ch = s.next() 599 case '\'': 600 if s.Mode&ScanChars != 0 { 601 s.scanChar() 602 tok = Char 603 } 604 ch = s.next() 605 case '.': 606 ch = s.next() 607 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 608 tok = Float 609 ch = s.scanMantissa(ch) 610 ch = s.scanExponent(ch) 611 } 612 case '/': 613 ch = s.next() 614 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 615 if s.Mode&SkipComments != 0 { 616 s.tokPos = -1 // don't collect token text 617 ch = s.scanComment(ch) 618 goto redo 619 } 620 ch = s.scanComment(ch) 621 tok = Comment 622 } 623 case '`': 624 if s.Mode&ScanRawStrings != 0 { 625 s.scanRawString() 626 tok = String 627 } 628 ch = s.next() 629 default: 630 ch = s.next() 631 } 632 } 633 634 // end of token text 635 s.tokEnd = s.srcPos - s.lastCharLen 636 637 s.ch = ch 638 return tok 639 } 640 641 // Pos returns the position of the character immediately after 642 // the character or token returned by the last call to Next or Scan. 643 func (s *Scanner) Pos() (pos Position) { 644 pos.Filename = s.Filename 645 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 646 switch { 647 case s.column > 0: 648 // common case: last character was not a '\n' 649 pos.Line = s.line 650 pos.Column = s.column 651 case s.lastLineLen > 0: 652 // last character was a '\n' 653 pos.Line = s.line - 1 654 pos.Column = s.lastLineLen 655 default: 656 // at the beginning of the source 657 pos.Line = 1 658 pos.Column = 1 659 } 660 return 661 } 662 663 // TokenText returns the string corresponding to the most recently scanned token. 664 // Valid after calling Scan(). 665 func (s *Scanner) TokenText() string { 666 if s.tokPos < 0 { 667 // no token text 668 return "" 669 } 670 671 if s.tokEnd < 0 { 672 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 673 s.tokEnd = s.tokPos 674 } 675 676 if s.tokBuf.Len() == 0 { 677 // common case: the entire token text is still in srcBuf 678 return string(s.srcBuf[s.tokPos:s.tokEnd]) 679 } 680 681 // part of the token text was saved in tokBuf: save the rest in 682 // tokBuf as well and return its content 683 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 684 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 685 return s.tokBuf.String() 686 }