github.com/sbinet/go@v0.0.0-20160827155028-54d7de7dd62b/src/text/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. If the first character 9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. 10 // 11 // By default, a Scanner skips white space and Go comments and recognizes all 12 // literals as defined by the Go language specification. It may be 13 // customized to recognize only a subset of those literals and to recognize 14 // different identifier and white space characters. 15 package scanner 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "os" 22 "unicode" 23 "unicode/utf8" 24 ) 25 26 // A source position is represented by a Position value. 27 // A position is valid if Line > 0. 28 type Position struct { 29 Filename string // filename, if any 30 Offset int // byte offset, starting at 0 31 Line int // line number, starting at 1 32 Column int // column number, starting at 1 (character count per line) 33 } 34 35 // IsValid reports whether the position is valid. 36 func (pos *Position) IsValid() bool { return pos.Line > 0 } 37 38 func (pos Position) String() string { 39 s := pos.Filename 40 if s == "" { 41 s = "<input>" 42 } 43 if pos.IsValid() { 44 s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column) 45 } 46 return s 47 } 48 49 // Predefined mode bits to control recognition of tokens. For instance, 50 // to configure a Scanner such that it only recognizes (Go) identifiers, 51 // integers, and skips comments, set the Scanner's Mode field to: 52 // 53 // ScanIdents | ScanInts | SkipComments 54 // 55 // With the exceptions of comments, which are skipped if SkipComments is 56 // set, unrecognized tokens are not ignored. Instead, the scanner simply 57 // returns the respective individual characters (or possibly sub-tokens). 58 // For instance, if the mode is ScanIdents (not ScanStrings), the string 59 // "foo" is scanned as the token sequence '"' Ident '"'. 60 // 61 const ( 62 ScanIdents = 1 << -Ident 63 ScanInts = 1 << -Int 64 ScanFloats = 1 << -Float // includes Ints 65 ScanChars = 1 << -Char 66 ScanStrings = 1 << -String 67 ScanRawStrings = 1 << -RawString 68 ScanComments = 1 << -Comment 69 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 70 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 71 ) 72 73 // The result of Scan is one of these tokens or a Unicode character. 74 const ( 75 EOF = -(iota + 1) 76 Ident 77 Int 78 Float 79 Char 80 String 81 RawString 82 Comment 83 skipComment 84 ) 85 86 var tokenString = map[rune]string{ 87 EOF: "EOF", 88 Ident: "Ident", 89 Int: "Int", 90 Float: "Float", 91 Char: "Char", 92 String: "String", 93 RawString: "RawString", 94 Comment: "Comment", 95 } 96 97 // TokenString returns a printable string for a token or Unicode character. 98 func TokenString(tok rune) string { 99 if s, found := tokenString[tok]; found { 100 return s 101 } 102 return fmt.Sprintf("%q", string(tok)) 103 } 104 105 // GoWhitespace is the default value for the Scanner's Whitespace field. 106 // Its value selects Go's white space characters. 107 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 108 109 const bufLen = 1024 // at least utf8.UTFMax 110 111 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 112 type Scanner struct { 113 // Input 114 src io.Reader 115 116 // Source buffer 117 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 118 srcPos int // reading position (srcBuf index) 119 srcEnd int // source end (srcBuf index) 120 121 // Source position 122 srcBufOffset int // byte offset of srcBuf[0] in source 123 line int // line count 124 column int // character count 125 lastLineLen int // length of last line in characters (for correct column reporting) 126 lastCharLen int // length of last character in bytes 127 128 // Token text buffer 129 // Typically, token text is stored completely in srcBuf, but in general 130 // the token text's head may be buffered in tokBuf while the token text's 131 // tail is stored in srcBuf. 132 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 133 tokPos int // token text tail position (srcBuf index); valid if >= 0 134 tokEnd int // token text tail end (srcBuf index) 135 136 // One character look-ahead 137 ch rune // character before current srcPos 138 139 // Error is called for each error encountered. If no Error 140 // function is set, the error is reported to os.Stderr. 141 Error func(s *Scanner, msg string) 142 143 // ErrorCount is incremented by one for each error encountered. 144 ErrorCount int 145 146 // The Mode field controls which tokens are recognized. For instance, 147 // to recognize Ints, set the ScanInts bit in Mode. The field may be 148 // changed at any time. 149 Mode uint 150 151 // The Whitespace field controls which characters are recognized 152 // as white space. To recognize a character ch <= ' ' as white space, 153 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 154 // for values ch > ' '). The field may be changed at any time. 155 Whitespace uint64 156 157 // IsIdentRune is a predicate controlling the characters accepted 158 // as the ith rune in an identifier. The set of valid characters 159 // must not intersect with the set of white space characters. 160 // If no IsIdentRune function is set, regular Go identifiers are 161 // accepted instead. The field may be changed at any time. 162 IsIdentRune func(ch rune, i int) bool 163 164 // Start position of most recently scanned token; set by Scan. 165 // Calling Init or Next invalidates the position (Line == 0). 166 // The Filename field is always left untouched by the Scanner. 167 // If an error is reported (via Error) and Position is invalid, 168 // the scanner is not inside a token. Call Pos to obtain an error 169 // position in that case. 170 Position 171 } 172 173 // Init initializes a Scanner with a new source and returns s. 174 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 175 // and Whitespace is set to GoWhitespace. 176 func (s *Scanner) Init(src io.Reader) *Scanner { 177 s.src = src 178 179 // initialize source buffer 180 // (the first call to next() will fill it by calling src.Read) 181 s.srcBuf[0] = utf8.RuneSelf // sentinel 182 s.srcPos = 0 183 s.srcEnd = 0 184 185 // initialize source position 186 s.srcBufOffset = 0 187 s.line = 1 188 s.column = 0 189 s.lastLineLen = 0 190 s.lastCharLen = 0 191 192 // initialize token text buffer 193 // (required for first call to next()). 194 s.tokPos = -1 195 196 // initialize one character look-ahead 197 s.ch = -2 // no char read yet, not EOF 198 199 // initialize public fields 200 s.Error = nil 201 s.ErrorCount = 0 202 s.Mode = GoTokens 203 s.Whitespace = GoWhitespace 204 s.Line = 0 // invalidate token position 205 206 return s 207 } 208 209 // next reads and returns the next Unicode character. It is designed such 210 // that only a minimal amount of work needs to be done in the common ASCII 211 // case (one test to check for both ASCII and end-of-buffer, and one test 212 // to check for newlines). 213 func (s *Scanner) next() rune { 214 ch, width := rune(s.srcBuf[s.srcPos]), 1 215 216 if ch >= utf8.RuneSelf { 217 // uncommon case: not ASCII or not enough bytes 218 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 219 // not enough bytes: read some more, but first 220 // save away token text if any 221 if s.tokPos >= 0 { 222 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 223 s.tokPos = 0 224 // s.tokEnd is set by Scan() 225 } 226 // move unread bytes to beginning of buffer 227 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 228 s.srcBufOffset += s.srcPos 229 // read more bytes 230 // (an io.Reader must return io.EOF when it reaches 231 // the end of what it is reading - simply returning 232 // n == 0 will make this loop retry forever; but the 233 // error is in the reader implementation in that case) 234 i := s.srcEnd - s.srcPos 235 n, err := s.src.Read(s.srcBuf[i:bufLen]) 236 s.srcPos = 0 237 s.srcEnd = i + n 238 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 239 if err != nil { 240 if err != io.EOF { 241 s.error(err.Error()) 242 } 243 if s.srcEnd == 0 { 244 if s.lastCharLen > 0 { 245 // previous character was not EOF 246 s.column++ 247 } 248 s.lastCharLen = 0 249 return EOF 250 } 251 // If err == EOF, we won't be getting more 252 // bytes; break to avoid infinite loop. If 253 // err is something else, we don't know if 254 // we can get more bytes; thus also break. 255 break 256 } 257 } 258 // at least one byte 259 ch = rune(s.srcBuf[s.srcPos]) 260 if ch >= utf8.RuneSelf { 261 // uncommon case: not ASCII 262 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 263 if ch == utf8.RuneError && width == 1 { 264 // advance for correct error position 265 s.srcPos += width 266 s.lastCharLen = width 267 s.column++ 268 s.error("illegal UTF-8 encoding") 269 return ch 270 } 271 } 272 } 273 274 // advance 275 s.srcPos += width 276 s.lastCharLen = width 277 s.column++ 278 279 // special situations 280 switch ch { 281 case 0: 282 // for compatibility with other tools 283 s.error("illegal character NUL") 284 case '\n': 285 s.line++ 286 s.lastLineLen = s.column 287 s.column = 0 288 } 289 290 return ch 291 } 292 293 // Next reads and returns the next Unicode character. 294 // It returns EOF at the end of the source. It reports 295 // a read error by calling s.Error, if not nil; otherwise 296 // it prints an error message to os.Stderr. Next does not 297 // update the Scanner's Position field; use Pos() to 298 // get the current position. 299 func (s *Scanner) Next() rune { 300 s.tokPos = -1 // don't collect token text 301 s.Line = 0 // invalidate token position 302 ch := s.Peek() 303 if ch != EOF { 304 s.ch = s.next() 305 } 306 return ch 307 } 308 309 // Peek returns the next Unicode character in the source without advancing 310 // the scanner. It returns EOF if the scanner's position is at the last 311 // character of the source. 312 func (s *Scanner) Peek() rune { 313 if s.ch == -2 { 314 // this code is only run for the very first character 315 s.ch = s.next() 316 if s.ch == '\uFEFF' { 317 s.ch = s.next() // ignore BOM 318 } 319 } 320 return s.ch 321 } 322 323 func (s *Scanner) error(msg string) { 324 s.ErrorCount++ 325 if s.Error != nil { 326 s.Error(s, msg) 327 return 328 } 329 pos := s.Position 330 if !pos.IsValid() { 331 pos = s.Pos() 332 } 333 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 334 } 335 336 func (s *Scanner) isIdentRune(ch rune, i int) bool { 337 if s.IsIdentRune != nil { 338 return s.IsIdentRune(ch, i) 339 } 340 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 341 } 342 343 func (s *Scanner) scanIdentifier() rune { 344 // we know the zero'th rune is OK; start scanning at the next one 345 ch := s.next() 346 for i := 1; s.isIdentRune(ch, i); i++ { 347 ch = s.next() 348 } 349 return ch 350 } 351 352 func digitVal(ch rune) int { 353 switch { 354 case '0' <= ch && ch <= '9': 355 return int(ch - '0') 356 case 'a' <= ch && ch <= 'f': 357 return int(ch - 'a' + 10) 358 case 'A' <= ch && ch <= 'F': 359 return int(ch - 'A' + 10) 360 } 361 return 16 // larger than any legal digit val 362 } 363 364 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 365 366 func (s *Scanner) scanMantissa(ch rune) rune { 367 for isDecimal(ch) { 368 ch = s.next() 369 } 370 return ch 371 } 372 373 func (s *Scanner) scanFraction(ch rune) rune { 374 if ch == '.' { 375 ch = s.scanMantissa(s.next()) 376 } 377 return ch 378 } 379 380 func (s *Scanner) scanExponent(ch rune) rune { 381 if ch == 'e' || ch == 'E' { 382 ch = s.next() 383 if ch == '-' || ch == '+' { 384 ch = s.next() 385 } 386 ch = s.scanMantissa(ch) 387 } 388 return ch 389 } 390 391 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 392 // isDecimal(ch) 393 if ch == '0' { 394 // int or float 395 ch = s.next() 396 if ch == 'x' || ch == 'X' { 397 // hexadecimal int 398 ch = s.next() 399 hasMantissa := false 400 for digitVal(ch) < 16 { 401 ch = s.next() 402 hasMantissa = true 403 } 404 if !hasMantissa { 405 s.error("illegal hexadecimal number") 406 } 407 } else { 408 // octal int or float 409 has8or9 := false 410 for isDecimal(ch) { 411 if ch > '7' { 412 has8or9 = true 413 } 414 ch = s.next() 415 } 416 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 417 // float 418 ch = s.scanFraction(ch) 419 ch = s.scanExponent(ch) 420 return Float, ch 421 } 422 // octal int 423 if has8or9 { 424 s.error("illegal octal number") 425 } 426 } 427 return Int, ch 428 } 429 // decimal int or float 430 ch = s.scanMantissa(ch) 431 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 432 // float 433 ch = s.scanFraction(ch) 434 ch = s.scanExponent(ch) 435 return Float, ch 436 } 437 return Int, ch 438 } 439 440 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 441 for n > 0 && digitVal(ch) < base { 442 ch = s.next() 443 n-- 444 } 445 if n > 0 { 446 s.error("illegal char escape") 447 } 448 return ch 449 } 450 451 func (s *Scanner) scanEscape(quote rune) rune { 452 ch := s.next() // read character after '/' 453 switch ch { 454 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 455 // nothing to do 456 ch = s.next() 457 case '0', '1', '2', '3', '4', '5', '6', '7': 458 ch = s.scanDigits(ch, 8, 3) 459 case 'x': 460 ch = s.scanDigits(s.next(), 16, 2) 461 case 'u': 462 ch = s.scanDigits(s.next(), 16, 4) 463 case 'U': 464 ch = s.scanDigits(s.next(), 16, 8) 465 default: 466 s.error("illegal char escape") 467 } 468 return ch 469 } 470 471 func (s *Scanner) scanString(quote rune) (n int) { 472 ch := s.next() // read character after quote 473 for ch != quote { 474 if ch == '\n' || ch < 0 { 475 s.error("literal not terminated") 476 return 477 } 478 if ch == '\\' { 479 ch = s.scanEscape(quote) 480 } else { 481 ch = s.next() 482 } 483 n++ 484 } 485 return 486 } 487 488 func (s *Scanner) scanRawString() { 489 ch := s.next() // read character after '`' 490 for ch != '`' { 491 if ch < 0 { 492 s.error("literal not terminated") 493 return 494 } 495 ch = s.next() 496 } 497 } 498 499 func (s *Scanner) scanChar() { 500 if s.scanString('\'') != 1 { 501 s.error("illegal char literal") 502 } 503 } 504 505 func (s *Scanner) scanComment(ch rune) rune { 506 // ch == '/' || ch == '*' 507 if ch == '/' { 508 // line comment 509 ch = s.next() // read character after "//" 510 for ch != '\n' && ch >= 0 { 511 ch = s.next() 512 } 513 return ch 514 } 515 516 // general comment 517 ch = s.next() // read character after "/*" 518 for { 519 if ch < 0 { 520 s.error("comment not terminated") 521 break 522 } 523 ch0 := ch 524 ch = s.next() 525 if ch0 == '*' && ch == '/' { 526 ch = s.next() 527 break 528 } 529 } 530 return ch 531 } 532 533 // Scan reads the next token or Unicode character from source and returns it. 534 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 535 // It returns EOF at the end of the source. It reports scanner errors (read and 536 // token errors) by calling s.Error, if not nil; otherwise it prints an error 537 // message to os.Stderr. 538 func (s *Scanner) Scan() rune { 539 ch := s.Peek() 540 541 // reset token text position 542 s.tokPos = -1 543 s.Line = 0 544 545 redo: 546 // skip white space 547 for s.Whitespace&(1<<uint(ch)) != 0 { 548 ch = s.next() 549 } 550 551 // start collecting token text 552 s.tokBuf.Reset() 553 s.tokPos = s.srcPos - s.lastCharLen 554 555 // set token position 556 // (this is a slightly optimized version of the code in Pos()) 557 s.Offset = s.srcBufOffset + s.tokPos 558 if s.column > 0 { 559 // common case: last character was not a '\n' 560 s.Line = s.line 561 s.Column = s.column 562 } else { 563 // last character was a '\n' 564 // (we cannot be at the beginning of the source 565 // since we have called next() at least once) 566 s.Line = s.line - 1 567 s.Column = s.lastLineLen 568 } 569 570 // determine token value 571 tok := ch 572 switch { 573 case s.isIdentRune(ch, 0): 574 if s.Mode&ScanIdents != 0 { 575 tok = Ident 576 ch = s.scanIdentifier() 577 } else { 578 ch = s.next() 579 } 580 case isDecimal(ch): 581 if s.Mode&(ScanInts|ScanFloats) != 0 { 582 tok, ch = s.scanNumber(ch) 583 } else { 584 ch = s.next() 585 } 586 default: 587 switch ch { 588 case EOF: 589 break 590 case '"': 591 if s.Mode&ScanStrings != 0 { 592 s.scanString('"') 593 tok = String 594 } 595 ch = s.next() 596 case '\'': 597 if s.Mode&ScanChars != 0 { 598 s.scanChar() 599 tok = Char 600 } 601 ch = s.next() 602 case '.': 603 ch = s.next() 604 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 605 tok = Float 606 ch = s.scanMantissa(ch) 607 ch = s.scanExponent(ch) 608 } 609 case '/': 610 ch = s.next() 611 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 612 if s.Mode&SkipComments != 0 { 613 s.tokPos = -1 // don't collect token text 614 ch = s.scanComment(ch) 615 goto redo 616 } 617 ch = s.scanComment(ch) 618 tok = Comment 619 } 620 case '`': 621 if s.Mode&ScanRawStrings != 0 { 622 s.scanRawString() 623 tok = String 624 } 625 ch = s.next() 626 default: 627 ch = s.next() 628 } 629 } 630 631 // end of token text 632 s.tokEnd = s.srcPos - s.lastCharLen 633 634 s.ch = ch 635 return tok 636 } 637 638 // Pos returns the position of the character immediately after 639 // the character or token returned by the last call to Next or Scan. 640 func (s *Scanner) Pos() (pos Position) { 641 pos.Filename = s.Filename 642 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 643 switch { 644 case s.column > 0: 645 // common case: last character was not a '\n' 646 pos.Line = s.line 647 pos.Column = s.column 648 case s.lastLineLen > 0: 649 // last character was a '\n' 650 pos.Line = s.line - 1 651 pos.Column = s.lastLineLen 652 default: 653 // at the beginning of the source 654 pos.Line = 1 655 pos.Column = 1 656 } 657 return 658 } 659 660 // TokenText returns the string corresponding to the most recently scanned token. 661 // Valid after calling Scan(). 662 func (s *Scanner) TokenText() string { 663 if s.tokPos < 0 { 664 // no token text 665 return "" 666 } 667 668 if s.tokEnd < 0 { 669 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 670 s.tokEnd = s.tokPos 671 } 672 673 if s.tokBuf.Len() == 0 { 674 // common case: the entire token text is still in srcBuf 675 return string(s.srcBuf[s.tokPos:s.tokEnd]) 676 } 677 678 // part of the token text was saved in tokBuf: save the rest in 679 // tokBuf as well and return its content 680 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 681 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 682 return s.tokBuf.String() 683 }