github.com/rohankumardubey/syslog-redirector-golang@v0.0.0-20140320174030-4859f03d829a/src/pkg/text/scanner/scanner.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. If the first character 9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded. 10 // 11 // By default, a Scanner skips white space and Go comments and recognizes all 12 // literals as defined by the Go language specification. It may be 13 // customized to recognize only a subset of those literals and to recognize 14 // different white space characters. 15 // 16 // Basic usage pattern: 17 // 18 // var s scanner.Scanner 19 // s.Init(src) 20 // tok := s.Scan() 21 // for tok != scanner.EOF { 22 // // do something with tok 23 // tok = s.Scan() 24 // } 25 // 26 package scanner 27 28 import ( 29 "bytes" 30 "fmt" 31 "io" 32 "os" 33 "unicode" 34 "unicode/utf8" 35 ) 36 37 // TODO(gri): Consider changing this to use the new (token) Position package. 38 39 // A source position is represented by a Position value. 40 // A position is valid if Line > 0. 41 type Position struct { 42 Filename string // filename, if any 43 Offset int // byte offset, starting at 0 44 Line int // line number, starting at 1 45 Column int // column number, starting at 1 (character count per line) 46 } 47 48 // IsValid returns true if the position is valid. 49 func (pos *Position) IsValid() bool { return pos.Line > 0 } 50 51 func (pos Position) String() string { 52 s := pos.Filename 53 if pos.IsValid() { 54 if s != "" { 55 s += ":" 56 } 57 s += fmt.Sprintf("%d:%d", pos.Line, pos.Column) 58 } 59 if s == "" { 60 s = "???" 61 } 62 return s 63 } 64 65 // Predefined mode bits to control recognition of tokens. For instance, 66 // to configure a Scanner such that it only recognizes (Go) identifiers, 67 // integers, and skips comments, set the Scanner's Mode field to: 68 // 69 // ScanIdents | ScanInts | SkipComments 70 // 71 const ( 72 ScanIdents = 1 << -Ident 73 ScanInts = 1 << -Int 74 ScanFloats = 1 << -Float // includes Ints 75 ScanChars = 1 << -Char 76 ScanStrings = 1 << -String 77 ScanRawStrings = 1 << -RawString 78 ScanComments = 1 << -Comment 79 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 80 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 81 ) 82 83 // The result of Scan is one of the following tokens or a Unicode character. 84 const ( 85 EOF = -(iota + 1) 86 Ident 87 Int 88 Float 89 Char 90 String 91 RawString 92 Comment 93 skipComment 94 ) 95 96 var tokenString = map[rune]string{ 97 EOF: "EOF", 98 Ident: "Ident", 99 Int: "Int", 100 Float: "Float", 101 Char: "Char", 102 String: "String", 103 RawString: "RawString", 104 Comment: "Comment", 105 } 106 107 // TokenString returns a printable string for a token or Unicode character. 108 func TokenString(tok rune) string { 109 if s, found := tokenString[tok]; found { 110 return s 111 } 112 return fmt.Sprintf("%q", string(tok)) 113 } 114 115 // GoWhitespace is the default value for the Scanner's Whitespace field. 116 // Its value selects Go's white space characters. 117 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 118 119 const bufLen = 1024 // at least utf8.UTFMax 120 121 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 122 type Scanner struct { 123 // Input 124 src io.Reader 125 126 // Source buffer 127 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 128 srcPos int // reading position (srcBuf index) 129 srcEnd int // source end (srcBuf index) 130 131 // Source position 132 srcBufOffset int // byte offset of srcBuf[0] in source 133 line int // line count 134 column int // character count 135 lastLineLen int // length of last line in characters (for correct column reporting) 136 lastCharLen int // length of last character in bytes 137 138 // Token text buffer 139 // Typically, token text is stored completely in srcBuf, but in general 140 // the token text's head may be buffered in tokBuf while the token text's 141 // tail is stored in srcBuf. 142 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 143 tokPos int // token text tail position (srcBuf index); valid if >= 0 144 tokEnd int // token text tail end (srcBuf index) 145 146 // One character look-ahead 147 ch rune // character before current srcPos 148 149 // Error is called for each error encountered. If no Error 150 // function is set, the error is reported to os.Stderr. 151 Error func(s *Scanner, msg string) 152 153 // ErrorCount is incremented by one for each error encountered. 154 ErrorCount int 155 156 // The Mode field controls which tokens are recognized. For instance, 157 // to recognize Ints, set the ScanInts bit in Mode. The field may be 158 // changed at any time. 159 Mode uint 160 161 // The Whitespace field controls which characters are recognized 162 // as white space. To recognize a character ch <= ' ' as white space, 163 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 164 // for values ch > ' '). The field may be changed at any time. 165 Whitespace uint64 166 167 // Start position of most recently scanned token; set by Scan. 168 // Calling Init or Next invalidates the position (Line == 0). 169 // The Filename field is always left untouched by the Scanner. 170 // If an error is reported (via Error) and Position is invalid, 171 // the scanner is not inside a token. Call Pos to obtain an error 172 // position in that case. 173 Position 174 } 175 176 // Init initializes a Scanner with a new source and returns s. 177 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 178 // and Whitespace is set to GoWhitespace. 179 func (s *Scanner) Init(src io.Reader) *Scanner { 180 s.src = src 181 182 // initialize source buffer 183 // (the first call to next() will fill it by calling src.Read) 184 s.srcBuf[0] = utf8.RuneSelf // sentinel 185 s.srcPos = 0 186 s.srcEnd = 0 187 188 // initialize source position 189 s.srcBufOffset = 0 190 s.line = 1 191 s.column = 0 192 s.lastLineLen = 0 193 s.lastCharLen = 0 194 195 // initialize token text buffer 196 // (required for first call to next()). 197 s.tokPos = -1 198 199 // initialize one character look-ahead 200 s.ch = -1 // no char read yet 201 202 // initialize public fields 203 s.Error = nil 204 s.ErrorCount = 0 205 s.Mode = GoTokens 206 s.Whitespace = GoWhitespace 207 s.Line = 0 // invalidate token position 208 209 return s 210 } 211 212 // next reads and returns the next Unicode character. It is designed such 213 // that only a minimal amount of work needs to be done in the common ASCII 214 // case (one test to check for both ASCII and end-of-buffer, and one test 215 // to check for newlines). 216 func (s *Scanner) next() rune { 217 ch, width := rune(s.srcBuf[s.srcPos]), 1 218 219 if ch >= utf8.RuneSelf { 220 // uncommon case: not ASCII or not enough bytes 221 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 222 // not enough bytes: read some more, but first 223 // save away token text if any 224 if s.tokPos >= 0 { 225 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 226 s.tokPos = 0 227 // s.tokEnd is set by Scan() 228 } 229 // move unread bytes to beginning of buffer 230 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 231 s.srcBufOffset += s.srcPos 232 // read more bytes 233 // (an io.Reader must return io.EOF when it reaches 234 // the end of what it is reading - simply returning 235 // n == 0 will make this loop retry forever; but the 236 // error is in the reader implementation in that case) 237 i := s.srcEnd - s.srcPos 238 n, err := s.src.Read(s.srcBuf[i:bufLen]) 239 s.srcPos = 0 240 s.srcEnd = i + n 241 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 242 if err != nil { 243 if s.srcEnd == 0 { 244 if s.lastCharLen > 0 { 245 // previous character was not EOF 246 s.column++ 247 } 248 s.lastCharLen = 0 249 return EOF 250 } 251 if err != io.EOF { 252 s.error(err.Error()) 253 } 254 // If err == EOF, we won't be getting more 255 // bytes; break to avoid infinite loop. If 256 // err is something else, we don't know if 257 // we can get more bytes; thus also break. 258 break 259 } 260 } 261 // at least one byte 262 ch = rune(s.srcBuf[s.srcPos]) 263 if ch >= utf8.RuneSelf { 264 // uncommon case: not ASCII 265 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 266 if ch == utf8.RuneError && width == 1 { 267 // advance for correct error position 268 s.srcPos += width 269 s.lastCharLen = width 270 s.column++ 271 s.error("illegal UTF-8 encoding") 272 return ch 273 } 274 } 275 } 276 277 // advance 278 s.srcPos += width 279 s.lastCharLen = width 280 s.column++ 281 282 // special situations 283 switch ch { 284 case 0: 285 // for compatibility with other tools 286 s.error("illegal character NUL") 287 case '\n': 288 s.line++ 289 s.lastLineLen = s.column 290 s.column = 0 291 } 292 293 return ch 294 } 295 296 // Next reads and returns the next Unicode character. 297 // It returns EOF at the end of the source. It reports 298 // a read error by calling s.Error, if not nil; otherwise 299 // it prints an error message to os.Stderr. Next does not 300 // update the Scanner's Position field; use Pos() to 301 // get the current position. 302 func (s *Scanner) Next() rune { 303 s.tokPos = -1 // don't collect token text 304 s.Line = 0 // invalidate token position 305 ch := s.Peek() 306 s.ch = s.next() 307 return ch 308 } 309 310 // Peek returns the next Unicode character in the source without advancing 311 // the scanner. It returns EOF if the scanner's position is at the last 312 // character of the source. 313 func (s *Scanner) Peek() rune { 314 if s.ch < 0 { 315 // this code is only run for the very first character 316 s.ch = s.next() 317 if s.ch == '\uFEFF' { 318 s.ch = s.next() // ignore BOM 319 } 320 } 321 return s.ch 322 } 323 324 func (s *Scanner) error(msg string) { 325 s.ErrorCount++ 326 if s.Error != nil { 327 s.Error(s, msg) 328 return 329 } 330 pos := s.Position 331 if !pos.IsValid() { 332 pos = s.Pos() 333 } 334 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 335 } 336 337 func (s *Scanner) scanIdentifier() rune { 338 ch := s.next() // read character after first '_' or letter 339 for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) { 340 ch = s.next() 341 } 342 return ch 343 } 344 345 func digitVal(ch rune) int { 346 switch { 347 case '0' <= ch && ch <= '9': 348 return int(ch - '0') 349 case 'a' <= ch && ch <= 'f': 350 return int(ch - 'a' + 10) 351 case 'A' <= ch && ch <= 'F': 352 return int(ch - 'A' + 10) 353 } 354 return 16 // larger than any legal digit val 355 } 356 357 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 358 359 func (s *Scanner) scanMantissa(ch rune) rune { 360 for isDecimal(ch) { 361 ch = s.next() 362 } 363 return ch 364 } 365 366 func (s *Scanner) scanFraction(ch rune) rune { 367 if ch == '.' { 368 ch = s.scanMantissa(s.next()) 369 } 370 return ch 371 } 372 373 func (s *Scanner) scanExponent(ch rune) rune { 374 if ch == 'e' || ch == 'E' { 375 ch = s.next() 376 if ch == '-' || ch == '+' { 377 ch = s.next() 378 } 379 ch = s.scanMantissa(ch) 380 } 381 return ch 382 } 383 384 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 385 // isDecimal(ch) 386 if ch == '0' { 387 // int or float 388 ch = s.next() 389 if ch == 'x' || ch == 'X' { 390 // hexadecimal int 391 ch = s.next() 392 hasMantissa := false 393 for digitVal(ch) < 16 { 394 ch = s.next() 395 hasMantissa = true 396 } 397 if !hasMantissa { 398 s.error("illegal hexadecimal number") 399 } 400 } else { 401 // octal int or float 402 has8or9 := false 403 for isDecimal(ch) { 404 if ch > '7' { 405 has8or9 = true 406 } 407 ch = s.next() 408 } 409 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 410 // float 411 ch = s.scanFraction(ch) 412 ch = s.scanExponent(ch) 413 return Float, ch 414 } 415 // octal int 416 if has8or9 { 417 s.error("illegal octal number") 418 } 419 } 420 return Int, ch 421 } 422 // decimal int or float 423 ch = s.scanMantissa(ch) 424 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 425 // float 426 ch = s.scanFraction(ch) 427 ch = s.scanExponent(ch) 428 return Float, ch 429 } 430 return Int, ch 431 } 432 433 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 434 for n > 0 && digitVal(ch) < base { 435 ch = s.next() 436 n-- 437 } 438 if n > 0 { 439 s.error("illegal char escape") 440 } 441 return ch 442 } 443 444 func (s *Scanner) scanEscape(quote rune) rune { 445 ch := s.next() // read character after '/' 446 switch ch { 447 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 448 // nothing to do 449 ch = s.next() 450 case '0', '1', '2', '3', '4', '5', '6', '7': 451 ch = s.scanDigits(ch, 8, 3) 452 case 'x': 453 ch = s.scanDigits(s.next(), 16, 2) 454 case 'u': 455 ch = s.scanDigits(s.next(), 16, 4) 456 case 'U': 457 ch = s.scanDigits(s.next(), 16, 8) 458 default: 459 s.error("illegal char escape") 460 } 461 return ch 462 } 463 464 func (s *Scanner) scanString(quote rune) (n int) { 465 ch := s.next() // read character after quote 466 for ch != quote { 467 if ch == '\n' || ch < 0 { 468 s.error("literal not terminated") 469 return 470 } 471 if ch == '\\' { 472 ch = s.scanEscape(quote) 473 } else { 474 ch = s.next() 475 } 476 n++ 477 } 478 return 479 } 480 481 func (s *Scanner) scanRawString() { 482 ch := s.next() // read character after '`' 483 for ch != '`' { 484 if ch < 0 { 485 s.error("literal not terminated") 486 return 487 } 488 ch = s.next() 489 } 490 } 491 492 func (s *Scanner) scanChar() { 493 if s.scanString('\'') != 1 { 494 s.error("illegal char literal") 495 } 496 } 497 498 func (s *Scanner) scanComment(ch rune) rune { 499 // ch == '/' || ch == '*' 500 if ch == '/' { 501 // line comment 502 ch = s.next() // read character after "//" 503 for ch != '\n' && ch >= 0 { 504 ch = s.next() 505 } 506 return ch 507 } 508 509 // general comment 510 ch = s.next() // read character after "/*" 511 for { 512 if ch < 0 { 513 s.error("comment not terminated") 514 break 515 } 516 ch0 := ch 517 ch = s.next() 518 if ch0 == '*' && ch == '/' { 519 ch = s.next() 520 break 521 } 522 } 523 return ch 524 } 525 526 // Scan reads the next token or Unicode character from source and returns it. 527 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 528 // It returns EOF at the end of the source. It reports scanner errors (read and 529 // token errors) by calling s.Error, if not nil; otherwise it prints an error 530 // message to os.Stderr. 531 func (s *Scanner) Scan() rune { 532 ch := s.Peek() 533 534 // reset token text position 535 s.tokPos = -1 536 s.Line = 0 537 538 redo: 539 // skip white space 540 for s.Whitespace&(1<<uint(ch)) != 0 { 541 ch = s.next() 542 } 543 544 // start collecting token text 545 s.tokBuf.Reset() 546 s.tokPos = s.srcPos - s.lastCharLen 547 548 // set token position 549 // (this is a slightly optimized version of the code in Pos()) 550 s.Offset = s.srcBufOffset + s.tokPos 551 if s.column > 0 { 552 // common case: last character was not a '\n' 553 s.Line = s.line 554 s.Column = s.column 555 } else { 556 // last character was a '\n' 557 // (we cannot be at the beginning of the source 558 // since we have called next() at least once) 559 s.Line = s.line - 1 560 s.Column = s.lastLineLen 561 } 562 563 // determine token value 564 tok := ch 565 switch { 566 case unicode.IsLetter(ch) || ch == '_': 567 if s.Mode&ScanIdents != 0 { 568 tok = Ident 569 ch = s.scanIdentifier() 570 } else { 571 ch = s.next() 572 } 573 case isDecimal(ch): 574 if s.Mode&(ScanInts|ScanFloats) != 0 { 575 tok, ch = s.scanNumber(ch) 576 } else { 577 ch = s.next() 578 } 579 default: 580 switch ch { 581 case '"': 582 if s.Mode&ScanStrings != 0 { 583 s.scanString('"') 584 tok = String 585 } 586 ch = s.next() 587 case '\'': 588 if s.Mode&ScanChars != 0 { 589 s.scanChar() 590 tok = Char 591 } 592 ch = s.next() 593 case '.': 594 ch = s.next() 595 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 596 tok = Float 597 ch = s.scanMantissa(ch) 598 ch = s.scanExponent(ch) 599 } 600 case '/': 601 ch = s.next() 602 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 603 if s.Mode&SkipComments != 0 { 604 s.tokPos = -1 // don't collect token text 605 ch = s.scanComment(ch) 606 goto redo 607 } 608 ch = s.scanComment(ch) 609 tok = Comment 610 } 611 case '`': 612 if s.Mode&ScanRawStrings != 0 { 613 s.scanRawString() 614 tok = String 615 } 616 ch = s.next() 617 default: 618 ch = s.next() 619 } 620 } 621 622 // end of token text 623 s.tokEnd = s.srcPos - s.lastCharLen 624 625 s.ch = ch 626 return tok 627 } 628 629 // Pos returns the position of the character immediately after 630 // the character or token returned by the last call to Next or Scan. 631 func (s *Scanner) Pos() (pos Position) { 632 pos.Filename = s.Filename 633 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 634 switch { 635 case s.column > 0: 636 // common case: last character was not a '\n' 637 pos.Line = s.line 638 pos.Column = s.column 639 case s.lastLineLen > 0: 640 // last character was a '\n' 641 pos.Line = s.line - 1 642 pos.Column = s.lastLineLen 643 default: 644 // at the beginning of the source 645 pos.Line = 1 646 pos.Column = 1 647 } 648 return 649 } 650 651 // TokenText returns the string corresponding to the most recently scanned token. 652 // Valid after calling Scan(). 653 func (s *Scanner) TokenText() string { 654 if s.tokPos < 0 { 655 // no token text 656 return "" 657 } 658 659 if s.tokEnd < 0 { 660 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 661 s.tokEnd = s.tokPos 662 } 663 664 if s.tokBuf.Len() == 0 { 665 // common case: the entire token text is still in srcBuf 666 return string(s.srcBuf[s.tokPos:s.tokEnd]) 667 } 668 669 // part of the token text was saved in tokBuf: save the rest in 670 // tokBuf as well and return its content 671 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 672 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 673 return s.tokBuf.String() 674 }