github.com/influxdata/influxql@v1.1.0/scanner.go (about) 1 package influxql 2 3 import ( 4 "bufio" 5 "bytes" 6 "errors" 7 "fmt" 8 "io" 9 ) 10 11 // Scanner represents a lexical scanner for InfluxQL. 12 type Scanner struct { 13 r *reader 14 } 15 16 // NewScanner returns a new instance of Scanner. 17 func NewScanner(r io.Reader) *Scanner { 18 return &Scanner{r: &reader{r: bufio.NewReader(r)}} 19 } 20 21 // Scan returns the next token and position from the underlying reader. 22 // Also returns the literal text read for strings, numbers, and duration tokens 23 // since these token types can have different literal representations. 24 func (s *Scanner) Scan() (tok Token, pos Pos, lit string) { 25 // Read next code point. 26 ch0, pos := s.r.read() 27 28 // If we see whitespace then consume all contiguous whitespace. 29 // If we see a letter, or certain acceptable special characters, then consume 30 // as an ident or reserved word. 31 if isWhitespace(ch0) { 32 return s.scanWhitespace() 33 } else if isLetter(ch0) || ch0 == '_' { 34 s.r.unread() 35 return s.scanIdent(true) 36 } else if isDigit(ch0) { 37 return s.scanNumber() 38 } 39 40 // Otherwise parse individual characters. 41 switch ch0 { 42 case eof: 43 return EOF, pos, "" 44 case '"': 45 s.r.unread() 46 return s.scanIdent(true) 47 case '\'': 48 return s.scanString() 49 case '.': 50 ch1, _ := s.r.read() 51 s.r.unread() 52 if isDigit(ch1) { 53 return s.scanNumber() 54 } 55 return DOT, pos, "" 56 case '$': 57 tok, _, lit = s.scanIdent(false) 58 if tok != IDENT { 59 return tok, pos, "$" + lit 60 } 61 return BOUNDPARAM, pos, "$" + lit 62 case '+': 63 return ADD, pos, "" 64 case '-': 65 ch1, _ := s.r.read() 66 if ch1 == '-' { 67 s.skipUntilNewline() 68 return COMMENT, pos, "" 69 } 70 s.r.unread() 71 return SUB, pos, "" 72 case '*': 73 return MUL, pos, "" 74 case '/': 75 ch1, _ := s.r.read() 76 if ch1 == '*' { 77 if err := s.skipUntilEndComment(); err != nil { 78 return ILLEGAL, pos, "" 79 } 80 return COMMENT, pos, "" 81 } else { 82 s.r.unread() 83 } 84 return DIV, pos, "" 85 case '%': 86 return MOD, pos, "" 87 case '&': 88 return BITWISE_AND, pos, "" 89 case '|': 90 return BITWISE_OR, pos, "" 91 case '^': 92 return BITWISE_XOR, pos, "" 93 case '=': 94 if ch1, _ := s.r.read(); ch1 == '~' { 95 return EQREGEX, pos, "" 96 } 97 s.r.unread() 98 return EQ, pos, "" 99 case '!': 100 if ch1, _ := s.r.read(); ch1 == '=' { 101 return NEQ, pos, "" 102 } else if ch1 == '~' { 103 return NEQREGEX, pos, "" 104 } 105 s.r.unread() 106 case '>': 107 if ch1, _ := s.r.read(); ch1 == '=' { 108 return GTE, pos, "" 109 } 110 s.r.unread() 111 return GT, pos, "" 112 case '<': 113 if ch1, _ := s.r.read(); ch1 == '=' { 114 return LTE, pos, "" 115 } else if ch1 == '>' { 116 return NEQ, pos, "" 117 } 118 s.r.unread() 119 return LT, pos, "" 120 case '(': 121 return LPAREN, pos, "" 122 case ')': 123 return RPAREN, pos, "" 124 case ',': 125 return COMMA, pos, "" 126 case ';': 127 return SEMICOLON, pos, "" 128 case ':': 129 if ch1, _ := s.r.read(); ch1 == ':' { 130 return DOUBLECOLON, pos, "" 131 } 132 s.r.unread() 133 return COLON, pos, "" 134 } 135 136 return ILLEGAL, pos, string(ch0) 137 } 138 139 // scanWhitespace consumes the current rune and all contiguous whitespace. 140 func (s *Scanner) scanWhitespace() (tok Token, pos Pos, lit string) { 141 // Create a buffer and read the current character into it. 142 var buf bytes.Buffer 143 ch, pos := s.r.curr() 144 _, _ = buf.WriteRune(ch) 145 146 // Read every subsequent whitespace character into the buffer. 147 // Non-whitespace characters and EOF will cause the loop to exit. 148 for { 149 ch, _ = s.r.read() 150 if ch == eof { 151 break 152 } else if !isWhitespace(ch) { 153 s.r.unread() 154 break 155 } else { 156 _, _ = buf.WriteRune(ch) 157 } 158 } 159 160 return WS, pos, buf.String() 161 } 162 163 // skipUntilNewline skips characters until it reaches a newline. 164 func (s *Scanner) skipUntilNewline() { 165 for { 166 if ch, _ := s.r.read(); ch == '\n' || ch == eof { 167 return 168 } 169 } 170 } 171 172 // skipUntilEndComment skips characters until it reaches a '*/' symbol. 173 func (s *Scanner) skipUntilEndComment() error { 174 for { 175 if ch1, _ := s.r.read(); ch1 == '*' { 176 // We might be at the end. 177 star: 178 ch2, _ := s.r.read() 179 if ch2 == '/' { 180 return nil 181 } else if ch2 == '*' { 182 // We are back in the state machine since we see a star. 183 goto star 184 } else if ch2 == eof { 185 return io.EOF 186 } 187 } else if ch1 == eof { 188 return io.EOF 189 } 190 } 191 } 192 193 func (s *Scanner) scanIdent(lookup bool) (tok Token, pos Pos, lit string) { 194 // Save the starting position of the identifier. 195 _, pos = s.r.read() 196 s.r.unread() 197 198 var buf bytes.Buffer 199 for { 200 if ch, _ := s.r.read(); ch == eof { 201 break 202 } else if ch == '"' { 203 tok0, pos0, lit0 := s.scanString() 204 if tok0 == BADSTRING || tok0 == BADESCAPE { 205 return tok0, pos0, lit0 206 } 207 return IDENT, pos, lit0 208 } else if isIdentChar(ch) { 209 s.r.unread() 210 buf.WriteString(ScanBareIdent(s.r)) 211 } else { 212 s.r.unread() 213 break 214 } 215 } 216 lit = buf.String() 217 218 // If the literal matches a keyword then return that keyword. 219 if lookup { 220 if tok = Lookup(lit); tok != IDENT { 221 return tok, pos, "" 222 } 223 } 224 return IDENT, pos, lit 225 } 226 227 // scanString consumes a contiguous string of non-quote characters. 228 // Quote characters can be consumed if they're first escaped with a backslash. 229 func (s *Scanner) scanString() (tok Token, pos Pos, lit string) { 230 s.r.unread() 231 _, pos = s.r.curr() 232 233 var err error 234 lit, err = ScanString(s.r) 235 if err == errBadString { 236 return BADSTRING, pos, lit 237 } else if err == errBadEscape { 238 _, pos = s.r.curr() 239 return BADESCAPE, pos, lit 240 } 241 return STRING, pos, lit 242 } 243 244 // ScanRegex consumes a token to find escapes 245 func (s *Scanner) ScanRegex() (tok Token, pos Pos, lit string) { 246 _, pos = s.r.curr() 247 248 // Start & end sentinels. 249 start, end := '/', '/' 250 // Valid escape chars. 251 escapes := map[rune]rune{'/': '/'} 252 253 b, err := ScanDelimited(s.r, start, end, escapes, true) 254 255 if err == errBadEscape { 256 _, pos = s.r.curr() 257 return BADESCAPE, pos, lit 258 } else if err != nil { 259 return BADREGEX, pos, lit 260 } 261 return REGEX, pos, string(b) 262 } 263 264 // scanNumber consumes anything that looks like the start of a number. 265 func (s *Scanner) scanNumber() (tok Token, pos Pos, lit string) { 266 var buf bytes.Buffer 267 268 // Check if the initial rune is a ".". 269 ch, pos := s.r.curr() 270 if ch == '.' { 271 // Peek and see if the next rune is a digit. 272 ch1, _ := s.r.read() 273 s.r.unread() 274 if !isDigit(ch1) { 275 return ILLEGAL, pos, "." 276 } 277 278 // Unread the full stop so we can read it later. 279 s.r.unread() 280 } else { 281 s.r.unread() 282 } 283 284 // Read as many digits as possible. 285 _, _ = buf.WriteString(s.scanDigits()) 286 287 // If next code points are a full stop and digit then consume them. 288 isDecimal := false 289 if ch0, _ := s.r.read(); ch0 == '.' { 290 isDecimal = true 291 if ch1, _ := s.r.read(); isDigit(ch1) { 292 _, _ = buf.WriteRune(ch0) 293 _, _ = buf.WriteRune(ch1) 294 _, _ = buf.WriteString(s.scanDigits()) 295 } else { 296 s.r.unread() 297 } 298 } else { 299 s.r.unread() 300 } 301 302 // Read as a duration or integer if it doesn't have a fractional part. 303 if !isDecimal { 304 // If the next rune is a letter then this is a duration token. 305 if ch0, _ := s.r.read(); isLetter(ch0) || ch0 == 'µ' { 306 _, _ = buf.WriteRune(ch0) 307 for { 308 ch1, _ := s.r.read() 309 if !isLetter(ch1) && ch1 != 'µ' { 310 s.r.unread() 311 break 312 } 313 _, _ = buf.WriteRune(ch1) 314 } 315 316 // Continue reading digits and letters as part of this token. 317 for { 318 if ch0, _ := s.r.read(); isLetter(ch0) || ch0 == 'µ' || isDigit(ch0) { 319 _, _ = buf.WriteRune(ch0) 320 } else { 321 s.r.unread() 322 break 323 } 324 } 325 return DURATIONVAL, pos, buf.String() 326 } else { 327 s.r.unread() 328 return INTEGER, pos, buf.String() 329 } 330 } 331 return NUMBER, pos, buf.String() 332 } 333 334 // scanDigits consumes a contiguous series of digits. 335 func (s *Scanner) scanDigits() string { 336 var buf bytes.Buffer 337 for { 338 ch, _ := s.r.read() 339 if !isDigit(ch) { 340 s.r.unread() 341 break 342 } 343 _, _ = buf.WriteRune(ch) 344 } 345 return buf.String() 346 } 347 348 // isWhitespace returns true if the rune is a space, tab, or newline. 349 func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' } 350 351 // isLetter returns true if the rune is a letter. 352 func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') } 353 354 // isDigit returns true if the rune is a digit. 355 func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') } 356 357 // isIdentChar returns true if the rune can be used in an unquoted identifier. 358 func isIdentChar(ch rune) bool { return isLetter(ch) || isDigit(ch) || ch == '_' } 359 360 // isIdentFirstChar returns true if the rune can be used as the first char in an unquoted identifer. 361 func isIdentFirstChar(ch rune) bool { return isLetter(ch) || ch == '_' } 362 363 // bufScanner represents a wrapper for scanner to add a buffer. 364 // It provides a fixed-length circular buffer that can be unread. 365 type bufScanner struct { 366 s *Scanner 367 i int // buffer index 368 n int // buffer size 369 buf [3]struct { 370 tok Token 371 pos Pos 372 lit string 373 } 374 } 375 376 // newBufScanner returns a new buffered scanner for a reader. 377 func newBufScanner(r io.Reader) *bufScanner { 378 return &bufScanner{s: NewScanner(r)} 379 } 380 381 // Scan reads the next token from the scanner. 382 func (s *bufScanner) Scan() (tok Token, pos Pos, lit string) { 383 return s.scanFunc(s.s.Scan) 384 } 385 386 // ScanRegex reads a regex token from the scanner. 387 func (s *bufScanner) ScanRegex() (tok Token, pos Pos, lit string) { 388 return s.scanFunc(s.s.ScanRegex) 389 } 390 391 // scanFunc uses the provided function to scan the next token. 392 func (s *bufScanner) scanFunc(scan func() (Token, Pos, string)) (tok Token, pos Pos, lit string) { 393 // If we have unread tokens then read them off the buffer first. 394 if s.n > 0 { 395 s.n-- 396 return s.curr() 397 } 398 399 // Move buffer position forward and save the token. 400 s.i = (s.i + 1) % len(s.buf) 401 buf := &s.buf[s.i] 402 buf.tok, buf.pos, buf.lit = scan() 403 404 return s.curr() 405 } 406 407 // Unscan pushes the previously token back onto the buffer. 408 func (s *bufScanner) Unscan() { s.n++ } 409 410 // curr returns the last read token. 411 func (s *bufScanner) curr() (tok Token, pos Pos, lit string) { 412 buf := &s.buf[(s.i-s.n+len(s.buf))%len(s.buf)] 413 return buf.tok, buf.pos, buf.lit 414 } 415 416 // reader represents a buffered rune reader used by the scanner. 417 // It provides a fixed-length circular buffer that can be unread. 418 type reader struct { 419 r io.RuneScanner 420 i int // buffer index 421 n int // buffer char count 422 pos Pos // last read rune position 423 buf [3]struct { 424 ch rune 425 pos Pos 426 } 427 eof bool // true if reader has ever seen eof. 428 } 429 430 // ReadRune reads the next rune from the reader. 431 // This is a wrapper function to implement the io.RuneReader interface. 432 // Note that this function does not return size. 433 func (r *reader) ReadRune() (ch rune, size int, err error) { 434 ch, _ = r.read() 435 if ch == eof { 436 err = io.EOF 437 } 438 return 439 } 440 441 // UnreadRune pushes the previously read rune back onto the buffer. 442 // This is a wrapper function to implement the io.RuneScanner interface. 443 func (r *reader) UnreadRune() error { 444 r.unread() 445 return nil 446 } 447 448 // read reads the next rune from the reader. 449 func (r *reader) read() (ch rune, pos Pos) { 450 // If we have unread characters then read them off the buffer first. 451 if r.n > 0 { 452 r.n-- 453 return r.curr() 454 } 455 456 // Read next rune from underlying reader. 457 // Any error (including io.EOF) should return as EOF. 458 ch, _, err := r.r.ReadRune() 459 if err != nil { 460 ch = eof 461 } else if ch == '\r' { 462 if ch, _, err := r.r.ReadRune(); err != nil { 463 // nop 464 } else if ch != '\n' { 465 _ = r.r.UnreadRune() 466 } 467 ch = '\n' 468 } 469 470 // Save character and position to the buffer. 471 r.i = (r.i + 1) % len(r.buf) 472 buf := &r.buf[r.i] 473 buf.ch, buf.pos = ch, r.pos 474 475 // Update position. 476 // Only count EOF once. 477 if ch == '\n' { 478 r.pos.Line++ 479 r.pos.Char = 0 480 } else if !r.eof { 481 r.pos.Char++ 482 } 483 484 // Mark the reader as EOF. 485 // This is used so we don't double count EOF characters. 486 if ch == eof { 487 r.eof = true 488 } 489 490 return r.curr() 491 } 492 493 // unread pushes the previously read rune back onto the buffer. 494 func (r *reader) unread() { 495 r.n++ 496 } 497 498 // curr returns the last read character and position. 499 func (r *reader) curr() (ch rune, pos Pos) { 500 i := (r.i - r.n + len(r.buf)) % len(r.buf) 501 buf := &r.buf[i] 502 return buf.ch, buf.pos 503 } 504 505 // eof is a marker code point to signify that the reader can't read any more. 506 const eof = rune(0) 507 508 // ScanDelimited reads a delimited set of runes 509 func ScanDelimited(r io.RuneScanner, start, end rune, escapes map[rune]rune, escapesPassThru bool) ([]byte, error) { 510 // Scan start delimiter. 511 if ch, _, err := r.ReadRune(); err != nil { 512 return nil, err 513 } else if ch != start { 514 return nil, fmt.Errorf("expected %s; found %s", string(start), string(ch)) 515 } 516 517 var buf bytes.Buffer 518 for { 519 ch0, _, err := r.ReadRune() 520 if ch0 == end { 521 return buf.Bytes(), nil 522 } else if err != nil { 523 return buf.Bytes(), err 524 } else if ch0 == '\n' { 525 return nil, errors.New("delimited text contains new line") 526 } else if ch0 == '\\' { 527 // If the next character is an escape then write the escaped char. 528 // If it's not a valid escape then return an error. 529 ch1, _, err := r.ReadRune() 530 if err != nil { 531 return nil, err 532 } 533 534 c, ok := escapes[ch1] 535 if !ok { 536 if escapesPassThru { 537 // Unread ch1 (char after the \) 538 _ = r.UnreadRune() 539 // Write ch0 (\) to the output buffer. 540 _, _ = buf.WriteRune(ch0) 541 continue 542 } else { 543 buf.Reset() 544 _, _ = buf.WriteRune(ch0) 545 _, _ = buf.WriteRune(ch1) 546 return buf.Bytes(), errBadEscape 547 } 548 } 549 550 _, _ = buf.WriteRune(c) 551 } else { 552 _, _ = buf.WriteRune(ch0) 553 } 554 } 555 } 556 557 // ScanString reads a quoted string from a rune reader. 558 func ScanString(r io.RuneScanner) (string, error) { 559 ending, _, err := r.ReadRune() 560 if err != nil { 561 return "", errBadString 562 } 563 564 var buf bytes.Buffer 565 for { 566 ch0, _, err := r.ReadRune() 567 if ch0 == ending { 568 return buf.String(), nil 569 } else if err != nil || ch0 == '\n' { 570 return buf.String(), errBadString 571 } else if ch0 == '\\' { 572 // If the next character is an escape then write the escaped char. 573 // If it's not a valid escape then return an error. 574 ch1, _, _ := r.ReadRune() 575 if ch1 == 'n' { 576 _, _ = buf.WriteRune('\n') 577 } else if ch1 == '\\' { 578 _, _ = buf.WriteRune('\\') 579 } else if ch1 == '"' { 580 _, _ = buf.WriteRune('"') 581 } else if ch1 == '\'' { 582 _, _ = buf.WriteRune('\'') 583 } else { 584 return string(ch0) + string(ch1), errBadEscape 585 } 586 } else { 587 _, _ = buf.WriteRune(ch0) 588 } 589 } 590 } 591 592 var errBadString = errors.New("bad string") 593 var errBadEscape = errors.New("bad escape") 594 595 // ScanBareIdent reads bare identifier from a rune reader. 596 func ScanBareIdent(r io.RuneScanner) string { 597 // Read every ident character into the buffer. 598 // Non-ident characters and EOF will cause the loop to exit. 599 var buf bytes.Buffer 600 for { 601 ch, _, err := r.ReadRune() 602 if err != nil { 603 break 604 } else if !isIdentChar(ch) { 605 r.UnreadRune() 606 break 607 } else { 608 _, _ = buf.WriteRune(ch) 609 } 610 } 611 return buf.String() 612 } 613 614 // IsRegexOp returns true if the operator accepts a regex operand. 615 func IsRegexOp(t Token) bool { 616 return (t == EQREGEX || t == NEQREGEX) 617 }