github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/expr/scanner.go (about) 1 // Modifications Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // Copyright (c) 2013-2016 Errplane Inc. 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy of 5 // this software and associated documentation files (the "Software"), to deal in 6 // the Software without restriction, including without limitation the rights to 7 // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 // the Software, and to permit persons to whom the Software is furnished to do so, 9 // subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in all 12 // copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 21 package expr 22 23 import ( 24 "bufio" 25 "bytes" 26 "errors" 27 "fmt" 28 "io" 29 ) 30 31 // Scanner represents a lexical scanner for InfluxQL. 32 type Scanner struct { 33 r *reader 34 lastNonWSToken Token 35 } 36 37 // NewScanner returns a new instance of Scanner. 38 func NewScanner(r io.Reader) *Scanner { 39 return &Scanner{r: &reader{r: bufio.NewReader(r)}} 40 } 41 42 // Scan returns the next token and position from the underlying reader. 43 // Also returns the literal text read for strings, numbers, and duration tokens 44 // since these token types can have different literal representations. 45 func (s *Scanner) Scan() (tok Token, pos Pos, lit string) { 46 tok, pos, lit = s.scan() 47 if tok != WS { 48 s.lastNonWSToken = tok 49 } 50 return 51 } 52 53 func (s *Scanner) scan() (tok Token, pos Pos, lit string) { 54 // Read next code point. 55 ch0, pos := s.r.read() 56 57 // If we see whitespace then consume all contiguous whitespace. 58 // If we see a letter, or certain acceptable special characters, then consume 59 // as an ident or reserved word. 60 if isWhitespace(ch0) { 61 return s.scanWhitespace() 62 } else if isLetter(ch0) || ch0 == '_' { 63 s.r.unread() 64 return s.scanIdent() 65 } else if isDigit(ch0) { 66 return s.scanNumber() 67 } 68 69 // Otherwise parse individual characters. 70 switch ch0 { 71 case eof: 72 return EOF, pos, "" 73 case '`': 74 s.r.unread() 75 return s.scanIdent() 76 case '"': 77 return s.scanString() 78 case '\'': 79 return s.scanString() 80 case '.': 81 ch1, _ := s.r.read() 82 s.r.unread() 83 if isDigit(ch1) { 84 return s.scanNumber() 85 } 86 return DOT, pos, "" 87 case '+', '-': 88 if (s.lastNonWSToken > literal_beg && s.lastNonWSToken < literal_end) || s.lastNonWSToken == RPAREN { 89 if ch0 == '+' { 90 return ADD, pos, "" 91 } 92 return SUB, pos, "" 93 } 94 return s.scanNumber() 95 case '*': 96 return MUL, pos, "" 97 case '/': 98 return DIV, pos, "" 99 case '%': 100 return MOD, pos, "" 101 case '=': 102 return EQ, pos, "" 103 case '~': 104 return BITWISE_NOT, pos, "" 105 case '|': 106 return BITWISE_OR, pos, "" 107 case '&': 108 return BITWISE_AND, pos, "" 109 case '^': 110 return BITWISE_XOR, pos, "" 111 case '!': 112 if ch1, _ := s.r.read(); ch1 == '=' { 113 return NEQ, pos, "" 114 } 115 s.r.unread() 116 return EXCLAMATION, pos, "" 117 case '>': 118 ch1, _ := s.r.read() 119 if ch1 == '=' { 120 return GTE, pos, "" 121 } else if ch1 == '>' { 122 return BITWISE_RIGHT_SHIFT, pos, "" 123 } 124 s.r.unread() 125 return GT, pos, "" 126 case '<': 127 ch1, _ := s.r.read() 128 if ch1 == '=' { 129 return LTE, pos, "" 130 } else if ch1 == '>' { 131 return NEQ, pos, "" 132 } else if ch1 == '<' { 133 return BITWISE_LEFT_SHIFT, pos, "" 134 } 135 s.r.unread() 136 return LT, pos, "" 137 case '(': 138 return LPAREN, pos, "" 139 case ')': 140 return RPAREN, pos, "" 141 case ',': 142 return COMMA, pos, "" 143 } 144 145 return ILLEGAL, pos, string(ch0) 146 } 147 148 // scanWhitespace consumes the current rune and all contiguous whitespace. 149 func (s *Scanner) scanWhitespace() (tok Token, pos Pos, lit string) { 150 // Create a buffer and read the current character into it. 151 var buf bytes.Buffer 152 ch, pos := s.r.curr() 153 _, _ = buf.WriteRune(ch) 154 155 // Read every subsequent whitespace character into the buffer. 156 // Non-whitespace characters and EOF will cause the loop to exit. 157 for { 158 ch, _ = s.r.read() 159 if ch == eof { 160 break 161 } else if !isWhitespace(ch) { 162 s.r.unread() 163 break 164 } else { 165 _, _ = buf.WriteRune(ch) 166 } 167 } 168 169 return WS, pos, buf.String() 170 } 171 172 func (s *Scanner) scanIdent() (tok Token, pos Pos, lit string) { 173 // Save the starting position of the identifier. 174 _, pos = s.r.read() 175 s.r.unread() 176 177 var buf bytes.Buffer 178 for { 179 if ch, _ := s.r.read(); ch == eof { 180 break 181 } else if ch == '`' { 182 tok0, pos0, lit0 := s.scanString() 183 if tok0 == BADSTRING || tok0 == BADESCAPE { 184 return tok0, pos0, lit0 185 } 186 return IDENT, pos, lit0 187 } else if isIdentChar(ch) { 188 s.r.unread() 189 buf.WriteString(ScanBareIdent(s.r)) 190 } else { 191 s.r.unread() 192 break 193 } 194 } 195 lit = buf.String() 196 197 // If the literal matches a keyword then return that keyword. 198 if tok = Lookup(lit); tok != IDENT { 199 return tok, pos, "" 200 } 201 202 return IDENT, pos, lit 203 } 204 205 // scanString consumes a contiguous string of non-quote characters. 206 // Quote characters can be consumed if they're first escaped with a backslash. 207 func (s *Scanner) scanString() (tok Token, pos Pos, lit string) { 208 s.r.unread() 209 _, pos = s.r.curr() 210 211 var err error 212 lit, err = ScanString(s.r) 213 if err == errBadString { 214 return BADSTRING, pos, lit 215 } else if err == errBadEscape { 216 _, pos = s.r.curr() 217 return BADESCAPE, pos, lit 218 } 219 return STRING, pos, lit 220 } 221 222 // scanNumber consumes anything that looks like the start of a number. 223 // Numbers start with a digit, full stop, plus sign or minus sign. 224 // This function can return non-number tokens if a scan is a false positive. 225 // For example, a minus sign followed by a letter will just return a minus sign. 226 func (s *Scanner) scanNumber() (tok Token, pos Pos, lit string) { 227 var buf bytes.Buffer 228 229 // Check if the initial rune is a "+" or "-". 230 ch, pos := s.r.curr() 231 if ch == '+' || ch == '-' { 232 buf.WriteRune(ch) 233 } else { 234 s.r.unread() 235 } 236 237 // Read as many digits as possible. 238 digits := s.scanDigits() 239 buf.WriteString(digits) 240 integralDigitLength := len(digits) 241 242 ch, _ = s.r.read() 243 if ch == 'x' { 244 buf.WriteRune(ch) 245 if digits != "0" { 246 return ILLEGAL, pos, buf.String() 247 } 248 hexChars := s.scanHexChars() 249 buf.WriteString(hexChars) 250 if len(hexChars) == 0 || len(hexChars)%2 != 0 { 251 return ILLEGAL, pos, buf.String() 252 } 253 return NUMBER, pos, buf.String() 254 } 255 256 if ch == '.' { 257 buf.WriteRune(ch) 258 buf.WriteString(s.scanDigits()) 259 ch, _ = s.r.read() 260 } else if integralDigitLength == 0 { 261 s.r.unread() 262 263 if buf.String() == "-" { 264 return UNARY_MINUS, pos, "" 265 } 266 267 return ILLEGAL, pos, buf.String() 268 } 269 270 if ch == 'e' || ch == 'E' { 271 buf.WriteRune(ch) 272 digits = s.scanDigits() 273 buf.WriteString(digits) 274 if len(digits) == 0 { 275 return ILLEGAL, pos, buf.String() 276 } 277 } else { 278 s.r.unread() 279 } 280 281 return NUMBER, pos, buf.String() 282 } 283 284 // scanDigits consume a contiguous series of digits. 285 func (s *Scanner) scanDigits() string { 286 var buf bytes.Buffer 287 for { 288 ch, _ := s.r.read() 289 if !isDigit(ch) { 290 s.r.unread() 291 break 292 } 293 _, _ = buf.WriteRune(ch) 294 } 295 return buf.String() 296 } 297 298 func (s *Scanner) scanHexChars() string { 299 var buf bytes.Buffer 300 for { 301 ch, _ := s.r.read() 302 if !isHexChar(ch) { 303 s.r.unread() 304 break 305 } 306 _, _ = buf.WriteRune(ch) 307 } 308 return buf.String() 309 } 310 311 // isWhitespace returns true if the rune is a space, tab, or newline. 312 func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' } 313 314 // isLetter returns true if the rune is a letter. 315 func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') } 316 317 // isDigit returns true if the rune is a digit. 318 func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') } 319 320 // isIdentChar returns true if the rune can be used in an unquoted identifier. 321 func isIdentChar(ch rune) bool { return isLetter(ch) || isDigit(ch) || ch == '_' } 322 323 func isHexChar(ch rune) bool { 324 return isDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') 325 } 326 327 // isIdentFirstChar returns true if the rune can be used as the first char in an unquoted identifer. 328 func isIdentFirstChar(ch rune) bool { return isLetter(ch) || ch == '_' } 329 330 // bufScanner represents a wrapper for scanner to add a buffer. 331 // It provides a fixed-length circular buffer that can be unread. 332 type bufScanner struct { 333 s *Scanner 334 i int // buffer index 335 n int // buffer size 336 buf [3]struct { 337 tok Token 338 pos Pos 339 lit string 340 } 341 } 342 343 // newBufScanner returns a new buffered scanner for a reader. 344 func newBufScanner(r io.Reader) *bufScanner { 345 return &bufScanner{s: NewScanner(r)} 346 } 347 348 // Scan reads the next token from the scanner. 349 func (s *bufScanner) Scan() (tok Token, pos Pos, lit string) { 350 return s.scanFunc(s.s.Scan) 351 } 352 353 // scanFunc uses the provided function to scan the next token. 354 func (s *bufScanner) scanFunc(scan func() (Token, Pos, string)) (tok Token, pos Pos, lit string) { 355 // If we have unread tokens then read them off the buffer first. 356 if s.n > 0 { 357 s.n-- 358 return s.curr() 359 } 360 361 // Move buffer position forward and save the token. 362 s.i = (s.i + 1) % len(s.buf) 363 buf := &s.buf[s.i] 364 buf.tok, buf.pos, buf.lit = scan() 365 366 return s.curr() 367 } 368 369 // Unscan pushes the previously token back onto the buffer. 370 func (s *bufScanner) Unscan() { s.n++ } 371 372 // curr returns the last read token. 373 func (s *bufScanner) curr() (tok Token, pos Pos, lit string) { 374 buf := &s.buf[(s.i-s.n+len(s.buf))%len(s.buf)] 375 return buf.tok, buf.pos, buf.lit 376 } 377 378 // reader represents a buffered rune reader used by the scanner. 379 // It provides a fixed-length circular buffer that can be unread. 380 type reader struct { 381 r io.RuneScanner 382 i int // buffer index 383 n int // buffer char count 384 pos Pos // last read rune position 385 buf [3]struct { 386 ch rune 387 pos Pos 388 } 389 eof bool // true if reader has ever seen eof. 390 } 391 392 // ReadRune reads the next rune from the reader. 393 // This is a wrapper function to implement the io.RuneReader interface. 394 // Note that this function does not return size. 395 func (r *reader) ReadRune() (ch rune, size int, err error) { 396 ch, _ = r.read() 397 if ch == eof { 398 err = io.EOF 399 } 400 return 401 } 402 403 // UnreadRune pushes the previously read rune back onto the buffer. 404 // This is a wrapper function to implement the io.RuneScanner interface. 405 func (r *reader) UnreadRune() error { 406 r.unread() 407 return nil 408 } 409 410 // read reads the next rune from the reader. 411 func (r *reader) read() (ch rune, pos Pos) { 412 // If we have unread characters then read them off the buffer first. 413 if r.n > 0 { 414 r.n-- 415 return r.curr() 416 } 417 418 // Read next rune from underlying reader. 419 // Any error (including io.EOF) should return as EOF. 420 ch, _, err := r.r.ReadRune() 421 if err != nil { 422 ch = eof 423 } else if ch == '\r' { 424 if ch, _, err := r.r.ReadRune(); err != nil { 425 // nop 426 } else if ch != '\n' { 427 _ = r.r.UnreadRune() 428 } 429 ch = '\n' 430 } 431 432 // Save character and position to the buffer. 433 r.i = (r.i + 1) % len(r.buf) 434 buf := &r.buf[r.i] 435 buf.ch, buf.pos = ch, r.pos 436 437 // Update position. 438 // Only count EOF once. 439 if ch == '\n' { 440 r.pos.Line++ 441 r.pos.Char = 0 442 } else if !r.eof { 443 r.pos.Char++ 444 } 445 446 // Mark the reader as EOF. 447 // This is used so we don't double count EOF characters. 448 if ch == eof { 449 r.eof = true 450 } 451 452 return r.curr() 453 } 454 455 // unread pushes the previously read rune back onto the buffer. 456 func (r *reader) unread() { 457 r.n++ 458 } 459 460 // curr returns the last read character and position. 461 func (r *reader) curr() (ch rune, pos Pos) { 462 i := (r.i - r.n + len(r.buf)) % len(r.buf) 463 buf := &r.buf[i] 464 return buf.ch, buf.pos 465 } 466 467 // eof is a marker code point to signify that the reader can't read any more. 468 const eof = rune(0) 469 470 func ScanDelimited(r io.RuneScanner, start, end rune, escapes map[rune]rune, escapesPassThru bool) ([]byte, error) { 471 // Scan start delimiter. 472 if ch, _, err := r.ReadRune(); err != nil { 473 return nil, err 474 } else if ch != start { 475 return nil, fmt.Errorf("expected %s; found %s", string(start), string(ch)) 476 } 477 478 var buf bytes.Buffer 479 for { 480 ch0, _, err := r.ReadRune() 481 if ch0 == end { 482 return buf.Bytes(), nil 483 } else if err != nil { 484 return buf.Bytes(), err 485 } else if ch0 == '\n' { 486 return nil, errors.New("delimited text contains new line") 487 } else if ch0 == '\\' { 488 // If the next character is an escape then write the escaped char. 489 // If it's not a valid escape then return an error. 490 ch1, _, err := r.ReadRune() 491 if err != nil { 492 return nil, err 493 } 494 495 c, ok := escapes[ch1] 496 if !ok { 497 if escapesPassThru { 498 // Unread ch1 (char after the \) 499 _ = r.UnreadRune() 500 // Write ch0 (\) to the output buffer. 501 _, _ = buf.WriteRune(ch0) 502 continue 503 } else { 504 buf.Reset() 505 _, _ = buf.WriteRune(ch0) 506 _, _ = buf.WriteRune(ch1) 507 return buf.Bytes(), errBadEscape 508 } 509 } 510 511 _, _ = buf.WriteRune(c) 512 } else { 513 _, _ = buf.WriteRune(ch0) 514 } 515 } 516 } 517 518 // ScanString reads a quoted string from a rune reader. 519 func ScanString(r io.RuneScanner) (string, error) { 520 ending, _, err := r.ReadRune() 521 if err != nil { 522 return "", errBadString 523 } 524 525 var buf bytes.Buffer 526 for { 527 ch0, _, err := r.ReadRune() 528 if ch0 == ending { 529 return buf.String(), nil 530 } else if err != nil || ch0 == '\n' { 531 return buf.String(), errBadString 532 } else if ch0 == '\\' { 533 // If the next character is an escape then write the escaped char. 534 // If it's not a valid escape then return an error. 535 ch1, _, _ := r.ReadRune() 536 if ch1 == 'n' { 537 _, _ = buf.WriteRune('\n') 538 } else if ch1 == '\\' { 539 _, _ = buf.WriteRune('\\') 540 } else if ch1 == '"' { 541 _, _ = buf.WriteRune('"') 542 } else if ch1 == '\'' { 543 _, _ = buf.WriteRune('\'') 544 } else { 545 return string(ch0) + string(ch1), errBadEscape 546 } 547 } else { 548 _, _ = buf.WriteRune(ch0) 549 } 550 } 551 } 552 553 var errBadString = errors.New("bad string") 554 var errBadEscape = errors.New("bad escape") 555 556 // ScanBareIdent reads bare identifier from a rune reader. 557 func ScanBareIdent(r io.RuneScanner) string { 558 // Read every ident character into the buffer. 559 // Non-ident characters and EOF will cause the loop to exit. 560 var buf bytes.Buffer 561 for { 562 ch, _, err := r.ReadRune() 563 if err != nil { 564 break 565 } else if !isIdentChar(ch) { 566 r.UnreadRune() 567 break 568 } else { 569 _, _ = buf.WriteRune(ch) 570 } 571 } 572 return buf.String() 573 }