github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/expr/scanner.go (about)

     1  // Modifications Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  // Copyright (c) 2013-2016 Errplane Inc.
     3  //
     4  // Permission is hereby granted, free of charge, to any person obtaining a copy of
     5  // this software and associated documentation files (the "Software"), to deal in
     6  // the Software without restriction, including without limitation the rights to
     7  // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
     8  // the Software, and to permit persons to whom the Software is furnished to do so,
     9  // subject to the following conditions:
    10  //
    11  // The above copyright notice and this permission notice shall be included in all
    12  // copies or substantial portions of the Software.
    13  //
    14  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
    16  // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
    17  // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
    18  // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
    19  // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    20  
    21  package expr
    22  
    23  import (
    24  	"bufio"
    25  	"bytes"
    26  	"errors"
    27  	"fmt"
    28  	"io"
    29  )
    30  
    31  // Scanner represents a lexical scanner for InfluxQL.
    32  type Scanner struct {
    33  	r              *reader
    34  	lastNonWSToken Token
    35  }
    36  
    37  // NewScanner returns a new instance of Scanner.
    38  func NewScanner(r io.Reader) *Scanner {
    39  	return &Scanner{r: &reader{r: bufio.NewReader(r)}}
    40  }
    41  
    42  // Scan returns the next token and position from the underlying reader.
    43  // Also returns the literal text read for strings, numbers, and duration tokens
    44  // since these token types can have different literal representations.
    45  func (s *Scanner) Scan() (tok Token, pos Pos, lit string) {
    46  	tok, pos, lit = s.scan()
    47  	if tok != WS {
    48  		s.lastNonWSToken = tok
    49  	}
    50  	return
    51  }
    52  
    53  func (s *Scanner) scan() (tok Token, pos Pos, lit string) {
    54  	// Read next code point.
    55  	ch0, pos := s.r.read()
    56  
    57  	// If we see whitespace then consume all contiguous whitespace.
    58  	// If we see a letter, or certain acceptable special characters, then consume
    59  	// as an ident or reserved word.
    60  	if isWhitespace(ch0) {
    61  		return s.scanWhitespace()
    62  	} else if isLetter(ch0) || ch0 == '_' {
    63  		s.r.unread()
    64  		return s.scanIdent()
    65  	} else if isDigit(ch0) {
    66  		return s.scanNumber()
    67  	}
    68  
    69  	// Otherwise parse individual characters.
    70  	switch ch0 {
    71  	case eof:
    72  		return EOF, pos, ""
    73  	case '`':
    74  		s.r.unread()
    75  		return s.scanIdent()
    76  	case '"':
    77  		return s.scanString()
    78  	case '\'':
    79  		return s.scanString()
    80  	case '.':
    81  		ch1, _ := s.r.read()
    82  		s.r.unread()
    83  		if isDigit(ch1) {
    84  			return s.scanNumber()
    85  		}
    86  		return DOT, pos, ""
    87  	case '+', '-':
    88  		if (s.lastNonWSToken > literal_beg && s.lastNonWSToken < literal_end) || s.lastNonWSToken == RPAREN {
    89  			if ch0 == '+' {
    90  				return ADD, pos, ""
    91  			}
    92  			return SUB, pos, ""
    93  		}
    94  		return s.scanNumber()
    95  	case '*':
    96  		return MUL, pos, ""
    97  	case '/':
    98  		return DIV, pos, ""
    99  	case '%':
   100  		return MOD, pos, ""
   101  	case '=':
   102  		return EQ, pos, ""
   103  	case '~':
   104  		return BITWISE_NOT, pos, ""
   105  	case '|':
   106  		return BITWISE_OR, pos, ""
   107  	case '&':
   108  		return BITWISE_AND, pos, ""
   109  	case '^':
   110  		return BITWISE_XOR, pos, ""
   111  	case '!':
   112  		if ch1, _ := s.r.read(); ch1 == '=' {
   113  			return NEQ, pos, ""
   114  		}
   115  		s.r.unread()
   116  		return EXCLAMATION, pos, ""
   117  	case '>':
   118  		ch1, _ := s.r.read()
   119  		if ch1 == '=' {
   120  			return GTE, pos, ""
   121  		} else if ch1 == '>' {
   122  			return BITWISE_RIGHT_SHIFT, pos, ""
   123  		}
   124  		s.r.unread()
   125  		return GT, pos, ""
   126  	case '<':
   127  		ch1, _ := s.r.read()
   128  		if ch1 == '=' {
   129  			return LTE, pos, ""
   130  		} else if ch1 == '>' {
   131  			return NEQ, pos, ""
   132  		} else if ch1 == '<' {
   133  			return BITWISE_LEFT_SHIFT, pos, ""
   134  		}
   135  		s.r.unread()
   136  		return LT, pos, ""
   137  	case '(':
   138  		return LPAREN, pos, ""
   139  	case ')':
   140  		return RPAREN, pos, ""
   141  	case ',':
   142  		return COMMA, pos, ""
   143  	}
   144  
   145  	return ILLEGAL, pos, string(ch0)
   146  }
   147  
   148  // scanWhitespace consumes the current rune and all contiguous whitespace.
   149  func (s *Scanner) scanWhitespace() (tok Token, pos Pos, lit string) {
   150  	// Create a buffer and read the current character into it.
   151  	var buf bytes.Buffer
   152  	ch, pos := s.r.curr()
   153  	_, _ = buf.WriteRune(ch)
   154  
   155  	// Read every subsequent whitespace character into the buffer.
   156  	// Non-whitespace characters and EOF will cause the loop to exit.
   157  	for {
   158  		ch, _ = s.r.read()
   159  		if ch == eof {
   160  			break
   161  		} else if !isWhitespace(ch) {
   162  			s.r.unread()
   163  			break
   164  		} else {
   165  			_, _ = buf.WriteRune(ch)
   166  		}
   167  	}
   168  
   169  	return WS, pos, buf.String()
   170  }
   171  
   172  func (s *Scanner) scanIdent() (tok Token, pos Pos, lit string) {
   173  	// Save the starting position of the identifier.
   174  	_, pos = s.r.read()
   175  	s.r.unread()
   176  
   177  	var buf bytes.Buffer
   178  	for {
   179  		if ch, _ := s.r.read(); ch == eof {
   180  			break
   181  		} else if ch == '`' {
   182  			tok0, pos0, lit0 := s.scanString()
   183  			if tok0 == BADSTRING || tok0 == BADESCAPE {
   184  				return tok0, pos0, lit0
   185  			}
   186  			return IDENT, pos, lit0
   187  		} else if isIdentChar(ch) {
   188  			s.r.unread()
   189  			buf.WriteString(ScanBareIdent(s.r))
   190  		} else {
   191  			s.r.unread()
   192  			break
   193  		}
   194  	}
   195  	lit = buf.String()
   196  
   197  	// If the literal matches a keyword then return that keyword.
   198  	if tok = Lookup(lit); tok != IDENT {
   199  		return tok, pos, ""
   200  	}
   201  
   202  	return IDENT, pos, lit
   203  }
   204  
   205  // scanString consumes a contiguous string of non-quote characters.
   206  // Quote characters can be consumed if they're first escaped with a backslash.
   207  func (s *Scanner) scanString() (tok Token, pos Pos, lit string) {
   208  	s.r.unread()
   209  	_, pos = s.r.curr()
   210  
   211  	var err error
   212  	lit, err = ScanString(s.r)
   213  	if err == errBadString {
   214  		return BADSTRING, pos, lit
   215  	} else if err == errBadEscape {
   216  		_, pos = s.r.curr()
   217  		return BADESCAPE, pos, lit
   218  	}
   219  	return STRING, pos, lit
   220  }
   221  
   222  // scanNumber consumes anything that looks like the start of a number.
   223  // Numbers start with a digit, full stop, plus sign or minus sign.
   224  // This function can return non-number tokens if a scan is a false positive.
   225  // For example, a minus sign followed by a letter will just return a minus sign.
   226  func (s *Scanner) scanNumber() (tok Token, pos Pos, lit string) {
   227  	var buf bytes.Buffer
   228  
   229  	// Check if the initial rune is a "+" or "-".
   230  	ch, pos := s.r.curr()
   231  	if ch == '+' || ch == '-' {
   232  		buf.WriteRune(ch)
   233  	} else {
   234  		s.r.unread()
   235  	}
   236  
   237  	// Read as many digits as possible.
   238  	digits := s.scanDigits()
   239  	buf.WriteString(digits)
   240  	integralDigitLength := len(digits)
   241  
   242  	ch, _ = s.r.read()
   243  	if ch == 'x' {
   244  		buf.WriteRune(ch)
   245  		if digits != "0" {
   246  			return ILLEGAL, pos, buf.String()
   247  		}
   248  		hexChars := s.scanHexChars()
   249  		buf.WriteString(hexChars)
   250  		if len(hexChars) == 0 || len(hexChars)%2 != 0 {
   251  			return ILLEGAL, pos, buf.String()
   252  		}
   253  		return NUMBER, pos, buf.String()
   254  	}
   255  
   256  	if ch == '.' {
   257  		buf.WriteRune(ch)
   258  		buf.WriteString(s.scanDigits())
   259  		ch, _ = s.r.read()
   260  	} else if integralDigitLength == 0 {
   261  		s.r.unread()
   262  
   263  		if buf.String() == "-" {
   264  			return UNARY_MINUS, pos, ""
   265  		}
   266  
   267  		return ILLEGAL, pos, buf.String()
   268  	}
   269  
   270  	if ch == 'e' || ch == 'E' {
   271  		buf.WriteRune(ch)
   272  		digits = s.scanDigits()
   273  		buf.WriteString(digits)
   274  		if len(digits) == 0 {
   275  			return ILLEGAL, pos, buf.String()
   276  		}
   277  	} else {
   278  		s.r.unread()
   279  	}
   280  
   281  	return NUMBER, pos, buf.String()
   282  }
   283  
   284  // scanDigits consume a contiguous series of digits.
   285  func (s *Scanner) scanDigits() string {
   286  	var buf bytes.Buffer
   287  	for {
   288  		ch, _ := s.r.read()
   289  		if !isDigit(ch) {
   290  			s.r.unread()
   291  			break
   292  		}
   293  		_, _ = buf.WriteRune(ch)
   294  	}
   295  	return buf.String()
   296  }
   297  
   298  func (s *Scanner) scanHexChars() string {
   299  	var buf bytes.Buffer
   300  	for {
   301  		ch, _ := s.r.read()
   302  		if !isHexChar(ch) {
   303  			s.r.unread()
   304  			break
   305  		}
   306  		_, _ = buf.WriteRune(ch)
   307  	}
   308  	return buf.String()
   309  }
   310  
   311  // isWhitespace returns true if the rune is a space, tab, or newline.
   312  func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }
   313  
   314  // isLetter returns true if the rune is a letter.
   315  func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
   316  
   317  // isDigit returns true if the rune is a digit.
   318  func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') }
   319  
   320  // isIdentChar returns true if the rune can be used in an unquoted identifier.
   321  func isIdentChar(ch rune) bool { return isLetter(ch) || isDigit(ch) || ch == '_' }
   322  
   323  func isHexChar(ch rune) bool {
   324  	return isDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')
   325  }
   326  
   327  // isIdentFirstChar returns true if the rune can be used as the first char in an unquoted identifer.
   328  func isIdentFirstChar(ch rune) bool { return isLetter(ch) || ch == '_' }
   329  
   330  // bufScanner represents a wrapper for scanner to add a buffer.
   331  // It provides a fixed-length circular buffer that can be unread.
   332  type bufScanner struct {
   333  	s   *Scanner
   334  	i   int // buffer index
   335  	n   int // buffer size
   336  	buf [3]struct {
   337  		tok Token
   338  		pos Pos
   339  		lit string
   340  	}
   341  }
   342  
   343  // newBufScanner returns a new buffered scanner for a reader.
   344  func newBufScanner(r io.Reader) *bufScanner {
   345  	return &bufScanner{s: NewScanner(r)}
   346  }
   347  
   348  // Scan reads the next token from the scanner.
   349  func (s *bufScanner) Scan() (tok Token, pos Pos, lit string) {
   350  	return s.scanFunc(s.s.Scan)
   351  }
   352  
   353  // scanFunc uses the provided function to scan the next token.
   354  func (s *bufScanner) scanFunc(scan func() (Token, Pos, string)) (tok Token, pos Pos, lit string) {
   355  	// If we have unread tokens then read them off the buffer first.
   356  	if s.n > 0 {
   357  		s.n--
   358  		return s.curr()
   359  	}
   360  
   361  	// Move buffer position forward and save the token.
   362  	s.i = (s.i + 1) % len(s.buf)
   363  	buf := &s.buf[s.i]
   364  	buf.tok, buf.pos, buf.lit = scan()
   365  
   366  	return s.curr()
   367  }
   368  
   369  // Unscan pushes the previously token back onto the buffer.
   370  func (s *bufScanner) Unscan() { s.n++ }
   371  
   372  // curr returns the last read token.
   373  func (s *bufScanner) curr() (tok Token, pos Pos, lit string) {
   374  	buf := &s.buf[(s.i-s.n+len(s.buf))%len(s.buf)]
   375  	return buf.tok, buf.pos, buf.lit
   376  }
   377  
   378  // reader represents a buffered rune reader used by the scanner.
   379  // It provides a fixed-length circular buffer that can be unread.
   380  type reader struct {
   381  	r   io.RuneScanner
   382  	i   int // buffer index
   383  	n   int // buffer char count
   384  	pos Pos // last read rune position
   385  	buf [3]struct {
   386  		ch  rune
   387  		pos Pos
   388  	}
   389  	eof bool // true if reader has ever seen eof.
   390  }
   391  
   392  // ReadRune reads the next rune from the reader.
   393  // This is a wrapper function to implement the io.RuneReader interface.
   394  // Note that this function does not return size.
   395  func (r *reader) ReadRune() (ch rune, size int, err error) {
   396  	ch, _ = r.read()
   397  	if ch == eof {
   398  		err = io.EOF
   399  	}
   400  	return
   401  }
   402  
   403  // UnreadRune pushes the previously read rune back onto the buffer.
   404  // This is a wrapper function to implement the io.RuneScanner interface.
   405  func (r *reader) UnreadRune() error {
   406  	r.unread()
   407  	return nil
   408  }
   409  
   410  // read reads the next rune from the reader.
   411  func (r *reader) read() (ch rune, pos Pos) {
   412  	// If we have unread characters then read them off the buffer first.
   413  	if r.n > 0 {
   414  		r.n--
   415  		return r.curr()
   416  	}
   417  
   418  	// Read next rune from underlying reader.
   419  	// Any error (including io.EOF) should return as EOF.
   420  	ch, _, err := r.r.ReadRune()
   421  	if err != nil {
   422  		ch = eof
   423  	} else if ch == '\r' {
   424  		if ch, _, err := r.r.ReadRune(); err != nil {
   425  			// nop
   426  		} else if ch != '\n' {
   427  			_ = r.r.UnreadRune()
   428  		}
   429  		ch = '\n'
   430  	}
   431  
   432  	// Save character and position to the buffer.
   433  	r.i = (r.i + 1) % len(r.buf)
   434  	buf := &r.buf[r.i]
   435  	buf.ch, buf.pos = ch, r.pos
   436  
   437  	// Update position.
   438  	// Only count EOF once.
   439  	if ch == '\n' {
   440  		r.pos.Line++
   441  		r.pos.Char = 0
   442  	} else if !r.eof {
   443  		r.pos.Char++
   444  	}
   445  
   446  	// Mark the reader as EOF.
   447  	// This is used so we don't double count EOF characters.
   448  	if ch == eof {
   449  		r.eof = true
   450  	}
   451  
   452  	return r.curr()
   453  }
   454  
   455  // unread pushes the previously read rune back onto the buffer.
   456  func (r *reader) unread() {
   457  	r.n++
   458  }
   459  
   460  // curr returns the last read character and position.
   461  func (r *reader) curr() (ch rune, pos Pos) {
   462  	i := (r.i - r.n + len(r.buf)) % len(r.buf)
   463  	buf := &r.buf[i]
   464  	return buf.ch, buf.pos
   465  }
   466  
   467  // eof is a marker code point to signify that the reader can't read any more.
   468  const eof = rune(0)
   469  
   470  func ScanDelimited(r io.RuneScanner, start, end rune, escapes map[rune]rune, escapesPassThru bool) ([]byte, error) {
   471  	// Scan start delimiter.
   472  	if ch, _, err := r.ReadRune(); err != nil {
   473  		return nil, err
   474  	} else if ch != start {
   475  		return nil, fmt.Errorf("expected %s; found %s", string(start), string(ch))
   476  	}
   477  
   478  	var buf bytes.Buffer
   479  	for {
   480  		ch0, _, err := r.ReadRune()
   481  		if ch0 == end {
   482  			return buf.Bytes(), nil
   483  		} else if err != nil {
   484  			return buf.Bytes(), err
   485  		} else if ch0 == '\n' {
   486  			return nil, errors.New("delimited text contains new line")
   487  		} else if ch0 == '\\' {
   488  			// If the next character is an escape then write the escaped char.
   489  			// If it's not a valid escape then return an error.
   490  			ch1, _, err := r.ReadRune()
   491  			if err != nil {
   492  				return nil, err
   493  			}
   494  
   495  			c, ok := escapes[ch1]
   496  			if !ok {
   497  				if escapesPassThru {
   498  					// Unread ch1 (char after the \)
   499  					_ = r.UnreadRune()
   500  					// Write ch0 (\) to the output buffer.
   501  					_, _ = buf.WriteRune(ch0)
   502  					continue
   503  				} else {
   504  					buf.Reset()
   505  					_, _ = buf.WriteRune(ch0)
   506  					_, _ = buf.WriteRune(ch1)
   507  					return buf.Bytes(), errBadEscape
   508  				}
   509  			}
   510  
   511  			_, _ = buf.WriteRune(c)
   512  		} else {
   513  			_, _ = buf.WriteRune(ch0)
   514  		}
   515  	}
   516  }
   517  
   518  // ScanString reads a quoted string from a rune reader.
   519  func ScanString(r io.RuneScanner) (string, error) {
   520  	ending, _, err := r.ReadRune()
   521  	if err != nil {
   522  		return "", errBadString
   523  	}
   524  
   525  	var buf bytes.Buffer
   526  	for {
   527  		ch0, _, err := r.ReadRune()
   528  		if ch0 == ending {
   529  			return buf.String(), nil
   530  		} else if err != nil || ch0 == '\n' {
   531  			return buf.String(), errBadString
   532  		} else if ch0 == '\\' {
   533  			// If the next character is an escape then write the escaped char.
   534  			// If it's not a valid escape then return an error.
   535  			ch1, _, _ := r.ReadRune()
   536  			if ch1 == 'n' {
   537  				_, _ = buf.WriteRune('\n')
   538  			} else if ch1 == '\\' {
   539  				_, _ = buf.WriteRune('\\')
   540  			} else if ch1 == '"' {
   541  				_, _ = buf.WriteRune('"')
   542  			} else if ch1 == '\'' {
   543  				_, _ = buf.WriteRune('\'')
   544  			} else {
   545  				return string(ch0) + string(ch1), errBadEscape
   546  			}
   547  		} else {
   548  			_, _ = buf.WriteRune(ch0)
   549  		}
   550  	}
   551  }
   552  
   553  var errBadString = errors.New("bad string")
   554  var errBadEscape = errors.New("bad escape")
   555  
   556  // ScanBareIdent reads bare identifier from a rune reader.
   557  func ScanBareIdent(r io.RuneScanner) string {
   558  	// Read every ident character into the buffer.
   559  	// Non-ident characters and EOF will cause the loop to exit.
   560  	var buf bytes.Buffer
   561  	for {
   562  		ch, _, err := r.ReadRune()
   563  		if err != nil {
   564  			break
   565  		} else if !isIdentChar(ch) {
   566  			r.UnreadRune()
   567  			break
   568  		} else {
   569  			_, _ = buf.WriteRune(ch)
   570  		}
   571  	}
   572  	return buf.String()
   573  }