github.com/influxdata/influxql@v1.1.0/scanner.go (about)

     1  package influxql
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  )
    10  
    11  // Scanner represents a lexical scanner for InfluxQL.
    12  type Scanner struct {
    13  	r *reader
    14  }
    15  
    16  // NewScanner returns a new instance of Scanner.
    17  func NewScanner(r io.Reader) *Scanner {
    18  	return &Scanner{r: &reader{r: bufio.NewReader(r)}}
    19  }
    20  
    21  // Scan returns the next token and position from the underlying reader.
    22  // Also returns the literal text read for strings, numbers, and duration tokens
    23  // since these token types can have different literal representations.
    24  func (s *Scanner) Scan() (tok Token, pos Pos, lit string) {
    25  	// Read next code point.
    26  	ch0, pos := s.r.read()
    27  
    28  	// If we see whitespace then consume all contiguous whitespace.
    29  	// If we see a letter, or certain acceptable special characters, then consume
    30  	// as an ident or reserved word.
    31  	if isWhitespace(ch0) {
    32  		return s.scanWhitespace()
    33  	} else if isLetter(ch0) || ch0 == '_' {
    34  		s.r.unread()
    35  		return s.scanIdent(true)
    36  	} else if isDigit(ch0) {
    37  		return s.scanNumber()
    38  	}
    39  
    40  	// Otherwise parse individual characters.
    41  	switch ch0 {
    42  	case eof:
    43  		return EOF, pos, ""
    44  	case '"':
    45  		s.r.unread()
    46  		return s.scanIdent(true)
    47  	case '\'':
    48  		return s.scanString()
    49  	case '.':
    50  		ch1, _ := s.r.read()
    51  		s.r.unread()
    52  		if isDigit(ch1) {
    53  			return s.scanNumber()
    54  		}
    55  		return DOT, pos, ""
    56  	case '$':
    57  		tok, _, lit = s.scanIdent(false)
    58  		if tok != IDENT {
    59  			return tok, pos, "$" + lit
    60  		}
    61  		return BOUNDPARAM, pos, "$" + lit
    62  	case '+':
    63  		return ADD, pos, ""
    64  	case '-':
    65  		ch1, _ := s.r.read()
    66  		if ch1 == '-' {
    67  			s.skipUntilNewline()
    68  			return COMMENT, pos, ""
    69  		}
    70  		s.r.unread()
    71  		return SUB, pos, ""
    72  	case '*':
    73  		return MUL, pos, ""
    74  	case '/':
    75  		ch1, _ := s.r.read()
    76  		if ch1 == '*' {
    77  			if err := s.skipUntilEndComment(); err != nil {
    78  				return ILLEGAL, pos, ""
    79  			}
    80  			return COMMENT, pos, ""
    81  		} else {
    82  			s.r.unread()
    83  		}
    84  		return DIV, pos, ""
    85  	case '%':
    86  		return MOD, pos, ""
    87  	case '&':
    88  		return BITWISE_AND, pos, ""
    89  	case '|':
    90  		return BITWISE_OR, pos, ""
    91  	case '^':
    92  		return BITWISE_XOR, pos, ""
    93  	case '=':
    94  		if ch1, _ := s.r.read(); ch1 == '~' {
    95  			return EQREGEX, pos, ""
    96  		}
    97  		s.r.unread()
    98  		return EQ, pos, ""
    99  	case '!':
   100  		if ch1, _ := s.r.read(); ch1 == '=' {
   101  			return NEQ, pos, ""
   102  		} else if ch1 == '~' {
   103  			return NEQREGEX, pos, ""
   104  		}
   105  		s.r.unread()
   106  	case '>':
   107  		if ch1, _ := s.r.read(); ch1 == '=' {
   108  			return GTE, pos, ""
   109  		}
   110  		s.r.unread()
   111  		return GT, pos, ""
   112  	case '<':
   113  		if ch1, _ := s.r.read(); ch1 == '=' {
   114  			return LTE, pos, ""
   115  		} else if ch1 == '>' {
   116  			return NEQ, pos, ""
   117  		}
   118  		s.r.unread()
   119  		return LT, pos, ""
   120  	case '(':
   121  		return LPAREN, pos, ""
   122  	case ')':
   123  		return RPAREN, pos, ""
   124  	case ',':
   125  		return COMMA, pos, ""
   126  	case ';':
   127  		return SEMICOLON, pos, ""
   128  	case ':':
   129  		if ch1, _ := s.r.read(); ch1 == ':' {
   130  			return DOUBLECOLON, pos, ""
   131  		}
   132  		s.r.unread()
   133  		return COLON, pos, ""
   134  	}
   135  
   136  	return ILLEGAL, pos, string(ch0)
   137  }
   138  
   139  // scanWhitespace consumes the current rune and all contiguous whitespace.
   140  func (s *Scanner) scanWhitespace() (tok Token, pos Pos, lit string) {
   141  	// Create a buffer and read the current character into it.
   142  	var buf bytes.Buffer
   143  	ch, pos := s.r.curr()
   144  	_, _ = buf.WriteRune(ch)
   145  
   146  	// Read every subsequent whitespace character into the buffer.
   147  	// Non-whitespace characters and EOF will cause the loop to exit.
   148  	for {
   149  		ch, _ = s.r.read()
   150  		if ch == eof {
   151  			break
   152  		} else if !isWhitespace(ch) {
   153  			s.r.unread()
   154  			break
   155  		} else {
   156  			_, _ = buf.WriteRune(ch)
   157  		}
   158  	}
   159  
   160  	return WS, pos, buf.String()
   161  }
   162  
   163  // skipUntilNewline skips characters until it reaches a newline.
   164  func (s *Scanner) skipUntilNewline() {
   165  	for {
   166  		if ch, _ := s.r.read(); ch == '\n' || ch == eof {
   167  			return
   168  		}
   169  	}
   170  }
   171  
   172  // skipUntilEndComment skips characters until it reaches a '*/' symbol.
   173  func (s *Scanner) skipUntilEndComment() error {
   174  	for {
   175  		if ch1, _ := s.r.read(); ch1 == '*' {
   176  			// We might be at the end.
   177  		star:
   178  			ch2, _ := s.r.read()
   179  			if ch2 == '/' {
   180  				return nil
   181  			} else if ch2 == '*' {
   182  				// We are back in the state machine since we see a star.
   183  				goto star
   184  			} else if ch2 == eof {
   185  				return io.EOF
   186  			}
   187  		} else if ch1 == eof {
   188  			return io.EOF
   189  		}
   190  	}
   191  }
   192  
   193  func (s *Scanner) scanIdent(lookup bool) (tok Token, pos Pos, lit string) {
   194  	// Save the starting position of the identifier.
   195  	_, pos = s.r.read()
   196  	s.r.unread()
   197  
   198  	var buf bytes.Buffer
   199  	for {
   200  		if ch, _ := s.r.read(); ch == eof {
   201  			break
   202  		} else if ch == '"' {
   203  			tok0, pos0, lit0 := s.scanString()
   204  			if tok0 == BADSTRING || tok0 == BADESCAPE {
   205  				return tok0, pos0, lit0
   206  			}
   207  			return IDENT, pos, lit0
   208  		} else if isIdentChar(ch) {
   209  			s.r.unread()
   210  			buf.WriteString(ScanBareIdent(s.r))
   211  		} else {
   212  			s.r.unread()
   213  			break
   214  		}
   215  	}
   216  	lit = buf.String()
   217  
   218  	// If the literal matches a keyword then return that keyword.
   219  	if lookup {
   220  		if tok = Lookup(lit); tok != IDENT {
   221  			return tok, pos, ""
   222  		}
   223  	}
   224  	return IDENT, pos, lit
   225  }
   226  
   227  // scanString consumes a contiguous string of non-quote characters.
   228  // Quote characters can be consumed if they're first escaped with a backslash.
   229  func (s *Scanner) scanString() (tok Token, pos Pos, lit string) {
   230  	s.r.unread()
   231  	_, pos = s.r.curr()
   232  
   233  	var err error
   234  	lit, err = ScanString(s.r)
   235  	if err == errBadString {
   236  		return BADSTRING, pos, lit
   237  	} else if err == errBadEscape {
   238  		_, pos = s.r.curr()
   239  		return BADESCAPE, pos, lit
   240  	}
   241  	return STRING, pos, lit
   242  }
   243  
   244  // ScanRegex consumes a token to find escapes
   245  func (s *Scanner) ScanRegex() (tok Token, pos Pos, lit string) {
   246  	_, pos = s.r.curr()
   247  
   248  	// Start & end sentinels.
   249  	start, end := '/', '/'
   250  	// Valid escape chars.
   251  	escapes := map[rune]rune{'/': '/'}
   252  
   253  	b, err := ScanDelimited(s.r, start, end, escapes, true)
   254  
   255  	if err == errBadEscape {
   256  		_, pos = s.r.curr()
   257  		return BADESCAPE, pos, lit
   258  	} else if err != nil {
   259  		return BADREGEX, pos, lit
   260  	}
   261  	return REGEX, pos, string(b)
   262  }
   263  
   264  // scanNumber consumes anything that looks like the start of a number.
   265  func (s *Scanner) scanNumber() (tok Token, pos Pos, lit string) {
   266  	var buf bytes.Buffer
   267  
   268  	// Check if the initial rune is a ".".
   269  	ch, pos := s.r.curr()
   270  	if ch == '.' {
   271  		// Peek and see if the next rune is a digit.
   272  		ch1, _ := s.r.read()
   273  		s.r.unread()
   274  		if !isDigit(ch1) {
   275  			return ILLEGAL, pos, "."
   276  		}
   277  
   278  		// Unread the full stop so we can read it later.
   279  		s.r.unread()
   280  	} else {
   281  		s.r.unread()
   282  	}
   283  
   284  	// Read as many digits as possible.
   285  	_, _ = buf.WriteString(s.scanDigits())
   286  
   287  	// If next code points are a full stop and digit then consume them.
   288  	isDecimal := false
   289  	if ch0, _ := s.r.read(); ch0 == '.' {
   290  		isDecimal = true
   291  		if ch1, _ := s.r.read(); isDigit(ch1) {
   292  			_, _ = buf.WriteRune(ch0)
   293  			_, _ = buf.WriteRune(ch1)
   294  			_, _ = buf.WriteString(s.scanDigits())
   295  		} else {
   296  			s.r.unread()
   297  		}
   298  	} else {
   299  		s.r.unread()
   300  	}
   301  
   302  	// Read as a duration or integer if it doesn't have a fractional part.
   303  	if !isDecimal {
   304  		// If the next rune is a letter then this is a duration token.
   305  		if ch0, _ := s.r.read(); isLetter(ch0) || ch0 == 'µ' {
   306  			_, _ = buf.WriteRune(ch0)
   307  			for {
   308  				ch1, _ := s.r.read()
   309  				if !isLetter(ch1) && ch1 != 'µ' {
   310  					s.r.unread()
   311  					break
   312  				}
   313  				_, _ = buf.WriteRune(ch1)
   314  			}
   315  
   316  			// Continue reading digits and letters as part of this token.
   317  			for {
   318  				if ch0, _ := s.r.read(); isLetter(ch0) || ch0 == 'µ' || isDigit(ch0) {
   319  					_, _ = buf.WriteRune(ch0)
   320  				} else {
   321  					s.r.unread()
   322  					break
   323  				}
   324  			}
   325  			return DURATIONVAL, pos, buf.String()
   326  		} else {
   327  			s.r.unread()
   328  			return INTEGER, pos, buf.String()
   329  		}
   330  	}
   331  	return NUMBER, pos, buf.String()
   332  }
   333  
   334  // scanDigits consumes a contiguous series of digits.
   335  func (s *Scanner) scanDigits() string {
   336  	var buf bytes.Buffer
   337  	for {
   338  		ch, _ := s.r.read()
   339  		if !isDigit(ch) {
   340  			s.r.unread()
   341  			break
   342  		}
   343  		_, _ = buf.WriteRune(ch)
   344  	}
   345  	return buf.String()
   346  }
   347  
   348  // isWhitespace returns true if the rune is a space, tab, or newline.
   349  func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }
   350  
   351  // isLetter returns true if the rune is a letter.
   352  func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
   353  
   354  // isDigit returns true if the rune is a digit.
   355  func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') }
   356  
   357  // isIdentChar returns true if the rune can be used in an unquoted identifier.
   358  func isIdentChar(ch rune) bool { return isLetter(ch) || isDigit(ch) || ch == '_' }
   359  
   360  // isIdentFirstChar returns true if the rune can be used as the first char in an unquoted identifer.
   361  func isIdentFirstChar(ch rune) bool { return isLetter(ch) || ch == '_' }
   362  
   363  // bufScanner represents a wrapper for scanner to add a buffer.
   364  // It provides a fixed-length circular buffer that can be unread.
   365  type bufScanner struct {
   366  	s   *Scanner
   367  	i   int // buffer index
   368  	n   int // buffer size
   369  	buf [3]struct {
   370  		tok Token
   371  		pos Pos
   372  		lit string
   373  	}
   374  }
   375  
   376  // newBufScanner returns a new buffered scanner for a reader.
   377  func newBufScanner(r io.Reader) *bufScanner {
   378  	return &bufScanner{s: NewScanner(r)}
   379  }
   380  
   381  // Scan reads the next token from the scanner.
   382  func (s *bufScanner) Scan() (tok Token, pos Pos, lit string) {
   383  	return s.scanFunc(s.s.Scan)
   384  }
   385  
   386  // ScanRegex reads a regex token from the scanner.
   387  func (s *bufScanner) ScanRegex() (tok Token, pos Pos, lit string) {
   388  	return s.scanFunc(s.s.ScanRegex)
   389  }
   390  
   391  // scanFunc uses the provided function to scan the next token.
   392  func (s *bufScanner) scanFunc(scan func() (Token, Pos, string)) (tok Token, pos Pos, lit string) {
   393  	// If we have unread tokens then read them off the buffer first.
   394  	if s.n > 0 {
   395  		s.n--
   396  		return s.curr()
   397  	}
   398  
   399  	// Move buffer position forward and save the token.
   400  	s.i = (s.i + 1) % len(s.buf)
   401  	buf := &s.buf[s.i]
   402  	buf.tok, buf.pos, buf.lit = scan()
   403  
   404  	return s.curr()
   405  }
   406  
   407  // Unscan pushes the previously token back onto the buffer.
   408  func (s *bufScanner) Unscan() { s.n++ }
   409  
   410  // curr returns the last read token.
   411  func (s *bufScanner) curr() (tok Token, pos Pos, lit string) {
   412  	buf := &s.buf[(s.i-s.n+len(s.buf))%len(s.buf)]
   413  	return buf.tok, buf.pos, buf.lit
   414  }
   415  
   416  // reader represents a buffered rune reader used by the scanner.
   417  // It provides a fixed-length circular buffer that can be unread.
   418  type reader struct {
   419  	r   io.RuneScanner
   420  	i   int // buffer index
   421  	n   int // buffer char count
   422  	pos Pos // last read rune position
   423  	buf [3]struct {
   424  		ch  rune
   425  		pos Pos
   426  	}
   427  	eof bool // true if reader has ever seen eof.
   428  }
   429  
   430  // ReadRune reads the next rune from the reader.
   431  // This is a wrapper function to implement the io.RuneReader interface.
   432  // Note that this function does not return size.
   433  func (r *reader) ReadRune() (ch rune, size int, err error) {
   434  	ch, _ = r.read()
   435  	if ch == eof {
   436  		err = io.EOF
   437  	}
   438  	return
   439  }
   440  
   441  // UnreadRune pushes the previously read rune back onto the buffer.
   442  // This is a wrapper function to implement the io.RuneScanner interface.
   443  func (r *reader) UnreadRune() error {
   444  	r.unread()
   445  	return nil
   446  }
   447  
   448  // read reads the next rune from the reader.
   449  func (r *reader) read() (ch rune, pos Pos) {
   450  	// If we have unread characters then read them off the buffer first.
   451  	if r.n > 0 {
   452  		r.n--
   453  		return r.curr()
   454  	}
   455  
   456  	// Read next rune from underlying reader.
   457  	// Any error (including io.EOF) should return as EOF.
   458  	ch, _, err := r.r.ReadRune()
   459  	if err != nil {
   460  		ch = eof
   461  	} else if ch == '\r' {
   462  		if ch, _, err := r.r.ReadRune(); err != nil {
   463  			// nop
   464  		} else if ch != '\n' {
   465  			_ = r.r.UnreadRune()
   466  		}
   467  		ch = '\n'
   468  	}
   469  
   470  	// Save character and position to the buffer.
   471  	r.i = (r.i + 1) % len(r.buf)
   472  	buf := &r.buf[r.i]
   473  	buf.ch, buf.pos = ch, r.pos
   474  
   475  	// Update position.
   476  	// Only count EOF once.
   477  	if ch == '\n' {
   478  		r.pos.Line++
   479  		r.pos.Char = 0
   480  	} else if !r.eof {
   481  		r.pos.Char++
   482  	}
   483  
   484  	// Mark the reader as EOF.
   485  	// This is used so we don't double count EOF characters.
   486  	if ch == eof {
   487  		r.eof = true
   488  	}
   489  
   490  	return r.curr()
   491  }
   492  
   493  // unread pushes the previously read rune back onto the buffer.
   494  func (r *reader) unread() {
   495  	r.n++
   496  }
   497  
   498  // curr returns the last read character and position.
   499  func (r *reader) curr() (ch rune, pos Pos) {
   500  	i := (r.i - r.n + len(r.buf)) % len(r.buf)
   501  	buf := &r.buf[i]
   502  	return buf.ch, buf.pos
   503  }
   504  
   505  // eof is a marker code point to signify that the reader can't read any more.
   506  const eof = rune(0)
   507  
   508  // ScanDelimited reads a delimited set of runes
   509  func ScanDelimited(r io.RuneScanner, start, end rune, escapes map[rune]rune, escapesPassThru bool) ([]byte, error) {
   510  	// Scan start delimiter.
   511  	if ch, _, err := r.ReadRune(); err != nil {
   512  		return nil, err
   513  	} else if ch != start {
   514  		return nil, fmt.Errorf("expected %s; found %s", string(start), string(ch))
   515  	}
   516  
   517  	var buf bytes.Buffer
   518  	for {
   519  		ch0, _, err := r.ReadRune()
   520  		if ch0 == end {
   521  			return buf.Bytes(), nil
   522  		} else if err != nil {
   523  			return buf.Bytes(), err
   524  		} else if ch0 == '\n' {
   525  			return nil, errors.New("delimited text contains new line")
   526  		} else if ch0 == '\\' {
   527  			// If the next character is an escape then write the escaped char.
   528  			// If it's not a valid escape then return an error.
   529  			ch1, _, err := r.ReadRune()
   530  			if err != nil {
   531  				return nil, err
   532  			}
   533  
   534  			c, ok := escapes[ch1]
   535  			if !ok {
   536  				if escapesPassThru {
   537  					// Unread ch1 (char after the \)
   538  					_ = r.UnreadRune()
   539  					// Write ch0 (\) to the output buffer.
   540  					_, _ = buf.WriteRune(ch0)
   541  					continue
   542  				} else {
   543  					buf.Reset()
   544  					_, _ = buf.WriteRune(ch0)
   545  					_, _ = buf.WriteRune(ch1)
   546  					return buf.Bytes(), errBadEscape
   547  				}
   548  			}
   549  
   550  			_, _ = buf.WriteRune(c)
   551  		} else {
   552  			_, _ = buf.WriteRune(ch0)
   553  		}
   554  	}
   555  }
   556  
   557  // ScanString reads a quoted string from a rune reader.
   558  func ScanString(r io.RuneScanner) (string, error) {
   559  	ending, _, err := r.ReadRune()
   560  	if err != nil {
   561  		return "", errBadString
   562  	}
   563  
   564  	var buf bytes.Buffer
   565  	for {
   566  		ch0, _, err := r.ReadRune()
   567  		if ch0 == ending {
   568  			return buf.String(), nil
   569  		} else if err != nil || ch0 == '\n' {
   570  			return buf.String(), errBadString
   571  		} else if ch0 == '\\' {
   572  			// If the next character is an escape then write the escaped char.
   573  			// If it's not a valid escape then return an error.
   574  			ch1, _, _ := r.ReadRune()
   575  			if ch1 == 'n' {
   576  				_, _ = buf.WriteRune('\n')
   577  			} else if ch1 == '\\' {
   578  				_, _ = buf.WriteRune('\\')
   579  			} else if ch1 == '"' {
   580  				_, _ = buf.WriteRune('"')
   581  			} else if ch1 == '\'' {
   582  				_, _ = buf.WriteRune('\'')
   583  			} else {
   584  				return string(ch0) + string(ch1), errBadEscape
   585  			}
   586  		} else {
   587  			_, _ = buf.WriteRune(ch0)
   588  		}
   589  	}
   590  }
   591  
   592  var errBadString = errors.New("bad string")
   593  var errBadEscape = errors.New("bad escape")
   594  
   595  // ScanBareIdent reads bare identifier from a rune reader.
   596  func ScanBareIdent(r io.RuneScanner) string {
   597  	// Read every ident character into the buffer.
   598  	// Non-ident characters and EOF will cause the loop to exit.
   599  	var buf bytes.Buffer
   600  	for {
   601  		ch, _, err := r.ReadRune()
   602  		if err != nil {
   603  			break
   604  		} else if !isIdentChar(ch) {
   605  			r.UnreadRune()
   606  			break
   607  		} else {
   608  			_, _ = buf.WriteRune(ch)
   609  		}
   610  	}
   611  	return buf.String()
   612  }
   613  
   614  // IsRegexOp returns true if the operator accepts a regex operand.
   615  func IsRegexOp(t Token) bool {
   616  	return (t == EQREGEX || t == NEQREGEX)
   617  }