github.com/m3db/m3@v1.5.0/src/query/graphite/lexer/lexer.go (about)

     1  // Copyright (c) 2019 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package lexer
    22  
    23  import (
    24  	"fmt"
    25  	"strings"
    26  	"unicode/utf8"
    27  
    28  	"github.com/m3db/m3/src/query/graphite/graphite"
    29  )
    30  
    31  // TokenType defines the type of identifier recognized by the Lexer.
    32  type TokenType int
    33  
    34  const (
    35  	// Error is what you get when the lexer fails to grok the input.
    36  	Error TokenType = iota
    37  	// Identifier is a symbol confining to C-style variable naming rules.
    38  	Identifier
    39  	// Pattern is a regex-ish pattern, accepts the following special chars: [{.*}].
    40  	Pattern
    41  	// Number is a numeral, including floats.
    42  	Number
    43  	// String is set of characters wrapped by double quotes.
    44  	String
    45  	// LParenthesis is the left parenthesis "(".
    46  	LParenthesis
    47  	// RParenthesis is the right parenthesis ")".
    48  	RParenthesis
    49  	// NotOperator is the exclamation sign - "!" symbol.
    50  	NotOperator
    51  	// Comma is a punctuation mark.
    52  	Comma
    53  	// Equal is the "=" symbol.
    54  	Equal
    55  
    56  	// True is Boolean true.
    57  	True
    58  	// False is Boolean false.
    59  	False
    60  )
    61  
    62  func (tt TokenType) String() string {
    63  	switch tt {
    64  	case Error:
    65  		return "Error"
    66  	case Identifier:
    67  		return "Identifier"
    68  	case Pattern:
    69  		return "Pattern"
    70  	case Number:
    71  		return "Number"
    72  	case String:
    73  		return "String"
    74  	case LParenthesis:
    75  		return "LParenthesis"
    76  	case RParenthesis:
    77  		return "RParenthesis"
    78  	case NotOperator:
    79  		return "NotOperator"
    80  	case Comma:
    81  		return "Comma"
    82  	case Equal:
    83  		return "Equal"
    84  	case True:
    85  		return "True"
    86  	case False:
    87  		return "False"
    88  	}
    89  	return fmt.Sprintf("UnknownToken(%d)", int(tt))
    90  }
    91  
    92  var symbols = map[rune]TokenType{
    93  	'(': LParenthesis,
    94  	')': RParenthesis,
    95  	'!': NotOperator,
    96  	',': Comma,
    97  	'=': Equal,
    98  }
    99  
   100  // Token is a token, doh!
   101  type Token struct {
   102  	tokenType TokenType
   103  	value     string
   104  }
   105  
   106  // MustMakeToken is a test function for creating a Token.MustMakeToken.
   107  func MustMakeToken(value string) *Token { return &Token{value: value} }
   108  
   109  // TokenType returns the type of token consumed.
   110  func (t Token) TokenType() TokenType {
   111  	return t.tokenType
   112  }
   113  
   114  // Value returns the string representation of the token as needed.
   115  func (t Token) Value() string {
   116  	return t.value
   117  }
   118  
   119  const (
   120  	uppercaseLetters     = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   121  	lowercaseLetters     = "abcdefghijklmnopqrstuvwxyz"
   122  	digits               = "0123456789"
   123  	exponentRunes        = "eE"
   124  	identifierStartRunes = uppercaseLetters + lowercaseLetters + "_" + "-" + "$" + ":" + "~"
   125  	identifierRunes      = identifierStartRunes + digits
   126  	signs                = "+-"
   127  )
   128  
   129  // Lexer breaks an input stream into a group of lexical elements.
   130  type Lexer struct {
   131  	tokens              chan *Token
   132  	s                   string
   133  	start               int
   134  	pos                 int
   135  	width               int
   136  	reservedIdentifiers map[string]TokenType
   137  	opts                Options
   138  }
   139  
   140  const (
   141  	eof rune = 0
   142  )
   143  
   144  // Options allows for specifying lexer options.
   145  type Options struct {
   146  	EscapeAllNotOnlyQuotes bool
   147  }
   148  
   149  // NewLexer returns a lexer and an output channel for tokens.
   150  func NewLexer(s string, reservedIdentifiers map[string]TokenType, opts Options) (*Lexer, chan *Token) {
   151  	tokens := make(chan *Token)
   152  	return &Lexer{
   153  		s:                   s,
   154  		tokens:              tokens,
   155  		reservedIdentifiers: reservedIdentifiers,
   156  		opts:                opts,
   157  	}, tokens
   158  }
   159  
   160  // Run consumes the input to produce a token stream.
   161  func (l *Lexer) Run() {
   162  	for l.lex() {
   163  	}
   164  	close(l.tokens)
   165  }
   166  
   167  func (l *Lexer) lex() bool {
   168  	l.skipWhitespace()
   169  
   170  	r := l.next()
   171  	if r == eof {
   172  		return false
   173  	}
   174  
   175  	if r == '"' || r == '\'' {
   176  		return l.quotedString(r)
   177  	}
   178  
   179  	if r == '+' || r == '-' {
   180  		return l.positiveOrNegativeNumber()
   181  	}
   182  
   183  	if r == '.' {
   184  		return l.fractionalOnlyNumber()
   185  	}
   186  
   187  	if strings.ContainsRune(digits, r) {
   188  		return l.numberOrPattern()
   189  	}
   190  
   191  	if strings.ContainsRune(identifierStartRunes, r) {
   192  		return l.identifierOrPattern()
   193  	}
   194  
   195  	if strings.ContainsRune("{[*.", r) {
   196  		l.backup()
   197  		return l.pattern()
   198  	}
   199  
   200  	sym, ok := symbols[r]
   201  	if !ok {
   202  		return l.errorf("unexpected character %c", r)
   203  	}
   204  
   205  	l.emit(sym)
   206  	return true
   207  }
   208  
   209  func (l *Lexer) eof() bool {
   210  	l.skipWhitespace()
   211  	return l.pos >= len(l.s)
   212  }
   213  
   214  func (l *Lexer) positiveOrNegativeNumber() bool {
   215  	if !l.acceptRun(digits) {
   216  		return l.unexpected(digits)
   217  	}
   218  
   219  	if l.accept(".") {
   220  		return l.fractionalPart()
   221  	}
   222  
   223  	l.emit(Number)
   224  	return true
   225  }
   226  
   227  func (l *Lexer) fractionalOnlyNumber() bool {
   228  	if !l.acceptRun(digits) {
   229  		return l.unexpected(digits)
   230  	}
   231  	if l.accept(exponentRunes) {
   232  		return l.exponentPart()
   233  	}
   234  	l.emit(Number)
   235  	return true
   236  }
   237  
   238  func (l *Lexer) fractionalPart() bool {
   239  	l.acceptRun(digits)
   240  	l.emit(Number)
   241  	return true
   242  }
   243  
   244  func (l *Lexer) exponentPart() bool {
   245  	l.accept(signs)
   246  	if !l.acceptRun(digits) {
   247  		return l.unexpected(digits)
   248  	}
   249  	l.emit(Number)
   250  	return true
   251  }
   252  
   253  func (l *Lexer) numberOrPattern() bool {
   254  	l.acceptRun(digits)
   255  	if l.accept(".") {
   256  		return l.fractionalPartOrPattern()
   257  	}
   258  
   259  	r := l.next()
   260  	if r != eof {
   261  		l.backup()
   262  	}
   263  	if l.accept(exponentRunes) {
   264  		return l.exponentPart()
   265  	}
   266  	if strings.ContainsRune("{[*-"+identifierStartRunes, r) {
   267  		return l.pattern()
   268  	}
   269  
   270  	l.emit(Number)
   271  	return true
   272  }
   273  
   274  func (l *Lexer) fractionalPartOrPattern() bool {
   275  	l.acceptRun(digits)
   276  
   277  	r := l.next()
   278  	if r != eof {
   279  		l.backup()
   280  	}
   281  	if l.accept(exponentRunes) {
   282  		return l.exponentPart()
   283  	}
   284  	if strings.ContainsRune("{[*-."+identifierStartRunes, r) {
   285  		return l.pattern()
   286  	}
   287  
   288  	l.emit(Number)
   289  	return true
   290  }
   291  
   292  func (l *Lexer) identifierOrPattern() bool {
   293  	l.acceptRun(identifierRunes)
   294  
   295  	r := l.next()
   296  	if r != eof {
   297  		l.backup()
   298  	}
   299  	if strings.ContainsRune("{[*.-", r) {
   300  		return l.pattern()
   301  	}
   302  
   303  	// Check if identifier is one of the reserved identifiers.
   304  	for text, identifier := range l.reservedIdentifiers {
   305  		if strings.ToLower(l.currentVal()) == text {
   306  			l.emit(identifier)
   307  			return true
   308  		}
   309  	}
   310  
   311  	l.emit(Identifier)
   312  	return true
   313  }
   314  
   315  // NB(jayp): initialized by init().
   316  var groupingEndsToStarts = map[rune]rune{}
   317  
   318  var groupingStartsToEnds = map[rune]rune{
   319  	'{': '}',
   320  	'[': ']',
   321  }
   322  
   323  func (l *Lexer) pattern() bool {
   324  	// rune(0) indicates pattern is not in a group.
   325  	groupStartStack := []rune{rune(0)}
   326  	for {
   327  		r := l.next()
   328  
   329  		// Start of a group.
   330  		if _, ok := groupingStartsToEnds[r]; ok {
   331  			// Start another group.
   332  			groupStartStack = append(groupStartStack, r)
   333  			continue
   334  		}
   335  
   336  		// End of a group.
   337  		if groupStart, ok := groupingEndsToStarts[r]; ok {
   338  			// Unwind group.
   339  			if groupStart != groupStartStack[len(groupStartStack)-1] {
   340  				return l.errorf("encountered unbalanced end of group %c in pattern %s",
   341  					r, l.currentVal())
   342  			}
   343  			groupStartStack = groupStartStack[:len(groupStartStack)-1]
   344  			continue
   345  		}
   346  
   347  		if strings.ContainsRune(graphite.ValidIdentifierRunes+".?*", r) {
   348  			continue
   349  		}
   350  
   351  		// Commas are part of the pattern if they appear in a group
   352  		if r == ',' && groupStartStack[len(groupStartStack)-1] != 0 {
   353  			continue
   354  		}
   355  
   356  		// Everything else is the end of the pattern.
   357  		if groupStartStack[len(groupStartStack)-1] != 0 {
   358  			return l.errorf("end of pattern %s reached while still in group %c",
   359  				l.currentVal(), groupStartStack[len(groupStartStack)-1])
   360  		}
   361  
   362  		if r != eof {
   363  			l.backup()
   364  		}
   365  		l.emit(Pattern)
   366  		return true
   367  	}
   368  }
   369  
   370  func (l *Lexer) quotedString(quoteMark rune) bool {
   371  	var s []rune
   372  	escaped := false
   373  	for {
   374  		r := l.next()
   375  		if r == eof {
   376  			return l.errorf("reached end of input while processing string %s", l.currentVal())
   377  		}
   378  
   379  		if !escaped && r == quoteMark {
   380  			l.emitToken(String, string(s))
   381  			l.consumeVal()
   382  			return true
   383  		}
   384  
   385  		if !escaped && r == '\\' {
   386  			// TODO: Want to omit this from the output.
   387  			escaped = true
   388  			continue
   389  		}
   390  
   391  		// By default we only need escaping for quotes and treat
   392  		// backslashes as regular backslashes (i.e. for use in regexp
   393  		// with aliasSub, etc) and as such restore backslash as long not
   394  		// escaping a quote.
   395  		restoreBackslash := escaped && r != quoteMark
   396  		if l.opts.EscapeAllNotOnlyQuotes {
   397  			// If escaping all characters not just quotes then only restore
   398  			// backslash if using it for regex group replacement (i.e. "\1").
   399  			restoreBackslash = escaped && strings.ContainsRune(digits, r)
   400  		}
   401  		if restoreBackslash {
   402  			// If backslash not being used to escape quote then keep it.
   403  			s = append(s, '\\')
   404  		}
   405  
   406  		s = append(s, r)
   407  		escaped = false
   408  	}
   409  }
   410  
   411  func (l *Lexer) unexpected(expected string) bool {
   412  	r := l.next()
   413  	l.backup()
   414  	return l.errorf("expected one of %s, found %c", expected, r)
   415  }
   416  
   417  func (l *Lexer) skipWhitespace() {
   418  	l.acceptRun(" \t\r\n")
   419  	l.ignore()
   420  }
   421  
   422  func (l *Lexer) next() (r rune) {
   423  	if l.pos >= len(l.s) {
   424  		l.width = 0
   425  		return eof
   426  	}
   427  
   428  	r, l.width = utf8.DecodeRuneInString(l.s[l.pos:])
   429  	l.pos += l.width
   430  	return r
   431  }
   432  
   433  func (l *Lexer) ignore() {
   434  	l.start = l.pos
   435  }
   436  
   437  func (l *Lexer) backup() {
   438  	l.pos--
   439  }
   440  
   441  func (l *Lexer) accept(valid string) bool {
   442  	r := l.next()
   443  	if r != eof && strings.ContainsRune(valid, r) {
   444  		return true
   445  	}
   446  
   447  	if r != eof {
   448  		l.backup()
   449  	}
   450  	return false
   451  }
   452  
   453  func (l *Lexer) acceptRun(valid string) bool {
   454  	matched := false
   455  
   456  	r := l.next()
   457  	for strings.ContainsRune(valid, r) && r != eof {
   458  		matched = true
   459  		r = l.next()
   460  	}
   461  
   462  	if r != eof {
   463  		l.backup()
   464  	}
   465  
   466  	return matched
   467  }
   468  
   469  func (l *Lexer) currentVal() string {
   470  	return l.s[l.start:l.pos]
   471  }
   472  
   473  func (l *Lexer) consumeVal() string {
   474  	s := l.currentVal()
   475  	l.start = l.pos
   476  	return s
   477  }
   478  
   479  func (l *Lexer) emit(tt TokenType) {
   480  	l.emitToken(tt, l.consumeVal())
   481  }
   482  
   483  func (l *Lexer) emitToken(tt TokenType, val string) {
   484  	l.tokens <- &Token{
   485  		tokenType: tt,
   486  		value:     val,
   487  	}
   488  }
   489  
   490  func (l *Lexer) errorf(msg string, args ...interface{}) bool {
   491  	l.tokens <- &Token{
   492  		tokenType: Error,
   493  		value:     fmt.Sprintf(msg, args...),
   494  	}
   495  	return false
   496  }
   497  
   498  func init() {
   499  	for start, end := range groupingStartsToEnds {
   500  		groupingEndsToStarts[end] = start
   501  	}
   502  }