github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/query/graphite/lexer/lexer.go (about) 1 // Copyright (c) 2019 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package lexer 22 23 import ( 24 "fmt" 25 "strings" 26 "unicode/utf8" 27 28 "github.com/m3db/m3/src/query/graphite/graphite" 29 ) 30 31 // TokenType defines the type of identifier recognized by the Lexer. 32 type TokenType int 33 34 const ( 35 // Error is what you get when the lexer fails to grok the input. 36 Error TokenType = iota 37 // Identifier is a symbol confining to C-style variable naming rules. 38 Identifier 39 // Pattern is a regex-ish pattern, accepts the following special chars: [{.*}]. 40 Pattern 41 // Number is a numeral, including floats. 42 Number 43 // String is set of characters wrapped by double quotes. 44 String 45 // LParenthesis is the left parenthesis "(". 46 LParenthesis 47 // RParenthesis is the right parenthesis ")". 48 RParenthesis 49 // NotOperator is the exclamation sign - "!" symbol. 50 NotOperator 51 // Comma is a punctuation mark. 52 Comma 53 // Equal is the "=" symbol. 54 Equal 55 56 // True is Boolean true. 57 True 58 // False is Boolean false. 59 False 60 ) 61 62 func (tt TokenType) String() string { 63 switch tt { 64 case Error: 65 return "Error" 66 case Identifier: 67 return "Identifier" 68 case Pattern: 69 return "Pattern" 70 case Number: 71 return "Number" 72 case String: 73 return "String" 74 case LParenthesis: 75 return "LParenthesis" 76 case RParenthesis: 77 return "RParenthesis" 78 case NotOperator: 79 return "NotOperator" 80 case Comma: 81 return "Comma" 82 case Equal: 83 return "Equal" 84 case True: 85 return "True" 86 case False: 87 return "False" 88 } 89 return fmt.Sprintf("UnknownToken(%d)", int(tt)) 90 } 91 92 var symbols = map[rune]TokenType{ 93 '(': LParenthesis, 94 ')': RParenthesis, 95 '!': NotOperator, 96 ',': Comma, 97 '=': Equal, 98 } 99 100 // Token is a token, doh! 101 type Token struct { 102 tokenType TokenType 103 value string 104 } 105 106 // MustMakeToken is a test function for creating a Token.MustMakeToken. 107 func MustMakeToken(value string) *Token { return &Token{value: value} } 108 109 // TokenType returns the type of token consumed. 110 func (t Token) TokenType() TokenType { 111 return t.tokenType 112 } 113 114 // Value returns the string representation of the token as needed. 115 func (t Token) Value() string { 116 return t.value 117 } 118 119 const ( 120 uppercaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 121 lowercaseLetters = "abcdefghijklmnopqrstuvwxyz" 122 digits = "0123456789" 123 exponentRunes = "eE" 124 identifierStartRunes = uppercaseLetters + lowercaseLetters + "_" + "-" + "$" + ":" + "~" 125 identifierRunes = identifierStartRunes + digits 126 signs = "+-" 127 ) 128 129 // Lexer breaks an input stream into a group of lexical elements. 130 type Lexer struct { 131 tokens chan *Token 132 s string 133 start int 134 pos int 135 width int 136 reservedIdentifiers map[string]TokenType 137 opts Options 138 } 139 140 const ( 141 eof rune = 0 142 ) 143 144 // Options allows for specifying lexer options. 145 type Options struct { 146 EscapeAllNotOnlyQuotes bool 147 } 148 149 // NewLexer returns a lexer and an output channel for tokens. 150 func NewLexer(s string, reservedIdentifiers map[string]TokenType, opts Options) (*Lexer, chan *Token) { 151 tokens := make(chan *Token) 152 return &Lexer{ 153 s: s, 154 tokens: tokens, 155 reservedIdentifiers: reservedIdentifiers, 156 opts: opts, 157 }, tokens 158 } 159 160 // Run consumes the input to produce a token stream. 161 func (l *Lexer) Run() { 162 for l.lex() { 163 } 164 close(l.tokens) 165 } 166 167 func (l *Lexer) lex() bool { 168 l.skipWhitespace() 169 170 r := l.next() 171 if r == eof { 172 return false 173 } 174 175 if r == '"' || r == '\'' { 176 return l.quotedString(r) 177 } 178 179 if r == '+' || r == '-' { 180 return l.positiveOrNegativeNumber() 181 } 182 183 if r == '.' { 184 return l.fractionalOnlyNumber() 185 } 186 187 if strings.ContainsRune(digits, r) { 188 return l.numberOrPattern() 189 } 190 191 if strings.ContainsRune(identifierStartRunes, r) { 192 return l.identifierOrPattern() 193 } 194 195 if strings.ContainsRune("{[*.", r) { 196 l.backup() 197 return l.pattern() 198 } 199 200 sym, ok := symbols[r] 201 if !ok { 202 return l.errorf("unexpected character %c", r) 203 } 204 205 l.emit(sym) 206 return true 207 } 208 209 func (l *Lexer) eof() bool { 210 l.skipWhitespace() 211 return l.pos >= len(l.s) 212 } 213 214 func (l *Lexer) positiveOrNegativeNumber() bool { 215 if !l.acceptRun(digits) { 216 return l.unexpected(digits) 217 } 218 219 if l.accept(".") { 220 return l.fractionalPart() 221 } 222 223 l.emit(Number) 224 return true 225 } 226 227 func (l *Lexer) fractionalOnlyNumber() bool { 228 if !l.acceptRun(digits) { 229 return l.unexpected(digits) 230 } 231 if l.accept(exponentRunes) { 232 return l.exponentPart() 233 } 234 l.emit(Number) 235 return true 236 } 237 238 func (l *Lexer) fractionalPart() bool { 239 l.acceptRun(digits) 240 l.emit(Number) 241 return true 242 } 243 244 func (l *Lexer) exponentPart() bool { 245 l.accept(signs) 246 if !l.acceptRun(digits) { 247 return l.unexpected(digits) 248 } 249 l.emit(Number) 250 return true 251 } 252 253 func (l *Lexer) numberOrPattern() bool { 254 l.acceptRun(digits) 255 if l.accept(".") { 256 return l.fractionalPartOrPattern() 257 } 258 259 r := l.next() 260 if r != eof { 261 l.backup() 262 } 263 if l.accept(exponentRunes) { 264 return l.exponentPart() 265 } 266 if strings.ContainsRune("{[*-"+identifierStartRunes, r) { 267 return l.pattern() 268 } 269 270 l.emit(Number) 271 return true 272 } 273 274 func (l *Lexer) fractionalPartOrPattern() bool { 275 l.acceptRun(digits) 276 277 r := l.next() 278 if r != eof { 279 l.backup() 280 } 281 if l.accept(exponentRunes) { 282 return l.exponentPart() 283 } 284 if strings.ContainsRune("{[*-."+identifierStartRunes, r) { 285 return l.pattern() 286 } 287 288 l.emit(Number) 289 return true 290 } 291 292 func (l *Lexer) identifierOrPattern() bool { 293 l.acceptRun(identifierRunes) 294 295 r := l.next() 296 if r != eof { 297 l.backup() 298 } 299 if strings.ContainsRune("{[*.-", r) { 300 return l.pattern() 301 } 302 303 // Check if identifier is one of the reserved identifiers. 304 for text, identifier := range l.reservedIdentifiers { 305 if strings.ToLower(l.currentVal()) == text { 306 l.emit(identifier) 307 return true 308 } 309 } 310 311 l.emit(Identifier) 312 return true 313 } 314 315 // NB(jayp): initialized by init(). 316 var groupingEndsToStarts = map[rune]rune{} 317 318 var groupingStartsToEnds = map[rune]rune{ 319 '{': '}', 320 '[': ']', 321 } 322 323 func (l *Lexer) pattern() bool { 324 // rune(0) indicates pattern is not in a group. 325 groupStartStack := []rune{rune(0)} 326 for { 327 r := l.next() 328 329 // Start of a group. 330 if _, ok := groupingStartsToEnds[r]; ok { 331 // Start another group. 332 groupStartStack = append(groupStartStack, r) 333 continue 334 } 335 336 // End of a group. 337 if groupStart, ok := groupingEndsToStarts[r]; ok { 338 // Unwind group. 339 if groupStart != groupStartStack[len(groupStartStack)-1] { 340 return l.errorf("encountered unbalanced end of group %c in pattern %s", 341 r, l.currentVal()) 342 } 343 groupStartStack = groupStartStack[:len(groupStartStack)-1] 344 continue 345 } 346 347 if strings.ContainsRune(graphite.ValidIdentifierRunes+".?*", r) { 348 continue 349 } 350 351 // Commas are part of the pattern if they appear in a group 352 if r == ',' && groupStartStack[len(groupStartStack)-1] != 0 { 353 continue 354 } 355 356 // Everything else is the end of the pattern. 357 if groupStartStack[len(groupStartStack)-1] != 0 { 358 return l.errorf("end of pattern %s reached while still in group %c", 359 l.currentVal(), groupStartStack[len(groupStartStack)-1]) 360 } 361 362 if r != eof { 363 l.backup() 364 } 365 l.emit(Pattern) 366 return true 367 } 368 } 369 370 func (l *Lexer) quotedString(quoteMark rune) bool { 371 var s []rune 372 escaped := false 373 for { 374 r := l.next() 375 if r == eof { 376 return l.errorf("reached end of input while processing string %s", l.currentVal()) 377 } 378 379 if !escaped && r == quoteMark { 380 l.emitToken(String, string(s)) 381 l.consumeVal() 382 return true 383 } 384 385 if !escaped && r == '\\' { 386 // TODO: Want to omit this from the output. 387 escaped = true 388 continue 389 } 390 391 // By default we only need escaping for quotes and treat 392 // backslashes as regular backslashes (i.e. for use in regexp 393 // with aliasSub, etc) and as such restore backslash as long not 394 // escaping a quote. 395 restoreBackslash := escaped && r != quoteMark 396 if l.opts.EscapeAllNotOnlyQuotes { 397 // If escaping all characters not just quotes then only restore 398 // backslash if using it for regex group replacement (i.e. "\1"). 399 restoreBackslash = escaped && strings.ContainsRune(digits, r) 400 } 401 if restoreBackslash { 402 // If backslash not being used to escape quote then keep it. 403 s = append(s, '\\') 404 } 405 406 s = append(s, r) 407 escaped = false 408 } 409 } 410 411 func (l *Lexer) unexpected(expected string) bool { 412 r := l.next() 413 l.backup() 414 return l.errorf("expected one of %s, found %c", expected, r) 415 } 416 417 func (l *Lexer) skipWhitespace() { 418 l.acceptRun(" \t\r\n") 419 l.ignore() 420 } 421 422 func (l *Lexer) next() (r rune) { 423 if l.pos >= len(l.s) { 424 l.width = 0 425 return eof 426 } 427 428 r, l.width = utf8.DecodeRuneInString(l.s[l.pos:]) 429 l.pos += l.width 430 return r 431 } 432 433 func (l *Lexer) ignore() { 434 l.start = l.pos 435 } 436 437 func (l *Lexer) backup() { 438 l.pos-- 439 } 440 441 func (l *Lexer) accept(valid string) bool { 442 r := l.next() 443 if r != eof && strings.ContainsRune(valid, r) { 444 return true 445 } 446 447 if r != eof { 448 l.backup() 449 } 450 return false 451 } 452 453 func (l *Lexer) acceptRun(valid string) bool { 454 matched := false 455 456 r := l.next() 457 for strings.ContainsRune(valid, r) && r != eof { 458 matched = true 459 r = l.next() 460 } 461 462 if r != eof { 463 l.backup() 464 } 465 466 return matched 467 } 468 469 func (l *Lexer) currentVal() string { 470 return l.s[l.start:l.pos] 471 } 472 473 func (l *Lexer) consumeVal() string { 474 s := l.currentVal() 475 l.start = l.pos 476 return s 477 } 478 479 func (l *Lexer) emit(tt TokenType) { 480 l.emitToken(tt, l.consumeVal()) 481 } 482 483 func (l *Lexer) emitToken(tt TokenType, val string) { 484 l.tokens <- &Token{ 485 tokenType: tt, 486 value: val, 487 } 488 } 489 490 func (l *Lexer) errorf(msg string, args ...interface{}) bool { 491 l.tokens <- &Token{ 492 tokenType: Error, 493 value: fmt.Sprintf(msg, args...), 494 } 495 return false 496 } 497 498 func init() { 499 for start, end := range groupingStartsToEnds { 500 groupingEndsToStarts[end] = start 501 } 502 }