github.com/GuanceCloud/cliutils@v1.1.21/filter/lex.go (about) 1 // Unless explicitly stated otherwise all files in this repository are licensed 2 // under the MIT License. 3 // This product includes software developed at Guance Cloud (https://www.guance.com/). 4 // Copyright 2021-present Guance, Inc. 5 6 package filter 7 8 import ( 9 "fmt" 10 "reflect" 11 "strings" 12 "unicode" 13 "unicode/utf8" 14 ) 15 16 type Item struct { 17 Typ ItemType 18 Pos Pos 19 Val string 20 } 21 22 func (i *Item) PositionRange() *PositionRange { 23 return &PositionRange{ 24 Start: i.Pos, 25 End: i.Pos + Pos(len(i.Val)), 26 } 27 } 28 29 func (i Item) String() string { 30 switch { 31 case i.Typ == EOF: 32 return "EOF" 33 case i.Typ == ERROR: 34 return i.Val 35 case i.Typ == ID: 36 return fmt.Sprintf("%q", i.Val) 37 case i.Typ.IsKeyword(): 38 return fmt.Sprintf("<%s>", i.Val) 39 case i.Typ.IsOperator(): 40 return fmt.Sprintf("<op:'%s'>", i.Val) 41 case len(i.Val) > 10: 42 return fmt.Sprintf("%.10q...", i.Val) 43 } 44 return fmt.Sprintf("%q", i.Val) 45 } 46 47 func (i ItemType) IsOperator() bool { return i > operatorsStart && i < operatorsEnd } 48 func (i ItemType) IsKeyword() bool { return i > keywordsStart && i < keywordsEnd } 49 50 type ItemType int 51 52 func (i *ItemType) MarshalJSON() ([]byte, error) { 53 return []byte(fmt.Sprintf(`"%s"`, reflect.ValueOf(i))), nil 54 } 55 56 const ( 57 eof = -1 58 lineComment = "#" 59 Digits = "0123456789" 60 HexDigits = "0123456789abcdefABCDEF" 61 ) 62 63 var ( 64 keywords = map[string]ItemType{ 65 // Keywords. 66 "and": AND, 67 "as": AS, 68 "asc": ASC, 69 "auto": AUTO, 70 "by": BY, 71 "desc": DESC, 72 73 "match": MATCH, 74 "notmatch": NOT_MATCH, 75 76 "false": FALSE, 77 "filter": FILTER, 78 "identifier": IDENTIFIER, 79 80 "in": IN, 81 82 "notin": NOT_IN, // deprecated 83 "not_in": NOT_IN, // same as notin 84 85 "limit": LIMIT, 86 "link": LINK, 87 "nil": NIL, 88 "null": NULL, 89 "offset": OFFSET, 90 "with": WITH, 91 "or": OR, 92 "order": ORDER, 93 "re": RE, 94 "int": INT, 95 "float": FLOAT, 96 "slimit": SLIMIT, 97 "soffset": SOFFSET, 98 "true": TRUE, 99 "tz": TIMEZONE, 100 } 101 102 ItemTypeStr = map[ItemType]string{ 103 LEFT_PAREN: "(", 104 RIGHT_PAREN: ")", 105 LEFT_BRACE: "{", 106 RIGHT_BRACE: "}", 107 LEFT_BRACKET: "[", 108 RIGHT_BRACKET: "]", 109 COMMA: ",", 110 EQ: "=", 111 COLON: ":", 112 SEMICOLON: ";", 113 SPACE: "<space>", 114 DOT: ".", 115 NAMESPACE: "::", 116 117 SUB: "-", 118 ADD: "+", 119 MUL: "*", 120 MOD: "%", 121 DIV: "/", 122 NEQ: "!=", 123 LTE: "<=", 124 LT: "<", 125 GTE: ">=", 126 GT: ">", 127 POW: "^", 128 AND: "&&", 129 OR: "||", 130 } 131 ) 132 133 func init() { //nolint:gochecknoinits 134 // Add keywords to Item type strings. 135 for s, ty := range keywords { 136 ItemTypeStr[ty] = s 137 } 138 // Special numbers. 139 keywords["inf"] = NUMBER 140 keywords["nan"] = NUMBER 141 } 142 143 func (i ItemType) String() string { 144 if s, ok := ItemTypeStr[i]; ok { 145 return s 146 } 147 return fmt.Sprintf("<Item %d>", i) 148 } 149 150 func (i Item) desc() string { 151 if _, ok := ItemTypeStr[i.Typ]; ok { 152 return i.String() 153 } 154 if i.Typ == EOF { 155 return i.Typ.desc() 156 } 157 return fmt.Sprintf("%s %s", i.Typ.desc(), i) 158 } 159 160 func (i ItemType) desc() string { 161 switch i { 162 case ERROR: 163 return "error" 164 case EOF: 165 return "end of input" 166 case COMMENT: 167 return "comment" 168 case ID: 169 return "id" 170 case STRING: 171 return "string" 172 case NUMBER: 173 return "number" 174 case DURATION: 175 return "duration" 176 } 177 return fmt.Sprintf("%q", i) 178 } 179 180 // stateFn represents the state of the scanner as a function that returns the next state. 181 type stateFn func(*Lexer) stateFn 182 183 // Pos is the position in a string. 184 // Negative numbers indicate undefined positions. 185 type Pos int 186 187 // Lexer holds the state of the scanner. 188 type Lexer struct { 189 input string // The string being scanned. 190 state stateFn // The next lexing function to enter. 191 pos Pos // Current position in the input. 192 start Pos // Start position of this Item. 193 width Pos // Width of last rune read from input. 194 lastPos Pos // Position of most recent Item returned by NextItem. 195 itemp *Item // Pointer to where the next scanned item should be placed. 196 scannedItem bool // Set to true every time an item is scanned. 197 198 parenDepth int // nested depth of () exprs. 199 braceDepth int // nested depth of {} exprs. 200 bracketDepth int // nested depth of [] exprs. 201 202 stringOpen rune // Quote rune of the string currently being read. 203 backquoteOpen rune // backquote keyworkds and utf8 characters 204 205 // seriesDesc is set when a series description for the testing 206 // language is lexed. 207 // seriesDesc bool 208 } 209 210 func Lex(input string) *Lexer { 211 l := &Lexer{ 212 input: input, 213 state: lexStatements, 214 } 215 return l 216 } 217 218 // Lexer entry. 219 func lexStatements(l *Lexer) stateFn { 220 if strings.HasPrefix(l.input[l.pos:], lineComment) { 221 return lexLineComment 222 } 223 224 switch r := l.next(); { 225 case r == '.': 226 l.emit(DOT) 227 228 case r == ',': 229 l.emit(COMMA) 230 231 case isSpace(r): 232 return lexSpace 233 234 case r == '*': 235 l.emit(MUL) 236 237 case r == '/': 238 l.emit(DIV) 239 240 case r == '%': 241 l.emit(MOD) 242 243 case r == '+': 244 l.emit(ADD) 245 246 case r == '-': 247 l.emit(SUB) 248 249 case r == '^': 250 l.emit(POW) 251 252 case r == '=': 253 l.emit(EQ) 254 255 case r == ';': 256 l.emit(SEMICOLON) 257 258 case r == '|': 259 if t := l.peek(); t == '|' { 260 l.next() 261 l.emit(OR) 262 } else { 263 // TODO: add bit-or operator 264 return l.errorf("unexpected character `%q' after `!'", r) 265 } 266 267 case r == '&': 268 if t := l.peek(); t == '&' { 269 l.next() 270 l.emit(AND) 271 } else { 272 // TODO: add bit-and operator 273 return l.errorf("unexpected character `%q' after `!'", r) 274 } 275 276 case r == ':': 277 if t := l.peek(); t == ':' && l.bracketDepth == 0 { 278 l.next() 279 l.emit(NAMESPACE) 280 } else { 281 l.emit(COLON) 282 } 283 284 case r == '!': 285 switch nr := l.next(); { 286 case nr == '=': 287 l.emit(NEQ) 288 default: 289 return l.errorf("unexpected character `%q' after `!'", nr) 290 } 291 292 case r == '<': 293 if t := l.peek(); t == '=' { 294 l.next() 295 l.emit(LTE) 296 } else { 297 l.emit(LT) 298 } 299 300 case r == '>': 301 if t := l.peek(); t == '=' { 302 l.next() 303 l.emit(GTE) 304 } else { 305 l.emit(GT) 306 } 307 308 case isDigit(r) || (r == '.' && isDigit(l.peek())): 309 l.backup() 310 return lexNumberOrDuration 311 312 case r == '"' || r == '\'': 313 l.stringOpen = r 314 return lexString 315 316 case r == '`': 317 l.backquoteOpen = r 318 return lexRawString 319 320 case isAlpha(r): 321 l.backup() 322 return lexKeywordOrIdentifier 323 324 case r == '(': 325 l.emit(LEFT_PAREN) 326 l.parenDepth++ 327 return lexStatements 328 329 case r == ')': 330 l.emit(RIGHT_PAREN) 331 l.parenDepth-- 332 if l.parenDepth < 0 { 333 return l.errorf("unexpected right parenthesis %q", r) 334 } 335 return lexStatements 336 337 case r == '{': 338 l.emit(LEFT_BRACE) 339 l.braceDepth++ 340 341 return lexStatements 342 343 case r == '}': 344 l.braceDepth-- 345 346 l.emit(RIGHT_BRACE) 347 return lexStatements 348 349 case r == '[': 350 351 l.bracketDepth++ 352 l.emit(LEFT_BRACKET) 353 354 case r == ']': 355 l.bracketDepth-- 356 l.emit(RIGHT_BRACKET) 357 358 case r == eof: 359 //nolint:gocritic 360 if l.parenDepth != 0 { 361 return l.errorf("unclosed left parenthesis") 362 } else if l.bracketDepth != 0 { 363 return l.errorf("unclosed left bracket") 364 } else if l.braceDepth != 0 { 365 return l.errorf("unclosed left brace") 366 } 367 368 l.emit(EOF) 369 return nil 370 371 default: 372 return l.errorf("unexpected character: %q", r) 373 } 374 return lexStatements 375 } 376 377 // 378 // Other state functions 379 // 380 381 // scan alphanumberic identifier, maybe keyword. 382 func lexKeywordOrIdentifier(l *Lexer) stateFn { 383 __goon: 384 for { 385 switch r := l.next(); { 386 case isAlphaNumeric(r): 387 // absorb 388 default: 389 l.backup() 390 word := l.input[l.start:l.pos] 391 392 if kw, ok := keywords[strings.ToLower(word)]; ok { 393 l.emit(kw) 394 } else { 395 l.emit(ID) 396 } 397 398 break __goon 399 } 400 } 401 402 return lexStatements 403 } 404 405 func lexSpace(l *Lexer) stateFn { 406 for isSpace(l.peek()) { 407 l.next() 408 } 409 410 l.ignore() 411 return lexStatements 412 } 413 414 func lexNumberOrDuration(l *Lexer) stateFn { 415 if l.scanNumber() { 416 l.emit(NUMBER) 417 return lexStatements 418 } 419 420 if acceptRemainDuration(l) { 421 l.backup() 422 l.emit(DURATION) 423 return lexStatements 424 } 425 426 return l.errorf("bad duration: %q", l.cur()) 427 } 428 429 func lexRawString(l *Lexer) stateFn { 430 __goon: 431 for { 432 switch l.next() { 433 case utf8.RuneError: 434 l.errorf("invalid UTF-8 rune") 435 return lexRawString 436 case eof: 437 l.errorf("unterminated raw string") 438 return lexRawString 439 case l.backquoteOpen: 440 break __goon 441 } 442 } 443 444 l.emit(QUOTED_STRING) 445 return lexStatements 446 } 447 448 func lexLineComment(l *Lexer) stateFn { 449 l.pos += Pos(len(lineComment)) 450 for r := l.next(); !isEOL(r) && r != eof; { 451 r = l.next() 452 } 453 l.backup() 454 l.emit(COMMENT) 455 return lexStatements 456 } 457 458 func lexEscape(l *Lexer) stateFn { 459 ch := l.next() 460 var n int 461 var base, max uint32 462 463 switch ch { 464 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', l.stringOpen, l.backquoteOpen: 465 return lexString 466 case '0', '1', '2', '3', '4', '5', '6', '7': 467 n, base, max = 3, 8, 255 468 case 'x', 'X': 469 ch = l.next() 470 n, base, max = 2, 16, 255 471 case 'u': 472 ch = l.next() 473 n, base, max = 4, 16, unicode.MaxRune 474 case 'U': 475 ch = l.next() 476 n, base, max = 8, 16, unicode.MaxRune 477 case eof: 478 l.errorf("escape squence not terminated") 479 return lexString 480 default: 481 l.errorf("unknown escape sequence %#U", ch) 482 return lexString 483 } 484 485 var x uint32 486 for n > 0 { 487 d := uint32(digitVal(ch)) 488 if d >= base { 489 if ch == eof { 490 l.errorf("escape sequence not terminated") 491 } 492 l.errorf("illegal character %#U in escape sequence", ch) 493 return lexString 494 } 495 496 x = x*base + d 497 ch = l.next() 498 n-- 499 } 500 501 if x > max || 0xD800 <= x && x < 0xE000 { 502 l.errorf("escape sequence is an invalid Unicode code point") 503 } 504 505 return lexString 506 } 507 508 func lexString(l *Lexer) stateFn { 509 __goon: 510 for { 511 switch l.next() { 512 case '\\': 513 return lexEscape 514 case utf8.RuneError: 515 l.errorf("invalid UTF-8 rune") 516 case eof, '\n': 517 return l.errorf("unterminated quoted string") 518 case l.stringOpen: 519 break __goon 520 } 521 } 522 523 l.emit(STRING) 524 return lexStatements 525 } 526 527 // lexer tool functions. 528 func (l *Lexer) next() rune { 529 if int(l.pos) >= len(l.input) { 530 l.width = 0 531 return eof 532 } 533 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 534 l.width = Pos(w) 535 l.pos += l.width 536 return r 537 } 538 539 func (l *Lexer) peek() rune { 540 r := l.next() 541 l.backup() 542 return r 543 } 544 545 func (l *Lexer) emit(t ItemType) { 546 *l.itemp = Item{t, l.start, l.input[l.start:l.pos]} 547 548 l.start = l.pos 549 l.scannedItem = true 550 } 551 552 func (l *Lexer) errorf(format string, args ...interface{}) stateFn { 553 *l.itemp = Item{ERROR, l.start, fmt.Sprintf(format, args...)} 554 l.scannedItem = true 555 556 return nil 557 } 558 559 func (l *Lexer) ignore() { 560 l.start = l.pos 561 } 562 563 func (l *Lexer) backup() { l.pos -= l.width } 564 565 func (l *Lexer) accept(valid string) bool { 566 if strings.ContainsRune(valid, l.next()) { 567 return true 568 } 569 l.backup() 570 return false 571 } 572 573 func (l *Lexer) acceptRun(valid string) { 574 for strings.ContainsRune(valid, l.next()) { 575 /* consume */ 576 } 577 l.backup() 578 } 579 580 func (l *Lexer) NextItem(itemp *Item) { 581 l.scannedItem = false 582 l.itemp = itemp 583 584 if l.state != nil { 585 for !l.scannedItem { 586 l.state = l.state(l) 587 } 588 } else { 589 l.emit(EOF) 590 } 591 592 l.lastPos = l.itemp.Pos 593 } 594 595 func (l *Lexer) cur() string { 596 return l.input[l.start:l.pos] 597 } 598 599 func (l *Lexer) scanNumber() bool { 600 digs := Digits 601 if l.accept("0") && l.accept("xX") { 602 digs = HexDigits 603 } 604 605 l.acceptRun(digs) 606 if l.accept(".") { 607 l.acceptRun(digs) 608 } 609 610 if l.accept("eE") { // scientific notation 611 l.accept("+-") 612 l.acceptRun(Digits) 613 } 614 615 // next things should not be alphanumberic 616 if r := l.peek(); !isAlphaNumeric(r) { 617 return true 618 } 619 620 return false 621 } 622 623 func acceptRemainDuration(l *Lexer) bool { 624 if !l.accept("nusmhdwy") { 625 return false 626 } 627 628 // support for `ms/us/ns` unit, `hs`, `ys` will be caught and parse duration failed 629 l.accept("s") 630 for l.accept(Digits) { // next 2 chars can be another number then a unit: 3m47s 631 for l.accept(Digits) { 632 } 633 634 if !l.accept("nusmhdw") { // NOTE: `y` removed: `y` should always come first in duration string 635 return false 636 } 637 638 l.accept("s") 639 } 640 641 return !isAlphaNumeric(l.next()) 642 } 643 644 // helpers. 645 func isAlphaNumeric(r rune) bool { return isAlpha(r) || isDigit(r) } 646 func isAlpha(r rune) bool { return r == '_' || ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') } 647 func isDigit(r rune) bool { return '0' <= r && r <= '9' } 648 func isSpace(r rune) bool { return r == ' ' || r == '\t' || r == '\n' || r == '\r' } 649 func isEOL(r rune) bool { return r == '\r' || r == '\n' } 650 651 func digitVal(ch rune) int { 652 switch { 653 case '0' <= ch && ch <= '9': 654 return int(ch - '0') 655 case 'a' <= ch && ch <= 'f': 656 return int(ch - 'a' + 10) 657 case 'A' <= ch && ch <= 'F': 658 return int(ch - 'A' + 10) 659 } 660 661 // larger than any legal digit val 662 return 16 //nolint:gomnd 663 }