github.com/GuanceCloud/cliutils@v1.1.21/filter/lex.go (about)

     1  // Unless explicitly stated otherwise all files in this repository are licensed
     2  // under the MIT License.
     3  // This product includes software developed at Guance Cloud (https://www.guance.com/).
     4  // Copyright 2021-present Guance, Inc.
     5  
     6  package filter
     7  
     8  import (
     9  	"fmt"
    10  	"reflect"
    11  	"strings"
    12  	"unicode"
    13  	"unicode/utf8"
    14  )
    15  
    16  type Item struct {
    17  	Typ ItemType
    18  	Pos Pos
    19  	Val string
    20  }
    21  
    22  func (i *Item) PositionRange() *PositionRange {
    23  	return &PositionRange{
    24  		Start: i.Pos,
    25  		End:   i.Pos + Pos(len(i.Val)),
    26  	}
    27  }
    28  
    29  func (i Item) String() string {
    30  	switch {
    31  	case i.Typ == EOF:
    32  		return "EOF"
    33  	case i.Typ == ERROR:
    34  		return i.Val
    35  	case i.Typ == ID:
    36  		return fmt.Sprintf("%q", i.Val)
    37  	case i.Typ.IsKeyword():
    38  		return fmt.Sprintf("<%s>", i.Val)
    39  	case i.Typ.IsOperator():
    40  		return fmt.Sprintf("<op:'%s'>", i.Val)
    41  	case len(i.Val) > 10:
    42  		return fmt.Sprintf("%.10q...", i.Val)
    43  	}
    44  	return fmt.Sprintf("%q", i.Val)
    45  }
    46  
    47  func (i ItemType) IsOperator() bool { return i > operatorsStart && i < operatorsEnd }
    48  func (i ItemType) IsKeyword() bool  { return i > keywordsStart && i < keywordsEnd }
    49  
    50  type ItemType int
    51  
    52  func (i *ItemType) MarshalJSON() ([]byte, error) {
    53  	return []byte(fmt.Sprintf(`"%s"`, reflect.ValueOf(i))), nil
    54  }
    55  
    56  const (
    57  	eof         = -1
    58  	lineComment = "#"
    59  	Digits      = "0123456789"
    60  	HexDigits   = "0123456789abcdefABCDEF"
    61  )
    62  
    63  var (
    64  	keywords = map[string]ItemType{
    65  		// Keywords.
    66  		"and":  AND,
    67  		"as":   AS,
    68  		"asc":  ASC,
    69  		"auto": AUTO,
    70  		"by":   BY,
    71  		"desc": DESC,
    72  
    73  		"match":    MATCH,
    74  		"notmatch": NOT_MATCH,
    75  
    76  		"false":      FALSE,
    77  		"filter":     FILTER,
    78  		"identifier": IDENTIFIER,
    79  
    80  		"in": IN,
    81  
    82  		"notin":  NOT_IN, // deprecated
    83  		"not_in": NOT_IN, // same as notin
    84  
    85  		"limit":   LIMIT,
    86  		"link":    LINK,
    87  		"nil":     NIL,
    88  		"null":    NULL,
    89  		"offset":  OFFSET,
    90  		"with":    WITH,
    91  		"or":      OR,
    92  		"order":   ORDER,
    93  		"re":      RE,
    94  		"int":     INT,
    95  		"float":   FLOAT,
    96  		"slimit":  SLIMIT,
    97  		"soffset": SOFFSET,
    98  		"true":    TRUE,
    99  		"tz":      TIMEZONE,
   100  	}
   101  
   102  	ItemTypeStr = map[ItemType]string{
   103  		LEFT_PAREN:    "(",
   104  		RIGHT_PAREN:   ")",
   105  		LEFT_BRACE:    "{",
   106  		RIGHT_BRACE:   "}",
   107  		LEFT_BRACKET:  "[",
   108  		RIGHT_BRACKET: "]",
   109  		COMMA:         ",",
   110  		EQ:            "=",
   111  		COLON:         ":",
   112  		SEMICOLON:     ";",
   113  		SPACE:         "<space>",
   114  		DOT:           ".",
   115  		NAMESPACE:     "::",
   116  
   117  		SUB: "-",
   118  		ADD: "+",
   119  		MUL: "*",
   120  		MOD: "%",
   121  		DIV: "/",
   122  		NEQ: "!=",
   123  		LTE: "<=",
   124  		LT:  "<",
   125  		GTE: ">=",
   126  		GT:  ">",
   127  		POW: "^",
   128  		AND: "&&",
   129  		OR:  "||",
   130  	}
   131  )
   132  
   133  func init() { //nolint:gochecknoinits
   134  	// Add keywords to Item type strings.
   135  	for s, ty := range keywords {
   136  		ItemTypeStr[ty] = s
   137  	}
   138  	// Special numbers.
   139  	keywords["inf"] = NUMBER
   140  	keywords["nan"] = NUMBER
   141  }
   142  
   143  func (i ItemType) String() string {
   144  	if s, ok := ItemTypeStr[i]; ok {
   145  		return s
   146  	}
   147  	return fmt.Sprintf("<Item %d>", i)
   148  }
   149  
   150  func (i Item) desc() string {
   151  	if _, ok := ItemTypeStr[i.Typ]; ok {
   152  		return i.String()
   153  	}
   154  	if i.Typ == EOF {
   155  		return i.Typ.desc()
   156  	}
   157  	return fmt.Sprintf("%s %s", i.Typ.desc(), i)
   158  }
   159  
   160  func (i ItemType) desc() string {
   161  	switch i {
   162  	case ERROR:
   163  		return "error"
   164  	case EOF:
   165  		return "end of input"
   166  	case COMMENT:
   167  		return "comment"
   168  	case ID:
   169  		return "id"
   170  	case STRING:
   171  		return "string"
   172  	case NUMBER:
   173  		return "number"
   174  	case DURATION:
   175  		return "duration"
   176  	}
   177  	return fmt.Sprintf("%q", i)
   178  }
   179  
   180  // stateFn represents the state of the scanner as a function that returns the next state.
   181  type stateFn func(*Lexer) stateFn
   182  
   183  // Pos is the position in a string.
   184  // Negative numbers indicate undefined positions.
   185  type Pos int
   186  
   187  // Lexer holds the state of the scanner.
   188  type Lexer struct {
   189  	input       string  // The string being scanned.
   190  	state       stateFn // The next lexing function to enter.
   191  	pos         Pos     // Current position in the input.
   192  	start       Pos     // Start position of this Item.
   193  	width       Pos     // Width of last rune read from input.
   194  	lastPos     Pos     // Position of most recent Item returned by NextItem.
   195  	itemp       *Item   // Pointer to where the next scanned item should be placed.
   196  	scannedItem bool    // Set to true every time an item is scanned.
   197  
   198  	parenDepth   int // nested depth of () exprs.
   199  	braceDepth   int // nested depth of {} exprs.
   200  	bracketDepth int // nested depth of [] exprs.
   201  
   202  	stringOpen    rune // Quote rune of the string currently being read.
   203  	backquoteOpen rune // backquote keyworkds and utf8 characters
   204  
   205  	// seriesDesc is set when a series description for the testing
   206  	// language is lexed.
   207  	// seriesDesc bool
   208  }
   209  
   210  func Lex(input string) *Lexer {
   211  	l := &Lexer{
   212  		input: input,
   213  		state: lexStatements,
   214  	}
   215  	return l
   216  }
   217  
   218  // Lexer entry.
   219  func lexStatements(l *Lexer) stateFn {
   220  	if strings.HasPrefix(l.input[l.pos:], lineComment) {
   221  		return lexLineComment
   222  	}
   223  
   224  	switch r := l.next(); {
   225  	case r == '.':
   226  		l.emit(DOT)
   227  
   228  	case r == ',':
   229  		l.emit(COMMA)
   230  
   231  	case isSpace(r):
   232  		return lexSpace
   233  
   234  	case r == '*':
   235  		l.emit(MUL)
   236  
   237  	case r == '/':
   238  		l.emit(DIV)
   239  
   240  	case r == '%':
   241  		l.emit(MOD)
   242  
   243  	case r == '+':
   244  		l.emit(ADD)
   245  
   246  	case r == '-':
   247  		l.emit(SUB)
   248  
   249  	case r == '^':
   250  		l.emit(POW)
   251  
   252  	case r == '=':
   253  		l.emit(EQ)
   254  
   255  	case r == ';':
   256  		l.emit(SEMICOLON)
   257  
   258  	case r == '|':
   259  		if t := l.peek(); t == '|' {
   260  			l.next()
   261  			l.emit(OR)
   262  		} else {
   263  			// TODO: add bit-or operator
   264  			return l.errorf("unexpected character `%q' after `!'", r)
   265  		}
   266  
   267  	case r == '&':
   268  		if t := l.peek(); t == '&' {
   269  			l.next()
   270  			l.emit(AND)
   271  		} else {
   272  			// TODO: add bit-and operator
   273  			return l.errorf("unexpected character `%q' after `!'", r)
   274  		}
   275  
   276  	case r == ':':
   277  		if t := l.peek(); t == ':' && l.bracketDepth == 0 {
   278  			l.next()
   279  			l.emit(NAMESPACE)
   280  		} else {
   281  			l.emit(COLON)
   282  		}
   283  
   284  	case r == '!':
   285  		switch nr := l.next(); {
   286  		case nr == '=':
   287  			l.emit(NEQ)
   288  		default:
   289  			return l.errorf("unexpected character `%q' after `!'", nr)
   290  		}
   291  
   292  	case r == '<':
   293  		if t := l.peek(); t == '=' {
   294  			l.next()
   295  			l.emit(LTE)
   296  		} else {
   297  			l.emit(LT)
   298  		}
   299  
   300  	case r == '>':
   301  		if t := l.peek(); t == '=' {
   302  			l.next()
   303  			l.emit(GTE)
   304  		} else {
   305  			l.emit(GT)
   306  		}
   307  
   308  	case isDigit(r) || (r == '.' && isDigit(l.peek())):
   309  		l.backup()
   310  		return lexNumberOrDuration
   311  
   312  	case r == '"' || r == '\'':
   313  		l.stringOpen = r
   314  		return lexString
   315  
   316  	case r == '`':
   317  		l.backquoteOpen = r
   318  		return lexRawString
   319  
   320  	case isAlpha(r):
   321  		l.backup()
   322  		return lexKeywordOrIdentifier
   323  
   324  	case r == '(':
   325  		l.emit(LEFT_PAREN)
   326  		l.parenDepth++
   327  		return lexStatements
   328  
   329  	case r == ')':
   330  		l.emit(RIGHT_PAREN)
   331  		l.parenDepth--
   332  		if l.parenDepth < 0 {
   333  			return l.errorf("unexpected right parenthesis %q", r)
   334  		}
   335  		return lexStatements
   336  
   337  	case r == '{':
   338  		l.emit(LEFT_BRACE)
   339  		l.braceDepth++
   340  
   341  		return lexStatements
   342  
   343  	case r == '}':
   344  		l.braceDepth--
   345  
   346  		l.emit(RIGHT_BRACE)
   347  		return lexStatements
   348  
   349  	case r == '[':
   350  
   351  		l.bracketDepth++
   352  		l.emit(LEFT_BRACKET)
   353  
   354  	case r == ']':
   355  		l.bracketDepth--
   356  		l.emit(RIGHT_BRACKET)
   357  
   358  	case r == eof:
   359  		//nolint:gocritic
   360  		if l.parenDepth != 0 {
   361  			return l.errorf("unclosed left parenthesis")
   362  		} else if l.bracketDepth != 0 {
   363  			return l.errorf("unclosed left bracket")
   364  		} else if l.braceDepth != 0 {
   365  			return l.errorf("unclosed left brace")
   366  		}
   367  
   368  		l.emit(EOF)
   369  		return nil
   370  
   371  	default:
   372  		return l.errorf("unexpected character: %q", r)
   373  	}
   374  	return lexStatements
   375  }
   376  
   377  //
   378  // Other state functions
   379  //
   380  
   381  // scan alphanumberic identifier, maybe keyword.
   382  func lexKeywordOrIdentifier(l *Lexer) stateFn {
   383  __goon:
   384  	for {
   385  		switch r := l.next(); {
   386  		case isAlphaNumeric(r):
   387  			// absorb
   388  		default:
   389  			l.backup()
   390  			word := l.input[l.start:l.pos]
   391  
   392  			if kw, ok := keywords[strings.ToLower(word)]; ok {
   393  				l.emit(kw)
   394  			} else {
   395  				l.emit(ID)
   396  			}
   397  
   398  			break __goon
   399  		}
   400  	}
   401  
   402  	return lexStatements
   403  }
   404  
   405  func lexSpace(l *Lexer) stateFn {
   406  	for isSpace(l.peek()) {
   407  		l.next()
   408  	}
   409  
   410  	l.ignore()
   411  	return lexStatements
   412  }
   413  
   414  func lexNumberOrDuration(l *Lexer) stateFn {
   415  	if l.scanNumber() {
   416  		l.emit(NUMBER)
   417  		return lexStatements
   418  	}
   419  
   420  	if acceptRemainDuration(l) {
   421  		l.backup()
   422  		l.emit(DURATION)
   423  		return lexStatements
   424  	}
   425  
   426  	return l.errorf("bad duration: %q", l.cur())
   427  }
   428  
   429  func lexRawString(l *Lexer) stateFn {
   430  __goon:
   431  	for {
   432  		switch l.next() {
   433  		case utf8.RuneError:
   434  			l.errorf("invalid UTF-8 rune")
   435  			return lexRawString
   436  		case eof:
   437  			l.errorf("unterminated raw string")
   438  			return lexRawString
   439  		case l.backquoteOpen:
   440  			break __goon
   441  		}
   442  	}
   443  
   444  	l.emit(QUOTED_STRING)
   445  	return lexStatements
   446  }
   447  
   448  func lexLineComment(l *Lexer) stateFn {
   449  	l.pos += Pos(len(lineComment))
   450  	for r := l.next(); !isEOL(r) && r != eof; {
   451  		r = l.next()
   452  	}
   453  	l.backup()
   454  	l.emit(COMMENT)
   455  	return lexStatements
   456  }
   457  
   458  func lexEscape(l *Lexer) stateFn {
   459  	ch := l.next()
   460  	var n int
   461  	var base, max uint32
   462  
   463  	switch ch {
   464  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', l.stringOpen, l.backquoteOpen:
   465  		return lexString
   466  	case '0', '1', '2', '3', '4', '5', '6', '7':
   467  		n, base, max = 3, 8, 255
   468  	case 'x', 'X':
   469  		ch = l.next()
   470  		n, base, max = 2, 16, 255
   471  	case 'u':
   472  		ch = l.next()
   473  		n, base, max = 4, 16, unicode.MaxRune
   474  	case 'U':
   475  		ch = l.next()
   476  		n, base, max = 8, 16, unicode.MaxRune
   477  	case eof:
   478  		l.errorf("escape squence not terminated")
   479  		return lexString
   480  	default:
   481  		l.errorf("unknown escape sequence %#U", ch)
   482  		return lexString
   483  	}
   484  
   485  	var x uint32
   486  	for n > 0 {
   487  		d := uint32(digitVal(ch))
   488  		if d >= base {
   489  			if ch == eof {
   490  				l.errorf("escape sequence not terminated")
   491  			}
   492  			l.errorf("illegal character %#U in escape sequence", ch)
   493  			return lexString
   494  		}
   495  
   496  		x = x*base + d
   497  		ch = l.next()
   498  		n--
   499  	}
   500  
   501  	if x > max || 0xD800 <= x && x < 0xE000 {
   502  		l.errorf("escape sequence is an invalid Unicode code point")
   503  	}
   504  
   505  	return lexString
   506  }
   507  
   508  func lexString(l *Lexer) stateFn {
   509  __goon:
   510  	for {
   511  		switch l.next() {
   512  		case '\\':
   513  			return lexEscape
   514  		case utf8.RuneError:
   515  			l.errorf("invalid UTF-8 rune")
   516  		case eof, '\n':
   517  			return l.errorf("unterminated quoted string")
   518  		case l.stringOpen:
   519  			break __goon
   520  		}
   521  	}
   522  
   523  	l.emit(STRING)
   524  	return lexStatements
   525  }
   526  
   527  // lexer tool functions.
   528  func (l *Lexer) next() rune {
   529  	if int(l.pos) >= len(l.input) {
   530  		l.width = 0
   531  		return eof
   532  	}
   533  	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
   534  	l.width = Pos(w)
   535  	l.pos += l.width
   536  	return r
   537  }
   538  
   539  func (l *Lexer) peek() rune {
   540  	r := l.next()
   541  	l.backup()
   542  	return r
   543  }
   544  
   545  func (l *Lexer) emit(t ItemType) {
   546  	*l.itemp = Item{t, l.start, l.input[l.start:l.pos]}
   547  
   548  	l.start = l.pos
   549  	l.scannedItem = true
   550  }
   551  
   552  func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
   553  	*l.itemp = Item{ERROR, l.start, fmt.Sprintf(format, args...)}
   554  	l.scannedItem = true
   555  
   556  	return nil
   557  }
   558  
   559  func (l *Lexer) ignore() {
   560  	l.start = l.pos
   561  }
   562  
   563  func (l *Lexer) backup() { l.pos -= l.width }
   564  
   565  func (l *Lexer) accept(valid string) bool {
   566  	if strings.ContainsRune(valid, l.next()) {
   567  		return true
   568  	}
   569  	l.backup()
   570  	return false
   571  }
   572  
   573  func (l *Lexer) acceptRun(valid string) {
   574  	for strings.ContainsRune(valid, l.next()) {
   575  		/* consume */
   576  	}
   577  	l.backup()
   578  }
   579  
   580  func (l *Lexer) NextItem(itemp *Item) {
   581  	l.scannedItem = false
   582  	l.itemp = itemp
   583  
   584  	if l.state != nil {
   585  		for !l.scannedItem {
   586  			l.state = l.state(l)
   587  		}
   588  	} else {
   589  		l.emit(EOF)
   590  	}
   591  
   592  	l.lastPos = l.itemp.Pos
   593  }
   594  
   595  func (l *Lexer) cur() string {
   596  	return l.input[l.start:l.pos]
   597  }
   598  
   599  func (l *Lexer) scanNumber() bool {
   600  	digs := Digits
   601  	if l.accept("0") && l.accept("xX") {
   602  		digs = HexDigits
   603  	}
   604  
   605  	l.acceptRun(digs)
   606  	if l.accept(".") {
   607  		l.acceptRun(digs)
   608  	}
   609  
   610  	if l.accept("eE") { // scientific notation
   611  		l.accept("+-")
   612  		l.acceptRun(Digits)
   613  	}
   614  
   615  	// next things should not be alphanumberic
   616  	if r := l.peek(); !isAlphaNumeric(r) {
   617  		return true
   618  	}
   619  
   620  	return false
   621  }
   622  
   623  func acceptRemainDuration(l *Lexer) bool {
   624  	if !l.accept("nusmhdwy") {
   625  		return false
   626  	}
   627  
   628  	// support for `ms/us/ns` unit, `hs`, `ys` will be caught and parse duration failed
   629  	l.accept("s")
   630  	for l.accept(Digits) { // next 2 chars can be another number then a unit:  3m47s
   631  		for l.accept(Digits) {
   632  		}
   633  
   634  		if !l.accept("nusmhdw") { // NOTE: `y` removed: `y` should always come first in duration string
   635  			return false
   636  		}
   637  
   638  		l.accept("s")
   639  	}
   640  
   641  	return !isAlphaNumeric(l.next())
   642  }
   643  
   644  // helpers.
   645  func isAlphaNumeric(r rune) bool { return isAlpha(r) || isDigit(r) }
   646  func isAlpha(r rune) bool        { return r == '_' || ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') }
   647  func isDigit(r rune) bool        { return '0' <= r && r <= '9' }
   648  func isSpace(r rune) bool        { return r == ' ' || r == '\t' || r == '\n' || r == '\r' }
   649  func isEOL(r rune) bool          { return r == '\r' || r == '\n' }
   650  
   651  func digitVal(ch rune) int {
   652  	switch {
   653  	case '0' <= ch && ch <= '9':
   654  		return int(ch - '0')
   655  	case 'a' <= ch && ch <= 'f':
   656  		return int(ch - 'a' + 10)
   657  	case 'A' <= ch && ch <= 'F':
   658  		return int(ch - 'A' + 10)
   659  	}
   660  
   661  	// larger than any legal digit val
   662  	return 16 //nolint:gomnd
   663  }