github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/sql/parser.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package sql
    19  
    20  import (
    21  	"strings"
    22  
    23  	"github.com/alecthomas/participle"
    24  	"github.com/alecthomas/participle/lexer"
    25  )
    26  
    27  // Types with custom Capture interface for parsing
    28  
    29  // Boolean is a type for a parsed Boolean literal
    30  type Boolean bool
    31  
    32  // Capture interface used by participle
    33  func (b *Boolean) Capture(values []string) error {
    34  	*b = Boolean(strings.EqualFold(values[0], "true"))
    35  	return nil
    36  }
    37  
    38  // LiteralString is a type for parsed SQL string literals
    39  type LiteralString string
    40  
    41  // Capture interface used by participle
    42  func (ls *LiteralString) Capture(values []string) error {
    43  	// Remove enclosing single quote
    44  	n := len(values[0])
    45  	r := values[0][1 : n-1]
    46  	// Translate doubled quotes
    47  	*ls = LiteralString(strings.ReplaceAll(r, "''", "'"))
    48  	return nil
    49  }
    50  
    51  // LiteralList is a type for parsed SQL lists literals
    52  type LiteralList []string
    53  
    54  // Capture interface used by participle
    55  func (ls *LiteralList) Capture(values []string) error {
    56  	// Remove enclosing parenthesis.
    57  	n := len(values[0])
    58  	r := values[0][1 : n-1]
    59  	// Translate doubled quotes
    60  	*ls = LiteralList(strings.Split(r, ","))
    61  	return nil
    62  }
    63  
    64  // ObjectKey is a type for parsed strings occurring in key paths
    65  type ObjectKey struct {
    66  	Lit *LiteralString `parser:" \"[\" @LitString \"]\""`
    67  	ID  *Identifier    `parser:"| \".\" @@"`
    68  }
    69  
    70  // QuotedIdentifier is a type for parsed strings that are double
    71  // quoted.
    72  type QuotedIdentifier string
    73  
    74  // Capture interface used by participle
    75  func (qi *QuotedIdentifier) Capture(values []string) error {
    76  	// Remove enclosing quotes
    77  	n := len(values[0])
    78  	r := values[0][1 : n-1]
    79  
    80  	// Translate doubled quotes
    81  	*qi = QuotedIdentifier(strings.ReplaceAll(r, `""`, `"`))
    82  	return nil
    83  }
    84  
    85  // Types representing AST of SQL statement. Only SELECT is supported.
    86  
    87  // Select is the top level AST node type
    88  type Select struct {
    89  	Expression *SelectExpression `parser:"\"SELECT\" @@"`
    90  	From       *TableExpression  `parser:"\"FROM\" @@"`
    91  	Where      *Expression       `parser:"( \"WHERE\" @@ )?"`
    92  	Limit      *LitValue         `parser:"( \"LIMIT\" @@ )?"`
    93  }
    94  
    95  // SelectExpression represents the items requested in the select
    96  // statement
    97  type SelectExpression struct {
    98  	All         bool                 `parser:"  @\"*\""`
    99  	Expressions []*AliasedExpression `parser:"| @@ { \",\" @@ }"`
   100  }
   101  
   102  // TableExpression represents the FROM clause
   103  type TableExpression struct {
   104  	Table *JSONPath `parser:"@@"`
   105  	As    string    `parser:"( \"AS\"? @Ident )?"`
   106  }
   107  
   108  // JSONPathElement represents a keypath component
   109  type JSONPathElement struct {
   110  	Key            *ObjectKey `parser:"  @@"`               // ['name'] and .name forms
   111  	Index          *int       `parser:"| \"[\" @Int \"]\""` // [3] form
   112  	ObjectWildcard bool       `parser:"| @\".*\""`          // .* form
   113  	ArrayWildcard  bool       `parser:"| @\"[*]\""`         // [*] form
   114  }
   115  
   116  // JSONPath represents a keypath.
   117  // Instances should be treated idempotent and not change once created.
   118  type JSONPath struct {
   119  	BaseKey  *Identifier        `parser:" @@"`
   120  	PathExpr []*JSONPathElement `parser:"(@@)*"`
   121  
   122  	// Cached values:
   123  	pathString         string
   124  	strippedTableAlias string
   125  	strippedPathExpr   []*JSONPathElement
   126  }
   127  
   128  // AliasedExpression is an expression that can be optionally named
   129  type AliasedExpression struct {
   130  	Expression *Expression `parser:"@@"`
   131  	As         string      `parser:"[ \"AS\" @Ident | \"AS\" @LitString ]"`
   132  }
   133  
   134  // Grammar for Expression
   135  //
   136  // Expression          → AndCondition ("OR" AndCondition)*
   137  // AndCondition        → Condition ("AND" Condition)*
   138  // Condition           → "NOT" Condition | ConditionExpression
   139  // ConditionExpression → ValueExpression ("=" | "<>" | "<=" | ">=" | "<" | ">") ValueExpression
   140  //                     | ValueExpression "LIKE" ValueExpression ("ESCAPE" LitString)?
   141  //                     | ValueExpression ("NOT"? "BETWEEN" ValueExpression "AND" ValueExpression)
   142  //                     | ValueExpression "IN" "(" Expression ("," Expression)* ")"
   143  //                     | ValueExpression
   144  // ValueExpression     → Operand
   145  //
   146  // Operand grammar follows below
   147  
   148  // Expression represents a logical disjunction of clauses
   149  type Expression struct {
   150  	And []*AndCondition `parser:"@@ ( \"OR\" @@ )*"`
   151  }
   152  
   153  // ListExpr represents a literal list with elements as expressions.
   154  type ListExpr struct {
   155  	Elements []*Expression `parser:"\"(\" @@ ( \",\" @@ )* \")\" | \"[\" @@ ( \",\" @@ )* \"]\""`
   156  }
   157  
   158  // AndCondition represents logical conjunction of clauses
   159  type AndCondition struct {
   160  	Condition []*Condition `parser:"@@ ( \"AND\" @@ )*"`
   161  }
   162  
   163  // Condition represents a negation or a condition operand
   164  type Condition struct {
   165  	Operand *ConditionOperand `parser:"  @@"`
   166  	Not     *Condition        `parser:"| \"NOT\" @@"`
   167  }
   168  
   169  // ConditionOperand is a operand followed by an optional operation expression.
   170  type ConditionOperand struct {
   171  	Operand      *Operand      `parser:"@@"`
   172  	ConditionRHS *ConditionRHS `parser:"@@?"`
   173  }
   174  
   175  // ConditionRHS represents the right-hand-side of Compare, Between, In
   176  // or Like expressions.
   177  type ConditionRHS struct {
   178  	Compare *Compare `parser:"  @@"`
   179  	Between *Between `parser:"| @@"`
   180  	In      *In      `parser:"| \"IN\" @@"`
   181  	Like    *Like    `parser:"| @@"`
   182  }
   183  
   184  // Compare represents the RHS of a comparison expression
   185  type Compare struct {
   186  	Operator string   `parser:"@( \"<>\" | \"<=\" | \">=\" | \"=\" | \"<\" | \">\" | \"!=\" | \"IS\" \"NOT\" | \"IS\")"`
   187  	Operand  *Operand `parser:"  @@"`
   188  }
   189  
   190  // Like represents the RHS of a LIKE expression
   191  type Like struct {
   192  	Not        bool     `parser:" @\"NOT\"? "`
   193  	Pattern    *Operand `parser:" \"LIKE\" @@ "`
   194  	EscapeChar *Operand `parser:" (\"ESCAPE\" @@)? "`
   195  }
   196  
   197  // Between represents the RHS of a BETWEEN expression
   198  type Between struct {
   199  	Not   bool     `parser:" @\"NOT\"? "`
   200  	Start *Operand `parser:" \"BETWEEN\" @@ "`
   201  	End   *Operand `parser:" \"AND\" @@ "`
   202  }
   203  
   204  // In represents the RHS of an IN expression. The RHS can be a list-literal
   205  // (i.e. enclosed in parentheses like `IN (1,2,4)`) or it could be a JSON path
   206  // expression (as in `8.5 IN s.nested[*][*]`). Specifically, it cannot be an
   207  // `Expression` as an expression can never evaluate to a list.
   208  type In struct {
   209  	JPathExpr *JSONPath `parser:"@@"`
   210  	ListExpr  *ListExpr `parser:"| @@"`
   211  }
   212  
   213  // Grammar for Operand:
   214  //
   215  // operand → multOp ( ("-" | "+") multOp )*
   216  // multOp  → unary ( ("/" | "*" | "%") unary )*
   217  // unary   → "-" unary | primary
   218  // primary → Value | Variable | "(" expression ")"
   219  //
   220  
   221  // An Operand is a single term followed by an optional sequence of
   222  // terms separated by +/-
   223  type Operand struct {
   224  	Left  *MultOp     `parser:"@@"`
   225  	Right []*OpFactor `parser:"(@@)*"`
   226  }
   227  
   228  // OpFactor represents the right-side of a +/- operation.
   229  type OpFactor struct {
   230  	Op    string  `parser:"@(\"+\" | \"-\")"`
   231  	Right *MultOp `parser:"@@"`
   232  }
   233  
   234  // MultOp represents a single term followed by an optional sequence of
   235  // terms separated by *, / or % operators.
   236  type MultOp struct {
   237  	Left  *UnaryTerm     `parser:"@@"`
   238  	Right []*OpUnaryTerm `parser:"(@@)*"`
   239  }
   240  
   241  // OpUnaryTerm represents the right side of *, / or % binary operations.
   242  type OpUnaryTerm struct {
   243  	Op    string     `parser:"@(\"*\" | \"/\" | \"%\")"`
   244  	Right *UnaryTerm `parser:"@@"`
   245  }
   246  
   247  // UnaryTerm represents a single negated term or a primary term
   248  type UnaryTerm struct {
   249  	Negated *NegatedTerm `parser:"  @@"`
   250  	Primary *PrimaryTerm `parser:"| @@"`
   251  }
   252  
   253  // NegatedTerm has a leading minus sign.
   254  type NegatedTerm struct {
   255  	Term *PrimaryTerm `parser:"\"-\" @@"`
   256  }
   257  
   258  // PrimaryTerm represents a Value, Path expression, a Sub-expression
   259  // or a function call.
   260  type PrimaryTerm struct {
   261  	Value         *LitValue   `parser:"  @@"`
   262  	JPathExpr     *JSONPath   `parser:"| @@"`
   263  	ListExpr      *ListExpr   `parser:"| @@"`
   264  	SubExpression *Expression `parser:"| \"(\" @@ \")\""`
   265  	// Include function expressions here.
   266  	FuncCall *FuncExpr `parser:"| @@"`
   267  }
   268  
   269  // FuncExpr represents a function call
   270  type FuncExpr struct {
   271  	SFunc     *SimpleArgFunc `parser:"  @@"`
   272  	Count     *CountFunc     `parser:"| @@"`
   273  	Cast      *CastFunc      `parser:"| @@"`
   274  	Substring *SubstringFunc `parser:"| @@"`
   275  	Extract   *ExtractFunc   `parser:"| @@"`
   276  	Trim      *TrimFunc      `parser:"| @@"`
   277  	DateAdd   *DateAddFunc   `parser:"| @@"`
   278  	DateDiff  *DateDiffFunc  `parser:"| @@"`
   279  
   280  	// Used during evaluation for aggregation funcs
   281  	aggregate *aggVal
   282  }
   283  
   284  // SimpleArgFunc represents functions with simple expression
   285  // arguments.
   286  type SimpleArgFunc struct {
   287  	FunctionName string `parser:" @(\"AVG\" | \"MAX\" | \"MIN\" | \"SUM\" |  \"COALESCE\" | \"NULLIF\" | \"TO_STRING\" | \"TO_TIMESTAMP\" | \"UTCNOW\" | \"CHAR_LENGTH\" | \"CHARACTER_LENGTH\" | \"LOWER\" | \"UPPER\") "`
   288  
   289  	ArgsList []*Expression `parser:"\"(\" (@@ (\",\" @@)*)?\")\""`
   290  }
   291  
   292  // CountFunc represents the COUNT sql function
   293  type CountFunc struct {
   294  	StarArg bool        `parser:" \"COUNT\" \"(\" ( @\"*\"?"`
   295  	ExprArg *Expression `parser:" @@? )! \")\""`
   296  }
   297  
   298  // CastFunc represents CAST sql function
   299  type CastFunc struct {
   300  	Expr     *Expression `parser:" \"CAST\" \"(\" @@ "`
   301  	CastType string      `parser:" \"AS\" @(\"BOOL\" | \"INT\" | \"INTEGER\" | \"STRING\" | \"FLOAT\" | \"DECIMAL\" | \"NUMERIC\" | \"TIMESTAMP\") \")\" "`
   302  }
   303  
   304  // SubstringFunc represents SUBSTRING sql function
   305  type SubstringFunc struct {
   306  	Expr *PrimaryTerm `parser:" \"SUBSTRING\" \"(\" @@ "`
   307  	From *Operand     `parser:" ( \"FROM\" @@ "`
   308  	For  *Operand     `parser:"   (\"FOR\" @@)? \")\" "`
   309  	Arg2 *Operand     `parser:" | \",\" @@ "`
   310  	Arg3 *Operand     `parser:"   (\",\" @@)? \")\" )"`
   311  }
   312  
   313  // ExtractFunc represents EXTRACT sql function
   314  type ExtractFunc struct {
   315  	Timeword string       `parser:" \"EXTRACT\" \"(\" @( \"YEAR\":Timeword | \"MONTH\":Timeword | \"DAY\":Timeword | \"HOUR\":Timeword | \"MINUTE\":Timeword | \"SECOND\":Timeword | \"TIMEZONE_HOUR\":Timeword | \"TIMEZONE_MINUTE\":Timeword ) "`
   316  	From     *PrimaryTerm `parser:" \"FROM\" @@ \")\" "`
   317  }
   318  
   319  // TrimFunc represents TRIM sql function
   320  type TrimFunc struct {
   321  	TrimWhere *string      `parser:" \"TRIM\" \"(\" ( @( \"LEADING\" | \"TRAILING\" | \"BOTH\" ) "`
   322  	TrimChars *PrimaryTerm `parser:"             @@?  "`
   323  	TrimFrom  *PrimaryTerm `parser:"             \"FROM\" )? @@ \")\" "`
   324  }
   325  
   326  // DateAddFunc represents the DATE_ADD function
   327  type DateAddFunc struct {
   328  	DatePart  string       `parser:" \"DATE_ADD\" \"(\" @( \"YEAR\":Timeword | \"MONTH\":Timeword | \"DAY\":Timeword | \"HOUR\":Timeword | \"MINUTE\":Timeword | \"SECOND\":Timeword ) \",\""`
   329  	Quantity  *Operand     `parser:" @@ \",\""`
   330  	Timestamp *PrimaryTerm `parser:" @@ \")\""`
   331  }
   332  
   333  // DateDiffFunc represents the DATE_DIFF function
   334  type DateDiffFunc struct {
   335  	DatePart   string       `parser:" \"DATE_DIFF\" \"(\" @( \"YEAR\":Timeword | \"MONTH\":Timeword | \"DAY\":Timeword | \"HOUR\":Timeword | \"MINUTE\":Timeword | \"SECOND\":Timeword ) \",\" "`
   336  	Timestamp1 *PrimaryTerm `parser:" @@ \",\" "`
   337  	Timestamp2 *PrimaryTerm `parser:" @@ \")\" "`
   338  }
   339  
   340  // LitValue represents a literal value parsed from the sql
   341  type LitValue struct {
   342  	Float   *float64       `parser:"(  @Float"`
   343  	Int     *float64       `parser:" | @Int"` // To avoid value out of range, use float64 instead
   344  	String  *LiteralString `parser:" | @LitString"`
   345  	Boolean *Boolean       `parser:" | @(\"TRUE\" | \"FALSE\")"`
   346  	Null    bool           `parser:" | @\"NULL\""`
   347  	Missing bool           `parser:" | @\"MISSING\")"`
   348  }
   349  
   350  // Identifier represents a parsed identifier
   351  type Identifier struct {
   352  	Unquoted *string           `parser:"  @Ident"`
   353  	Quoted   *QuotedIdentifier `parser:"| @QuotIdent"`
   354  }
   355  
   356  var (
   357  	sqlLexer = lexer.Must(lexer.Regexp(`(\s+)` +
   358  		`|(?P<Timeword>(?i)\b(?:YEAR|MONTH|DAY|HOUR|MINUTE|SECOND|TIMEZONE_HOUR|TIMEZONE_MINUTE)\b)` +
   359  		`|(?P<Keyword>(?i)\b(?:SELECT|FROM|TOP|DISTINCT|ALL|WHERE|GROUP|BY|HAVING|UNION|MINUS|EXCEPT|INTERSECT|ORDER|LIMIT|OFFSET|TRUE|FALSE|NULL|IS|NOT|ANY|SOME|BETWEEN|AND|OR|LIKE|ESCAPE|AS|IN|BOOL|INT|INTEGER|STRING|FLOAT|DECIMAL|NUMERIC|TIMESTAMP|AVG|COUNT|MAX|MIN|SUM|COALESCE|NULLIF|CAST|DATE_ADD|DATE_DIFF|EXTRACT|TO_STRING|TO_TIMESTAMP|UTCNOW|CHAR_LENGTH|CHARACTER_LENGTH|LOWER|SUBSTRING|TRIM|UPPER|LEADING|TRAILING|BOTH|FOR|MISSING)\b)` +
   360  		`|(?P<Ident>[a-zA-Z_][a-zA-Z0-9_]*)` +
   361  		`|(?P<QuotIdent>"([^"]*("")?)*")` +
   362  		`|(?P<Float>\d*\.\d+([eE][-+]?\d+)?)` +
   363  		`|(?P<Int>\d+)` +
   364  		`|(?P<LitString>'([^']*('')?)*')` +
   365  		`|(?P<Operators><>|!=|<=|>=|\.\*|\[\*\]|[-+*/%,.()=<>\[\]])`,
   366  	))
   367  
   368  	// SQLParser is used to parse SQL statements
   369  	SQLParser = participle.MustBuild(
   370  		&Select{},
   371  		participle.Lexer(sqlLexer),
   372  		participle.CaseInsensitive("Keyword"),
   373  		participle.CaseInsensitive("Timeword"),
   374  	)
   375  )