github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/sql/parser/lexer.go

github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/sql/parser/lexer.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package parser
    12  
    13  import (
    14  	"bytes"
    15  	"fmt"
    16  	"strings"
    17  
    18  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgcode"
    19  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgerror"
    20  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/sem/tree"
    21  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/types"
    22  	unimp "github.com/cockroachdb/cockroachdb-parser/pkg/util/errorutil/unimplemented"
    23  	"github.com/cockroachdb/errors"
    24  )
    25  
    26  type lexer struct {
    27  	in string
    28  	// tokens contains tokens generated by the scanner.
    29  	tokens []sqlSymType
    30  
    31  	// The type that should be used when an INT or SERIAL is encountered.
    32  	nakedIntType *types.T
    33  
    34  	// lastPos is the position into the tokens slice of the last
    35  	// token returned by Lex().
    36  	lastPos int
    37  
    38  	stmt tree.Statement
    39  	// numPlaceholders is 1 + the highest placeholder index encountered.
    40  	numPlaceholders int
    41  	numAnnotations  tree.AnnotationIdx
    42  
    43  	lastError error
    44  }
    45  
    46  func (l *lexer) init(sql string, tokens []sqlSymType, nakedIntType *types.T) {
    47  	l.in = sql
    48  	l.tokens = tokens
    49  	l.lastPos = -1
    50  	l.stmt = nil
    51  	l.numPlaceholders = 0
    52  	l.numAnnotations = 0
    53  	l.lastError = nil
    54  
    55  	l.nakedIntType = nakedIntType
    56  }
    57  
    58  // cleanup is used to avoid holding on to memory unnecessarily (for the cases
    59  // where we reuse a scanner).
    60  func (l *lexer) cleanup() {
    61  	l.tokens = nil
    62  	l.stmt = nil
    63  	l.lastError = nil
    64  }
    65  
    66  // Lex lexes a token from input.
    67  func (l *lexer) Lex(lval *sqlSymType) int {
    68  	l.lastPos++
    69  	// The core lexing takes place in the scanner. Here we do a small bit of post
    70  	// processing of the lexical tokens so that the grammar only requires
    71  	// one-token lookahead despite SQL requiring multi-token lookahead in some
    72  	// cases. These special cases are handled below and the returned tokens are
    73  	// adjusted to reflect the lookahead (LA) that occurred.
    74  	if l.lastPos >= len(l.tokens) {
    75  		lval.id = 0
    76  		lval.pos = int32(len(l.in))
    77  		lval.str = "EOF"
    78  		return 0
    79  	}
    80  	*lval = l.tokens[l.lastPos]
    81  
    82  	switch lval.id {
    83  	case NOTHING:
    84  		// Introducing the "RETURNING NOTHING" syntax in CockroachDB
    85  		// was a terrible idea, given that it is not even used any more!
    86  		// We should really deprecate it and remove this special case.
    87  		if l.lastPos > 0 && l.tokens[l.lastPos-1].id == RETURNING {
    88  			lval.id = NOTHING_AFTER_RETURNING
    89  		}
    90  	case INDEX:
    91  		// The following complex logic is a consternation, really.
    92  		//
    93  		// It flows from a profoundly mistaken decision to allow the INDEX
    94  		// keyword inside the column definition list of CREATE, a place
    95  		// where PostgreSQL did not allow it, for a very good reason:
    96  		// applications legitimately want to name columns with the name
    97  		// "index".
    98  		//
    99  		// After this mistaken decision was first made, the INDEX keyword
   100  		// was also allowed in CockroachDB in another place where it is
   101  		// partially ambiguous with other identifiers: ORDER BY
   102  		// (`ORDER BY INDEX foo@bar`, ambiguous with `ORDER BY index`).
   103  		//
   104  		// Sadly it took a very long time before we realized this mistake,
   105  		// and by that time these uses of INDEX have become legitimate
   106  		// CockroachDB features.
   107  		//
   108  		// We are thus left with the need to disambiguate between:
   109  		//
   110  		// CREATE TABLE t(index a) -- column name "index", column type "a"
   111  		// CREATE TABLE t(index (a)) -- keyword INDEX, column name "a"
   112  		// CREATE TABLE t(index a (b)) -- keyword INDEX, index name "a", column name "b"
   113  		//
   114  		// Thankfully, a coldef for a column named "index" and an index
   115  		// specification differ unambiguously, *given sufficient
   116  		// lookaheaed*: an index specification always has an open '('
   117  		// after INDEX, with or without an identifier in-between. A column
   118  		// definition never has this.
   119  		//
   120  		// Likewise, between:
   121  		//
   122  		// ORDER BY index
   123  		// ORDER BY index a@idx
   124  		// ORDER BY index a.b@idx
   125  		// ORDER BY index a.b.c@idx
   126  		//
   127  		// We can unambiguously distinguish by the presence of the '@' sign
   128  		// with a maximum of 6 token lookahead.
   129  		//
   130  		var pprevID, prevID int32
   131  		if l.lastPos > 0 {
   132  			prevID = l.tokens[l.lastPos-1].id
   133  		}
   134  		if l.lastPos > 1 {
   135  			pprevID = l.tokens[l.lastPos-2].id
   136  		}
   137  		var nextID, secondID int32
   138  		if l.lastPos+1 < len(l.tokens) {
   139  			nextID = l.tokens[l.lastPos+1].id
   140  		}
   141  		if l.lastPos+2 < len(l.tokens) {
   142  			secondID = l.tokens[l.lastPos+2].id
   143  		}
   144  		afterCommaOrParen := prevID == ',' || prevID == '('
   145  		afterCommaOrOPTIONS := prevID == ',' || prevID == OPTIONS
   146  		afterCommaOrParenThenINVERTED := prevID == INVERTED && (pprevID == ',' || pprevID == '(')
   147  		followedByParen := nextID == '('
   148  		followedByNonPunctThenParen := nextID > 255 /* non-punctuation */ && secondID == '('
   149  		if //
   150  		// CREATE ... (INDEX (
   151  		// CREATE ... (x INT, y INT, INDEX (
   152  		(afterCommaOrParen && followedByParen) ||
   153  			// SCRUB ... WITH OPTIONS INDEX (...
   154  			// SCRUB ... WITH OPTIONS a, INDEX (...
   155  			(afterCommaOrOPTIONS && followedByParen) ||
   156  			// CREATE ... (INVERTED INDEX (
   157  			// CREATE ... (x INT, y INT, INVERTED INDEX (
   158  			(afterCommaOrParenThenINVERTED && followedByParen) {
   159  			lval.id = INDEX_BEFORE_PAREN
   160  			break
   161  		}
   162  		if //
   163  		// CREATE ... (INDEX abc (
   164  		// CREATE ... (x INT, y INT, INDEX abc (
   165  		(afterCommaOrParen && followedByNonPunctThenParen) ||
   166  			// CREATE ... (INVERTED INDEX abc (
   167  			// CREATE ... (x INT, y INT, INVERTED INDEX abc (
   168  			(afterCommaOrParenThenINVERTED && followedByNonPunctThenParen) {
   169  			lval.id = INDEX_BEFORE_NAME_THEN_PAREN
   170  			break
   171  		}
   172  		// The rules above all require that the INDEX keyword be
   173  		// followed ultimately by an open parenthesis, with no '@'
   174  		// in-between. The rule below is strictly exclusive with this
   175  		// situation.
   176  		afterCommaOrOrderBy := prevID == ',' || (prevID == BY && pprevID == ORDER)
   177  		if afterCommaOrOrderBy {
   178  			// SORT BY INDEX <objname> @
   179  			// SORT BY a, b, INDEX <objname> @
   180  			atSignAfterObjectName := false
   181  			// An object name has one of the following forms:
   182  			//    name
   183  			//    name.name
   184  			//    name.name.name
   185  			// So it is between 1 and 5 tokens in length.
   186  			for i := l.lastPos + 1; i < len(l.tokens) && i < l.lastPos+7; i++ {
   187  				curToken := l.tokens[i].id
   188  				// An object name can only contain keyword/identifiers, and
   189  				// the punctuation '.'.
   190  				if curToken < 255 /* not ident/keyword */ && curToken != '.' && curToken != '@' {
   191  					// Definitely not object name.
   192  					break
   193  				}
   194  				if curToken == '@' {
   195  					if i == l.lastPos+1 {
   196  						/* The '@' cannot follow the INDEX keyword directly. */
   197  						break
   198  					}
   199  					atSignAfterObjectName = true
   200  					break
   201  				}
   202  			}
   203  			if atSignAfterObjectName {
   204  				lval.id = INDEX_AFTER_ORDER_BY_BEFORE_AT
   205  			}
   206  		}
   207  
   208  	case NOT, WITH, AS, GENERATED, NULLS, RESET, ROLE, USER, ON, TENANT, CLUSTER, SET:
   209  		nextToken := sqlSymType{}
   210  		if l.lastPos+1 < len(l.tokens) {
   211  			nextToken = l.tokens[l.lastPos+1]
   212  		}
   213  		secondToken := sqlSymType{}
   214  		if l.lastPos+2 < len(l.tokens) {
   215  			secondToken = l.tokens[l.lastPos+2]
   216  		}
   217  		thirdToken := sqlSymType{}
   218  		if l.lastPos+3 < len(l.tokens) {
   219  			thirdToken = l.tokens[l.lastPos+3]
   220  		}
   221  
   222  		// If you update these cases, update lex.lookaheadKeywords.
   223  		switch lval.id {
   224  		case AS:
   225  			switch nextToken.id {
   226  			case OF:
   227  				switch secondToken.id {
   228  				case SYSTEM:
   229  					lval.id = AS_LA
   230  				}
   231  			}
   232  		case NOT:
   233  			switch nextToken.id {
   234  			case BETWEEN, IN, LIKE, ILIKE, SIMILAR:
   235  				lval.id = NOT_LA
   236  			}
   237  		case GENERATED:
   238  			switch nextToken.id {
   239  			case ALWAYS:
   240  				lval.id = GENERATED_ALWAYS
   241  			case BY:
   242  				lval.id = GENERATED_BY_DEFAULT
   243  			}
   244  
   245  		case WITH:
   246  			switch nextToken.id {
   247  			case TIME, ORDINALITY, BUCKET_COUNT:
   248  				lval.id = WITH_LA
   249  			}
   250  		case NULLS:
   251  			switch nextToken.id {
   252  			case FIRST, LAST:
   253  				lval.id = NULLS_LA
   254  			}
   255  		case RESET:
   256  			switch nextToken.id {
   257  			case ALL:
   258  				lval.id = RESET_ALL
   259  			}
   260  		case ROLE:
   261  			switch nextToken.id {
   262  			case ALL:
   263  				lval.id = ROLE_ALL
   264  			}
   265  		case USER:
   266  			switch nextToken.id {
   267  			case ALL:
   268  				lval.id = USER_ALL
   269  			}
   270  		case ON:
   271  			switch nextToken.id {
   272  			case DELETE:
   273  				lval.id = ON_LA
   274  			case UPDATE:
   275  				switch secondToken.id {
   276  				case NO, RESTRICT, CASCADE, SET:
   277  					lval.id = ON_LA
   278  				}
   279  			}
   280  		case TENANT:
   281  			switch nextToken.id {
   282  			case ALL:
   283  				lval.id = TENANT_ALL
   284  			}
   285  		case CLUSTER:
   286  			switch nextToken.id {
   287  			case ALL:
   288  				lval.id = CLUSTER_ALL
   289  			}
   290  		case SET:
   291  			switch nextToken.id {
   292  			case TRACING:
   293  				// Do not use the lookahead rule for `SET tracing.custom ...`
   294  				if secondToken.str != "." {
   295  					lval.id = SET_TRACING
   296  				}
   297  			case SESSION:
   298  				switch secondToken.id {
   299  				case TRACING:
   300  					// Do not use the lookahead rule for `SET SESSION tracing.custom ...`
   301  					if thirdToken.str != "." {
   302  						lval.id = SET_TRACING
   303  					}
   304  				}
   305  			}
   306  		}
   307  	}
   308  
   309  	return int(lval.id)
   310  }
   311  
   312  func (l *lexer) lastToken() sqlSymType {
   313  	if l.lastPos < 0 {
   314  		return sqlSymType{}
   315  	}
   316  
   317  	if l.lastPos >= len(l.tokens) {
   318  		return sqlSymType{
   319  			id:  0,
   320  			pos: int32(len(l.in)),
   321  			str: "EOF",
   322  		}
   323  	}
   324  	return l.tokens[l.lastPos]
   325  }
   326  
   327  // NewAnnotation returns a new annotation index.
   328  func (l *lexer) NewAnnotation() tree.AnnotationIdx {
   329  	l.numAnnotations++
   330  	return l.numAnnotations
   331  }
   332  
   333  // SetStmt is called from the parser when the statement is constructed.
   334  func (l *lexer) SetStmt(stmt tree.Statement) {
   335  	l.stmt = stmt
   336  }
   337  
   338  // UpdateNumPlaceholders is called from the parser when a placeholder is constructed.
   339  func (l *lexer) UpdateNumPlaceholders(p *tree.Placeholder) {
   340  	if n := int(p.Idx) + 1; l.numPlaceholders < n {
   341  		l.numPlaceholders = n
   342  	}
   343  }
   344  
   345  // PurposelyUnimplemented wraps Error, setting lastUnimplementedError.
   346  func (l *lexer) PurposelyUnimplemented(feature string, reason string) {
   347  	// We purposely do not use unimp here, as it appends hints to suggest that
   348  	// the error may be actively tracked as a bug.
   349  	l.lastError = errors.WithHint(
   350  		errors.WithTelemetry(
   351  			pgerror.Newf(pgcode.Syntax, "unimplemented: this syntax"),
   352  			fmt.Sprintf("sql.purposely_unimplemented.%s", feature),
   353  		),
   354  		reason,
   355  	)
   356  	l.populateErrorDetails()
   357  	l.lastError = &tree.UnsupportedError{
   358  		Err:         l.lastError,
   359  		FeatureName: feature,
   360  	}
   361  }
   362  
   363  // UnimplementedWithIssue wraps Error, setting lastUnimplementedError.
   364  func (l *lexer) UnimplementedWithIssue(issue int) {
   365  	l.lastError = unimp.NewWithIssue(issue, "this syntax")
   366  	l.populateErrorDetails()
   367  	l.lastError = &tree.UnsupportedError{
   368  		Err:         l.lastError,
   369  		FeatureName: fmt.Sprintf("https://github.com/cockroachdb/cockroachdb-parser/issues/%d", issue),
   370  	}
   371  }
   372  
   373  // UnimplementedWithIssueDetail wraps Error, setting lastUnimplementedError.
   374  func (l *lexer) UnimplementedWithIssueDetail(issue int, detail string) {
   375  	l.lastError = unimp.NewWithIssueDetail(issue, detail, "this syntax")
   376  	l.populateErrorDetails()
   377  	l.lastError = &tree.UnsupportedError{
   378  		Err:         l.lastError,
   379  		FeatureName: detail,
   380  	}
   381  }
   382  
   383  // Unimplemented wraps Error, setting lastUnimplementedError.
   384  func (l *lexer) Unimplemented(feature string) {
   385  	l.lastError = unimp.New(feature, "this syntax")
   386  	l.populateErrorDetails()
   387  	l.lastError = &tree.UnsupportedError{
   388  		Err:         l.lastError,
   389  		FeatureName: feature,
   390  	}
   391  }
   392  
   393  // setErr is called from parsing action rules to register an error observed
   394  // while running the action. That error becomes the actual "cause" of the
   395  // syntax error.
   396  func (l *lexer) setErr(err error) {
   397  	err = pgerror.WithCandidateCode(err, pgcode.Syntax)
   398  	l.lastError = err
   399  	l.populateErrorDetails()
   400  }
   401  
   402  func (l *lexer) Error(e string) {
   403  	e = strings.TrimPrefix(e, "syntax error: ") // we'll add it again below.
   404  	l.lastError = pgerror.WithCandidateCode(errors.Newf("%s", e), pgcode.Syntax)
   405  	l.populateErrorDetails()
   406  }
   407  
   408  // PopulateErrorDetails properly wraps the "last error" field in the lexer.
   409  func PopulateErrorDetails(
   410  	tokID int32, lastTokStr string, lastTokPos int32, lastErr error, lIn string,
   411  ) error {
   412  	var retErr error
   413  
   414  	if tokID == ERROR {
   415  		// This is a tokenizer (lexical) error: the scanner
   416  		// will have stored the error message in the string field.
   417  		err := pgerror.WithCandidateCode(errors.Newf("lexical error: %s", lastTokStr), pgcode.Syntax)
   418  		retErr = errors.WithSecondaryError(err, lastErr)
   419  	} else {
   420  		// This is a contextual error. Print the provided error message
   421  		// and the error context.
   422  		if !strings.Contains(lastErr.Error(), "syntax error") {
   423  			// "syntax error" is already prepended when the yacc-generated
   424  			// parser encounters a parsing error.
   425  			lastErr = errors.Wrap(lastErr, "syntax error")
   426  		}
   427  		retErr = errors.Wrapf(lastErr, "at or near \"%s\"", lastTokStr)
   428  	}
   429  
   430  	// Find the end of the line containing the last token.
   431  	i := strings.IndexByte(lIn[lastTokPos:], '\n')
   432  	if i == -1 {
   433  		i = len(lIn)
   434  	} else {
   435  		i += int(lastTokPos)
   436  	}
   437  	// Find the beginning of the line containing the last token. Note that
   438  	// LastIndexByte returns -1 if '\n' could not be found.
   439  	j := strings.LastIndexByte(lIn[:lastTokPos], '\n') + 1
   440  	// Output everything up to and including the line containing the last token.
   441  	var buf bytes.Buffer
   442  	fmt.Fprintf(&buf, "source SQL:\n%s\n", lIn[:i])
   443  	// Output a caret indicating where the last token starts.
   444  	fmt.Fprintf(&buf, "%s^", strings.Repeat(" ", int(lastTokPos)-j))
   445  	return errors.WithDetail(retErr, buf.String())
   446  }
   447  
   448  func (l *lexer) populateErrorDetails() {
   449  	lastTok := l.lastToken()
   450  	l.lastError = PopulateErrorDetails(lastTok.id, lastTok.str, lastTok.pos, l.lastError, l.in)
   451  }
   452  
   453  // SetHelp marks the "last error" field in the lexer to become a
   454  // help text. This method is invoked in the error action of the
   455  // parser, so the help text is only produced if the last token
   456  // encountered was HELPTOKEN -- other cases are just syntax errors,
   457  // and in that case we do not want the help text to overwrite the
   458  // lastError field, which was set earlier to contain details about the
   459  // syntax error.
   460  func (l *lexer) SetHelp(msg HelpMessage) {
   461  	if l.lastError == nil {
   462  		l.lastError = pgerror.WithCandidateCode(errors.New("help request"), pgcode.Syntax)
   463  	}
   464  
   465  	if lastTok := l.lastToken(); lastTok.id == HELPTOKEN {
   466  		l.populateHelpMsg(msg.String())
   467  	} else {
   468  		if msg.Command != "" {
   469  			l.lastError = errors.WithHintf(l.lastError, `try \h %s`, msg.Command)
   470  		} else {
   471  			l.lastError = errors.WithHintf(l.lastError, `try \hf %s`, msg.Function)
   472  		}
   473  	}
   474  }
   475  
   476  // specialHelpErrorPrefix is a special prefix that must be present at
   477  // the start of an error message to be considered a valid help
   478  // response payload by the CLI shell.
   479  const specialHelpErrorPrefix = "help token in input"
   480  
   481  func (l *lexer) populateHelpMsg(msg string) {
   482  	l.lastError = errors.WithHint(errors.Wrap(l.lastError, specialHelpErrorPrefix), msg)
   483  }