github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/sql/parser/lexer.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package parser 12 13 import ( 14 "bytes" 15 "fmt" 16 "strings" 17 18 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgcode" 19 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgerror" 20 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/sem/tree" 21 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/types" 22 unimp "github.com/cockroachdb/cockroachdb-parser/pkg/util/errorutil/unimplemented" 23 "github.com/cockroachdb/errors" 24 ) 25 26 type lexer struct { 27 in string 28 // tokens contains tokens generated by the scanner. 29 tokens []sqlSymType 30 31 // The type that should be used when an INT or SERIAL is encountered. 32 nakedIntType *types.T 33 34 // lastPos is the position into the tokens slice of the last 35 // token returned by Lex(). 36 lastPos int 37 38 stmt tree.Statement 39 // numPlaceholders is 1 + the highest placeholder index encountered. 40 numPlaceholders int 41 numAnnotations tree.AnnotationIdx 42 43 lastError error 44 } 45 46 func (l *lexer) init(sql string, tokens []sqlSymType, nakedIntType *types.T) { 47 l.in = sql 48 l.tokens = tokens 49 l.lastPos = -1 50 l.stmt = nil 51 l.numPlaceholders = 0 52 l.numAnnotations = 0 53 l.lastError = nil 54 55 l.nakedIntType = nakedIntType 56 } 57 58 // cleanup is used to avoid holding on to memory unnecessarily (for the cases 59 // where we reuse a scanner). 60 func (l *lexer) cleanup() { 61 l.tokens = nil 62 l.stmt = nil 63 l.lastError = nil 64 } 65 66 // Lex lexes a token from input. 67 func (l *lexer) Lex(lval *sqlSymType) int { 68 l.lastPos++ 69 // The core lexing takes place in the scanner. Here we do a small bit of post 70 // processing of the lexical tokens so that the grammar only requires 71 // one-token lookahead despite SQL requiring multi-token lookahead in some 72 // cases. These special cases are handled below and the returned tokens are 73 // adjusted to reflect the lookahead (LA) that occurred. 74 if l.lastPos >= len(l.tokens) { 75 lval.id = 0 76 lval.pos = int32(len(l.in)) 77 lval.str = "EOF" 78 return 0 79 } 80 *lval = l.tokens[l.lastPos] 81 82 switch lval.id { 83 case NOTHING: 84 // Introducing the "RETURNING NOTHING" syntax in CockroachDB 85 // was a terrible idea, given that it is not even used any more! 86 // We should really deprecate it and remove this special case. 87 if l.lastPos > 0 && l.tokens[l.lastPos-1].id == RETURNING { 88 lval.id = NOTHING_AFTER_RETURNING 89 } 90 case INDEX: 91 // The following complex logic is a consternation, really. 92 // 93 // It flows from a profoundly mistaken decision to allow the INDEX 94 // keyword inside the column definition list of CREATE, a place 95 // where PostgreSQL did not allow it, for a very good reason: 96 // applications legitimately want to name columns with the name 97 // "index". 98 // 99 // After this mistaken decision was first made, the INDEX keyword 100 // was also allowed in CockroachDB in another place where it is 101 // partially ambiguous with other identifiers: ORDER BY 102 // (`ORDER BY INDEX foo@bar`, ambiguous with `ORDER BY index`). 103 // 104 // Sadly it took a very long time before we realized this mistake, 105 // and by that time these uses of INDEX have become legitimate 106 // CockroachDB features. 107 // 108 // We are thus left with the need to disambiguate between: 109 // 110 // CREATE TABLE t(index a) -- column name "index", column type "a" 111 // CREATE TABLE t(index (a)) -- keyword INDEX, column name "a" 112 // CREATE TABLE t(index a (b)) -- keyword INDEX, index name "a", column name "b" 113 // 114 // Thankfully, a coldef for a column named "index" and an index 115 // specification differ unambiguously, *given sufficient 116 // lookaheaed*: an index specification always has an open '(' 117 // after INDEX, with or without an identifier in-between. A column 118 // definition never has this. 119 // 120 // Likewise, between: 121 // 122 // ORDER BY index 123 // ORDER BY index a@idx 124 // ORDER BY index a.b@idx 125 // ORDER BY index a.b.c@idx 126 // 127 // We can unambiguously distinguish by the presence of the '@' sign 128 // with a maximum of 6 token lookahead. 129 // 130 var pprevID, prevID int32 131 if l.lastPos > 0 { 132 prevID = l.tokens[l.lastPos-1].id 133 } 134 if l.lastPos > 1 { 135 pprevID = l.tokens[l.lastPos-2].id 136 } 137 var nextID, secondID int32 138 if l.lastPos+1 < len(l.tokens) { 139 nextID = l.tokens[l.lastPos+1].id 140 } 141 if l.lastPos+2 < len(l.tokens) { 142 secondID = l.tokens[l.lastPos+2].id 143 } 144 afterCommaOrParen := prevID == ',' || prevID == '(' 145 afterCommaOrOPTIONS := prevID == ',' || prevID == OPTIONS 146 afterCommaOrParenThenINVERTED := prevID == INVERTED && (pprevID == ',' || pprevID == '(') 147 followedByParen := nextID == '(' 148 followedByNonPunctThenParen := nextID > 255 /* non-punctuation */ && secondID == '(' 149 if // 150 // CREATE ... (INDEX ( 151 // CREATE ... (x INT, y INT, INDEX ( 152 (afterCommaOrParen && followedByParen) || 153 // SCRUB ... WITH OPTIONS INDEX (... 154 // SCRUB ... WITH OPTIONS a, INDEX (... 155 (afterCommaOrOPTIONS && followedByParen) || 156 // CREATE ... (INVERTED INDEX ( 157 // CREATE ... (x INT, y INT, INVERTED INDEX ( 158 (afterCommaOrParenThenINVERTED && followedByParen) { 159 lval.id = INDEX_BEFORE_PAREN 160 break 161 } 162 if // 163 // CREATE ... (INDEX abc ( 164 // CREATE ... (x INT, y INT, INDEX abc ( 165 (afterCommaOrParen && followedByNonPunctThenParen) || 166 // CREATE ... (INVERTED INDEX abc ( 167 // CREATE ... (x INT, y INT, INVERTED INDEX abc ( 168 (afterCommaOrParenThenINVERTED && followedByNonPunctThenParen) { 169 lval.id = INDEX_BEFORE_NAME_THEN_PAREN 170 break 171 } 172 // The rules above all require that the INDEX keyword be 173 // followed ultimately by an open parenthesis, with no '@' 174 // in-between. The rule below is strictly exclusive with this 175 // situation. 176 afterCommaOrOrderBy := prevID == ',' || (prevID == BY && pprevID == ORDER) 177 if afterCommaOrOrderBy { 178 // SORT BY INDEX <objname> @ 179 // SORT BY a, b, INDEX <objname> @ 180 atSignAfterObjectName := false 181 // An object name has one of the following forms: 182 // name 183 // name.name 184 // name.name.name 185 // So it is between 1 and 5 tokens in length. 186 for i := l.lastPos + 1; i < len(l.tokens) && i < l.lastPos+7; i++ { 187 curToken := l.tokens[i].id 188 // An object name can only contain keyword/identifiers, and 189 // the punctuation '.'. 190 if curToken < 255 /* not ident/keyword */ && curToken != '.' && curToken != '@' { 191 // Definitely not object name. 192 break 193 } 194 if curToken == '@' { 195 if i == l.lastPos+1 { 196 /* The '@' cannot follow the INDEX keyword directly. */ 197 break 198 } 199 atSignAfterObjectName = true 200 break 201 } 202 } 203 if atSignAfterObjectName { 204 lval.id = INDEX_AFTER_ORDER_BY_BEFORE_AT 205 } 206 } 207 208 case NOT, WITH, AS, GENERATED, NULLS, RESET, ROLE, USER, ON, TENANT, CLUSTER, SET: 209 nextToken := sqlSymType{} 210 if l.lastPos+1 < len(l.tokens) { 211 nextToken = l.tokens[l.lastPos+1] 212 } 213 secondToken := sqlSymType{} 214 if l.lastPos+2 < len(l.tokens) { 215 secondToken = l.tokens[l.lastPos+2] 216 } 217 thirdToken := sqlSymType{} 218 if l.lastPos+3 < len(l.tokens) { 219 thirdToken = l.tokens[l.lastPos+3] 220 } 221 222 // If you update these cases, update lex.lookaheadKeywords. 223 switch lval.id { 224 case AS: 225 switch nextToken.id { 226 case OF: 227 switch secondToken.id { 228 case SYSTEM: 229 lval.id = AS_LA 230 } 231 } 232 case NOT: 233 switch nextToken.id { 234 case BETWEEN, IN, LIKE, ILIKE, SIMILAR: 235 lval.id = NOT_LA 236 } 237 case GENERATED: 238 switch nextToken.id { 239 case ALWAYS: 240 lval.id = GENERATED_ALWAYS 241 case BY: 242 lval.id = GENERATED_BY_DEFAULT 243 } 244 245 case WITH: 246 switch nextToken.id { 247 case TIME, ORDINALITY, BUCKET_COUNT: 248 lval.id = WITH_LA 249 } 250 case NULLS: 251 switch nextToken.id { 252 case FIRST, LAST: 253 lval.id = NULLS_LA 254 } 255 case RESET: 256 switch nextToken.id { 257 case ALL: 258 lval.id = RESET_ALL 259 } 260 case ROLE: 261 switch nextToken.id { 262 case ALL: 263 lval.id = ROLE_ALL 264 } 265 case USER: 266 switch nextToken.id { 267 case ALL: 268 lval.id = USER_ALL 269 } 270 case ON: 271 switch nextToken.id { 272 case DELETE: 273 lval.id = ON_LA 274 case UPDATE: 275 switch secondToken.id { 276 case NO, RESTRICT, CASCADE, SET: 277 lval.id = ON_LA 278 } 279 } 280 case TENANT: 281 switch nextToken.id { 282 case ALL: 283 lval.id = TENANT_ALL 284 } 285 case CLUSTER: 286 switch nextToken.id { 287 case ALL: 288 lval.id = CLUSTER_ALL 289 } 290 case SET: 291 switch nextToken.id { 292 case TRACING: 293 // Do not use the lookahead rule for `SET tracing.custom ...` 294 if secondToken.str != "." { 295 lval.id = SET_TRACING 296 } 297 case SESSION: 298 switch secondToken.id { 299 case TRACING: 300 // Do not use the lookahead rule for `SET SESSION tracing.custom ...` 301 if thirdToken.str != "." { 302 lval.id = SET_TRACING 303 } 304 } 305 } 306 } 307 } 308 309 return int(lval.id) 310 } 311 312 func (l *lexer) lastToken() sqlSymType { 313 if l.lastPos < 0 { 314 return sqlSymType{} 315 } 316 317 if l.lastPos >= len(l.tokens) { 318 return sqlSymType{ 319 id: 0, 320 pos: int32(len(l.in)), 321 str: "EOF", 322 } 323 } 324 return l.tokens[l.lastPos] 325 } 326 327 // NewAnnotation returns a new annotation index. 328 func (l *lexer) NewAnnotation() tree.AnnotationIdx { 329 l.numAnnotations++ 330 return l.numAnnotations 331 } 332 333 // SetStmt is called from the parser when the statement is constructed. 334 func (l *lexer) SetStmt(stmt tree.Statement) { 335 l.stmt = stmt 336 } 337 338 // UpdateNumPlaceholders is called from the parser when a placeholder is constructed. 339 func (l *lexer) UpdateNumPlaceholders(p *tree.Placeholder) { 340 if n := int(p.Idx) + 1; l.numPlaceholders < n { 341 l.numPlaceholders = n 342 } 343 } 344 345 // PurposelyUnimplemented wraps Error, setting lastUnimplementedError. 346 func (l *lexer) PurposelyUnimplemented(feature string, reason string) { 347 // We purposely do not use unimp here, as it appends hints to suggest that 348 // the error may be actively tracked as a bug. 349 l.lastError = errors.WithHint( 350 errors.WithTelemetry( 351 pgerror.Newf(pgcode.Syntax, "unimplemented: this syntax"), 352 fmt.Sprintf("sql.purposely_unimplemented.%s", feature), 353 ), 354 reason, 355 ) 356 l.populateErrorDetails() 357 l.lastError = &tree.UnsupportedError{ 358 Err: l.lastError, 359 FeatureName: feature, 360 } 361 } 362 363 // UnimplementedWithIssue wraps Error, setting lastUnimplementedError. 364 func (l *lexer) UnimplementedWithIssue(issue int) { 365 l.lastError = unimp.NewWithIssue(issue, "this syntax") 366 l.populateErrorDetails() 367 l.lastError = &tree.UnsupportedError{ 368 Err: l.lastError, 369 FeatureName: fmt.Sprintf("https://github.com/cockroachdb/cockroachdb-parser/issues/%d", issue), 370 } 371 } 372 373 // UnimplementedWithIssueDetail wraps Error, setting lastUnimplementedError. 374 func (l *lexer) UnimplementedWithIssueDetail(issue int, detail string) { 375 l.lastError = unimp.NewWithIssueDetail(issue, detail, "this syntax") 376 l.populateErrorDetails() 377 l.lastError = &tree.UnsupportedError{ 378 Err: l.lastError, 379 FeatureName: detail, 380 } 381 } 382 383 // Unimplemented wraps Error, setting lastUnimplementedError. 384 func (l *lexer) Unimplemented(feature string) { 385 l.lastError = unimp.New(feature, "this syntax") 386 l.populateErrorDetails() 387 l.lastError = &tree.UnsupportedError{ 388 Err: l.lastError, 389 FeatureName: feature, 390 } 391 } 392 393 // setErr is called from parsing action rules to register an error observed 394 // while running the action. That error becomes the actual "cause" of the 395 // syntax error. 396 func (l *lexer) setErr(err error) { 397 err = pgerror.WithCandidateCode(err, pgcode.Syntax) 398 l.lastError = err 399 l.populateErrorDetails() 400 } 401 402 func (l *lexer) Error(e string) { 403 e = strings.TrimPrefix(e, "syntax error: ") // we'll add it again below. 404 l.lastError = pgerror.WithCandidateCode(errors.Newf("%s", e), pgcode.Syntax) 405 l.populateErrorDetails() 406 } 407 408 // PopulateErrorDetails properly wraps the "last error" field in the lexer. 409 func PopulateErrorDetails( 410 tokID int32, lastTokStr string, lastTokPos int32, lastErr error, lIn string, 411 ) error { 412 var retErr error 413 414 if tokID == ERROR { 415 // This is a tokenizer (lexical) error: the scanner 416 // will have stored the error message in the string field. 417 err := pgerror.WithCandidateCode(errors.Newf("lexical error: %s", lastTokStr), pgcode.Syntax) 418 retErr = errors.WithSecondaryError(err, lastErr) 419 } else { 420 // This is a contextual error. Print the provided error message 421 // and the error context. 422 if !strings.Contains(lastErr.Error(), "syntax error") { 423 // "syntax error" is already prepended when the yacc-generated 424 // parser encounters a parsing error. 425 lastErr = errors.Wrap(lastErr, "syntax error") 426 } 427 retErr = errors.Wrapf(lastErr, "at or near \"%s\"", lastTokStr) 428 } 429 430 // Find the end of the line containing the last token. 431 i := strings.IndexByte(lIn[lastTokPos:], '\n') 432 if i == -1 { 433 i = len(lIn) 434 } else { 435 i += int(lastTokPos) 436 } 437 // Find the beginning of the line containing the last token. Note that 438 // LastIndexByte returns -1 if '\n' could not be found. 439 j := strings.LastIndexByte(lIn[:lastTokPos], '\n') + 1 440 // Output everything up to and including the line containing the last token. 441 var buf bytes.Buffer 442 fmt.Fprintf(&buf, "source SQL:\n%s\n", lIn[:i]) 443 // Output a caret indicating where the last token starts. 444 fmt.Fprintf(&buf, "%s^", strings.Repeat(" ", int(lastTokPos)-j)) 445 return errors.WithDetail(retErr, buf.String()) 446 } 447 448 func (l *lexer) populateErrorDetails() { 449 lastTok := l.lastToken() 450 l.lastError = PopulateErrorDetails(lastTok.id, lastTok.str, lastTok.pos, l.lastError, l.in) 451 } 452 453 // SetHelp marks the "last error" field in the lexer to become a 454 // help text. This method is invoked in the error action of the 455 // parser, so the help text is only produced if the last token 456 // encountered was HELPTOKEN -- other cases are just syntax errors, 457 // and in that case we do not want the help text to overwrite the 458 // lastError field, which was set earlier to contain details about the 459 // syntax error. 460 func (l *lexer) SetHelp(msg HelpMessage) { 461 if l.lastError == nil { 462 l.lastError = pgerror.WithCandidateCode(errors.New("help request"), pgcode.Syntax) 463 } 464 465 if lastTok := l.lastToken(); lastTok.id == HELPTOKEN { 466 l.populateHelpMsg(msg.String()) 467 } else { 468 if msg.Command != "" { 469 l.lastError = errors.WithHintf(l.lastError, `try \h %s`, msg.Command) 470 } else { 471 l.lastError = errors.WithHintf(l.lastError, `try \hf %s`, msg.Function) 472 } 473 } 474 } 475 476 // specialHelpErrorPrefix is a special prefix that must be present at 477 // the start of an error message to be considered a valid help 478 // response payload by the CLI shell. 479 const specialHelpErrorPrefix = "help token in input" 480 481 func (l *lexer) populateHelpMsg(msg string) { 482 l.lastError = errors.WithHint(errors.Wrap(l.lastError, specialHelpErrorPrefix), msg) 483 }