vitess.io/vitess@v0.16.2/go/vt/sqlparser/token.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sqlparser 18 19 import ( 20 "fmt" 21 "strconv" 22 "strings" 23 24 "vitess.io/vitess/go/sqltypes" 25 ) 26 27 const ( 28 eofChar = 0x100 29 ) 30 31 // Tokenizer is the struct used to generate SQL 32 // tokens for the parser. 33 type Tokenizer struct { 34 AllowComments bool 35 SkipSpecialComments bool 36 SkipToEnd bool 37 LastError error 38 ParseTree Statement 39 BindVars map[string]struct{} 40 41 lastToken string 42 posVarIndex int 43 partialDDL Statement 44 nesting int 45 multi bool 46 specialComment *Tokenizer 47 48 Pos int 49 buf string 50 } 51 52 // NewStringTokenizer creates a new Tokenizer for the 53 // sql string. 54 func NewStringTokenizer(sql string) *Tokenizer { 55 checkParserVersionFlag() 56 57 return &Tokenizer{ 58 buf: sql, 59 BindVars: make(map[string]struct{}), 60 } 61 } 62 63 // Lex returns the next token form the Tokenizer. 64 // This function is used by go yacc. 65 func (tkn *Tokenizer) Lex(lval *yySymType) int { 66 if tkn.SkipToEnd { 67 return tkn.skipStatement() 68 } 69 70 typ, val := tkn.Scan() 71 for typ == COMMENT { 72 if tkn.AllowComments { 73 break 74 } 75 typ, val = tkn.Scan() 76 } 77 if typ == 0 || typ == ';' || typ == LEX_ERROR { 78 // If encounter end of statement or invalid token, 79 // we should not accept partially parsed DDLs. They 80 // should instead result in parser errors. See the 81 // Parse function to see how this is handled. 82 tkn.partialDDL = nil 83 } 84 lval.str = val 85 tkn.lastToken = val 86 return typ 87 } 88 89 // PositionedErr holds context related to parser errors 90 type PositionedErr struct { 91 Err string 92 Pos int 93 Near string 94 } 95 96 func (p PositionedErr) Error() string { 97 if p.Near != "" { 98 return fmt.Sprintf("%s at position %v near '%s'", p.Err, p.Pos, p.Near) 99 } 100 return fmt.Sprintf("%s at position %v", p.Err, p.Pos) 101 } 102 103 // Error is called by go yacc if there's a parsing error. 104 func (tkn *Tokenizer) Error(err string) { 105 tkn.LastError = PositionedErr{Err: err, Pos: tkn.Pos + 1, Near: tkn.lastToken} 106 107 // Try and re-sync to the next statement 108 tkn.skipStatement() 109 } 110 111 // Scan scans the tokenizer for the next token and returns 112 // the token type and an optional value. 113 func (tkn *Tokenizer) Scan() (int, string) { 114 if tkn.specialComment != nil { 115 // Enter specialComment scan mode. 116 // for scanning such kind of comment: /*! MySQL-specific code */ 117 specialComment := tkn.specialComment 118 tok, val := specialComment.Scan() 119 if tok != 0 { 120 // return the specialComment scan result as the result 121 return tok, val 122 } 123 // leave specialComment scan mode after all stream consumed. 124 tkn.specialComment = nil 125 } 126 127 tkn.skipBlank() 128 switch ch := tkn.cur(); { 129 case ch == '@': 130 tokenID := AT_ID 131 tkn.skip(1) 132 if tkn.cur() == '@' { 133 tokenID = AT_AT_ID 134 tkn.skip(1) 135 } 136 var tID int 137 var tBytes string 138 if tkn.cur() == '`' { 139 tkn.skip(1) 140 tID, tBytes = tkn.scanLiteralIdentifier() 141 } else if tkn.cur() == eofChar { 142 return LEX_ERROR, "" 143 } else { 144 tID, tBytes = tkn.scanIdentifier(true) 145 } 146 if tID == LEX_ERROR { 147 return tID, "" 148 } 149 return tokenID, tBytes 150 case isLetter(ch): 151 if ch == 'X' || ch == 'x' { 152 if tkn.peek(1) == '\'' { 153 tkn.skip(2) 154 return tkn.scanHex() 155 } 156 } 157 if ch == 'B' || ch == 'b' { 158 if tkn.peek(1) == '\'' { 159 tkn.skip(2) 160 return tkn.scanBitLiteral() 161 } 162 } 163 // N\'literal' is used to create a string in the national character set 164 if ch == 'N' || ch == 'n' { 165 nxt := tkn.peek(1) 166 if nxt == '\'' || nxt == '"' { 167 tkn.skip(2) 168 return tkn.scanString(nxt, NCHAR_STRING) 169 } 170 } 171 return tkn.scanIdentifier(false) 172 case isDigit(ch): 173 return tkn.scanNumber() 174 case ch == ':': 175 return tkn.scanBindVar() 176 case ch == ';': 177 if tkn.multi { 178 // In multi mode, ';' is treated as EOF. So, we don't advance. 179 // Repeated calls to Scan will keep returning 0 until ParseNext 180 // forces the advance. 181 return 0, "" 182 } 183 tkn.skip(1) 184 return ';', "" 185 case ch == eofChar: 186 return 0, "" 187 default: 188 if ch == '.' && isDigit(tkn.peek(1)) { 189 return tkn.scanNumber() 190 } 191 192 tkn.skip(1) 193 switch ch { 194 case '=', ',', '(', ')', '+', '*', '%', '^', '~': 195 return int(ch), "" 196 case '&': 197 if tkn.cur() == '&' { 198 tkn.skip(1) 199 return AND, "" 200 } 201 return int(ch), "" 202 case '|': 203 if tkn.cur() == '|' { 204 tkn.skip(1) 205 return OR, "" 206 } 207 return int(ch), "" 208 case '?': 209 tkn.posVarIndex++ 210 buf := make([]byte, 0, 8) 211 buf = append(buf, ":v"...) 212 buf = strconv.AppendInt(buf, int64(tkn.posVarIndex), 10) 213 return VALUE_ARG, string(buf) 214 case '.': 215 return int(ch), "" 216 case '/': 217 switch tkn.cur() { 218 case '/': 219 tkn.skip(1) 220 return tkn.scanCommentType1(2) 221 case '*': 222 tkn.skip(1) 223 if tkn.cur() == '!' && !tkn.SkipSpecialComments { 224 tkn.skip(1) 225 return tkn.scanMySQLSpecificComment() 226 } 227 return tkn.scanCommentType2() 228 default: 229 return int(ch), "" 230 } 231 case '#': 232 return tkn.scanCommentType1(1) 233 case '-': 234 switch tkn.cur() { 235 case '-': 236 nextChar := tkn.peek(1) 237 if nextChar == ' ' || nextChar == '\n' || nextChar == '\t' || nextChar == '\r' || nextChar == eofChar { 238 tkn.skip(1) 239 return tkn.scanCommentType1(2) 240 } 241 case '>': 242 tkn.skip(1) 243 if tkn.cur() == '>' { 244 tkn.skip(1) 245 return JSON_UNQUOTE_EXTRACT_OP, "" 246 } 247 return JSON_EXTRACT_OP, "" 248 } 249 return int(ch), "" 250 case '<': 251 switch tkn.cur() { 252 case '>': 253 tkn.skip(1) 254 return NE, "" 255 case '<': 256 tkn.skip(1) 257 return SHIFT_LEFT, "" 258 case '=': 259 tkn.skip(1) 260 switch tkn.cur() { 261 case '>': 262 tkn.skip(1) 263 return NULL_SAFE_EQUAL, "" 264 default: 265 return LE, "" 266 } 267 default: 268 return int(ch), "" 269 } 270 case '>': 271 switch tkn.cur() { 272 case '=': 273 tkn.skip(1) 274 return GE, "" 275 case '>': 276 tkn.skip(1) 277 return SHIFT_RIGHT, "" 278 default: 279 return int(ch), "" 280 } 281 case '!': 282 if tkn.cur() == '=' { 283 tkn.skip(1) 284 return NE, "" 285 } 286 return int(ch), "" 287 case '\'', '"': 288 return tkn.scanString(ch, STRING) 289 case '`': 290 return tkn.scanLiteralIdentifier() 291 default: 292 return LEX_ERROR, string(byte(ch)) 293 } 294 } 295 } 296 297 // skipStatement scans until end of statement. 298 func (tkn *Tokenizer) skipStatement() int { 299 tkn.SkipToEnd = false 300 for { 301 typ, _ := tkn.Scan() 302 if typ == 0 || typ == ';' || typ == LEX_ERROR { 303 return typ 304 } 305 } 306 } 307 308 // skipBlank skips the cursor while it finds whitespace 309 func (tkn *Tokenizer) skipBlank() { 310 ch := tkn.cur() 311 for ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' { 312 tkn.skip(1) 313 ch = tkn.cur() 314 } 315 } 316 317 // scanIdentifier scans a language keyword or @-encased variable 318 func (tkn *Tokenizer) scanIdentifier(isVariable bool) (int, string) { 319 start := tkn.Pos 320 tkn.skip(1) 321 322 for { 323 ch := tkn.cur() 324 if !isLetter(ch) && !isDigit(ch) && !(isVariable && isCarat(ch)) { 325 break 326 } 327 tkn.skip(1) 328 } 329 keywordName := tkn.buf[start:tkn.Pos] 330 if keywordID, found := keywordLookupTable.LookupString(keywordName); found { 331 return keywordID, keywordName 332 } 333 // dual must always be case-insensitive 334 if keywordASCIIMatch(keywordName, "dual") { 335 return ID, "dual" 336 } 337 return ID, keywordName 338 } 339 340 // scanHex scans a hex numeral; assumes x' or X' has already been scanned 341 func (tkn *Tokenizer) scanHex() (int, string) { 342 start := tkn.Pos 343 tkn.scanMantissa(16) 344 hex := tkn.buf[start:tkn.Pos] 345 if tkn.cur() != '\'' { 346 return LEX_ERROR, hex 347 } 348 tkn.skip(1) 349 if len(hex)%2 != 0 { 350 return LEX_ERROR, hex 351 } 352 return HEX, hex 353 } 354 355 // scanBitLiteral scans a binary numeric literal; assumes b' or B' has already been scanned 356 func (tkn *Tokenizer) scanBitLiteral() (int, string) { 357 start := tkn.Pos 358 tkn.scanMantissa(2) 359 bit := tkn.buf[start:tkn.Pos] 360 if tkn.cur() != '\'' { 361 return LEX_ERROR, bit 362 } 363 tkn.skip(1) 364 return BIT_LITERAL, bit 365 } 366 367 // scanLiteralIdentifierSlow scans an identifier surrounded by backticks which may 368 // contain escape sequences instead of it. This method is only called from 369 // scanLiteralIdentifier once the first escape sequence is found in the identifier. 370 // The provided `buf` contains the contents of the identifier that have been scanned 371 // so far. 372 func (tkn *Tokenizer) scanLiteralIdentifierSlow(buf *strings.Builder) (int, string) { 373 backTickSeen := true 374 for { 375 if backTickSeen { 376 if tkn.cur() != '`' { 377 break 378 } 379 backTickSeen = false 380 buf.WriteByte('`') 381 tkn.skip(1) 382 continue 383 } 384 // The previous char was not a backtick. 385 switch tkn.cur() { 386 case '`': 387 backTickSeen = true 388 case eofChar: 389 // Premature EOF. 390 return LEX_ERROR, buf.String() 391 default: 392 buf.WriteByte(byte(tkn.cur())) 393 // keep scanning 394 } 395 tkn.skip(1) 396 } 397 return ID, buf.String() 398 } 399 400 // scanLiteralIdentifier scans an identifier enclosed by backticks. If the identifier 401 // is a simple literal, it'll be returned as a slice of the input buffer. If the identifier 402 // contains escape sequences, this function will fall back to scanLiteralIdentifierSlow 403 func (tkn *Tokenizer) scanLiteralIdentifier() (int, string) { 404 start := tkn.Pos 405 for { 406 switch tkn.cur() { 407 case '`': 408 if tkn.peek(1) != '`' { 409 if tkn.Pos == start { 410 return LEX_ERROR, "" 411 } 412 tkn.skip(1) 413 return ID, tkn.buf[start : tkn.Pos-1] 414 } 415 416 var buf strings.Builder 417 buf.WriteString(tkn.buf[start:tkn.Pos]) 418 tkn.skip(1) 419 return tkn.scanLiteralIdentifierSlow(&buf) 420 case eofChar: 421 // Premature EOF. 422 return LEX_ERROR, tkn.buf[start:tkn.Pos] 423 default: 424 tkn.skip(1) 425 } 426 } 427 } 428 429 // scanBindVar scans a bind variable; assumes a ':' has been scanned right before 430 func (tkn *Tokenizer) scanBindVar() (int, string) { 431 start := tkn.Pos 432 token := VALUE_ARG 433 434 tkn.skip(1) 435 // If : is followed by a digit, then it is an offset value arg. Example - :1, :10 436 if isDigit(tkn.cur()) { 437 tkn.scanMantissa(10) 438 return OFFSET_ARG, tkn.buf[start+1 : tkn.Pos] 439 } 440 // If : is followed by another : it is a list arg. Example ::v1, ::list 441 if tkn.cur() == ':' { 442 token = LIST_ARG 443 tkn.skip(1) 444 } 445 if !isLetter(tkn.cur()) { 446 return LEX_ERROR, tkn.buf[start:tkn.Pos] 447 } 448 // If : is followed by a letter, it is a bindvariable. Example :v1, :v2 449 for { 450 ch := tkn.cur() 451 if !isLetter(ch) && !isDigit(ch) && ch != '.' { 452 break 453 } 454 tkn.skip(1) 455 } 456 return token, tkn.buf[start:tkn.Pos] 457 } 458 459 // scanMantissa scans a sequence of numeric characters with the same base. 460 // This is a helper function only called from the numeric scanners 461 func (tkn *Tokenizer) scanMantissa(base int) { 462 for digitVal(tkn.cur()) < base { 463 tkn.skip(1) 464 } 465 } 466 467 // scanNumber scans any SQL numeric literal, either floating point or integer 468 func (tkn *Tokenizer) scanNumber() (int, string) { 469 start := tkn.Pos 470 token := INTEGRAL 471 472 if tkn.cur() == '.' { 473 token = DECIMAL 474 tkn.skip(1) 475 tkn.scanMantissa(10) 476 goto exponent 477 } 478 479 // 0x construct. 480 if tkn.cur() == '0' { 481 tkn.skip(1) 482 if tkn.cur() == 'x' || tkn.cur() == 'X' { 483 token = HEXNUM 484 tkn.skip(1) 485 tkn.scanMantissa(16) 486 goto exit 487 } 488 if tkn.cur() == 'b' || tkn.cur() == 'B' { 489 token = BITNUM 490 tkn.skip(1) 491 tkn.scanMantissa(2) 492 goto exit 493 } 494 } 495 496 tkn.scanMantissa(10) 497 498 if tkn.cur() == '.' { 499 token = DECIMAL 500 tkn.skip(1) 501 tkn.scanMantissa(10) 502 } 503 504 exponent: 505 if tkn.cur() == 'e' || tkn.cur() == 'E' { 506 token = FLOAT 507 tkn.skip(1) 508 if tkn.cur() == '+' || tkn.cur() == '-' { 509 tkn.skip(1) 510 } 511 tkn.scanMantissa(10) 512 } 513 514 exit: 515 if isLetter(tkn.cur()) { 516 // A letter cannot immediately follow a float number. 517 if token == FLOAT || token == DECIMAL { 518 return LEX_ERROR, tkn.buf[start:tkn.Pos] 519 } 520 // A letter seen after a few numbers means that we should parse this 521 // as an identifier and not a number. 522 for { 523 ch := tkn.cur() 524 if !isLetter(ch) && !isDigit(ch) { 525 break 526 } 527 tkn.skip(1) 528 } 529 return ID, tkn.buf[start:tkn.Pos] 530 } 531 532 return token, tkn.buf[start:tkn.Pos] 533 } 534 535 // scanString scans a string surrounded by the given `delim`, which can be 536 // either single or double quotes. Assumes that the given delimiter has just 537 // been scanned. If the skin contains any escape sequences, this function 538 // will fall back to scanStringSlow 539 func (tkn *Tokenizer) scanString(delim uint16, typ int) (int, string) { 540 start := tkn.Pos 541 542 for { 543 switch tkn.cur() { 544 case delim: 545 if tkn.peek(1) != delim { 546 tkn.skip(1) 547 return typ, tkn.buf[start : tkn.Pos-1] 548 } 549 fallthrough 550 551 case '\\': 552 var buffer strings.Builder 553 buffer.WriteString(tkn.buf[start:tkn.Pos]) 554 return tkn.scanStringSlow(&buffer, delim, typ) 555 556 case eofChar: 557 return LEX_ERROR, tkn.buf[start:tkn.Pos] 558 } 559 560 tkn.skip(1) 561 } 562 } 563 564 // scanString scans a string surrounded by the given `delim` and containing escape 565 // sequencse. The given `buffer` contains the contents of the string that have 566 // been scanned so far. 567 func (tkn *Tokenizer) scanStringSlow(buffer *strings.Builder, delim uint16, typ int) (int, string) { 568 for { 569 ch := tkn.cur() 570 if ch == eofChar { 571 // Unterminated string. 572 return LEX_ERROR, buffer.String() 573 } 574 575 if ch != delim && ch != '\\' { 576 // Scan ahead to the next interesting character. 577 start := tkn.Pos 578 for ; tkn.Pos < len(tkn.buf); tkn.Pos++ { 579 ch = uint16(tkn.buf[tkn.Pos]) 580 if ch == delim || ch == '\\' { 581 break 582 } 583 } 584 585 buffer.WriteString(tkn.buf[start:tkn.Pos]) 586 if tkn.Pos >= len(tkn.buf) { 587 // Reached the end of the buffer without finding a delim or 588 // escape character. 589 tkn.skip(1) 590 continue 591 } 592 } 593 tkn.skip(1) // Read one past the delim or escape character. 594 595 if ch == '\\' { 596 if tkn.cur() == eofChar { 597 // String terminates mid escape character. 598 return LEX_ERROR, buffer.String() 599 } 600 // Preserve escaping of % and _ 601 if tkn.cur() == '%' || tkn.cur() == '_' { 602 buffer.WriteByte('\\') 603 ch = tkn.cur() 604 } else if decodedChar := sqltypes.SQLDecodeMap[byte(tkn.cur())]; decodedChar == sqltypes.DontEscape { 605 ch = tkn.cur() 606 } else { 607 ch = uint16(decodedChar) 608 } 609 } else if ch == delim && tkn.cur() != delim { 610 // Correctly terminated string, which is not a double delim. 611 break 612 } 613 614 buffer.WriteByte(byte(ch)) 615 tkn.skip(1) 616 } 617 618 return typ, buffer.String() 619 } 620 621 // scanCommentType1 scans a SQL line-comment, which is applied until the end 622 // of the line. The given prefix length varies based on whether the comment 623 // is started with '//', '--' or '#'. 624 func (tkn *Tokenizer) scanCommentType1(prefixLen int) (int, string) { 625 start := tkn.Pos - prefixLen 626 for tkn.cur() != eofChar { 627 if tkn.cur() == '\n' { 628 tkn.skip(1) 629 break 630 } 631 tkn.skip(1) 632 } 633 return COMMENT, tkn.buf[start:tkn.Pos] 634 } 635 636 // scanCommentType2 scans a '/*' delimited comment; assumes the opening 637 // prefix has already been scanned 638 func (tkn *Tokenizer) scanCommentType2() (int, string) { 639 start := tkn.Pos - 2 640 for { 641 if tkn.cur() == '*' { 642 tkn.skip(1) 643 if tkn.cur() == '/' { 644 tkn.skip(1) 645 break 646 } 647 continue 648 } 649 if tkn.cur() == eofChar { 650 return LEX_ERROR, tkn.buf[start:tkn.Pos] 651 } 652 tkn.skip(1) 653 } 654 return COMMENT, tkn.buf[start:tkn.Pos] 655 } 656 657 // scanMySQLSpecificComment scans a MySQL comment pragma, which always starts with '//*` 658 func (tkn *Tokenizer) scanMySQLSpecificComment() (int, string) { 659 start := tkn.Pos - 3 660 for { 661 if tkn.cur() == '*' { 662 tkn.skip(1) 663 if tkn.cur() == '/' { 664 tkn.skip(1) 665 break 666 } 667 continue 668 } 669 if tkn.cur() == eofChar { 670 return LEX_ERROR, tkn.buf[start:tkn.Pos] 671 } 672 tkn.skip(1) 673 } 674 675 commentVersion, sql := ExtractMysqlComment(tkn.buf[start:tkn.Pos]) 676 677 if mySQLParserVersion >= commentVersion { 678 // Only add the special comment to the tokenizer if the version of MySQL is higher or equal to the comment version 679 tkn.specialComment = NewStringTokenizer(sql) 680 } 681 682 return tkn.Scan() 683 } 684 685 func (tkn *Tokenizer) cur() uint16 { 686 return tkn.peek(0) 687 } 688 689 func (tkn *Tokenizer) skip(dist int) { 690 tkn.Pos += dist 691 } 692 693 func (tkn *Tokenizer) peek(dist int) uint16 { 694 if tkn.Pos+dist >= len(tkn.buf) { 695 return eofChar 696 } 697 return uint16(tkn.buf[tkn.Pos+dist]) 698 } 699 700 // reset clears any internal state. 701 func (tkn *Tokenizer) reset() { 702 tkn.ParseTree = nil 703 tkn.partialDDL = nil 704 tkn.specialComment = nil 705 tkn.posVarIndex = 0 706 tkn.nesting = 0 707 tkn.SkipToEnd = false 708 } 709 710 func isLetter(ch uint16) bool { 711 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch == '$' 712 } 713 714 func isCarat(ch uint16) bool { 715 return ch == '.' || ch == '\'' || ch == '"' || ch == '`' 716 } 717 718 func digitVal(ch uint16) int { 719 switch { 720 case '0' <= ch && ch <= '9': 721 return int(ch) - '0' 722 case 'a' <= ch && ch <= 'f': 723 return int(ch) - 'a' + 10 724 case 'A' <= ch && ch <= 'F': 725 return int(ch) - 'A' + 10 726 } 727 return 16 // larger than any legal digit val 728 } 729 730 func isDigit(ch uint16) bool { 731 return '0' <= ch && ch <= '9' 732 }