github.com/bingoohuang/gg@v0.0.0-20240325092523-45da7dee9335/pkg/sqlparse/sqlparser/token.go (about) 1 /* 2 Copyright 2017 Google Inc. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sqlparser 18 19 import ( 20 "bytes" 21 "fmt" 22 "strings" 23 24 "github.com/bingoohuang/gg/pkg/sqlparse/bytes2" 25 "github.com/bingoohuang/gg/pkg/sqlparse/sqltypes" 26 ) 27 28 const eofChar = 0x100 29 30 // Tokenizer is the struct used to generate SQL 31 // tokens for the parser. 32 type Tokenizer struct { 33 InStream *strings.Reader 34 AllowComments bool 35 ForceEOF bool 36 lastChar uint16 37 Position int 38 lastToken []byte 39 LastError string 40 posVarIndex int 41 ParseTree Statement 42 nesting int 43 } 44 45 // NewStringTokenizer creates a new Tokenizer for the 46 // sql string. 47 func NewStringTokenizer(sql string) *Tokenizer { 48 return &Tokenizer{InStream: strings.NewReader(sql)} 49 } 50 51 // keywords is a map of mysql keywords that fall into two categories: 52 // 1) keywords considered reserved by MySQL 53 // 2) keywords for us to handle specially in sql.y 54 // 55 // Those marked as UNUSED are likely reserved keywords. We add them here so that 56 // when rewriting queries we can properly backtick quote them so they don't cause issues 57 // 58 // NOTE: If you add new keywords, add them also to the reserved_keywords or 59 // non_reserved_keywords grammar in sql.y -- this will allow the keyword to be used 60 // in identifiers. See the docs for each grammar to determine which one to put it into. 61 var keywords = map[string]int{ 62 "accessible": UNUSED, 63 "add": UNUSED, 64 "against": AGAINST, 65 "all": ALL, 66 "alter": ALTER, 67 "analyze": ANALYZE, 68 "and": AND, 69 "as": AS, 70 "asc": ASC, 71 "asensitive": UNUSED, 72 "before": UNUSED, 73 "between": BETWEEN, 74 "bigint": UNUSED, 75 "binary": BINARY, 76 "blob": UNUSED, 77 "boolean": BOOLEAN, 78 "both": UNUSED, 79 "by": BY, 80 "call": UNUSED, 81 "cascade": UNUSED, 82 "case": CASE, 83 "cast": CAST, 84 "change": UNUSED, 85 "character": CHARACTER, 86 "check": UNUSED, 87 "collate": COLLATE, 88 "column": UNUSED, 89 "condition": UNUSED, 90 "constraint": UNUSED, 91 "continue": UNUSED, 92 "convert": CONVERT, 93 "create": CREATE, 94 "cross": CROSS, 95 "current_date": CURRENT_DATE, 96 "current_time": CURRENT_TIME, 97 "current_timestamp": CURRENT_TIMESTAMP, 98 "current_user": UNUSED, 99 "cursor": UNUSED, 100 "database": DATABASE, 101 "databases": DATABASES, 102 "day_hour": UNUSED, 103 "day_microsecond": UNUSED, 104 "day_minute": UNUSED, 105 "day_second": UNUSED, 106 "date": DATE, 107 "dec": UNUSED, 108 "declare": UNUSED, 109 "default": DEFAULT, 110 "delayed": UNUSED, 111 "delete": DELETE, 112 "desc": DESC, 113 "describe": DESCRIBE, 114 "deterministic": UNUSED, 115 "distinct": DISTINCT, 116 "distinctrow": UNUSED, 117 "div": DIV, 118 "double": UNUSED, 119 "drop": DROP, 120 "duplicate": DUPLICATE, 121 "each": UNUSED, 122 "else": ELSE, 123 "elseif": UNUSED, 124 "enclosed": UNUSED, 125 "end": END, 126 "escape": ESCAPE, 127 "escaped": UNUSED, 128 "exists": EXISTS, 129 "exit": UNUSED, 130 "explain": EXPLAIN, 131 "expansion": EXPANSION, 132 "false": FALSE, 133 "fetch": UNUSED, 134 "float": UNUSED, 135 "float4": UNUSED, 136 "float8": UNUSED, 137 "for": FOR, 138 "force": FORCE, 139 "foreign": UNUSED, 140 "from": FROM, 141 "fulltext": UNUSED, 142 "generated": UNUSED, 143 "get": UNUSED, 144 "grant": UNUSED, 145 "group": GROUP, 146 "group_concat": GROUP_CONCAT, 147 "having": HAVING, 148 "high_priority": UNUSED, 149 "hour_microsecond": UNUSED, 150 "hour_minute": UNUSED, 151 "hour_second": UNUSED, 152 "if": IF, 153 "ignore": IGNORE, 154 "in": IN, 155 "index": INDEX, 156 "infile": UNUSED, 157 "inout": UNUSED, 158 "inner": INNER, 159 "insensitive": UNUSED, 160 "insert": INSERT, 161 "int": UNUSED, 162 "int1": UNUSED, 163 "int2": UNUSED, 164 "int3": UNUSED, 165 "int4": UNUSED, 166 "int8": UNUSED, 167 "integer": INTEGER, 168 "interval": INTERVAL, 169 "into": INTO, 170 "io_after_gtids": UNUSED, 171 "is": IS, 172 "iterate": UNUSED, 173 "join": JOIN, 174 "key": KEY, 175 "keys": UNUSED, 176 "kill": UNUSED, 177 "language": LANGUAGE, 178 "last_insert_id": LAST_INSERT_ID, 179 "leading": UNUSED, 180 "leave": UNUSED, 181 "left": LEFT, 182 "like": LIKE, 183 "limit": LIMIT, 184 "linear": UNUSED, 185 "lines": UNUSED, 186 "load": UNUSED, 187 "localtime": LOCALTIME, 188 "localtimestamp": LOCALTIMESTAMP, 189 "lock": LOCK, 190 "long": UNUSED, 191 "longblob": UNUSED, 192 "longtext": UNUSED, 193 "loop": UNUSED, 194 "low_priority": UNUSED, 195 "master_bind": UNUSED, 196 "match": MATCH, 197 "maxvalue": UNUSED, 198 "mediumblob": UNUSED, 199 "mediumint": UNUSED, 200 "mediumtext": UNUSED, 201 "middleint": UNUSED, 202 "minute_microsecond": UNUSED, 203 "minute_second": UNUSED, 204 "mod": MOD, 205 "mode": MODE, 206 "modifies": UNUSED, 207 "natural": NATURAL, 208 "next": NEXT, 209 "not": NOT, 210 "no_write_to_binlog": UNUSED, 211 "null": NULL, 212 "numeric": UNUSED, 213 "offset": OFFSET, 214 "on": ON, 215 "optimize": OPTIMIZE, 216 "optimizer_costs": UNUSED, 217 "option": UNUSED, 218 "optionally": UNUSED, 219 "or": OR, 220 "order": ORDER, 221 "out": UNUSED, 222 "outer": OUTER, 223 "outfile": UNUSED, 224 "partition": UNUSED, 225 "precision": UNUSED, 226 "primary": UNUSED, 227 "procedure": UNUSED, 228 "query": QUERY, 229 "range": UNUSED, 230 "read": UNUSED, 231 "reads": UNUSED, 232 "read_write": UNUSED, 233 "real": UNUSED, 234 "references": UNUSED, 235 "regexp": REGEXP, 236 "release": UNUSED, 237 "rename": RENAME, 238 "repair": REPAIR, 239 "repeat": UNUSED, 240 "replace": REPLACE, 241 "require": UNUSED, 242 "resignal": UNUSED, 243 "restrict": UNUSED, 244 "return": UNUSED, 245 "revoke": UNUSED, 246 "right": RIGHT, 247 "rlike": REGEXP, 248 "schema": UNUSED, 249 "schemas": UNUSED, 250 "second_microsecond": UNUSED, 251 "select": SELECT, 252 "sensitive": UNUSED, 253 "separator": SEPARATOR, 254 "set": SET, 255 "share": SHARE, 256 "show": SHOW, 257 "signal": UNUSED, 258 "smallint": UNUSED, 259 "spatial": UNUSED, 260 "specific": UNUSED, 261 "sql": UNUSED, 262 "sqlexception": UNUSED, 263 "sqlstate": UNUSED, 264 "sqlwarning": UNUSED, 265 "sql_big_result": UNUSED, 266 "sql_cache": SQL_CACHE, 267 "sql_calc_found_rows": UNUSED, 268 "sql_no_cache": SQL_NO_CACHE, 269 "sql_small_result": UNUSED, 270 "ssl": UNUSED, 271 "starting": UNUSED, 272 "stored": UNUSED, 273 "straight_join": STRAIGHT_JOIN, 274 "table": TABLE, 275 "tables": TABLES, 276 "terminated": UNUSED, 277 "then": THEN, 278 "tinyblob": UNUSED, 279 "tinyint": UNUSED, 280 "tinytext": UNUSED, 281 "to": TO, 282 "trailing": UNUSED, 283 "trigger": UNUSED, 284 "true": TRUE, 285 "truncate": TRUNCATE, 286 "undo": UNUSED, 287 "union": UNION, 288 "unique": UNIQUE, 289 "unlock": UNUSED, 290 "update": UPDATE, 291 "usage": UNUSED, 292 "use": USE, 293 "using": USING, 294 "utc_date": UTC_DATE, 295 "utc_time": UTC_TIME, 296 "utc_timestamp": UTC_TIMESTAMP, 297 "values": VALUES, 298 "varbinary": UNUSED, 299 "varchar": UNUSED, 300 "varcharacter": UNUSED, 301 "varying": UNUSED, 302 "virtual": UNUSED, 303 "view": VIEW, 304 "vitess_keyspaces": VITESS_KEYSPACES, 305 "vitess_shards": VITESS_SHARDS, 306 "vschema_tables": VSCHEMA_TABLES, 307 "when": WHEN, 308 "where": WHERE, 309 "while": UNUSED, 310 "with": WITH, 311 "write": UNUSED, 312 "xor": UNUSED, 313 "year_month": UNUSED, 314 "zerofill": UNUSED, 315 } 316 317 // Lex returns the next token form the Tokenizer. 318 // This function is used by go yacc. 319 func (tkn *Tokenizer) Lex(lval *yySymType) int { 320 typ, val := tkn.Scan() 321 for typ == COMMENT { 322 if tkn.AllowComments { 323 break 324 } 325 typ, val = tkn.Scan() 326 } 327 lval.bytes = val 328 tkn.lastToken = val 329 return typ 330 } 331 332 // Error is called by go yacc if there's a parsing error. 333 func (tkn *Tokenizer) Error(err string) { 334 buf := &bytes2.Buffer{} 335 if tkn.lastToken != nil { 336 fmt.Fprintf(buf, "%s at position %v near '%s'", err, tkn.Position, tkn.lastToken) 337 } else { 338 fmt.Fprintf(buf, "%s at position %v", err, tkn.Position) 339 } 340 tkn.LastError = buf.String() 341 } 342 343 // Scan scans the tokenizer for the next token and returns 344 // the token type and an optional value. 345 func (tkn *Tokenizer) Scan() (int, []byte) { 346 if tkn.ForceEOF { 347 return 0, nil 348 } 349 350 if tkn.lastChar == 0 { 351 tkn.next() 352 } 353 tkn.skipBlank() 354 switch ch := tkn.lastChar; { 355 case isLetter(ch): 356 tkn.next() 357 if ch == 'X' || ch == 'x' { 358 if tkn.lastChar == '\'' { 359 tkn.next() 360 return tkn.scanHex() 361 } 362 } 363 return tkn.scanIdentifier(byte(ch)) 364 case isDigit(ch): 365 return tkn.scanNumber(false) 366 case ch == ':': 367 return tkn.scanBindVar() 368 default: 369 tkn.next() 370 switch ch { 371 case eofChar: 372 return 0, nil 373 case '=', ',', ';', '(', ')', '+', '*', '%', '^', '~': 374 return int(ch), nil 375 case '&': 376 if tkn.lastChar == '&' { 377 tkn.next() 378 return AND, nil 379 } 380 return int(ch), nil 381 case '|': 382 if tkn.lastChar == '|' { 383 tkn.next() 384 return OR, nil 385 } 386 return int(ch), nil 387 case '?': 388 tkn.posVarIndex++ 389 // buf := new(bytes2.Buffer) 390 // fmt.Fprintf(buf, ":v%d", tkn.posVarIndex) 391 return VALUE_ARG, []byte("?") 392 case '.': 393 if isDigit(tkn.lastChar) { 394 return tkn.scanNumber(true) 395 } 396 return int(ch), nil 397 case '/': 398 switch tkn.lastChar { 399 case '/': 400 tkn.next() 401 return tkn.scanCommentType1("//") 402 case '*': 403 tkn.next() 404 return tkn.scanCommentType2() 405 default: 406 return int(ch), nil 407 } 408 case '#': 409 tkn.next() 410 return tkn.scanCommentType1("#") 411 case '-': 412 switch tkn.lastChar { 413 case '-': 414 tkn.next() 415 return tkn.scanCommentType1("--") 416 case '>': 417 tkn.next() 418 if tkn.lastChar == '>' { 419 tkn.next() 420 return JSON_UNQUOTE_EXTRACT_OP, nil 421 } 422 return JSON_EXTRACT_OP, nil 423 } 424 return int(ch), nil 425 case '<': 426 switch tkn.lastChar { 427 case '>': 428 tkn.next() 429 return NE, nil 430 case '<': 431 tkn.next() 432 return SHIFT_LEFT, nil 433 case '=': 434 tkn.next() 435 switch tkn.lastChar { 436 case '>': 437 tkn.next() 438 return NULL_SAFE_EQUAL, nil 439 default: 440 return LE, nil 441 } 442 default: 443 return int(ch), nil 444 } 445 case '>': 446 switch tkn.lastChar { 447 case '=': 448 tkn.next() 449 return GE, nil 450 case '>': 451 tkn.next() 452 return SHIFT_RIGHT, nil 453 default: 454 return int(ch), nil 455 } 456 case '!': 457 if tkn.lastChar == '=' { 458 tkn.next() 459 return NE, nil 460 } 461 return int(ch), nil 462 case '\'', '"': 463 return tkn.scanString(ch, STRING) 464 case '`': 465 return tkn.scanLiteralIdentifier() 466 default: 467 return LEX_ERROR, []byte{byte(ch)} 468 } 469 } 470 } 471 472 func (tkn *Tokenizer) skipBlank() { 473 ch := tkn.lastChar 474 for ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' { 475 tkn.next() 476 ch = tkn.lastChar 477 } 478 } 479 480 func (tkn *Tokenizer) scanIdentifier(firstByte byte) (int, []byte) { 481 buffer := &bytes2.Buffer{} 482 buffer.WriteByte(firstByte) 483 for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) { 484 buffer.WriteByte(byte(tkn.lastChar)) 485 tkn.next() 486 } 487 lowered := bytes.ToLower(buffer.Bytes()) 488 loweredStr := string(lowered) 489 if keywordID, found := keywords[loweredStr]; found { 490 return keywordID, lowered 491 } 492 // dual must always be case-insensitive 493 if loweredStr == "dual" { 494 return ID, lowered 495 } 496 return ID, buffer.Bytes() 497 } 498 499 func (tkn *Tokenizer) scanHex() (int, []byte) { 500 buffer := &bytes2.Buffer{} 501 tkn.scanMantissa(16, buffer) 502 if tkn.lastChar != '\'' { 503 return LEX_ERROR, buffer.Bytes() 504 } 505 tkn.next() 506 if buffer.Len()%2 != 0 { 507 return LEX_ERROR, buffer.Bytes() 508 } 509 return HEX, buffer.Bytes() 510 } 511 512 func (tkn *Tokenizer) scanLiteralIdentifier() (int, []byte) { 513 buffer := &bytes2.Buffer{} 514 backTickSeen := false 515 for { 516 if backTickSeen { 517 if tkn.lastChar != '`' { 518 break 519 } 520 backTickSeen = false 521 buffer.WriteByte('`') 522 tkn.next() 523 continue 524 } 525 // The previous char was not a backtick. 526 switch tkn.lastChar { 527 case '`': 528 backTickSeen = true 529 case eofChar: 530 // Premature EOF. 531 return LEX_ERROR, buffer.Bytes() 532 default: 533 buffer.WriteByte(byte(tkn.lastChar)) 534 } 535 tkn.next() 536 } 537 if buffer.Len() == 0 { 538 return LEX_ERROR, buffer.Bytes() 539 } 540 return ID, buffer.Bytes() 541 } 542 543 func (tkn *Tokenizer) scanBindVar() (int, []byte) { 544 buffer := &bytes2.Buffer{} 545 buffer.WriteByte(byte(tkn.lastChar)) 546 token := VALUE_ARG 547 tkn.next() 548 if tkn.lastChar == ':' { 549 token = LIST_ARG 550 buffer.WriteByte(byte(tkn.lastChar)) 551 tkn.next() 552 } 553 //if !isLetter(tkn.lastChar) { 554 // return LEX_ERROR, buffer.Bytes() 555 //} 556 for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '?' || tkn.lastChar == '.' { 557 buffer.WriteByte(byte(tkn.lastChar)) 558 tkn.next() 559 } 560 return token, buffer.Bytes() 561 } 562 563 func (tkn *Tokenizer) scanMantissa(base int, buffer *bytes2.Buffer) { 564 for digitVal(tkn.lastChar) < base { 565 tkn.consumeNext(buffer) 566 } 567 } 568 569 func (tkn *Tokenizer) scanNumber(seenDecimalPoint bool) (int, []byte) { 570 token := INTEGRAL 571 buffer := &bytes2.Buffer{} 572 if seenDecimalPoint { 573 token = FLOAT 574 buffer.WriteByte('.') 575 tkn.scanMantissa(10, buffer) 576 goto exponent 577 } 578 579 // 0x construct. 580 if tkn.lastChar == '0' { 581 tkn.consumeNext(buffer) 582 if tkn.lastChar == 'x' || tkn.lastChar == 'X' { 583 token = HEXNUM 584 tkn.consumeNext(buffer) 585 tkn.scanMantissa(16, buffer) 586 goto exit 587 } 588 } 589 590 tkn.scanMantissa(10, buffer) 591 592 if tkn.lastChar == '.' { 593 token = FLOAT 594 tkn.consumeNext(buffer) 595 tkn.scanMantissa(10, buffer) 596 } 597 598 exponent: 599 if tkn.lastChar == 'e' || tkn.lastChar == 'E' { 600 token = FLOAT 601 tkn.consumeNext(buffer) 602 if tkn.lastChar == '+' || tkn.lastChar == '-' { 603 tkn.consumeNext(buffer) 604 } 605 tkn.scanMantissa(10, buffer) 606 } 607 608 exit: 609 // A letter cannot immediately follow a number. 610 if isLetter(tkn.lastChar) { 611 return LEX_ERROR, buffer.Bytes() 612 } 613 614 return token, buffer.Bytes() 615 } 616 617 func (tkn *Tokenizer) scanString(delim uint16, typ int) (int, []byte) { 618 buffer := &bytes2.Buffer{} 619 for { 620 ch := tkn.lastChar 621 tkn.next() 622 if ch == delim { 623 if tkn.lastChar == delim { 624 tkn.next() 625 } else { 626 break 627 } 628 } else if ch == '\\' { 629 if tkn.lastChar == eofChar { 630 return LEX_ERROR, buffer.Bytes() 631 } 632 if decodedChar := sqltypes.SQLDecodeMap[byte(tkn.lastChar)]; decodedChar == sqltypes.DontEscape { 633 ch = tkn.lastChar 634 } else { 635 ch = uint16(decodedChar) 636 } 637 tkn.next() 638 } 639 if ch == eofChar { 640 return LEX_ERROR, buffer.Bytes() 641 } 642 buffer.WriteByte(byte(ch)) 643 } 644 return typ, buffer.Bytes() 645 } 646 647 func (tkn *Tokenizer) scanCommentType1(prefix string) (int, []byte) { 648 buffer := &bytes2.Buffer{} 649 buffer.WriteString(prefix) 650 for tkn.lastChar != eofChar { 651 if tkn.lastChar == '\n' { 652 tkn.consumeNext(buffer) 653 break 654 } 655 tkn.consumeNext(buffer) 656 } 657 return COMMENT, buffer.Bytes() 658 } 659 660 func (tkn *Tokenizer) scanCommentType2() (int, []byte) { 661 buffer := &bytes2.Buffer{} 662 buffer.WriteString("/*") 663 for { 664 if tkn.lastChar == '*' { 665 tkn.consumeNext(buffer) 666 if tkn.lastChar == '/' { 667 tkn.consumeNext(buffer) 668 break 669 } 670 continue 671 } 672 if tkn.lastChar == eofChar { 673 return LEX_ERROR, buffer.Bytes() 674 } 675 tkn.consumeNext(buffer) 676 } 677 return COMMENT, buffer.Bytes() 678 } 679 680 func (tkn *Tokenizer) consumeNext(buffer *bytes2.Buffer) { 681 if tkn.lastChar == eofChar { 682 // This should never happen. 683 panic("unexpected EOF") 684 } 685 buffer.WriteByte(byte(tkn.lastChar)) 686 tkn.next() 687 } 688 689 func (tkn *Tokenizer) next() { 690 if ch, err := tkn.InStream.ReadByte(); err != nil { 691 // Only EOF is possible. 692 tkn.lastChar = eofChar 693 } else { 694 tkn.lastChar = uint16(ch) 695 } 696 tkn.Position++ 697 } 698 699 func isLetter(ch uint16) bool { 700 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch == '@' 701 } 702 703 func digitVal(ch uint16) int { 704 switch { 705 case '0' <= ch && ch <= '9': 706 return int(ch) - '0' 707 case 'a' <= ch && ch <= 'f': 708 return int(ch) - 'a' + 10 709 case 'A' <= ch && ch <= 'F': 710 return int(ch) - 'A' + 10 711 } 712 return 16 // larger than any legal digit val 713 } 714 715 func isDigit(ch uint16) bool { 716 return '0' <= ch && ch <= '9' 717 }