github.com/bingoohuang/gg@v0.0.0-20240325092523-45da7dee9335/pkg/sqlparse/tidbparser/parser/lexer.go (about) 1 // Copyright 2016 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package parser 15 16 import ( 17 "bytes" 18 "fmt" 19 "strings" 20 "unicode" 21 "unicode/utf8" 22 23 "github.com/bingoohuang/gg/pkg/sqlparse/tidbparser/dependency/mysql" 24 ) 25 26 var _ = yyLexer(&Scanner{}) 27 28 // Pos represents the position of a token. 29 type Pos struct { 30 Line int 31 Col int 32 Offset int 33 } 34 35 // Scanner implements the yyLexer interface. 36 type Scanner struct { 37 r reader 38 buf bytes.Buffer 39 40 errs []error 41 stmtStartPos int 42 43 // For scanning such kind of comment: /*! MySQL-specific code */ or /*+ optimizer hint */ 44 specialComment specialCommentScanner 45 46 sqlMode mysql.SQLMode 47 } 48 49 type specialCommentScanner interface { 50 scan() (tok int, pos Pos, lit string) 51 } 52 53 type mysqlSpecificCodeScanner struct { 54 *Scanner 55 Pos 56 } 57 58 func (s *mysqlSpecificCodeScanner) scan() (tok int, pos Pos, lit string) { 59 tok, pos, lit = s.Scanner.scan() 60 pos.Line += s.Pos.Line 61 pos.Col += s.Pos.Col 62 pos.Offset += s.Pos.Offset 63 return 64 } 65 66 type optimizerHintScanner struct { 67 *Scanner 68 Pos 69 end bool 70 } 71 72 func (s *optimizerHintScanner) scan() (tok int, pos Pos, lit string) { 73 tok, pos, lit = s.Scanner.scan() 74 pos.Line += s.Pos.Line 75 pos.Col += s.Pos.Col 76 pos.Offset += s.Pos.Offset 77 if tok == 0 { 78 if !s.end { 79 tok = hintEnd 80 s.end = true 81 } 82 } 83 return 84 } 85 86 // Errors returns the errors during a scan. 87 func (s *Scanner) Errors() []error { 88 return s.errs 89 } 90 91 // reset resets the sql string to be scanned. 92 func (s *Scanner) reset(sql string) { 93 s.r = reader{s: sql, p: Pos{Line: 1}} 94 s.buf.Reset() 95 s.errs = s.errs[:0] 96 s.stmtStartPos = 0 97 } 98 99 func (s *Scanner) stmtText() string { 100 endPos := s.r.pos().Offset 101 if s.r.s[endPos-1] == '\n' { 102 endPos = endPos - 1 // trim new line 103 } 104 if s.r.s[s.stmtStartPos] == '\n' { 105 s.stmtStartPos++ 106 } 107 108 text := s.r.s[s.stmtStartPos:endPos] 109 110 s.stmtStartPos = endPos 111 return text 112 } 113 114 // Errorf tells scanner something is wrong. 115 // Scanner satisfies yyLexer interface which need this function. 116 func (s *Scanner) Errorf(format string, a ...interface{}) { 117 str := fmt.Sprintf(format, a...) 118 val := s.r.s[s.r.pos().Offset:] 119 if len(val) > 2048 { 120 val = val[:2048] 121 } 122 err := fmt.Errorf("line %d column %d near \"%s\"%s (total length %d)", s.r.p.Line, s.r.p.Col, val, str, len(s.r.s)) 123 s.errs = append(s.errs, err) 124 } 125 126 // Lex returns a token and store the token value in v. 127 // Scanner satisfies yyLexer interface. 128 // 0 and invalid are special token id this function would return: 129 // return 0 tells parser that scanner meets EOF, 130 // return invalid tells parser that scanner meets illegal character. 131 func (s *Scanner) Lex(v *yySymType) int { 132 tok, pos, lit := s.scan() 133 v.offset = pos.Offset 134 v.ident = lit 135 if tok == identifier { 136 tok = handleIdent(v) 137 } 138 if tok == identifier { 139 if tok1 := s.isTokenIdentifier(lit, pos.Offset); tok1 != 0 { 140 tok = tok1 141 } 142 } 143 if s.sqlMode.HasANSIQuotesMode() && 144 tok == stringLit && 145 s.r.s[v.offset] == '"' { 146 tok = identifier 147 } 148 149 if tok == pipes && !(s.sqlMode.HasPipesAsConcatMode()) { 150 return pipesAsOr 151 } 152 153 if tok == not && s.sqlMode.HasHighNotPrecedenceMode() { 154 return not2 155 } 156 157 switch tok { 158 case intLit: 159 return toInt(s, v, lit) 160 case floatLit: 161 return toFloat(s, v, lit) 162 case decLit: 163 return toDecimal(s, v, lit) 164 case hexLit: 165 return toHex(s, v, lit) 166 case bitLit: 167 return toBit(s, v, lit) 168 case singleAtIdentifier, doubleAtIdentifier, cast, extract: 169 v.item = lit 170 return tok 171 case null: 172 v.item = nil 173 case quotedIdentifier: 174 tok = identifier 175 } 176 if tok == unicode.ReplacementChar && s.r.eof() { 177 return 0 178 } 179 return tok 180 } 181 182 // SetSQLMode sets the SQL mode for scanner. 183 func (s *Scanner) SetSQLMode(mode mysql.SQLMode) { 184 s.sqlMode = mode 185 } 186 187 // GetSQLMode return the SQL mode of scanner. 188 func (s *Scanner) GetSQLMode() mysql.SQLMode { 189 return s.sqlMode 190 } 191 192 // NewScanner returns a new scanner object. 193 func NewScanner(s string) *Scanner { 194 return &Scanner{r: reader{s: s}} 195 } 196 197 func (s *Scanner) skipWhitespace() rune { 198 return s.r.incAsLongAs(unicode.IsSpace) 199 } 200 201 func (s *Scanner) scan() (tok int, pos Pos, lit string) { 202 if s.specialComment != nil { 203 // Enter specialComment scan mode. 204 // for scanning such kind of comment: /*! MySQL-specific code */ 205 specialComment := s.specialComment 206 tok, pos, lit = specialComment.scan() 207 if tok != 0 { 208 // return the specialComment scan result as the result 209 return 210 } 211 // leave specialComment scan mode after all stream consumed. 212 s.specialComment = nil 213 } 214 215 ch0 := s.r.peek() 216 if unicode.IsSpace(ch0) { 217 ch0 = s.skipWhitespace() 218 } 219 pos = s.r.pos() 220 if s.r.eof() { 221 // when scanner meets EOF, the returned token should be 0, 222 // because 0 is a special token id to remind the parser that stream is end. 223 return 0, pos, "" 224 } 225 226 if !s.r.eof() && isIdentExtend(ch0) { 227 return scanIdentifier(s) 228 } 229 230 // search a trie to get a token. 231 node := &ruleTable 232 for ch0 >= 0 && ch0 <= 255 { 233 if node.childs[ch0] == nil || s.r.eof() { 234 break 235 } 236 node = node.childs[ch0] 237 if node.fn != nil { 238 return node.fn(s) 239 } 240 s.r.inc() 241 ch0 = s.r.peek() 242 } 243 244 tok, lit = node.token, s.r.data(&pos) 245 return 246 } 247 248 func startWithXx(s *Scanner) (tok int, pos Pos, lit string) { 249 pos = s.r.pos() 250 s.r.inc() 251 if s.r.peek() == '\'' { 252 s.r.inc() 253 s.scanHex() 254 if s.r.peek() == '\'' { 255 s.r.inc() 256 tok, lit = hexLit, s.r.data(&pos) 257 } else { 258 tok = unicode.ReplacementChar 259 } 260 return 261 } 262 s.r.incAsLongAs(isIdentChar) 263 tok, lit = identifier, s.r.data(&pos) 264 return 265 } 266 267 func startWithNn(s *Scanner) (tok int, pos Pos, lit string) { 268 tok, pos, lit = scanIdentifier(s) 269 // The National Character Set, N'some text' or n'some test'. 270 // See https://dev.mysql.com/doc/refman/5.7/en/string-literals.html 271 // and https://dev.mysql.com/doc/refman/5.7/en/charset-national.html 272 if lit == "N" || lit == "n" { 273 if s.r.peek() == '\'' { 274 tok = underscoreCS 275 lit = "utf8" 276 } 277 } 278 return 279 } 280 281 func startWithBb(s *Scanner) (tok int, pos Pos, lit string) { 282 pos = s.r.pos() 283 s.r.inc() 284 if s.r.peek() == '\'' { 285 s.r.inc() 286 s.scanBit() 287 if s.r.peek() == '\'' { 288 s.r.inc() 289 tok, lit = bitLit, s.r.data(&pos) 290 } else { 291 tok = unicode.ReplacementChar 292 } 293 return 294 } 295 s.r.incAsLongAs(isIdentChar) 296 tok, lit = identifier, s.r.data(&pos) 297 return 298 } 299 300 func startWithSharp(s *Scanner) (tok int, pos Pos, lit string) { 301 s.r.incAsLongAs(func(ch rune) bool { 302 return ch != '\n' 303 }) 304 return s.scan() 305 } 306 307 func startWithDash(s *Scanner) (tok int, pos Pos, lit string) { 308 pos = s.r.pos() 309 if strings.HasPrefix(s.r.s[pos.Offset:], "-- ") { 310 s.r.incN(3) 311 s.r.incAsLongAs(func(ch rune) bool { 312 return ch != '\n' 313 }) 314 return s.scan() 315 } 316 if strings.HasPrefix(s.r.s[pos.Offset:], "->>") { 317 tok = juss 318 s.r.incN(3) 319 return 320 } 321 if strings.HasPrefix(s.r.s[pos.Offset:], "->") { 322 tok = jss 323 s.r.incN(2) 324 return 325 } 326 tok = int('-') 327 s.r.inc() 328 return 329 } 330 331 func startWithSlash(s *Scanner) (tok int, pos Pos, lit string) { 332 pos = s.r.pos() 333 s.r.inc() 334 ch0 := s.r.peek() 335 if ch0 == '*' { 336 s.r.inc() 337 for { 338 ch0 = s.r.readByte() 339 if ch0 == unicode.ReplacementChar && s.r.eof() { 340 // unclosed comment 341 s.errs = append(s.errs, ParseErrorWith(s.r.data(&pos), s.r.p.Line)) 342 return 343 } 344 if ch0 == '*' && s.r.readByte() == '/' { 345 break 346 } 347 } 348 349 comment := s.r.data(&pos) 350 351 // See https://dev.mysql.com/doc/refman/5.7/en/optimizer-hints.html 352 if strings.HasPrefix(comment, "/*+") { 353 begin := sqlOffsetInComment(comment) 354 end := len(comment) - 2 355 sql := comment[begin:end] 356 s.specialComment = &optimizerHintScanner{ 357 Scanner: NewScanner(sql), 358 Pos: Pos{ 359 pos.Line, 360 pos.Col, 361 pos.Offset + begin, 362 }, 363 } 364 365 tok = hintBegin 366 return 367 } 368 369 // See http://dev.mysql.com/doc/refman/5.7/en/comments.html 370 // Convert "/*!VersionNumber MySQL-specific-code */" to "MySQL-specific-code". 371 if strings.HasPrefix(comment, "/*!") { 372 sql := specCodePattern.ReplaceAllStringFunc(comment, TrimComment) 373 s.specialComment = &mysqlSpecificCodeScanner{ 374 Scanner: NewScanner(sql), 375 Pos: Pos{ 376 pos.Line, 377 pos.Col, 378 pos.Offset + sqlOffsetInComment(comment), 379 }, 380 } 381 } 382 383 return s.scan() 384 } 385 tok = int('/') 386 return 387 } 388 389 func sqlOffsetInComment(comment string) int { 390 // find the first SQL token offset in pattern like "/*!40101 mysql specific code */" 391 offset := 0 392 for i := 0; i < len(comment); i++ { 393 if unicode.IsSpace(rune(comment[i])) { 394 offset = i 395 break 396 } 397 } 398 for offset < len(comment) { 399 offset++ 400 if !unicode.IsSpace(rune(comment[offset])) { 401 break 402 } 403 } 404 return offset 405 } 406 407 func startWithAt(s *Scanner) (tok int, pos Pos, lit string) { 408 pos = s.r.pos() 409 s.r.inc() 410 ch1 := s.r.peek() 411 if isIdentFirstChar(ch1) { 412 s.r.incAsLongAs(isIdentChar) 413 tok, lit = singleAtIdentifier, s.r.data(&pos) 414 } else if ch1 == '@' { 415 s.r.inc() 416 stream := s.r.s[pos.Offset+2:] 417 for _, v := range []string{"global.", "session.", "local."} { 418 if len(v) > len(stream) { 419 continue 420 } 421 if strings.EqualFold(stream[:len(v)], v) { 422 s.r.incN(len(v)) 423 break 424 } 425 } 426 s.r.incAsLongAs(isIdentChar) 427 tok, lit = doubleAtIdentifier, s.r.data(&pos) 428 } else { 429 tok = int('@') 430 } 431 return 432 } 433 434 func scanIdentifier(s *Scanner) (int, Pos, string) { 435 pos := s.r.pos() 436 s.r.inc() 437 s.r.incAsLongAs(isIdentChar) 438 return identifier, pos, s.r.data(&pos) 439 } 440 441 var quotedIdentifier = -identifier 442 443 func scanQuotedIdent(s *Scanner) (tok int, pos Pos, lit string) { 444 pos = s.r.pos() 445 s.r.inc() 446 s.buf.Reset() 447 for { 448 ch := s.r.readByte() 449 if ch == unicode.ReplacementChar && s.r.eof() { 450 tok = unicode.ReplacementChar 451 return 452 } 453 if ch == '`' { 454 if s.r.peek() != '`' { 455 // don't return identifier in case that it's interpreted as keyword token later. 456 tok, lit = quotedIdentifier, s.buf.String() 457 return 458 } 459 s.r.inc() 460 } 461 s.buf.WriteRune(ch) 462 } 463 } 464 465 func startString(s *Scanner) (tok int, pos Pos, lit string) { 466 return s.scanString() 467 } 468 469 // lazyBuf is used to avoid allocation if possible. 470 // it has a useBuf field indicates whether bytes.Buffer is necessary. if 471 // useBuf is false, we can avoid calling bytes.Buffer.String(), which 472 // make a copy of data and cause allocation. 473 type lazyBuf struct { 474 useBuf bool 475 r *reader 476 b *bytes.Buffer 477 p *Pos 478 } 479 480 func (mb *lazyBuf) setUseBuf(str string) { 481 if !mb.useBuf { 482 mb.useBuf = true 483 mb.b.Reset() 484 mb.b.WriteString(str) 485 } 486 } 487 488 func (mb *lazyBuf) writeRune(r rune, w int) { 489 if mb.useBuf { 490 if w > 1 { 491 mb.b.WriteRune(r) 492 } else { 493 mb.b.WriteByte(byte(r)) 494 } 495 } 496 } 497 498 func (mb *lazyBuf) data() string { 499 var lit string 500 if mb.useBuf { 501 lit = mb.b.String() 502 } else { 503 lit = mb.r.data(mb.p) 504 lit = lit[1 : len(lit)-1] 505 } 506 return lit 507 } 508 509 func (s *Scanner) scanString() (tok int, pos Pos, lit string) { 510 tok, pos = stringLit, s.r.pos() 511 mb := lazyBuf{false, &s.r, &s.buf, &pos} 512 ending := s.r.readByte() 513 ch0 := s.r.peek() 514 for !s.r.eof() { 515 if ch0 == ending { 516 s.r.inc() 517 if s.r.peek() != ending { 518 lit = mb.data() 519 return 520 } 521 str := mb.r.data(&pos) 522 mb.setUseBuf(str[1 : len(str)-1]) 523 } else if ch0 == '\\' && !s.sqlMode.HasNoBackslashEscapesMode() { 524 mb.setUseBuf(mb.r.data(&pos)[1:]) 525 ch0 = handleEscape(s) 526 } 527 mb.writeRune(ch0, s.r.w) 528 if !s.r.eof() { 529 s.r.inc() 530 ch0 = s.r.peek() 531 } 532 } 533 534 tok = unicode.ReplacementChar 535 return 536 } 537 538 // handleEscape handles the case in scanString when previous char is '\'. 539 func handleEscape(s *Scanner) rune { 540 s.r.inc() 541 ch0 := s.r.peek() 542 /* 543 \" \' \\ \n \0 \b \Z \r \t ==> escape to one char 544 \% \_ ==> preserve both char 545 other ==> remove \ 546 */ 547 switch ch0 { 548 case 'n': 549 ch0 = '\n' 550 case '0': 551 ch0 = 0 552 case 'b': 553 ch0 = 8 554 case 'Z': 555 ch0 = 26 556 case 'r': 557 ch0 = '\r' 558 case 't': 559 ch0 = '\t' 560 case '%', '_': 561 s.buf.WriteByte('\\') 562 } 563 return ch0 564 } 565 566 func startWithNumber(s *Scanner) (tok int, pos Pos, lit string) { 567 pos = s.r.pos() 568 tok = intLit 569 ch0 := s.r.readByte() 570 if ch0 == '0' { 571 tok = intLit 572 ch1 := s.r.peek() 573 switch { 574 case ch1 >= '0' && ch1 <= '7': 575 s.r.inc() 576 s.scanOct() 577 case ch1 == 'x' || ch1 == 'X': 578 s.r.inc() 579 s.scanHex() 580 tok = hexLit 581 case ch1 == 'b': 582 s.r.inc() 583 s.scanBit() 584 tok = bitLit 585 case ch1 == '.': 586 return s.scanFloat(&pos) 587 case ch1 == 'B': 588 tok = unicode.ReplacementChar 589 return 590 } 591 } 592 593 s.scanDigits() 594 ch0 = s.r.peek() 595 if ch0 == '.' || ch0 == 'e' || ch0 == 'E' { 596 return s.scanFloat(&pos) 597 } 598 599 // Identifiers may begin with a digit but unless quoted may not consist solely of digits. 600 if !s.r.eof() && isIdentChar(ch0) { 601 s.r.incAsLongAs(isIdentChar) 602 return identifier, pos, s.r.data(&pos) 603 } 604 lit = s.r.data(&pos) 605 return 606 } 607 608 func startWithDot(s *Scanner) (tok int, pos Pos, lit string) { 609 pos = s.r.pos() 610 s.r.inc() 611 save := s.r.pos() 612 if isDigit(s.r.peek()) { 613 tok, _, lit = s.scanFloat(&pos) 614 if s.r.eof() || !isIdentChar(s.r.peek()) { 615 return 616 } 617 // Fail to parse a float, reset to dot. 618 s.r.p = save 619 } 620 tok, lit = int('.'), "." 621 return 622 } 623 624 func (s *Scanner) scanOct() { 625 s.r.incAsLongAs(func(ch rune) bool { 626 return ch >= '0' && ch <= '7' 627 }) 628 } 629 630 func (s *Scanner) scanHex() { 631 s.r.incAsLongAs(func(ch rune) bool { 632 return ch >= '0' && ch <= '9' || 633 ch >= 'a' && ch <= 'f' || 634 ch >= 'A' && ch <= 'F' 635 }) 636 } 637 638 func (s *Scanner) scanBit() { 639 s.r.incAsLongAs(func(ch rune) bool { 640 return ch == '0' || ch == '1' 641 }) 642 } 643 644 func (s *Scanner) scanFloat(beg *Pos) (tok int, pos Pos, lit string) { 645 s.r.p = *beg 646 // float = D1 . D2 e D3 647 s.scanDigits() 648 ch0 := s.r.peek() 649 if ch0 == '.' { 650 s.r.inc() 651 s.scanDigits() 652 ch0 = s.r.peek() 653 } 654 if ch0 == 'e' || ch0 == 'E' { 655 s.r.inc() 656 ch0 = s.r.peek() 657 if ch0 == '-' || ch0 == '+' { 658 s.r.inc() 659 } 660 s.scanDigits() 661 tok = floatLit 662 } else { 663 tok = decLit 664 } 665 pos, lit = *beg, s.r.data(beg) 666 return 667 } 668 669 func (s *Scanner) scanDigits() string { 670 pos := s.r.pos() 671 s.r.incAsLongAs(isDigit) 672 return s.r.data(&pos) 673 } 674 675 type reader struct { 676 s string 677 p Pos 678 w int 679 } 680 681 var eof = Pos{-1, -1, -1} 682 683 func (r *reader) eof() bool { 684 return r.p.Offset >= len(r.s) 685 } 686 687 // peek() peeks a rune from underlying reader. 688 // if reader meets EOF, it will return unicode.ReplacementChar. to distinguish from 689 // the real unicode.ReplacementChar, the caller should call r.eof() again to check. 690 func (r *reader) peek() rune { 691 if r.eof() { 692 return unicode.ReplacementChar 693 } 694 v, w := rune(r.s[r.p.Offset]), 1 695 switch { 696 case v == 0: 697 r.w = w 698 return v // illegal UTF-8 encoding 699 case v >= 0x80: 700 v, w = utf8.DecodeRuneInString(r.s[r.p.Offset:]) 701 if v == utf8.RuneError && w == 1 { 702 v = rune(r.s[r.p.Offset]) // illegal UTF-8 encoding 703 } 704 } 705 r.w = w 706 return v 707 } 708 709 // inc increase the position offset of the reader. 710 // peek must be called before calling inc! 711 func (r *reader) inc() { 712 if r.s[r.p.Offset] == '\n' { 713 r.p.Line++ 714 r.p.Col = 0 715 } 716 r.p.Offset += r.w 717 r.p.Col++ 718 } 719 720 func (r *reader) incN(n int) { 721 for i := 0; i < n; i++ { 722 r.inc() 723 } 724 } 725 726 func (r *reader) readByte() (ch rune) { 727 ch = r.peek() 728 if ch == unicode.ReplacementChar && r.eof() { 729 return 730 } 731 r.inc() 732 return 733 } 734 735 func (r *reader) pos() Pos { 736 return r.p 737 } 738 739 func (r *reader) data(from *Pos) string { 740 return r.s[from.Offset:r.p.Offset] 741 } 742 743 func (r *reader) incAsLongAs(fn func(rune) bool) rune { 744 for { 745 ch := r.peek() 746 if !fn(ch) { 747 return ch 748 } 749 if ch == unicode.ReplacementChar && r.eof() { 750 return 0 751 } 752 r.inc() 753 } 754 }