github.com/pingcap/tidb/parser@v0.0.0-20231013125129-93a834a6bf8d/lexer.go (about) 1 // Copyright 2016 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package parser 15 16 import ( 17 "bytes" 18 "fmt" 19 "strconv" 20 "strings" 21 "unicode" 22 23 "github.com/pingcap/tidb/parser/charset" 24 "github.com/pingcap/tidb/parser/mysql" 25 tidbfeature "github.com/pingcap/tidb/parser/tidb" 26 ) 27 28 var _ = yyLexer(&Scanner{}) 29 30 // Pos represents the position of a token. 31 type Pos struct { 32 Line int 33 Col int 34 Offset int 35 } 36 37 // Scanner implements the yyLexer interface. 38 type Scanner struct { 39 r reader 40 buf bytes.Buffer 41 42 client charset.Encoding 43 connection charset.Encoding 44 45 errs []error 46 warns []error 47 stmtStartPos int 48 49 // inBangComment is true if we are inside a `/*! ... */` block. 50 // It is used to ignore a stray `*/` when scanning. 51 inBangComment bool 52 53 sqlMode mysql.SQLMode 54 55 // If the lexer should recognize keywords for window function. 56 // It may break the compatibility when support those keywords, 57 // because some application may already use them as identifiers. 58 supportWindowFunc bool 59 60 // Whether record the original text keyword position to the AST node. 61 skipPositionRecording bool 62 63 // lastScanOffset indicates last offset returned by scan(). 64 // It's used to substring sql in syntax error message. 65 lastScanOffset int 66 67 // lastKeyword records the previous keyword returned by scan(). 68 // determine whether an optimizer hint should be parsed or ignored. 69 lastKeyword int 70 // lastKeyword2 records the keyword before lastKeyword, it is used 71 // to disambiguate hint after for update, which should be ignored. 72 lastKeyword2 int 73 // lastKeyword3 records the keyword before lastKeyword2, it is used 74 // to disambiguate hint after create binding for update, which should 75 // be pertained. 76 lastKeyword3 int 77 78 // hintPos records the start position of the previous optimizer hint. 79 lastHintPos Pos 80 81 // true if a dot follows an identifier 82 identifierDot bool 83 84 // keepHint, if true, Scanner will keep hint when normalizing . 85 keepHint bool 86 } 87 88 // Errors returns the errors and warns during a scan. 89 func (s *Scanner) Errors() (warns []error, errs []error) { 90 return s.warns, s.errs 91 } 92 93 // reset resets the sql string to be scanned. 94 func (s *Scanner) reset(sql string) { 95 s.client = charset.FindEncoding(mysql.DefaultCharset) 96 s.connection = charset.FindEncoding(mysql.DefaultCharset) 97 s.r = reader{s: sql, p: Pos{Line: 1}, l: len(sql)} 98 s.buf.Reset() 99 s.errs = s.errs[:0] 100 s.warns = s.warns[:0] 101 s.stmtStartPos = 0 102 s.inBangComment = false 103 s.lastKeyword = 0 104 s.identifierDot = false 105 } 106 107 func (s *Scanner) stmtText() string { 108 endPos := s.r.pos().Offset 109 if s.r.s[endPos-1] == '\n' { 110 endPos = endPos - 1 // trim new line 111 } 112 if s.r.s[s.stmtStartPos] == '\n' { 113 s.stmtStartPos++ 114 } 115 116 text := s.r.s[s.stmtStartPos:endPos] 117 118 s.stmtStartPos = endPos 119 return text 120 } 121 122 // Errorf tells scanner something is wrong. 123 // Scanner satisfies yyLexer interface which need this function. 124 func (s *Scanner) Errorf(format string, a ...interface{}) (err error) { 125 str := fmt.Sprintf(format, a...) 126 val := s.r.s[s.lastScanOffset:] 127 var lenStr = "" 128 if len(val) > 2048 { 129 lenStr = "(total length " + strconv.Itoa(len(val)) + ")" 130 val = val[:2048] 131 } 132 err = fmt.Errorf("line %d column %d near \"%s\"%s %s", 133 s.r.p.Line, s.r.p.Col, val, str, lenStr) 134 return 135 } 136 137 // AppendError sets error into scanner. 138 // Scanner satisfies yyLexer interface which need this function. 139 func (s *Scanner) AppendError(err error) { 140 if err == nil { 141 return 142 } 143 s.errs = append(s.errs, err) 144 } 145 146 // AppendWarn sets warning into scanner. 147 func (s *Scanner) AppendWarn(err error) { 148 if err == nil { 149 return 150 } 151 s.warns = append(s.warns, err) 152 } 153 154 // convert2System convert lit from client encoding to system encoding which is utf8mb4. 155 func (s *Scanner) convert2System(tok int, lit string) (int, string) { 156 utf8Lit, err := s.client.Transform(nil, charset.HackSlice(lit), charset.OpDecodeReplace) 157 if err != nil { 158 s.AppendWarn(err) 159 } 160 161 return tok, charset.HackString(utf8Lit) 162 } 163 164 // convert2Connection convert lit from client encoding to connection encoding. 165 func (s *Scanner) convert2Connection(tok int, lit string) (int, string) { 166 if mysql.IsUTF8Charset(s.client.Name()) { 167 return tok, lit 168 } 169 utf8Lit, err := s.client.Transform(nil, charset.HackSlice(lit), charset.OpDecodeReplace) 170 if err != nil { 171 s.AppendError(err) 172 if s.sqlMode.HasStrictMode() && s.client.Tp() == s.connection.Tp() { 173 return invalid, lit 174 } 175 s.lastErrorAsWarn() 176 } 177 178 // It is definitely valid if `client` is the same with `connection`, so just transform if they are not the same. 179 if s.client.Tp() != s.connection.Tp() { 180 utf8Lit, _ = s.connection.Transform(nil, utf8Lit, charset.OpReplaceNoErr) 181 } 182 return tok, charset.HackString(utf8Lit) 183 } 184 185 func (s *Scanner) getNextToken() int { 186 r := s.r 187 tok, pos, lit := s.scan() 188 if tok == identifier { 189 tok = s.handleIdent(&yySymType{}) 190 } 191 if tok == identifier { 192 if tok1 := s.isTokenIdentifier(lit, pos.Offset); tok1 != 0 { 193 tok = tok1 194 } 195 } 196 s.r = r 197 return tok 198 } 199 200 func (s *Scanner) getNextTwoTokens() (tok1 int, tok2 int) { 201 r := s.r 202 tok1, pos, lit := s.scan() 203 if tok1 == identifier { 204 tok1 = s.handleIdent(&yySymType{}) 205 } 206 if tok1 == identifier { 207 if tmpToken := s.isTokenIdentifier(lit, pos.Offset); tmpToken != 0 { 208 tok1 = tmpToken 209 } 210 } 211 tok2, pos, lit = s.scan() 212 if tok2 == identifier { 213 tok2 = s.handleIdent(&yySymType{}) 214 } 215 if tok2 == identifier { 216 if tmpToken := s.isTokenIdentifier(lit, pos.Offset); tmpToken != 0 { 217 tok2 = tmpToken 218 } 219 } 220 s.r = r 221 return tok1, tok2 222 } 223 224 // Lex returns a token and store the token value in v. 225 // Scanner satisfies yyLexer interface. 226 // 0 and invalid are special token id this function would return: 227 // return 0 tells parser that scanner meets EOF, 228 // return invalid tells parser that scanner meets illegal character. 229 func (s *Scanner) Lex(v *yySymType) int { 230 tok, pos, lit := s.scan() 231 s.lastScanOffset = pos.Offset 232 s.lastKeyword3 = s.lastKeyword2 233 s.lastKeyword2 = s.lastKeyword 234 s.lastKeyword = 0 235 v.offset = pos.Offset 236 v.ident = lit 237 if tok == identifier { 238 tok = s.handleIdent(v) 239 } 240 if tok == identifier { 241 if tok1 := s.isTokenIdentifier(lit, pos.Offset); tok1 != 0 { 242 tok = tok1 243 s.lastKeyword = tok1 244 } 245 } 246 if s.sqlMode.HasANSIQuotesMode() && 247 tok == stringLit && 248 s.r.s[v.offset] == '"' { 249 tok = identifier 250 } 251 252 if tok == pipes && !(s.sqlMode.HasPipesAsConcatMode()) { 253 return pipesAsOr 254 } 255 256 if tok == not && s.sqlMode.HasHighNotPrecedenceMode() { 257 return not2 258 } 259 if (tok == as || tok == member) && s.getNextToken() == of { 260 _, pos, lit = s.scan() 261 v.ident = fmt.Sprintf("%s %s", v.ident, lit) 262 s.lastScanOffset = pos.Offset 263 v.offset = pos.Offset 264 if tok == as { 265 s.lastKeyword = asof 266 return asof 267 } 268 s.lastKeyword = memberof 269 return memberof 270 } 271 if tok == to { 272 tok1, tok2 := s.getNextTwoTokens() 273 if tok1 == timestampType && tok2 == stringLit { 274 _, pos, lit = s.scan() 275 v.ident = fmt.Sprintf("%s %s", v.ident, lit) 276 s.lastKeyword = toTimestamp 277 s.lastScanOffset = pos.Offset 278 v.offset = pos.Offset 279 return toTimestamp 280 } 281 } 282 // fix shift/reduce conflict with DEFINED NULL BY xxx OPTIONALLY ENCLOSED 283 if tok == optionally { 284 tok1, tok2 := s.getNextTwoTokens() 285 if tok1 == enclosed && tok2 == by { 286 _, _, lit = s.scan() 287 _, pos2, lit2 := s.scan() 288 v.ident = fmt.Sprintf("%s %s %s", v.ident, lit, lit2) 289 s.lastKeyword = optionallyEnclosedBy 290 s.lastScanOffset = pos2.Offset 291 v.offset = pos2.Offset 292 return optionallyEnclosedBy 293 } 294 } 295 296 switch tok { 297 case intLit: 298 return toInt(s, v, lit) 299 case floatLit: 300 return toFloat(s, v, lit) 301 case decLit: 302 return toDecimal(s, v, lit) 303 case hexLit: 304 return toHex(s, v, lit) 305 case bitLit: 306 return toBit(s, v, lit) 307 case singleAtIdentifier, doubleAtIdentifier, cast, extract: 308 v.item = lit 309 return tok 310 case null: 311 v.item = nil 312 case quotedIdentifier, identifier: 313 tok = identifier 314 s.identifierDot = s.r.peek() == '.' 315 tok, v.ident = s.convert2System(tok, lit) 316 case stringLit: 317 tok, v.ident = s.convert2Connection(tok, lit) 318 } 319 320 return tok 321 } 322 323 // LexLiteral returns the value of the converted literal 324 func (s *Scanner) LexLiteral() interface{} { 325 symType := &yySymType{} 326 s.Lex(symType) 327 if symType.item == nil { 328 return symType.ident 329 } 330 return symType.item 331 } 332 333 // SetSQLMode sets the SQL mode for scanner. 334 func (s *Scanner) SetSQLMode(mode mysql.SQLMode) { 335 s.sqlMode = mode 336 } 337 338 // GetSQLMode return the SQL mode of scanner. 339 func (s *Scanner) GetSQLMode() mysql.SQLMode { 340 return s.sqlMode 341 } 342 343 // EnableWindowFunc controls whether the scanner recognize the keywords of window function. 344 func (s *Scanner) EnableWindowFunc(val bool) { 345 s.supportWindowFunc = val 346 } 347 348 // setKeepHint set the keepHint flag when normalizing. 349 func (s *Scanner) setKeepHint(val bool) { 350 s.keepHint = val 351 } 352 353 // InheritScanner returns a new scanner object which inherits configurations from the parent scanner. 354 func (s *Scanner) InheritScanner(sql string) *Scanner { 355 return &Scanner{ 356 r: reader{s: sql}, 357 client: s.client, 358 sqlMode: s.sqlMode, 359 supportWindowFunc: s.supportWindowFunc, 360 } 361 } 362 363 // NewScanner returns a new scanner object. 364 func NewScanner(s string) *Scanner { 365 lexer := &Scanner{r: reader{s: s}} 366 lexer.reset(s) 367 return lexer 368 } 369 370 func (*Scanner) handleIdent(lval *yySymType) int { 371 str := lval.ident 372 // A character string literal may have an optional character set introducer and COLLATE clause: 373 // [_charset_name]'string' [COLLATE collation_name] 374 // See https://dev.mysql.com/doc/refman/5.7/en/charset-literal.html 375 if !strings.HasPrefix(str, "_") { 376 return identifier 377 } 378 cs, _ := charset.GetCharsetInfo(str[1:]) 379 if cs == nil { 380 return identifier 381 } 382 lval.ident = cs.Name 383 return underscoreCS 384 } 385 386 func (s *Scanner) skipWhitespace() byte { 387 return s.r.incAsLongAs(func(b byte) bool { 388 return unicode.IsSpace(rune(b)) 389 }) 390 } 391 392 func (s *Scanner) scan() (tok int, pos Pos, lit string) { 393 ch0 := s.r.peek() 394 if unicode.IsSpace(rune(ch0)) { 395 ch0 = s.skipWhitespace() 396 } 397 pos = s.r.pos() 398 if s.r.eof() { 399 // when scanner meets EOF, the returned token should be 0, 400 // because 0 is a special token id to remind the parser that stream is end. 401 return 0, pos, "" 402 } 403 404 if isIdentExtend(ch0) { 405 return scanIdentifier(s) 406 } 407 408 // search a trie to get a token. 409 node := &ruleTable 410 for !(node.childs[ch0] == nil || s.r.eof()) { 411 node = node.childs[ch0] 412 if node.fn != nil { 413 return node.fn(s) 414 } 415 s.r.inc() 416 ch0 = s.r.peek() 417 } 418 419 tok, lit = node.token, s.r.data(&pos) 420 return 421 } 422 423 func startWithXx(s *Scanner) (tok int, pos Pos, lit string) { 424 pos = s.r.pos() 425 s.r.inc() 426 if s.r.peek() == '\'' { 427 s.r.inc() 428 s.scanHex() 429 if s.r.peek() == '\'' { 430 s.r.inc() 431 tok, lit = hexLit, s.r.data(&pos) 432 } else { 433 tok = invalid 434 } 435 return 436 } 437 s.r.updatePos(pos) 438 return scanIdentifier(s) 439 } 440 441 func startWithNn(s *Scanner) (tok int, pos Pos, lit string) { 442 tok, pos, lit = scanIdentifier(s) 443 // The National Character Set, N'some text' or n'some test'. 444 // See https://dev.mysql.com/doc/refman/5.7/en/string-literals.html 445 // and https://dev.mysql.com/doc/refman/5.7/en/charset-national.html 446 if lit == "N" || lit == "n" { 447 if s.r.peek() == '\'' { 448 tok = underscoreCS 449 lit = "utf8" 450 } 451 } 452 return 453 } 454 455 func startWithBb(s *Scanner) (tok int, pos Pos, lit string) { 456 pos = s.r.pos() 457 s.r.inc() 458 if s.r.peek() == '\'' { 459 s.r.inc() 460 s.scanBit() 461 if s.r.peek() == '\'' { 462 s.r.inc() 463 tok, lit = bitLit, s.r.data(&pos) 464 } else { 465 tok = invalid 466 } 467 return 468 } 469 s.r.updatePos(pos) 470 return scanIdentifier(s) 471 } 472 473 func startWithSharp(s *Scanner) (tok int, pos Pos, lit string) { 474 s.r.incAsLongAs(func(ch byte) bool { 475 return ch != '\n' 476 }) 477 return s.scan() 478 } 479 480 func startWithDash(s *Scanner) (tok int, pos Pos, lit string) { 481 pos = s.r.pos() 482 if strings.HasPrefix(s.r.s[pos.Offset:], "--") { 483 remainLen := len(s.r.s[pos.Offset:]) 484 if remainLen == 2 || (remainLen > 2 && unicode.IsSpace(rune(s.r.s[pos.Offset+2]))) { 485 s.r.incAsLongAs(func(ch byte) bool { 486 return ch != '\n' 487 }) 488 return s.scan() 489 } 490 } 491 if strings.HasPrefix(s.r.s[pos.Offset:], "->>") { 492 tok = juss 493 s.r.incN(3) 494 return 495 } 496 if strings.HasPrefix(s.r.s[pos.Offset:], "->") { 497 tok = jss 498 s.r.incN(2) 499 return 500 } 501 tok = int('-') 502 lit = "-" 503 s.r.inc() 504 return 505 } 506 507 func startWithSlash(s *Scanner) (tok int, pos Pos, lit string) { 508 pos = s.r.pos() 509 s.r.inc() 510 if s.r.peek() != '*' { 511 tok = int('/') 512 lit = "/" 513 return 514 } 515 516 isOptimizerHint := false 517 currentCharIsStar := false 518 519 s.r.inc() // we see '/*' so far. 520 switch s.r.readByte() { 521 case '!': // '/*!' MySQL-specific comments 522 // See http://dev.mysql.com/doc/refman/5.7/en/comments.html 523 // in '/*!', which we always recognize regardless of version. 524 s.scanVersionDigits(5, 5) 525 s.inBangComment = true 526 return s.scan() 527 528 case 'T': // '/*T' maybe TiDB-specific comments 529 if s.r.peek() != '!' { 530 // '/*TX' is just normal comment. 531 break 532 } 533 s.r.inc() 534 // in '/*T!', try to match the pattern '/*T![feature1,feature2,...]'. 535 features := s.scanFeatureIDs() 536 if tidbfeature.CanParseFeature(features...) { 537 s.inBangComment = true 538 return s.scan() 539 } 540 case 'M': // '/*M' maybe MariaDB-specific comments 541 // no special treatment for now. 542 543 case '+': // '/*+' optimizer hints 544 // See https://dev.mysql.com/doc/refman/5.7/en/optimizer-hints.html 545 if _, ok := hintedTokens[s.lastKeyword]; ok || s.keepHint { 546 // only recognize optimizers hints directly followed by certain 547 // keywords like SELECT, INSERT, etc., only a special case "FOR UPDATE" needs to be handled 548 // we will report a warning in order to match MySQL's behavior, but the hint content will be ignored 549 if s.lastKeyword2 == forKwd { 550 if s.lastKeyword3 == binding { 551 // special case of `create binding for update` 552 isOptimizerHint = true 553 } else { 554 s.warns = append(s.warns, ParseErrorWith(s.r.data(&pos), s.r.p.Line)) 555 } 556 } else { 557 isOptimizerHint = true 558 } 559 } else { 560 s.AppendWarn(ErrWarnOptimizerHintWrongPos) 561 } 562 563 case '*': // '/**' if the next char is '/' it would close the comment. 564 currentCharIsStar = true 565 566 default: 567 } 568 569 // standard C-like comment. read until we see '*/' then drop it. 570 for { 571 if currentCharIsStar || s.r.incAsLongAs(func(ch byte) bool { return ch != '*' }) == '*' { 572 switch s.r.readByte() { 573 case '/': 574 // Meets */, means comment end. 575 if isOptimizerHint { 576 s.lastHintPos = pos 577 return hintComment, pos, s.r.data(&pos) 578 } 579 return s.scan() 580 case '*': 581 currentCharIsStar = true 582 continue 583 default: 584 currentCharIsStar = false 585 continue 586 } 587 } 588 // unclosed comment or other errors. 589 s.errs = append(s.errs, ParseErrorWith(s.r.data(&pos), s.r.p.Line)) 590 return 591 } 592 } 593 594 func startWithStar(s *Scanner) (tok int, pos Pos, lit string) { 595 pos = s.r.pos() 596 s.r.inc() 597 598 // skip and exit '/*!' if we see '*/' 599 if s.inBangComment && s.r.peek() == '/' { 600 s.inBangComment = false 601 s.r.inc() 602 return s.scan() 603 } 604 // otherwise it is just a normal star. 605 s.identifierDot = false 606 return '*', pos, "*" 607 } 608 609 func startWithAt(s *Scanner) (tok int, pos Pos, lit string) { 610 pos = s.r.pos() 611 s.r.inc() 612 613 tok, lit = scanIdentifierOrString(s) 614 switch tok { 615 case '@': 616 s.r.inc() 617 stream := s.r.s[pos.Offset+2:] 618 var prefix string 619 for _, v := range []string{"global.", "session.", "local."} { 620 if len(v) > len(stream) { 621 continue 622 } 623 if strings.EqualFold(stream[:len(v)], v) { 624 prefix = v 625 s.r.incN(len(v)) 626 break 627 } 628 } 629 tok, lit = scanIdentifierOrString(s) 630 switch tok { 631 case stringLit, quotedIdentifier: 632 var sb strings.Builder 633 sb.WriteString("@@") 634 sb.WriteString(prefix) 635 sb.WriteString(lit) 636 tok, lit = doubleAtIdentifier, sb.String() 637 case identifier: 638 tok, lit = doubleAtIdentifier, s.r.data(&pos) 639 } 640 case invalid: 641 return 642 default: 643 tok = singleAtIdentifier 644 } 645 646 return 647 } 648 649 func scanIdentifier(s *Scanner) (int, Pos, string) { 650 pos := s.r.pos() 651 s.r.incAsLongAs(isIdentChar) 652 return identifier, pos, s.r.data(&pos) 653 } 654 655 func scanIdentifierOrString(s *Scanner) (tok int, lit string) { 656 ch1 := s.r.peek() 657 switch ch1 { 658 case '\'', '"': 659 tok, _, lit = startString(s) 660 case '`': 661 tok, _, lit = scanQuotedIdent(s) 662 default: 663 if isUserVarChar(ch1) { 664 pos := s.r.pos() 665 s.r.incAsLongAs(isUserVarChar) 666 tok, lit = identifier, s.r.data(&pos) 667 } else { 668 tok = int(ch1) 669 } 670 } 671 return 672 } 673 674 var ( 675 quotedIdentifier = -identifier 676 ) 677 678 func scanQuotedIdent(s *Scanner) (tok int, pos Pos, lit string) { 679 pos = s.r.pos() 680 s.r.inc() 681 s.buf.Reset() 682 for !s.r.eof() { 683 tPos := s.r.pos() 684 if s.r.skipRune(s.client) { 685 s.buf.WriteString(s.r.data(&tPos)) 686 continue 687 } 688 ch := s.r.readByte() 689 if ch == '`' { 690 if s.r.peek() != '`' { 691 // don't return identifier in case that it's interpreted as keyword token later. 692 tok, lit = quotedIdentifier, s.buf.String() 693 return 694 } 695 s.r.inc() 696 } 697 s.buf.WriteByte(ch) 698 } 699 tok = invalid 700 return 701 } 702 703 func startString(s *Scanner) (tok int, pos Pos, lit string) { 704 return s.scanString() 705 } 706 707 // lazyBuf is used to avoid allocation if possible. 708 // it has a useBuf field indicates whether bytes.Buffer is necessary. if 709 // useBuf is false, we can avoid calling bytes.Buffer.String(), which 710 // make a copy of data and cause allocation. 711 type lazyBuf struct { 712 useBuf bool 713 r *reader 714 b *bytes.Buffer 715 p *Pos 716 } 717 718 func (mb *lazyBuf) setUseBuf(str string) { 719 if !mb.useBuf { 720 mb.useBuf = true 721 mb.b.Reset() 722 mb.b.WriteString(str) 723 } 724 } 725 726 func (mb *lazyBuf) writeRune(r rune, w int) { 727 if mb.useBuf { 728 if w > 1 { 729 mb.b.WriteRune(r) 730 } else { 731 mb.b.WriteByte(byte(r)) 732 } 733 } 734 } 735 736 func (mb *lazyBuf) data() string { 737 var lit string 738 if mb.useBuf { 739 lit = mb.b.String() 740 } else { 741 lit = mb.r.data(mb.p) 742 lit = lit[1 : len(lit)-1] 743 } 744 return lit 745 } 746 747 func (s *Scanner) scanString() (tok int, pos Pos, lit string) { 748 tok, pos = stringLit, s.r.pos() 749 ending := s.r.readByte() 750 s.buf.Reset() 751 for !s.r.eof() { 752 tPos := s.r.pos() 753 if s.r.skipRune(s.client) { 754 s.buf.WriteString(s.r.data(&tPos)) 755 continue 756 } 757 ch0 := s.r.readByte() 758 if ch0 == ending { 759 if s.r.peek() != ending { 760 lit = s.buf.String() 761 return 762 } 763 s.r.inc() 764 s.buf.WriteByte(ch0) 765 } else if ch0 == '\\' && !s.sqlMode.HasNoBackslashEscapesMode() { 766 if s.r.eof() { 767 break 768 } 769 s.handleEscape(s.r.peek(), &s.buf) 770 s.r.inc() 771 } else { 772 s.buf.WriteByte(ch0) 773 } 774 } 775 776 tok = invalid 777 return 778 } 779 780 // handleEscape handles the case in scanString when previous char is '\'. 781 func (*Scanner) handleEscape(b byte, buf *bytes.Buffer) { 782 var ch0 byte 783 /* 784 \" \' \\ \n \0 \b \Z \r \t ==> escape to one char 785 \% \_ ==> preserve both char 786 other ==> remove \ 787 */ 788 switch b { 789 case 'n': 790 ch0 = '\n' 791 case '0': 792 ch0 = 0 793 case 'b': 794 ch0 = 8 795 case 'Z': 796 ch0 = 26 797 case 'r': 798 ch0 = '\r' 799 case 't': 800 ch0 = '\t' 801 case '%', '_': 802 buf.WriteByte('\\') 803 ch0 = b 804 default: 805 ch0 = b 806 } 807 buf.WriteByte(ch0) 808 } 809 810 func startWithNumber(s *Scanner) (tok int, pos Pos, lit string) { 811 if s.identifierDot { 812 return scanIdentifier(s) 813 } 814 pos = s.r.pos() 815 tok = intLit 816 ch0 := s.r.readByte() 817 if ch0 == '0' { 818 tok = intLit 819 ch1 := s.r.peek() 820 switch { 821 case ch1 >= '0' && ch1 <= '7': 822 s.r.inc() 823 s.scanOct() 824 case ch1 == 'x' || ch1 == 'X': 825 s.r.inc() 826 p1 := s.r.pos() 827 s.scanHex() 828 p2 := s.r.pos() 829 // 0x, 0x7fz3 are identifier 830 if p1 == p2 || isDigit(s.r.peek()) { 831 s.r.incAsLongAs(isIdentChar) 832 return identifier, pos, s.r.data(&pos) 833 } 834 tok = hexLit 835 case ch1 == 'b': 836 s.r.inc() 837 p1 := s.r.pos() 838 s.scanBit() 839 p2 := s.r.pos() 840 // 0b, 0b123, 0b1ab are identifier 841 if p1 == p2 || isDigit(s.r.peek()) { 842 s.r.incAsLongAs(isIdentChar) 843 return identifier, pos, s.r.data(&pos) 844 } 845 tok = bitLit 846 case ch1 == '.': 847 return s.scanFloat(&pos) 848 case ch1 == 'B': 849 s.r.incAsLongAs(isIdentChar) 850 return identifier, pos, s.r.data(&pos) 851 } 852 } 853 854 s.scanDigits() 855 ch0 = s.r.peek() 856 if ch0 == '.' || ch0 == 'e' || ch0 == 'E' { 857 return s.scanFloat(&pos) 858 } 859 860 // Identifiers may begin with a digit but unless quoted may not consist solely of digits. 861 if !s.r.eof() && isIdentChar(ch0) { 862 s.r.incAsLongAs(isIdentChar) 863 return identifier, pos, s.r.data(&pos) 864 } 865 lit = s.r.data(&pos) 866 return 867 } 868 869 func startWithDot(s *Scanner) (tok int, pos Pos, lit string) { 870 pos = s.r.pos() 871 s.r.inc() 872 if s.identifierDot { 873 return int('.'), pos, "." 874 } 875 if isDigit(s.r.peek()) { 876 tok, p, l := s.scanFloat(&pos) 877 if tok == identifier { 878 return invalid, p, l 879 } 880 return tok, p, l 881 } 882 tok, lit = int('.'), "." 883 return 884 } 885 886 func (s *Scanner) scanOct() { 887 s.r.incAsLongAs(func(ch byte) bool { 888 return ch >= '0' && ch <= '7' 889 }) 890 } 891 892 func (s *Scanner) scanHex() { 893 s.r.incAsLongAs(func(ch byte) bool { 894 return ch >= '0' && ch <= '9' || 895 ch >= 'a' && ch <= 'f' || 896 ch >= 'A' && ch <= 'F' 897 }) 898 } 899 900 func (s *Scanner) scanBit() { 901 s.r.incAsLongAs(func(ch byte) bool { 902 return ch == '0' || ch == '1' 903 }) 904 } 905 906 func (s *Scanner) scanFloat(beg *Pos) (tok int, pos Pos, lit string) { 907 s.r.updatePos(*beg) 908 // float = D1 . D2 e D3 909 s.scanDigits() 910 ch0 := s.r.peek() 911 if ch0 == '.' { 912 s.r.inc() 913 s.scanDigits() 914 ch0 = s.r.peek() 915 } 916 if ch0 == 'e' || ch0 == 'E' { 917 s.r.inc() 918 ch0 = s.r.peek() 919 if ch0 == '-' || ch0 == '+' { 920 s.r.inc() 921 } 922 if isDigit(s.r.peek()) { 923 s.scanDigits() 924 tok = floatLit 925 } else { 926 // D1 . D2 e XX when XX is not D3, parse the result to an identifier. 927 // 9e9e = 9e9(float) + e(identifier) 928 // 9est = 9est(identifier) 929 s.r.updatePos(*beg) 930 s.r.incAsLongAs(isIdentChar) 931 tok = identifier 932 } 933 } else { 934 tok = decLit 935 } 936 pos, lit = *beg, s.r.data(beg) 937 return 938 } 939 940 func (s *Scanner) scanDigits() string { 941 pos := s.r.pos() 942 s.r.incAsLongAs(isDigit) 943 return s.r.data(&pos) 944 } 945 946 // scanVersionDigits scans for `min` to `max` digits (range inclusive) used in 947 // `/*!12345 ... */` comments. 948 func (s *Scanner) scanVersionDigits(min, max int) { 949 pos := s.r.pos() 950 for i := 0; i < max; i++ { 951 ch := s.r.peek() 952 if isDigit(ch) { 953 s.r.inc() 954 } else if i < min { 955 s.r.updatePos(pos) 956 return 957 } else { 958 break 959 } 960 } 961 } 962 963 func (s *Scanner) scanFeatureIDs() (featureIDs []string) { 964 pos := s.r.pos() 965 const init, expectChar, obtainChar = 0, 1, 2 966 state := init 967 var b strings.Builder 968 for !s.r.eof() { 969 ch := s.r.peek() 970 s.r.inc() 971 switch state { 972 case init: 973 if ch == '[' { 974 state = expectChar 975 break 976 } 977 s.r.updatePos(pos) 978 return nil 979 case expectChar: 980 if isIdentChar(ch) { 981 b.WriteByte(ch) 982 state = obtainChar 983 break 984 } 985 s.r.updatePos(pos) 986 return nil 987 case obtainChar: 988 if isIdentChar(ch) { 989 b.WriteByte(ch) 990 state = obtainChar 991 break 992 } else if ch == ',' { 993 featureIDs = append(featureIDs, b.String()) 994 b.Reset() 995 state = expectChar 996 break 997 } else if ch == ']' { 998 featureIDs = append(featureIDs, b.String()) 999 return featureIDs 1000 } 1001 s.r.updatePos(pos) 1002 return nil 1003 } 1004 } 1005 s.r.updatePos(pos) 1006 return nil 1007 } 1008 1009 func (s *Scanner) lastErrorAsWarn() { 1010 if len(s.errs) == 0 { 1011 return 1012 } 1013 s.warns = append(s.warns, s.errs[len(s.errs)-1]) 1014 s.errs = s.errs[:len(s.errs)-1] 1015 } 1016 1017 type reader struct { 1018 s string 1019 p Pos 1020 l int 1021 } 1022 1023 var eof = Pos{-1, -1, -1} 1024 1025 func (r *reader) eof() bool { 1026 return r.p.Offset >= r.l 1027 } 1028 1029 // peek() peeks a rune from underlying reader. 1030 // if reader meets EOF, it will return 0. to distinguish from 1031 // the real 0, the caller should call r.eof() again to check. 1032 func (r *reader) peek() byte { 1033 if r.eof() { 1034 return 0 1035 } 1036 return r.s[r.p.Offset] 1037 } 1038 1039 // inc increase the position offset of the reader. 1040 // peek must be called before calling inc! 1041 func (r *reader) inc() { 1042 if r.s[r.p.Offset] == '\n' { 1043 r.p.Line++ 1044 r.p.Col = 0 1045 } 1046 r.p.Offset++ 1047 r.p.Col++ 1048 } 1049 1050 func (r *reader) incN(n int) { 1051 for i := 0; i < n; i++ { 1052 r.inc() 1053 } 1054 } 1055 1056 func (r *reader) readByte() (ch byte) { 1057 ch = r.peek() 1058 if r.eof() { 1059 return 1060 } 1061 r.inc() 1062 return 1063 } 1064 1065 func (r *reader) pos() Pos { 1066 return r.p 1067 } 1068 1069 func (r *reader) updatePos(pos Pos) { 1070 r.p = pos 1071 } 1072 1073 func (r *reader) data(from *Pos) string { 1074 return r.s[from.Offset:r.p.Offset] 1075 } 1076 1077 func (r *reader) incAsLongAs(fn func(b byte) bool) byte { 1078 for { 1079 ch := r.peek() 1080 if !fn(ch) { 1081 return ch 1082 } 1083 if r.eof() { 1084 return 0 1085 } 1086 r.inc() 1087 } 1088 } 1089 1090 // skipRune skip mb character, return true indicate something has been skipped. 1091 func (r *reader) skipRune(enc charset.Encoding) bool { 1092 if r.s[r.p.Offset] <= unicode.MaxASCII { 1093 return false 1094 } 1095 c := enc.MbLen(r.s[r.p.Offset:]) 1096 r.incN(c) 1097 return c > 0 1098 }