github.com/pingcap/tidb/parser@v0.0.0-20231013125129-93a834a6bf8d/lexer.go (about)

     1  // Copyright 2016 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package parser
    15  
    16  import (
    17  	"bytes"
    18  	"fmt"
    19  	"strconv"
    20  	"strings"
    21  	"unicode"
    22  
    23  	"github.com/pingcap/tidb/parser/charset"
    24  	"github.com/pingcap/tidb/parser/mysql"
    25  	tidbfeature "github.com/pingcap/tidb/parser/tidb"
    26  )
    27  
    28  var _ = yyLexer(&Scanner{})
    29  
    30  // Pos represents the position of a token.
    31  type Pos struct {
    32  	Line   int
    33  	Col    int
    34  	Offset int
    35  }
    36  
    37  // Scanner implements the yyLexer interface.
    38  type Scanner struct {
    39  	r   reader
    40  	buf bytes.Buffer
    41  
    42  	client     charset.Encoding
    43  	connection charset.Encoding
    44  
    45  	errs         []error
    46  	warns        []error
    47  	stmtStartPos int
    48  
    49  	// inBangComment is true if we are inside a `/*! ... */` block.
    50  	// It is used to ignore a stray `*/` when scanning.
    51  	inBangComment bool
    52  
    53  	sqlMode mysql.SQLMode
    54  
    55  	// If the lexer should recognize keywords for window function.
    56  	// It may break the compatibility when support those keywords,
    57  	// because some application may already use them as identifiers.
    58  	supportWindowFunc bool
    59  
    60  	// Whether record the original text keyword position to the AST node.
    61  	skipPositionRecording bool
    62  
    63  	// lastScanOffset indicates last offset returned by scan().
    64  	// It's used to substring sql in syntax error message.
    65  	lastScanOffset int
    66  
    67  	// lastKeyword records the previous keyword returned by scan().
    68  	// determine whether an optimizer hint should be parsed or ignored.
    69  	lastKeyword int
    70  	// lastKeyword2 records the keyword before lastKeyword, it is used
    71  	// to disambiguate hint after for update, which should be ignored.
    72  	lastKeyword2 int
    73  	// lastKeyword3 records the keyword before lastKeyword2, it is used
    74  	// to disambiguate hint after create binding for update, which should
    75  	// be pertained.
    76  	lastKeyword3 int
    77  
    78  	// hintPos records the start position of the previous optimizer hint.
    79  	lastHintPos Pos
    80  
    81  	// true if a dot follows an identifier
    82  	identifierDot bool
    83  
    84  	// keepHint, if true, Scanner will keep hint when normalizing .
    85  	keepHint bool
    86  }
    87  
    88  // Errors returns the errors and warns during a scan.
    89  func (s *Scanner) Errors() (warns []error, errs []error) {
    90  	return s.warns, s.errs
    91  }
    92  
    93  // reset resets the sql string to be scanned.
    94  func (s *Scanner) reset(sql string) {
    95  	s.client = charset.FindEncoding(mysql.DefaultCharset)
    96  	s.connection = charset.FindEncoding(mysql.DefaultCharset)
    97  	s.r = reader{s: sql, p: Pos{Line: 1}, l: len(sql)}
    98  	s.buf.Reset()
    99  	s.errs = s.errs[:0]
   100  	s.warns = s.warns[:0]
   101  	s.stmtStartPos = 0
   102  	s.inBangComment = false
   103  	s.lastKeyword = 0
   104  	s.identifierDot = false
   105  }
   106  
   107  func (s *Scanner) stmtText() string {
   108  	endPos := s.r.pos().Offset
   109  	if s.r.s[endPos-1] == '\n' {
   110  		endPos = endPos - 1 // trim new line
   111  	}
   112  	if s.r.s[s.stmtStartPos] == '\n' {
   113  		s.stmtStartPos++
   114  	}
   115  
   116  	text := s.r.s[s.stmtStartPos:endPos]
   117  
   118  	s.stmtStartPos = endPos
   119  	return text
   120  }
   121  
   122  // Errorf tells scanner something is wrong.
   123  // Scanner satisfies yyLexer interface which need this function.
   124  func (s *Scanner) Errorf(format string, a ...interface{}) (err error) {
   125  	str := fmt.Sprintf(format, a...)
   126  	val := s.r.s[s.lastScanOffset:]
   127  	var lenStr = ""
   128  	if len(val) > 2048 {
   129  		lenStr = "(total length " + strconv.Itoa(len(val)) + ")"
   130  		val = val[:2048]
   131  	}
   132  	err = fmt.Errorf("line %d column %d near \"%s\"%s %s",
   133  		s.r.p.Line, s.r.p.Col, val, str, lenStr)
   134  	return
   135  }
   136  
   137  // AppendError sets error into scanner.
   138  // Scanner satisfies yyLexer interface which need this function.
   139  func (s *Scanner) AppendError(err error) {
   140  	if err == nil {
   141  		return
   142  	}
   143  	s.errs = append(s.errs, err)
   144  }
   145  
   146  // AppendWarn sets warning into scanner.
   147  func (s *Scanner) AppendWarn(err error) {
   148  	if err == nil {
   149  		return
   150  	}
   151  	s.warns = append(s.warns, err)
   152  }
   153  
   154  // convert2System convert lit from client encoding to system encoding which is utf8mb4.
   155  func (s *Scanner) convert2System(tok int, lit string) (int, string) {
   156  	utf8Lit, err := s.client.Transform(nil, charset.HackSlice(lit), charset.OpDecodeReplace)
   157  	if err != nil {
   158  		s.AppendWarn(err)
   159  	}
   160  
   161  	return tok, charset.HackString(utf8Lit)
   162  }
   163  
   164  // convert2Connection convert lit from client encoding to connection encoding.
   165  func (s *Scanner) convert2Connection(tok int, lit string) (int, string) {
   166  	if mysql.IsUTF8Charset(s.client.Name()) {
   167  		return tok, lit
   168  	}
   169  	utf8Lit, err := s.client.Transform(nil, charset.HackSlice(lit), charset.OpDecodeReplace)
   170  	if err != nil {
   171  		s.AppendError(err)
   172  		if s.sqlMode.HasStrictMode() && s.client.Tp() == s.connection.Tp() {
   173  			return invalid, lit
   174  		}
   175  		s.lastErrorAsWarn()
   176  	}
   177  
   178  	// It is definitely valid if `client` is the same with `connection`, so just transform if they are not the same.
   179  	if s.client.Tp() != s.connection.Tp() {
   180  		utf8Lit, _ = s.connection.Transform(nil, utf8Lit, charset.OpReplaceNoErr)
   181  	}
   182  	return tok, charset.HackString(utf8Lit)
   183  }
   184  
   185  func (s *Scanner) getNextToken() int {
   186  	r := s.r
   187  	tok, pos, lit := s.scan()
   188  	if tok == identifier {
   189  		tok = s.handleIdent(&yySymType{})
   190  	}
   191  	if tok == identifier {
   192  		if tok1 := s.isTokenIdentifier(lit, pos.Offset); tok1 != 0 {
   193  			tok = tok1
   194  		}
   195  	}
   196  	s.r = r
   197  	return tok
   198  }
   199  
   200  func (s *Scanner) getNextTwoTokens() (tok1 int, tok2 int) {
   201  	r := s.r
   202  	tok1, pos, lit := s.scan()
   203  	if tok1 == identifier {
   204  		tok1 = s.handleIdent(&yySymType{})
   205  	}
   206  	if tok1 == identifier {
   207  		if tmpToken := s.isTokenIdentifier(lit, pos.Offset); tmpToken != 0 {
   208  			tok1 = tmpToken
   209  		}
   210  	}
   211  	tok2, pos, lit = s.scan()
   212  	if tok2 == identifier {
   213  		tok2 = s.handleIdent(&yySymType{})
   214  	}
   215  	if tok2 == identifier {
   216  		if tmpToken := s.isTokenIdentifier(lit, pos.Offset); tmpToken != 0 {
   217  			tok2 = tmpToken
   218  		}
   219  	}
   220  	s.r = r
   221  	return tok1, tok2
   222  }
   223  
   224  // Lex returns a token and store the token value in v.
   225  // Scanner satisfies yyLexer interface.
   226  // 0 and invalid are special token id this function would return:
   227  // return 0 tells parser that scanner meets EOF,
   228  // return invalid tells parser that scanner meets illegal character.
   229  func (s *Scanner) Lex(v *yySymType) int {
   230  	tok, pos, lit := s.scan()
   231  	s.lastScanOffset = pos.Offset
   232  	s.lastKeyword3 = s.lastKeyword2
   233  	s.lastKeyword2 = s.lastKeyword
   234  	s.lastKeyword = 0
   235  	v.offset = pos.Offset
   236  	v.ident = lit
   237  	if tok == identifier {
   238  		tok = s.handleIdent(v)
   239  	}
   240  	if tok == identifier {
   241  		if tok1 := s.isTokenIdentifier(lit, pos.Offset); tok1 != 0 {
   242  			tok = tok1
   243  			s.lastKeyword = tok1
   244  		}
   245  	}
   246  	if s.sqlMode.HasANSIQuotesMode() &&
   247  		tok == stringLit &&
   248  		s.r.s[v.offset] == '"' {
   249  		tok = identifier
   250  	}
   251  
   252  	if tok == pipes && !(s.sqlMode.HasPipesAsConcatMode()) {
   253  		return pipesAsOr
   254  	}
   255  
   256  	if tok == not && s.sqlMode.HasHighNotPrecedenceMode() {
   257  		return not2
   258  	}
   259  	if (tok == as || tok == member) && s.getNextToken() == of {
   260  		_, pos, lit = s.scan()
   261  		v.ident = fmt.Sprintf("%s %s", v.ident, lit)
   262  		s.lastScanOffset = pos.Offset
   263  		v.offset = pos.Offset
   264  		if tok == as {
   265  			s.lastKeyword = asof
   266  			return asof
   267  		}
   268  		s.lastKeyword = memberof
   269  		return memberof
   270  	}
   271  	if tok == to {
   272  		tok1, tok2 := s.getNextTwoTokens()
   273  		if tok1 == timestampType && tok2 == stringLit {
   274  			_, pos, lit = s.scan()
   275  			v.ident = fmt.Sprintf("%s %s", v.ident, lit)
   276  			s.lastKeyword = toTimestamp
   277  			s.lastScanOffset = pos.Offset
   278  			v.offset = pos.Offset
   279  			return toTimestamp
   280  		}
   281  	}
   282  	// fix shift/reduce conflict with DEFINED NULL BY xxx OPTIONALLY ENCLOSED
   283  	if tok == optionally {
   284  		tok1, tok2 := s.getNextTwoTokens()
   285  		if tok1 == enclosed && tok2 == by {
   286  			_, _, lit = s.scan()
   287  			_, pos2, lit2 := s.scan()
   288  			v.ident = fmt.Sprintf("%s %s %s", v.ident, lit, lit2)
   289  			s.lastKeyword = optionallyEnclosedBy
   290  			s.lastScanOffset = pos2.Offset
   291  			v.offset = pos2.Offset
   292  			return optionallyEnclosedBy
   293  		}
   294  	}
   295  
   296  	switch tok {
   297  	case intLit:
   298  		return toInt(s, v, lit)
   299  	case floatLit:
   300  		return toFloat(s, v, lit)
   301  	case decLit:
   302  		return toDecimal(s, v, lit)
   303  	case hexLit:
   304  		return toHex(s, v, lit)
   305  	case bitLit:
   306  		return toBit(s, v, lit)
   307  	case singleAtIdentifier, doubleAtIdentifier, cast, extract:
   308  		v.item = lit
   309  		return tok
   310  	case null:
   311  		v.item = nil
   312  	case quotedIdentifier, identifier:
   313  		tok = identifier
   314  		s.identifierDot = s.r.peek() == '.'
   315  		tok, v.ident = s.convert2System(tok, lit)
   316  	case stringLit:
   317  		tok, v.ident = s.convert2Connection(tok, lit)
   318  	}
   319  
   320  	return tok
   321  }
   322  
   323  // LexLiteral returns the value of the converted literal
   324  func (s *Scanner) LexLiteral() interface{} {
   325  	symType := &yySymType{}
   326  	s.Lex(symType)
   327  	if symType.item == nil {
   328  		return symType.ident
   329  	}
   330  	return symType.item
   331  }
   332  
   333  // SetSQLMode sets the SQL mode for scanner.
   334  func (s *Scanner) SetSQLMode(mode mysql.SQLMode) {
   335  	s.sqlMode = mode
   336  }
   337  
   338  // GetSQLMode return the SQL mode of scanner.
   339  func (s *Scanner) GetSQLMode() mysql.SQLMode {
   340  	return s.sqlMode
   341  }
   342  
   343  // EnableWindowFunc controls whether the scanner recognize the keywords of window function.
   344  func (s *Scanner) EnableWindowFunc(val bool) {
   345  	s.supportWindowFunc = val
   346  }
   347  
   348  // setKeepHint set the keepHint flag when normalizing.
   349  func (s *Scanner) setKeepHint(val bool) {
   350  	s.keepHint = val
   351  }
   352  
   353  // InheritScanner returns a new scanner object which inherits configurations from the parent scanner.
   354  func (s *Scanner) InheritScanner(sql string) *Scanner {
   355  	return &Scanner{
   356  		r:                 reader{s: sql},
   357  		client:            s.client,
   358  		sqlMode:           s.sqlMode,
   359  		supportWindowFunc: s.supportWindowFunc,
   360  	}
   361  }
   362  
   363  // NewScanner returns a new scanner object.
   364  func NewScanner(s string) *Scanner {
   365  	lexer := &Scanner{r: reader{s: s}}
   366  	lexer.reset(s)
   367  	return lexer
   368  }
   369  
   370  func (*Scanner) handleIdent(lval *yySymType) int {
   371  	str := lval.ident
   372  	// A character string literal may have an optional character set introducer and COLLATE clause:
   373  	// [_charset_name]'string' [COLLATE collation_name]
   374  	// See https://dev.mysql.com/doc/refman/5.7/en/charset-literal.html
   375  	if !strings.HasPrefix(str, "_") {
   376  		return identifier
   377  	}
   378  	cs, _ := charset.GetCharsetInfo(str[1:])
   379  	if cs == nil {
   380  		return identifier
   381  	}
   382  	lval.ident = cs.Name
   383  	return underscoreCS
   384  }
   385  
   386  func (s *Scanner) skipWhitespace() byte {
   387  	return s.r.incAsLongAs(func(b byte) bool {
   388  		return unicode.IsSpace(rune(b))
   389  	})
   390  }
   391  
   392  func (s *Scanner) scan() (tok int, pos Pos, lit string) {
   393  	ch0 := s.r.peek()
   394  	if unicode.IsSpace(rune(ch0)) {
   395  		ch0 = s.skipWhitespace()
   396  	}
   397  	pos = s.r.pos()
   398  	if s.r.eof() {
   399  		// when scanner meets EOF, the returned token should be 0,
   400  		// because 0 is a special token id to remind the parser that stream is end.
   401  		return 0, pos, ""
   402  	}
   403  
   404  	if isIdentExtend(ch0) {
   405  		return scanIdentifier(s)
   406  	}
   407  
   408  	// search a trie to get a token.
   409  	node := &ruleTable
   410  	for !(node.childs[ch0] == nil || s.r.eof()) {
   411  		node = node.childs[ch0]
   412  		if node.fn != nil {
   413  			return node.fn(s)
   414  		}
   415  		s.r.inc()
   416  		ch0 = s.r.peek()
   417  	}
   418  
   419  	tok, lit = node.token, s.r.data(&pos)
   420  	return
   421  }
   422  
   423  func startWithXx(s *Scanner) (tok int, pos Pos, lit string) {
   424  	pos = s.r.pos()
   425  	s.r.inc()
   426  	if s.r.peek() == '\'' {
   427  		s.r.inc()
   428  		s.scanHex()
   429  		if s.r.peek() == '\'' {
   430  			s.r.inc()
   431  			tok, lit = hexLit, s.r.data(&pos)
   432  		} else {
   433  			tok = invalid
   434  		}
   435  		return
   436  	}
   437  	s.r.updatePos(pos)
   438  	return scanIdentifier(s)
   439  }
   440  
   441  func startWithNn(s *Scanner) (tok int, pos Pos, lit string) {
   442  	tok, pos, lit = scanIdentifier(s)
   443  	// The National Character Set, N'some text' or n'some test'.
   444  	// See https://dev.mysql.com/doc/refman/5.7/en/string-literals.html
   445  	// and https://dev.mysql.com/doc/refman/5.7/en/charset-national.html
   446  	if lit == "N" || lit == "n" {
   447  		if s.r.peek() == '\'' {
   448  			tok = underscoreCS
   449  			lit = "utf8"
   450  		}
   451  	}
   452  	return
   453  }
   454  
   455  func startWithBb(s *Scanner) (tok int, pos Pos, lit string) {
   456  	pos = s.r.pos()
   457  	s.r.inc()
   458  	if s.r.peek() == '\'' {
   459  		s.r.inc()
   460  		s.scanBit()
   461  		if s.r.peek() == '\'' {
   462  			s.r.inc()
   463  			tok, lit = bitLit, s.r.data(&pos)
   464  		} else {
   465  			tok = invalid
   466  		}
   467  		return
   468  	}
   469  	s.r.updatePos(pos)
   470  	return scanIdentifier(s)
   471  }
   472  
   473  func startWithSharp(s *Scanner) (tok int, pos Pos, lit string) {
   474  	s.r.incAsLongAs(func(ch byte) bool {
   475  		return ch != '\n'
   476  	})
   477  	return s.scan()
   478  }
   479  
   480  func startWithDash(s *Scanner) (tok int, pos Pos, lit string) {
   481  	pos = s.r.pos()
   482  	if strings.HasPrefix(s.r.s[pos.Offset:], "--") {
   483  		remainLen := len(s.r.s[pos.Offset:])
   484  		if remainLen == 2 || (remainLen > 2 && unicode.IsSpace(rune(s.r.s[pos.Offset+2]))) {
   485  			s.r.incAsLongAs(func(ch byte) bool {
   486  				return ch != '\n'
   487  			})
   488  			return s.scan()
   489  		}
   490  	}
   491  	if strings.HasPrefix(s.r.s[pos.Offset:], "->>") {
   492  		tok = juss
   493  		s.r.incN(3)
   494  		return
   495  	}
   496  	if strings.HasPrefix(s.r.s[pos.Offset:], "->") {
   497  		tok = jss
   498  		s.r.incN(2)
   499  		return
   500  	}
   501  	tok = int('-')
   502  	lit = "-"
   503  	s.r.inc()
   504  	return
   505  }
   506  
   507  func startWithSlash(s *Scanner) (tok int, pos Pos, lit string) {
   508  	pos = s.r.pos()
   509  	s.r.inc()
   510  	if s.r.peek() != '*' {
   511  		tok = int('/')
   512  		lit = "/"
   513  		return
   514  	}
   515  
   516  	isOptimizerHint := false
   517  	currentCharIsStar := false
   518  
   519  	s.r.inc() // we see '/*' so far.
   520  	switch s.r.readByte() {
   521  	case '!': // '/*!' MySQL-specific comments
   522  		// See http://dev.mysql.com/doc/refman/5.7/en/comments.html
   523  		// in '/*!', which we always recognize regardless of version.
   524  		s.scanVersionDigits(5, 5)
   525  		s.inBangComment = true
   526  		return s.scan()
   527  
   528  	case 'T': // '/*T' maybe TiDB-specific comments
   529  		if s.r.peek() != '!' {
   530  			// '/*TX' is just normal comment.
   531  			break
   532  		}
   533  		s.r.inc()
   534  		// in '/*T!', try to match the pattern '/*T![feature1,feature2,...]'.
   535  		features := s.scanFeatureIDs()
   536  		if tidbfeature.CanParseFeature(features...) {
   537  			s.inBangComment = true
   538  			return s.scan()
   539  		}
   540  	case 'M': // '/*M' maybe MariaDB-specific comments
   541  		// no special treatment for now.
   542  
   543  	case '+': // '/*+' optimizer hints
   544  		// See https://dev.mysql.com/doc/refman/5.7/en/optimizer-hints.html
   545  		if _, ok := hintedTokens[s.lastKeyword]; ok || s.keepHint {
   546  			// only recognize optimizers hints directly followed by certain
   547  			// keywords like SELECT, INSERT, etc., only a special case "FOR UPDATE" needs to be handled
   548  			// we will report a warning in order to match MySQL's behavior, but the hint content will be ignored
   549  			if s.lastKeyword2 == forKwd {
   550  				if s.lastKeyword3 == binding {
   551  					// special case of `create binding for update`
   552  					isOptimizerHint = true
   553  				} else {
   554  					s.warns = append(s.warns, ParseErrorWith(s.r.data(&pos), s.r.p.Line))
   555  				}
   556  			} else {
   557  				isOptimizerHint = true
   558  			}
   559  		} else {
   560  			s.AppendWarn(ErrWarnOptimizerHintWrongPos)
   561  		}
   562  
   563  	case '*': // '/**' if the next char is '/' it would close the comment.
   564  		currentCharIsStar = true
   565  
   566  	default:
   567  	}
   568  
   569  	// standard C-like comment. read until we see '*/' then drop it.
   570  	for {
   571  		if currentCharIsStar || s.r.incAsLongAs(func(ch byte) bool { return ch != '*' }) == '*' {
   572  			switch s.r.readByte() {
   573  			case '/':
   574  				// Meets */, means comment end.
   575  				if isOptimizerHint {
   576  					s.lastHintPos = pos
   577  					return hintComment, pos, s.r.data(&pos)
   578  				}
   579  				return s.scan()
   580  			case '*':
   581  				currentCharIsStar = true
   582  				continue
   583  			default:
   584  				currentCharIsStar = false
   585  				continue
   586  			}
   587  		}
   588  		// unclosed comment or other errors.
   589  		s.errs = append(s.errs, ParseErrorWith(s.r.data(&pos), s.r.p.Line))
   590  		return
   591  	}
   592  }
   593  
   594  func startWithStar(s *Scanner) (tok int, pos Pos, lit string) {
   595  	pos = s.r.pos()
   596  	s.r.inc()
   597  
   598  	// skip and exit '/*!' if we see '*/'
   599  	if s.inBangComment && s.r.peek() == '/' {
   600  		s.inBangComment = false
   601  		s.r.inc()
   602  		return s.scan()
   603  	}
   604  	// otherwise it is just a normal star.
   605  	s.identifierDot = false
   606  	return '*', pos, "*"
   607  }
   608  
   609  func startWithAt(s *Scanner) (tok int, pos Pos, lit string) {
   610  	pos = s.r.pos()
   611  	s.r.inc()
   612  
   613  	tok, lit = scanIdentifierOrString(s)
   614  	switch tok {
   615  	case '@':
   616  		s.r.inc()
   617  		stream := s.r.s[pos.Offset+2:]
   618  		var prefix string
   619  		for _, v := range []string{"global.", "session.", "local."} {
   620  			if len(v) > len(stream) {
   621  				continue
   622  			}
   623  			if strings.EqualFold(stream[:len(v)], v) {
   624  				prefix = v
   625  				s.r.incN(len(v))
   626  				break
   627  			}
   628  		}
   629  		tok, lit = scanIdentifierOrString(s)
   630  		switch tok {
   631  		case stringLit, quotedIdentifier:
   632  			var sb strings.Builder
   633  			sb.WriteString("@@")
   634  			sb.WriteString(prefix)
   635  			sb.WriteString(lit)
   636  			tok, lit = doubleAtIdentifier, sb.String()
   637  		case identifier:
   638  			tok, lit = doubleAtIdentifier, s.r.data(&pos)
   639  		}
   640  	case invalid:
   641  		return
   642  	default:
   643  		tok = singleAtIdentifier
   644  	}
   645  
   646  	return
   647  }
   648  
   649  func scanIdentifier(s *Scanner) (int, Pos, string) {
   650  	pos := s.r.pos()
   651  	s.r.incAsLongAs(isIdentChar)
   652  	return identifier, pos, s.r.data(&pos)
   653  }
   654  
   655  func scanIdentifierOrString(s *Scanner) (tok int, lit string) {
   656  	ch1 := s.r.peek()
   657  	switch ch1 {
   658  	case '\'', '"':
   659  		tok, _, lit = startString(s)
   660  	case '`':
   661  		tok, _, lit = scanQuotedIdent(s)
   662  	default:
   663  		if isUserVarChar(ch1) {
   664  			pos := s.r.pos()
   665  			s.r.incAsLongAs(isUserVarChar)
   666  			tok, lit = identifier, s.r.data(&pos)
   667  		} else {
   668  			tok = int(ch1)
   669  		}
   670  	}
   671  	return
   672  }
   673  
   674  var (
   675  	quotedIdentifier = -identifier
   676  )
   677  
   678  func scanQuotedIdent(s *Scanner) (tok int, pos Pos, lit string) {
   679  	pos = s.r.pos()
   680  	s.r.inc()
   681  	s.buf.Reset()
   682  	for !s.r.eof() {
   683  		tPos := s.r.pos()
   684  		if s.r.skipRune(s.client) {
   685  			s.buf.WriteString(s.r.data(&tPos))
   686  			continue
   687  		}
   688  		ch := s.r.readByte()
   689  		if ch == '`' {
   690  			if s.r.peek() != '`' {
   691  				// don't return identifier in case that it's interpreted as keyword token later.
   692  				tok, lit = quotedIdentifier, s.buf.String()
   693  				return
   694  			}
   695  			s.r.inc()
   696  		}
   697  		s.buf.WriteByte(ch)
   698  	}
   699  	tok = invalid
   700  	return
   701  }
   702  
   703  func startString(s *Scanner) (tok int, pos Pos, lit string) {
   704  	return s.scanString()
   705  }
   706  
   707  // lazyBuf is used to avoid allocation if possible.
   708  // it has a useBuf field indicates whether bytes.Buffer is necessary. if
   709  // useBuf is false, we can avoid calling bytes.Buffer.String(), which
   710  // make a copy of data and cause allocation.
   711  type lazyBuf struct {
   712  	useBuf bool
   713  	r      *reader
   714  	b      *bytes.Buffer
   715  	p      *Pos
   716  }
   717  
   718  func (mb *lazyBuf) setUseBuf(str string) {
   719  	if !mb.useBuf {
   720  		mb.useBuf = true
   721  		mb.b.Reset()
   722  		mb.b.WriteString(str)
   723  	}
   724  }
   725  
   726  func (mb *lazyBuf) writeRune(r rune, w int) {
   727  	if mb.useBuf {
   728  		if w > 1 {
   729  			mb.b.WriteRune(r)
   730  		} else {
   731  			mb.b.WriteByte(byte(r))
   732  		}
   733  	}
   734  }
   735  
   736  func (mb *lazyBuf) data() string {
   737  	var lit string
   738  	if mb.useBuf {
   739  		lit = mb.b.String()
   740  	} else {
   741  		lit = mb.r.data(mb.p)
   742  		lit = lit[1 : len(lit)-1]
   743  	}
   744  	return lit
   745  }
   746  
   747  func (s *Scanner) scanString() (tok int, pos Pos, lit string) {
   748  	tok, pos = stringLit, s.r.pos()
   749  	ending := s.r.readByte()
   750  	s.buf.Reset()
   751  	for !s.r.eof() {
   752  		tPos := s.r.pos()
   753  		if s.r.skipRune(s.client) {
   754  			s.buf.WriteString(s.r.data(&tPos))
   755  			continue
   756  		}
   757  		ch0 := s.r.readByte()
   758  		if ch0 == ending {
   759  			if s.r.peek() != ending {
   760  				lit = s.buf.String()
   761  				return
   762  			}
   763  			s.r.inc()
   764  			s.buf.WriteByte(ch0)
   765  		} else if ch0 == '\\' && !s.sqlMode.HasNoBackslashEscapesMode() {
   766  			if s.r.eof() {
   767  				break
   768  			}
   769  			s.handleEscape(s.r.peek(), &s.buf)
   770  			s.r.inc()
   771  		} else {
   772  			s.buf.WriteByte(ch0)
   773  		}
   774  	}
   775  
   776  	tok = invalid
   777  	return
   778  }
   779  
   780  // handleEscape handles the case in scanString when previous char is '\'.
   781  func (*Scanner) handleEscape(b byte, buf *bytes.Buffer) {
   782  	var ch0 byte
   783  	/*
   784  		\" \' \\ \n \0 \b \Z \r \t ==> escape to one char
   785  		\% \_ ==> preserve both char
   786  		other ==> remove \
   787  	*/
   788  	switch b {
   789  	case 'n':
   790  		ch0 = '\n'
   791  	case '0':
   792  		ch0 = 0
   793  	case 'b':
   794  		ch0 = 8
   795  	case 'Z':
   796  		ch0 = 26
   797  	case 'r':
   798  		ch0 = '\r'
   799  	case 't':
   800  		ch0 = '\t'
   801  	case '%', '_':
   802  		buf.WriteByte('\\')
   803  		ch0 = b
   804  	default:
   805  		ch0 = b
   806  	}
   807  	buf.WriteByte(ch0)
   808  }
   809  
   810  func startWithNumber(s *Scanner) (tok int, pos Pos, lit string) {
   811  	if s.identifierDot {
   812  		return scanIdentifier(s)
   813  	}
   814  	pos = s.r.pos()
   815  	tok = intLit
   816  	ch0 := s.r.readByte()
   817  	if ch0 == '0' {
   818  		tok = intLit
   819  		ch1 := s.r.peek()
   820  		switch {
   821  		case ch1 >= '0' && ch1 <= '7':
   822  			s.r.inc()
   823  			s.scanOct()
   824  		case ch1 == 'x' || ch1 == 'X':
   825  			s.r.inc()
   826  			p1 := s.r.pos()
   827  			s.scanHex()
   828  			p2 := s.r.pos()
   829  			// 0x, 0x7fz3 are identifier
   830  			if p1 == p2 || isDigit(s.r.peek()) {
   831  				s.r.incAsLongAs(isIdentChar)
   832  				return identifier, pos, s.r.data(&pos)
   833  			}
   834  			tok = hexLit
   835  		case ch1 == 'b':
   836  			s.r.inc()
   837  			p1 := s.r.pos()
   838  			s.scanBit()
   839  			p2 := s.r.pos()
   840  			// 0b, 0b123, 0b1ab are identifier
   841  			if p1 == p2 || isDigit(s.r.peek()) {
   842  				s.r.incAsLongAs(isIdentChar)
   843  				return identifier, pos, s.r.data(&pos)
   844  			}
   845  			tok = bitLit
   846  		case ch1 == '.':
   847  			return s.scanFloat(&pos)
   848  		case ch1 == 'B':
   849  			s.r.incAsLongAs(isIdentChar)
   850  			return identifier, pos, s.r.data(&pos)
   851  		}
   852  	}
   853  
   854  	s.scanDigits()
   855  	ch0 = s.r.peek()
   856  	if ch0 == '.' || ch0 == 'e' || ch0 == 'E' {
   857  		return s.scanFloat(&pos)
   858  	}
   859  
   860  	// Identifiers may begin with a digit but unless quoted may not consist solely of digits.
   861  	if !s.r.eof() && isIdentChar(ch0) {
   862  		s.r.incAsLongAs(isIdentChar)
   863  		return identifier, pos, s.r.data(&pos)
   864  	}
   865  	lit = s.r.data(&pos)
   866  	return
   867  }
   868  
   869  func startWithDot(s *Scanner) (tok int, pos Pos, lit string) {
   870  	pos = s.r.pos()
   871  	s.r.inc()
   872  	if s.identifierDot {
   873  		return int('.'), pos, "."
   874  	}
   875  	if isDigit(s.r.peek()) {
   876  		tok, p, l := s.scanFloat(&pos)
   877  		if tok == identifier {
   878  			return invalid, p, l
   879  		}
   880  		return tok, p, l
   881  	}
   882  	tok, lit = int('.'), "."
   883  	return
   884  }
   885  
   886  func (s *Scanner) scanOct() {
   887  	s.r.incAsLongAs(func(ch byte) bool {
   888  		return ch >= '0' && ch <= '7'
   889  	})
   890  }
   891  
   892  func (s *Scanner) scanHex() {
   893  	s.r.incAsLongAs(func(ch byte) bool {
   894  		return ch >= '0' && ch <= '9' ||
   895  			ch >= 'a' && ch <= 'f' ||
   896  			ch >= 'A' && ch <= 'F'
   897  	})
   898  }
   899  
   900  func (s *Scanner) scanBit() {
   901  	s.r.incAsLongAs(func(ch byte) bool {
   902  		return ch == '0' || ch == '1'
   903  	})
   904  }
   905  
   906  func (s *Scanner) scanFloat(beg *Pos) (tok int, pos Pos, lit string) {
   907  	s.r.updatePos(*beg)
   908  	// float = D1 . D2 e D3
   909  	s.scanDigits()
   910  	ch0 := s.r.peek()
   911  	if ch0 == '.' {
   912  		s.r.inc()
   913  		s.scanDigits()
   914  		ch0 = s.r.peek()
   915  	}
   916  	if ch0 == 'e' || ch0 == 'E' {
   917  		s.r.inc()
   918  		ch0 = s.r.peek()
   919  		if ch0 == '-' || ch0 == '+' {
   920  			s.r.inc()
   921  		}
   922  		if isDigit(s.r.peek()) {
   923  			s.scanDigits()
   924  			tok = floatLit
   925  		} else {
   926  			// D1 . D2 e XX when XX is not D3, parse the result to an identifier.
   927  			// 9e9e = 9e9(float) + e(identifier)
   928  			// 9est = 9est(identifier)
   929  			s.r.updatePos(*beg)
   930  			s.r.incAsLongAs(isIdentChar)
   931  			tok = identifier
   932  		}
   933  	} else {
   934  		tok = decLit
   935  	}
   936  	pos, lit = *beg, s.r.data(beg)
   937  	return
   938  }
   939  
   940  func (s *Scanner) scanDigits() string {
   941  	pos := s.r.pos()
   942  	s.r.incAsLongAs(isDigit)
   943  	return s.r.data(&pos)
   944  }
   945  
   946  // scanVersionDigits scans for `min` to `max` digits (range inclusive) used in
   947  // `/*!12345 ... */` comments.
   948  func (s *Scanner) scanVersionDigits(min, max int) {
   949  	pos := s.r.pos()
   950  	for i := 0; i < max; i++ {
   951  		ch := s.r.peek()
   952  		if isDigit(ch) {
   953  			s.r.inc()
   954  		} else if i < min {
   955  			s.r.updatePos(pos)
   956  			return
   957  		} else {
   958  			break
   959  		}
   960  	}
   961  }
   962  
   963  func (s *Scanner) scanFeatureIDs() (featureIDs []string) {
   964  	pos := s.r.pos()
   965  	const init, expectChar, obtainChar = 0, 1, 2
   966  	state := init
   967  	var b strings.Builder
   968  	for !s.r.eof() {
   969  		ch := s.r.peek()
   970  		s.r.inc()
   971  		switch state {
   972  		case init:
   973  			if ch == '[' {
   974  				state = expectChar
   975  				break
   976  			}
   977  			s.r.updatePos(pos)
   978  			return nil
   979  		case expectChar:
   980  			if isIdentChar(ch) {
   981  				b.WriteByte(ch)
   982  				state = obtainChar
   983  				break
   984  			}
   985  			s.r.updatePos(pos)
   986  			return nil
   987  		case obtainChar:
   988  			if isIdentChar(ch) {
   989  				b.WriteByte(ch)
   990  				state = obtainChar
   991  				break
   992  			} else if ch == ',' {
   993  				featureIDs = append(featureIDs, b.String())
   994  				b.Reset()
   995  				state = expectChar
   996  				break
   997  			} else if ch == ']' {
   998  				featureIDs = append(featureIDs, b.String())
   999  				return featureIDs
  1000  			}
  1001  			s.r.updatePos(pos)
  1002  			return nil
  1003  		}
  1004  	}
  1005  	s.r.updatePos(pos)
  1006  	return nil
  1007  }
  1008  
  1009  func (s *Scanner) lastErrorAsWarn() {
  1010  	if len(s.errs) == 0 {
  1011  		return
  1012  	}
  1013  	s.warns = append(s.warns, s.errs[len(s.errs)-1])
  1014  	s.errs = s.errs[:len(s.errs)-1]
  1015  }
  1016  
  1017  type reader struct {
  1018  	s string
  1019  	p Pos
  1020  	l int
  1021  }
  1022  
  1023  var eof = Pos{-1, -1, -1}
  1024  
  1025  func (r *reader) eof() bool {
  1026  	return r.p.Offset >= r.l
  1027  }
  1028  
  1029  // peek() peeks a rune from underlying reader.
  1030  // if reader meets EOF, it will return 0. to distinguish from
  1031  // the real 0, the caller should call r.eof() again to check.
  1032  func (r *reader) peek() byte {
  1033  	if r.eof() {
  1034  		return 0
  1035  	}
  1036  	return r.s[r.p.Offset]
  1037  }
  1038  
  1039  // inc increase the position offset of the reader.
  1040  // peek must be called before calling inc!
  1041  func (r *reader) inc() {
  1042  	if r.s[r.p.Offset] == '\n' {
  1043  		r.p.Line++
  1044  		r.p.Col = 0
  1045  	}
  1046  	r.p.Offset++
  1047  	r.p.Col++
  1048  }
  1049  
  1050  func (r *reader) incN(n int) {
  1051  	for i := 0; i < n; i++ {
  1052  		r.inc()
  1053  	}
  1054  }
  1055  
  1056  func (r *reader) readByte() (ch byte) {
  1057  	ch = r.peek()
  1058  	if r.eof() {
  1059  		return
  1060  	}
  1061  	r.inc()
  1062  	return
  1063  }
  1064  
  1065  func (r *reader) pos() Pos {
  1066  	return r.p
  1067  }
  1068  
  1069  func (r *reader) updatePos(pos Pos) {
  1070  	r.p = pos
  1071  }
  1072  
  1073  func (r *reader) data(from *Pos) string {
  1074  	return r.s[from.Offset:r.p.Offset]
  1075  }
  1076  
  1077  func (r *reader) incAsLongAs(fn func(b byte) bool) byte {
  1078  	for {
  1079  		ch := r.peek()
  1080  		if !fn(ch) {
  1081  			return ch
  1082  		}
  1083  		if r.eof() {
  1084  			return 0
  1085  		}
  1086  		r.inc()
  1087  	}
  1088  }
  1089  
  1090  // skipRune skip mb character, return true indicate something has been skipped.
  1091  func (r *reader) skipRune(enc charset.Encoding) bool {
  1092  	if r.s[r.p.Offset] <= unicode.MaxASCII {
  1093  		return false
  1094  	}
  1095  	c := enc.MbLen(r.s[r.p.Offset:])
  1096  	r.incN(c)
  1097  	return c > 0
  1098  }