github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/parser.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package mydump
    15  
    16  import (
    17  	"bytes"
    18  	"fmt"
    19  	"io"
    20  	"regexp"
    21  	"strconv"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/pingcap/errors"
    27  	"github.com/pingcap/parser/mysql"
    28  	"github.com/pingcap/tidb/types"
    29  	"go.uber.org/zap"
    30  	"go.uber.org/zap/zapcore"
    31  
    32  	"github.com/pingcap/br/pkg/lightning/config"
    33  	"github.com/pingcap/br/pkg/lightning/log"
    34  	"github.com/pingcap/br/pkg/lightning/metric"
    35  	"github.com/pingcap/br/pkg/lightning/worker"
    36  )
    37  
    38  type blockParser struct {
    39  	// states for the lexer
    40  	reader      PooledReader
    41  	buf         []byte
    42  	blockBuf    []byte
    43  	isLastChunk bool
    44  
    45  	// The list of column names of the last INSERT statement.
    46  	columns []string
    47  
    48  	rowPool *sync.Pool
    49  	lastRow Row
    50  	// Current file offset.
    51  	pos int64
    52  
    53  	// cache
    54  	remainBuf *bytes.Buffer
    55  	appendBuf *bytes.Buffer
    56  
    57  	// the Logger associated with this parser for reporting failure
    58  	Logger log.Logger
    59  }
    60  
    61  func makeBlockParser(reader ReadSeekCloser, blockBufSize int64, ioWorkers *worker.Pool) blockParser {
    62  	return blockParser{
    63  		reader:    MakePooledReader(reader, ioWorkers),
    64  		blockBuf:  make([]byte, blockBufSize*config.BufferSizeScale),
    65  		remainBuf: &bytes.Buffer{},
    66  		appendBuf: &bytes.Buffer{},
    67  		Logger:    log.L(),
    68  		rowPool: &sync.Pool{
    69  			New: func() interface{} {
    70  				return make([]types.Datum, 0, 16)
    71  			},
    72  		},
    73  	}
    74  }
    75  
    76  // ChunkParser is a parser of the data files (the file containing only INSERT
    77  // statements).
    78  type ChunkParser struct {
    79  	blockParser
    80  
    81  	escFlavor backslashEscapeFlavor
    82  }
    83  
    84  // Chunk represents a portion of the data file.
    85  type Chunk struct {
    86  	Offset       int64
    87  	EndOffset    int64
    88  	PrevRowIDMax int64
    89  	RowIDMax     int64
    90  	Columns      []string
    91  }
    92  
    93  // Row is the content of a row.
    94  type Row struct {
    95  	RowID  int64
    96  	Row    []types.Datum
    97  	Length int
    98  }
    99  
   100  // MarshalLogArray implements the zapcore.ArrayMarshaler interface
   101  func (row Row) MarshalLogArray(encoder zapcore.ArrayEncoder) error {
   102  	for _, r := range row.Row {
   103  		encoder.AppendString(r.String())
   104  	}
   105  	return nil
   106  }
   107  
   108  type backslashEscapeFlavor uint8
   109  
   110  const (
   111  	backslashEscapeFlavorNone backslashEscapeFlavor = iota
   112  	backslashEscapeFlavorMySQL
   113  	backslashEscapeFlavorMySQLWithNull
   114  )
   115  
   116  type Parser interface {
   117  	Pos() (pos int64, rowID int64)
   118  	SetPos(pos int64, rowID int64) error
   119  	Close() error
   120  	ReadRow() error
   121  	LastRow() Row
   122  	RecycleRow(row Row)
   123  
   124  	// Columns returns the _lower-case_ column names corresponding to values in
   125  	// the LastRow.
   126  	Columns() []string
   127  	// SetColumns set restored column names to parser
   128  	SetColumns([]string)
   129  
   130  	SetLogger(log.Logger)
   131  }
   132  
   133  // NewChunkParser creates a new parser which can read chunks out of a file.
   134  func NewChunkParser(
   135  	sqlMode mysql.SQLMode,
   136  	reader ReadSeekCloser,
   137  	blockBufSize int64,
   138  	ioWorkers *worker.Pool,
   139  ) *ChunkParser {
   140  	escFlavor := backslashEscapeFlavorMySQL
   141  	if sqlMode.HasNoBackslashEscapesMode() {
   142  		escFlavor = backslashEscapeFlavorNone
   143  	}
   144  
   145  	return &ChunkParser{
   146  		blockParser: makeBlockParser(reader, blockBufSize, ioWorkers),
   147  		escFlavor:   escFlavor,
   148  	}
   149  }
   150  
   151  // SetPos changes the reported position and row ID.
   152  func (parser *blockParser) SetPos(pos int64, rowID int64) error {
   153  	p, err := parser.reader.Seek(pos, io.SeekStart)
   154  	if err != nil {
   155  		return errors.Trace(err)
   156  	}
   157  	if p != pos {
   158  		return errors.Errorf("set pos failed, required position: %d, got: %d", pos, p)
   159  	}
   160  	parser.pos = pos
   161  	parser.lastRow.RowID = rowID
   162  	return nil
   163  }
   164  
   165  // Pos returns the current file offset.
   166  func (parser *blockParser) Pos() (int64, int64) {
   167  	return parser.pos, parser.lastRow.RowID
   168  }
   169  
   170  func (parser *blockParser) Close() error {
   171  	return parser.reader.Close()
   172  }
   173  
   174  func (parser *blockParser) Columns() []string {
   175  	return parser.columns
   176  }
   177  
   178  func (parser *blockParser) SetColumns(columns []string) {
   179  	parser.columns = columns
   180  }
   181  
   182  func (parser *blockParser) logSyntaxError() {
   183  	content := parser.buf
   184  	if len(content) > 256 {
   185  		content = content[:256]
   186  	}
   187  	parser.Logger.Error("syntax error",
   188  		zap.Int64("pos", parser.pos),
   189  		zap.ByteString("content", content),
   190  	)
   191  }
   192  
   193  func (parser *blockParser) SetLogger(logger log.Logger) {
   194  	parser.Logger = logger
   195  }
   196  
   197  type token byte
   198  
   199  const (
   200  	tokNil token = iota
   201  	tokRowBegin
   202  	tokRowEnd
   203  	tokValues
   204  	tokNull
   205  	tokTrue
   206  	tokFalse
   207  	tokHexString
   208  	tokBinString
   209  	tokInteger
   210  	tokSingleQuoted
   211  	tokDoubleQuoted
   212  	tokBackQuoted
   213  	tokUnquoted
   214  )
   215  
   216  var tokenDescriptions = [...]string{
   217  	tokNil:          "<Nil>",
   218  	tokRowBegin:     "RowBegin",
   219  	tokRowEnd:       "RowEnd",
   220  	tokValues:       "Values",
   221  	tokNull:         "Null",
   222  	tokTrue:         "True",
   223  	tokFalse:        "False",
   224  	tokHexString:    "HexString",
   225  	tokBinString:    "BinString",
   226  	tokInteger:      "Integer",
   227  	tokSingleQuoted: "SingleQuoted",
   228  	tokDoubleQuoted: "DoubleQuoted",
   229  	tokBackQuoted:   "BackQuoted",
   230  	tokUnquoted:     "Unquoted",
   231  }
   232  
   233  // String implements the fmt.Stringer interface
   234  //
   235  // Mainly used for debugging a token.
   236  func (tok token) String() string {
   237  	t := int(tok)
   238  	if t >= 0 && t < len(tokenDescriptions) {
   239  		if description := tokenDescriptions[t]; description != "" {
   240  			return description
   241  		}
   242  	}
   243  	return fmt.Sprintf("<Unknown(%d)>", t)
   244  }
   245  
   246  func (parser *blockParser) readBlock() error {
   247  	startTime := time.Now()
   248  
   249  	n, err := parser.reader.ReadFull(parser.blockBuf)
   250  
   251  	switch err {
   252  	case io.ErrUnexpectedEOF, io.EOF:
   253  		parser.isLastChunk = true
   254  		fallthrough
   255  	case nil:
   256  		// `parser.buf` reference to `appendBuf.Bytes`, so should use remainBuf to
   257  		// hold the `parser.buf` rest data to prevent slice overlap
   258  		parser.remainBuf.Reset()
   259  		parser.remainBuf.Write(parser.buf)
   260  		parser.appendBuf.Reset()
   261  		parser.appendBuf.Write(parser.remainBuf.Bytes())
   262  		parser.appendBuf.Write(parser.blockBuf[:n])
   263  		parser.buf = parser.appendBuf.Bytes()
   264  		metric.ChunkParserReadBlockSecondsHistogram.Observe(time.Since(startTime).Seconds())
   265  		return nil
   266  	default:
   267  		return errors.Trace(err)
   268  	}
   269  }
   270  
   271  var unescapeRegexp = regexp.MustCompile(`(?s)\\.`)
   272  
   273  func unescape(
   274  	input string,
   275  	delim string,
   276  	escFlavor backslashEscapeFlavor,
   277  ) string {
   278  	if len(delim) > 0 {
   279  		delim2 := delim + delim
   280  		if strings.Contains(input, delim2) {
   281  			input = strings.ReplaceAll(input, delim2, delim)
   282  		}
   283  	}
   284  	if escFlavor != backslashEscapeFlavorNone && strings.IndexByte(input, '\\') != -1 {
   285  		input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string {
   286  			switch substr[1] {
   287  			case '0':
   288  				return "\x00"
   289  			case 'b':
   290  				return "\b"
   291  			case 'n':
   292  				return "\n"
   293  			case 'r':
   294  				return "\r"
   295  			case 't':
   296  				return "\t"
   297  			case 'Z':
   298  				return "\x1a"
   299  			default:
   300  				return substr[1:]
   301  			}
   302  		})
   303  	}
   304  	return input
   305  }
   306  
   307  func (parser *ChunkParser) unescapeString(input string) string {
   308  	if len(input) >= 2 {
   309  		switch input[0] {
   310  		case '\'', '"':
   311  			return unescape(input[1:len(input)-1], input[:1], parser.escFlavor)
   312  		case '`':
   313  			return unescape(input[1:len(input)-1], "`", backslashEscapeFlavorNone)
   314  		}
   315  	}
   316  	return input
   317  }
   318  
   319  // ReadRow reads a row from the datafile.
   320  func (parser *ChunkParser) ReadRow() error {
   321  	// This parser will recognize contents like:
   322  	//
   323  	// 		`tableName` (...) VALUES (...) (...) (...)
   324  	//
   325  	// Keywords like INSERT, INTO and separators like ',' and ';' are treated
   326  	// like comments and ignored. Therefore, this parser will accept some
   327  	// nonsense input. The advantage is the parser becomes extremely simple,
   328  	// suitable for us where we just want to quickly and accurately split the
   329  	// file apart, not to validate the content.
   330  
   331  	type state byte
   332  
   333  	const (
   334  		// the state after "INSERT INTO" before the column names or "VALUES"
   335  		stateTableName state = iota
   336  
   337  		// the state while reading the column names
   338  		stateColumns
   339  
   340  		// the state after reading "VALUES"
   341  		stateValues
   342  
   343  		// the state while reading row values
   344  		stateRow
   345  	)
   346  
   347  	// Dry-run sample of the state machine, first row:
   348  	//
   349  	//              Input         Token             State
   350  	//              ~~~~~         ~~~~~             ~~~~~
   351  	//
   352  	//                                              stateValues
   353  	//              INSERT
   354  	//              INTO
   355  	//              `tableName`   tokBackQuoted
   356  	//                                              stateTableName (reset columns)
   357  	//              (             tokRowBegin
   358  	//                                              stateColumns
   359  	//              `a`           tokBackQuoted
   360  	//                                              stateColumns (append column)
   361  	//              ,
   362  	//              `b`           tokBackQuoted
   363  	//                                              stateColumns (append column)
   364  	//              )             tokRowEnd
   365  	//                                              stateValues
   366  	//              VALUES
   367  	//                                              stateValues (no-op)
   368  	//              (             tokRowBegin
   369  	//                                              stateRow (reset row)
   370  	//              1             tokInteger
   371  	//                                              stateRow (append value)
   372  	//              ,
   373  	//              2             tokInteger
   374  	//                                              stateRow (append value)
   375  	//              )             tokRowEnd
   376  	//                                              return
   377  	//
   378  	//
   379  	// Second row:
   380  	//
   381  	//              Input         Token             State
   382  	//              ~~~~~         ~~~~~             ~~~~~
   383  	//
   384  	//                                              stateValues
   385  	//              ,
   386  	//              (             tokRowBegin
   387  	//                                              stateRow (reset row)
   388  	//              3             tokInteger
   389  	//                                              stateRow (append value)
   390  	//              )             tokRowEnd
   391  	//                                              return
   392  	//
   393  	// Third row:
   394  	//
   395  	//              Input         Token             State
   396  	//              ~~~~~         ~~~~~             ~~~~~
   397  	//
   398  	//              ;
   399  	//              INSERT
   400  	//              INTO
   401  	//              `database`    tokBackQuoted
   402  	//                                              stateTableName (reset columns)
   403  	//              .
   404  	//              `tableName`   tokBackQuoted
   405  	//                                              stateTableName (no-op)
   406  	//              VALUES
   407  	//                                              stateValues
   408  	//              (             tokRowBegin
   409  	//                                              stateRow (reset row)
   410  	//              4             tokInteger
   411  	//                                              stateRow (append value)
   412  	//              )             tokRowEnd
   413  	//                                              return
   414  
   415  	row := &parser.lastRow
   416  	st := stateValues
   417  	row.Length = 0
   418  
   419  	for {
   420  		tok, content, err := parser.lex()
   421  		if err != nil {
   422  			if err == io.EOF && st != stateValues {
   423  				return errors.Errorf("syntax error: premature EOF at offset %d", parser.pos)
   424  			}
   425  			return errors.Trace(err)
   426  		}
   427  		row.Length += len(content)
   428  		switch st {
   429  		case stateTableName:
   430  			switch tok {
   431  			case tokRowBegin:
   432  				st = stateColumns
   433  			case tokValues:
   434  				st = stateValues
   435  			case tokUnquoted, tokDoubleQuoted, tokBackQuoted:
   436  			default:
   437  				return errors.Errorf(
   438  					"syntax error: unexpected %s (%s) at offset %d, expecting %s",
   439  					tok, content, parser.pos, "table name",
   440  				)
   441  			}
   442  		case stateColumns:
   443  			switch tok {
   444  			case tokRowEnd:
   445  				st = stateValues
   446  			case tokUnquoted, tokDoubleQuoted, tokBackQuoted:
   447  				columnName := strings.ToLower(parser.unescapeString(string(content)))
   448  				parser.columns = append(parser.columns, columnName)
   449  			default:
   450  				return errors.Errorf(
   451  					"syntax error: unexpected %s (%s) at offset %d, expecting %s",
   452  					tok, content, parser.pos, "column list",
   453  				)
   454  			}
   455  		case stateValues:
   456  			switch tok {
   457  			case tokRowBegin:
   458  				row.RowID++
   459  				row.Row = parser.acquireDatumSlice()
   460  				st = stateRow
   461  			case tokUnquoted, tokDoubleQuoted, tokBackQuoted:
   462  				parser.columns = nil
   463  				st = stateTableName
   464  			case tokValues:
   465  			default:
   466  				return errors.Errorf(
   467  					"syntax error: unexpected %s (%s) at offset %d, expecting %s",
   468  					tok, content, parser.pos, "start of row",
   469  				)
   470  			}
   471  		case stateRow:
   472  			var value types.Datum
   473  			switch tok {
   474  			case tokRowEnd:
   475  				return nil
   476  			case tokNull:
   477  				value.SetNull()
   478  			case tokTrue:
   479  				value.SetInt64(1)
   480  			case tokFalse:
   481  				value.SetInt64(0)
   482  			case tokInteger:
   483  				c := string(content)
   484  				if strings.HasPrefix(c, "-") {
   485  					i, err := strconv.ParseInt(c, 10, 64)
   486  					if err == nil {
   487  						value.SetInt64(i)
   488  						break
   489  					}
   490  				} else {
   491  					u, err := strconv.ParseUint(c, 10, 64)
   492  					if err == nil {
   493  						value.SetUint64(u)
   494  						break
   495  					}
   496  				}
   497  				// if the integer is too long, fallback to treating it as a
   498  				// string (all types that treats integer specially like BIT
   499  				// can't handle integers more than 64 bits anyway)
   500  				fallthrough
   501  			case tokUnquoted, tokSingleQuoted, tokDoubleQuoted:
   502  				value.SetString(parser.unescapeString(string(content)), "utf8mb4_bin")
   503  			case tokHexString:
   504  				hexLit, err := types.ParseHexStr(string(content))
   505  				if err != nil {
   506  					return errors.Trace(err)
   507  				}
   508  				value.SetBinaryLiteral(hexLit)
   509  			case tokBinString:
   510  				binLit, err := types.ParseBitStr(string(content))
   511  				if err != nil {
   512  					return errors.Trace(err)
   513  				}
   514  				value.SetBinaryLiteral(binLit)
   515  			default:
   516  				return errors.Errorf(
   517  					"syntax error: unexpected %s (%s) at offset %d, expecting %s",
   518  					tok, content, parser.pos, "data literal",
   519  				)
   520  			}
   521  			row.Row = append(row.Row, value)
   522  		}
   523  	}
   524  }
   525  
   526  // LastRow is the copy of the row parsed by the last call to ReadRow().
   527  func (parser *blockParser) LastRow() Row {
   528  	return parser.lastRow
   529  }
   530  
   531  // RecycleRow places the row object back into the allocation pool.
   532  func (parser *blockParser) RecycleRow(row Row) {
   533  	// We need farther benchmarking to make sure whether send a pointer
   534  	// (instead of a slice) here can improve performance.
   535  	//nolint:staticcheck
   536  	parser.rowPool.Put(row.Row[:0])
   537  }
   538  
   539  // acquireDatumSlice allocates an empty []types.Datum
   540  func (parser *blockParser) acquireDatumSlice() []types.Datum {
   541  	return parser.rowPool.Get().([]types.Datum)
   542  }
   543  
   544  // ReadChunks parses the entire file and splits it into continuous chunks of
   545  // size >= minSize.
   546  func ReadChunks(parser Parser, minSize int64) ([]Chunk, error) {
   547  	var chunks []Chunk
   548  
   549  	pos, lastRowID := parser.Pos()
   550  	cur := Chunk{
   551  		Offset:       pos,
   552  		EndOffset:    pos,
   553  		PrevRowIDMax: lastRowID,
   554  		RowIDMax:     lastRowID,
   555  	}
   556  
   557  	for {
   558  		switch err := parser.ReadRow(); errors.Cause(err) {
   559  		case nil:
   560  			cur.EndOffset, cur.RowIDMax = parser.Pos()
   561  			if cur.EndOffset-cur.Offset >= minSize {
   562  				chunks = append(chunks, cur)
   563  				cur.Offset = cur.EndOffset
   564  				cur.PrevRowIDMax = cur.RowIDMax
   565  			}
   566  
   567  		case io.EOF:
   568  			if cur.Offset < cur.EndOffset {
   569  				chunks = append(chunks, cur)
   570  			}
   571  			return chunks, nil
   572  
   573  		default:
   574  			return nil, errors.Trace(err)
   575  		}
   576  	}
   577  }