github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/parser.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "bytes" 18 "fmt" 19 "io" 20 "regexp" 21 "strconv" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/pingcap/errors" 27 "github.com/pingcap/parser/mysql" 28 "github.com/pingcap/tidb/types" 29 "go.uber.org/zap" 30 "go.uber.org/zap/zapcore" 31 32 "github.com/pingcap/br/pkg/lightning/config" 33 "github.com/pingcap/br/pkg/lightning/log" 34 "github.com/pingcap/br/pkg/lightning/metric" 35 "github.com/pingcap/br/pkg/lightning/worker" 36 ) 37 38 type blockParser struct { 39 // states for the lexer 40 reader PooledReader 41 buf []byte 42 blockBuf []byte 43 isLastChunk bool 44 45 // The list of column names of the last INSERT statement. 46 columns []string 47 48 rowPool *sync.Pool 49 lastRow Row 50 // Current file offset. 51 pos int64 52 53 // cache 54 remainBuf *bytes.Buffer 55 appendBuf *bytes.Buffer 56 57 // the Logger associated with this parser for reporting failure 58 Logger log.Logger 59 } 60 61 func makeBlockParser(reader ReadSeekCloser, blockBufSize int64, ioWorkers *worker.Pool) blockParser { 62 return blockParser{ 63 reader: MakePooledReader(reader, ioWorkers), 64 blockBuf: make([]byte, blockBufSize*config.BufferSizeScale), 65 remainBuf: &bytes.Buffer{}, 66 appendBuf: &bytes.Buffer{}, 67 Logger: log.L(), 68 rowPool: &sync.Pool{ 69 New: func() interface{} { 70 return make([]types.Datum, 0, 16) 71 }, 72 }, 73 } 74 } 75 76 // ChunkParser is a parser of the data files (the file containing only INSERT 77 // statements). 78 type ChunkParser struct { 79 blockParser 80 81 escFlavor backslashEscapeFlavor 82 } 83 84 // Chunk represents a portion of the data file. 85 type Chunk struct { 86 Offset int64 87 EndOffset int64 88 PrevRowIDMax int64 89 RowIDMax int64 90 Columns []string 91 } 92 93 // Row is the content of a row. 94 type Row struct { 95 RowID int64 96 Row []types.Datum 97 Length int 98 } 99 100 // MarshalLogArray implements the zapcore.ArrayMarshaler interface 101 func (row Row) MarshalLogArray(encoder zapcore.ArrayEncoder) error { 102 for _, r := range row.Row { 103 encoder.AppendString(r.String()) 104 } 105 return nil 106 } 107 108 type backslashEscapeFlavor uint8 109 110 const ( 111 backslashEscapeFlavorNone backslashEscapeFlavor = iota 112 backslashEscapeFlavorMySQL 113 backslashEscapeFlavorMySQLWithNull 114 ) 115 116 type Parser interface { 117 Pos() (pos int64, rowID int64) 118 SetPos(pos int64, rowID int64) error 119 Close() error 120 ReadRow() error 121 LastRow() Row 122 RecycleRow(row Row) 123 124 // Columns returns the _lower-case_ column names corresponding to values in 125 // the LastRow. 126 Columns() []string 127 // SetColumns set restored column names to parser 128 SetColumns([]string) 129 130 SetLogger(log.Logger) 131 } 132 133 // NewChunkParser creates a new parser which can read chunks out of a file. 134 func NewChunkParser( 135 sqlMode mysql.SQLMode, 136 reader ReadSeekCloser, 137 blockBufSize int64, 138 ioWorkers *worker.Pool, 139 ) *ChunkParser { 140 escFlavor := backslashEscapeFlavorMySQL 141 if sqlMode.HasNoBackslashEscapesMode() { 142 escFlavor = backslashEscapeFlavorNone 143 } 144 145 return &ChunkParser{ 146 blockParser: makeBlockParser(reader, blockBufSize, ioWorkers), 147 escFlavor: escFlavor, 148 } 149 } 150 151 // SetPos changes the reported position and row ID. 152 func (parser *blockParser) SetPos(pos int64, rowID int64) error { 153 p, err := parser.reader.Seek(pos, io.SeekStart) 154 if err != nil { 155 return errors.Trace(err) 156 } 157 if p != pos { 158 return errors.Errorf("set pos failed, required position: %d, got: %d", pos, p) 159 } 160 parser.pos = pos 161 parser.lastRow.RowID = rowID 162 return nil 163 } 164 165 // Pos returns the current file offset. 166 func (parser *blockParser) Pos() (int64, int64) { 167 return parser.pos, parser.lastRow.RowID 168 } 169 170 func (parser *blockParser) Close() error { 171 return parser.reader.Close() 172 } 173 174 func (parser *blockParser) Columns() []string { 175 return parser.columns 176 } 177 178 func (parser *blockParser) SetColumns(columns []string) { 179 parser.columns = columns 180 } 181 182 func (parser *blockParser) logSyntaxError() { 183 content := parser.buf 184 if len(content) > 256 { 185 content = content[:256] 186 } 187 parser.Logger.Error("syntax error", 188 zap.Int64("pos", parser.pos), 189 zap.ByteString("content", content), 190 ) 191 } 192 193 func (parser *blockParser) SetLogger(logger log.Logger) { 194 parser.Logger = logger 195 } 196 197 type token byte 198 199 const ( 200 tokNil token = iota 201 tokRowBegin 202 tokRowEnd 203 tokValues 204 tokNull 205 tokTrue 206 tokFalse 207 tokHexString 208 tokBinString 209 tokInteger 210 tokSingleQuoted 211 tokDoubleQuoted 212 tokBackQuoted 213 tokUnquoted 214 ) 215 216 var tokenDescriptions = [...]string{ 217 tokNil: "<Nil>", 218 tokRowBegin: "RowBegin", 219 tokRowEnd: "RowEnd", 220 tokValues: "Values", 221 tokNull: "Null", 222 tokTrue: "True", 223 tokFalse: "False", 224 tokHexString: "HexString", 225 tokBinString: "BinString", 226 tokInteger: "Integer", 227 tokSingleQuoted: "SingleQuoted", 228 tokDoubleQuoted: "DoubleQuoted", 229 tokBackQuoted: "BackQuoted", 230 tokUnquoted: "Unquoted", 231 } 232 233 // String implements the fmt.Stringer interface 234 // 235 // Mainly used for debugging a token. 236 func (tok token) String() string { 237 t := int(tok) 238 if t >= 0 && t < len(tokenDescriptions) { 239 if description := tokenDescriptions[t]; description != "" { 240 return description 241 } 242 } 243 return fmt.Sprintf("<Unknown(%d)>", t) 244 } 245 246 func (parser *blockParser) readBlock() error { 247 startTime := time.Now() 248 249 n, err := parser.reader.ReadFull(parser.blockBuf) 250 251 switch err { 252 case io.ErrUnexpectedEOF, io.EOF: 253 parser.isLastChunk = true 254 fallthrough 255 case nil: 256 // `parser.buf` reference to `appendBuf.Bytes`, so should use remainBuf to 257 // hold the `parser.buf` rest data to prevent slice overlap 258 parser.remainBuf.Reset() 259 parser.remainBuf.Write(parser.buf) 260 parser.appendBuf.Reset() 261 parser.appendBuf.Write(parser.remainBuf.Bytes()) 262 parser.appendBuf.Write(parser.blockBuf[:n]) 263 parser.buf = parser.appendBuf.Bytes() 264 metric.ChunkParserReadBlockSecondsHistogram.Observe(time.Since(startTime).Seconds()) 265 return nil 266 default: 267 return errors.Trace(err) 268 } 269 } 270 271 var unescapeRegexp = regexp.MustCompile(`(?s)\\.`) 272 273 func unescape( 274 input string, 275 delim string, 276 escFlavor backslashEscapeFlavor, 277 ) string { 278 if len(delim) > 0 { 279 delim2 := delim + delim 280 if strings.Contains(input, delim2) { 281 input = strings.ReplaceAll(input, delim2, delim) 282 } 283 } 284 if escFlavor != backslashEscapeFlavorNone && strings.IndexByte(input, '\\') != -1 { 285 input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string { 286 switch substr[1] { 287 case '0': 288 return "\x00" 289 case 'b': 290 return "\b" 291 case 'n': 292 return "\n" 293 case 'r': 294 return "\r" 295 case 't': 296 return "\t" 297 case 'Z': 298 return "\x1a" 299 default: 300 return substr[1:] 301 } 302 }) 303 } 304 return input 305 } 306 307 func (parser *ChunkParser) unescapeString(input string) string { 308 if len(input) >= 2 { 309 switch input[0] { 310 case '\'', '"': 311 return unescape(input[1:len(input)-1], input[:1], parser.escFlavor) 312 case '`': 313 return unescape(input[1:len(input)-1], "`", backslashEscapeFlavorNone) 314 } 315 } 316 return input 317 } 318 319 // ReadRow reads a row from the datafile. 320 func (parser *ChunkParser) ReadRow() error { 321 // This parser will recognize contents like: 322 // 323 // `tableName` (...) VALUES (...) (...) (...) 324 // 325 // Keywords like INSERT, INTO and separators like ',' and ';' are treated 326 // like comments and ignored. Therefore, this parser will accept some 327 // nonsense input. The advantage is the parser becomes extremely simple, 328 // suitable for us where we just want to quickly and accurately split the 329 // file apart, not to validate the content. 330 331 type state byte 332 333 const ( 334 // the state after "INSERT INTO" before the column names or "VALUES" 335 stateTableName state = iota 336 337 // the state while reading the column names 338 stateColumns 339 340 // the state after reading "VALUES" 341 stateValues 342 343 // the state while reading row values 344 stateRow 345 ) 346 347 // Dry-run sample of the state machine, first row: 348 // 349 // Input Token State 350 // ~~~~~ ~~~~~ ~~~~~ 351 // 352 // stateValues 353 // INSERT 354 // INTO 355 // `tableName` tokBackQuoted 356 // stateTableName (reset columns) 357 // ( tokRowBegin 358 // stateColumns 359 // `a` tokBackQuoted 360 // stateColumns (append column) 361 // , 362 // `b` tokBackQuoted 363 // stateColumns (append column) 364 // ) tokRowEnd 365 // stateValues 366 // VALUES 367 // stateValues (no-op) 368 // ( tokRowBegin 369 // stateRow (reset row) 370 // 1 tokInteger 371 // stateRow (append value) 372 // , 373 // 2 tokInteger 374 // stateRow (append value) 375 // ) tokRowEnd 376 // return 377 // 378 // 379 // Second row: 380 // 381 // Input Token State 382 // ~~~~~ ~~~~~ ~~~~~ 383 // 384 // stateValues 385 // , 386 // ( tokRowBegin 387 // stateRow (reset row) 388 // 3 tokInteger 389 // stateRow (append value) 390 // ) tokRowEnd 391 // return 392 // 393 // Third row: 394 // 395 // Input Token State 396 // ~~~~~ ~~~~~ ~~~~~ 397 // 398 // ; 399 // INSERT 400 // INTO 401 // `database` tokBackQuoted 402 // stateTableName (reset columns) 403 // . 404 // `tableName` tokBackQuoted 405 // stateTableName (no-op) 406 // VALUES 407 // stateValues 408 // ( tokRowBegin 409 // stateRow (reset row) 410 // 4 tokInteger 411 // stateRow (append value) 412 // ) tokRowEnd 413 // return 414 415 row := &parser.lastRow 416 st := stateValues 417 row.Length = 0 418 419 for { 420 tok, content, err := parser.lex() 421 if err != nil { 422 if err == io.EOF && st != stateValues { 423 return errors.Errorf("syntax error: premature EOF at offset %d", parser.pos) 424 } 425 return errors.Trace(err) 426 } 427 row.Length += len(content) 428 switch st { 429 case stateTableName: 430 switch tok { 431 case tokRowBegin: 432 st = stateColumns 433 case tokValues: 434 st = stateValues 435 case tokUnquoted, tokDoubleQuoted, tokBackQuoted: 436 default: 437 return errors.Errorf( 438 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 439 tok, content, parser.pos, "table name", 440 ) 441 } 442 case stateColumns: 443 switch tok { 444 case tokRowEnd: 445 st = stateValues 446 case tokUnquoted, tokDoubleQuoted, tokBackQuoted: 447 columnName := strings.ToLower(parser.unescapeString(string(content))) 448 parser.columns = append(parser.columns, columnName) 449 default: 450 return errors.Errorf( 451 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 452 tok, content, parser.pos, "column list", 453 ) 454 } 455 case stateValues: 456 switch tok { 457 case tokRowBegin: 458 row.RowID++ 459 row.Row = parser.acquireDatumSlice() 460 st = stateRow 461 case tokUnquoted, tokDoubleQuoted, tokBackQuoted: 462 parser.columns = nil 463 st = stateTableName 464 case tokValues: 465 default: 466 return errors.Errorf( 467 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 468 tok, content, parser.pos, "start of row", 469 ) 470 } 471 case stateRow: 472 var value types.Datum 473 switch tok { 474 case tokRowEnd: 475 return nil 476 case tokNull: 477 value.SetNull() 478 case tokTrue: 479 value.SetInt64(1) 480 case tokFalse: 481 value.SetInt64(0) 482 case tokInteger: 483 c := string(content) 484 if strings.HasPrefix(c, "-") { 485 i, err := strconv.ParseInt(c, 10, 64) 486 if err == nil { 487 value.SetInt64(i) 488 break 489 } 490 } else { 491 u, err := strconv.ParseUint(c, 10, 64) 492 if err == nil { 493 value.SetUint64(u) 494 break 495 } 496 } 497 // if the integer is too long, fallback to treating it as a 498 // string (all types that treats integer specially like BIT 499 // can't handle integers more than 64 bits anyway) 500 fallthrough 501 case tokUnquoted, tokSingleQuoted, tokDoubleQuoted: 502 value.SetString(parser.unescapeString(string(content)), "utf8mb4_bin") 503 case tokHexString: 504 hexLit, err := types.ParseHexStr(string(content)) 505 if err != nil { 506 return errors.Trace(err) 507 } 508 value.SetBinaryLiteral(hexLit) 509 case tokBinString: 510 binLit, err := types.ParseBitStr(string(content)) 511 if err != nil { 512 return errors.Trace(err) 513 } 514 value.SetBinaryLiteral(binLit) 515 default: 516 return errors.Errorf( 517 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 518 tok, content, parser.pos, "data literal", 519 ) 520 } 521 row.Row = append(row.Row, value) 522 } 523 } 524 } 525 526 // LastRow is the copy of the row parsed by the last call to ReadRow(). 527 func (parser *blockParser) LastRow() Row { 528 return parser.lastRow 529 } 530 531 // RecycleRow places the row object back into the allocation pool. 532 func (parser *blockParser) RecycleRow(row Row) { 533 // We need farther benchmarking to make sure whether send a pointer 534 // (instead of a slice) here can improve performance. 535 //nolint:staticcheck 536 parser.rowPool.Put(row.Row[:0]) 537 } 538 539 // acquireDatumSlice allocates an empty []types.Datum 540 func (parser *blockParser) acquireDatumSlice() []types.Datum { 541 return parser.rowPool.Get().([]types.Datum) 542 } 543 544 // ReadChunks parses the entire file and splits it into continuous chunks of 545 // size >= minSize. 546 func ReadChunks(parser Parser, minSize int64) ([]Chunk, error) { 547 var chunks []Chunk 548 549 pos, lastRowID := parser.Pos() 550 cur := Chunk{ 551 Offset: pos, 552 EndOffset: pos, 553 PrevRowIDMax: lastRowID, 554 RowIDMax: lastRowID, 555 } 556 557 for { 558 switch err := parser.ReadRow(); errors.Cause(err) { 559 case nil: 560 cur.EndOffset, cur.RowIDMax = parser.Pos() 561 if cur.EndOffset-cur.Offset >= minSize { 562 chunks = append(chunks, cur) 563 cur.Offset = cur.EndOffset 564 cur.PrevRowIDMax = cur.RowIDMax 565 } 566 567 case io.EOF: 568 if cur.Offset < cur.EndOffset { 569 chunks = append(chunks, cur) 570 } 571 return chunks, nil 572 573 default: 574 return nil, errors.Trace(err) 575 } 576 } 577 }