github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/parser.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "bytes" 18 "fmt" 19 "io" 20 "regexp" 21 "strconv" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/pingcap/errors" 27 "github.com/pingcap/parser/mysql" 28 "github.com/pingcap/tidb/types" 29 "go.uber.org/zap" 30 "go.uber.org/zap/zapcore" 31 32 "github.com/pingcap/tidb-lightning/lightning/config" 33 "github.com/pingcap/tidb-lightning/lightning/log" 34 "github.com/pingcap/tidb-lightning/lightning/metric" 35 "github.com/pingcap/tidb-lightning/lightning/worker" 36 ) 37 38 type blockParser struct { 39 // states for the lexer 40 reader PooledReader 41 buf []byte 42 blockBuf []byte 43 isLastChunk bool 44 45 // The list of column names of the last INSERT statement. 46 columns []string 47 48 rowPool *sync.Pool 49 lastRow Row 50 // Current file offset. 51 pos int64 52 53 // cache 54 remainBuf *bytes.Buffer 55 appendBuf *bytes.Buffer 56 57 // the Logger associated with this parser for reporting failure 58 Logger log.Logger 59 } 60 61 func makeBlockParser(reader ReadSeekCloser, blockBufSize int64, ioWorkers *worker.Pool) blockParser { 62 return blockParser{ 63 reader: MakePooledReader(reader, ioWorkers), 64 blockBuf: make([]byte, blockBufSize*config.BufferSizeScale), 65 remainBuf: &bytes.Buffer{}, 66 appendBuf: &bytes.Buffer{}, 67 Logger: log.L(), 68 rowPool: &sync.Pool{ 69 New: func() interface{} { 70 return make([]types.Datum, 0, 16) 71 }, 72 }, 73 } 74 } 75 76 // ChunkParser is a parser of the data files (the file containing only INSERT 77 // statements). 78 type ChunkParser struct { 79 blockParser 80 81 escFlavor backslashEscapeFlavor 82 } 83 84 // Chunk represents a portion of the data file. 85 type Chunk struct { 86 Offset int64 87 EndOffset int64 88 PrevRowIDMax int64 89 RowIDMax int64 90 Columns []string 91 } 92 93 // Row is the content of a row. 94 type Row struct { 95 RowID int64 96 Row []types.Datum 97 } 98 99 // MarshalLogArray implements the zapcore.ArrayMarshaler interface 100 func (row Row) MarshalLogArray(encoder zapcore.ArrayEncoder) error { 101 for _, r := range row.Row { 102 encoder.AppendString(r.String()) 103 } 104 return nil 105 } 106 107 type backslashEscapeFlavor uint8 108 109 const ( 110 backslashEscapeFlavorNone backslashEscapeFlavor = iota 111 backslashEscapeFlavorMySQL 112 backslashEscapeFlavorMySQLWithNull 113 ) 114 115 type Parser interface { 116 Pos() (pos int64, rowID int64) 117 SetPos(pos int64, rowID int64) error 118 Close() error 119 ReadRow() error 120 LastRow() Row 121 RecycleRow(row Row) 122 123 // Columns returns the _lower-case_ column names corresponding to values in 124 // the LastRow. 125 Columns() []string 126 // SetColumns set restored column names to parser 127 SetColumns([]string) 128 129 SetLogger(log.Logger) 130 } 131 132 // NewChunkParser creates a new parser which can read chunks out of a file. 133 func NewChunkParser( 134 sqlMode mysql.SQLMode, 135 reader ReadSeekCloser, 136 blockBufSize int64, 137 ioWorkers *worker.Pool, 138 ) *ChunkParser { 139 escFlavor := backslashEscapeFlavorMySQL 140 if sqlMode.HasNoBackslashEscapesMode() { 141 escFlavor = backslashEscapeFlavorNone 142 } 143 144 return &ChunkParser{ 145 blockParser: makeBlockParser(reader, blockBufSize, ioWorkers), 146 escFlavor: escFlavor, 147 } 148 } 149 150 // SetPos changes the reported position and row ID. 151 func (parser *blockParser) SetPos(pos int64, rowID int64) error { 152 p, err := parser.reader.Seek(pos, io.SeekStart) 153 if err != nil { 154 return errors.Trace(err) 155 } 156 if p != pos { 157 return errors.Errorf("set pos failed, required position: %d, got: %d", pos, p) 158 } 159 parser.pos = pos 160 parser.lastRow.RowID = rowID 161 return nil 162 } 163 164 // Pos returns the current file offset. 165 func (parser *blockParser) Pos() (int64, int64) { 166 return parser.pos, parser.lastRow.RowID 167 } 168 169 func (parser *blockParser) Close() error { 170 return parser.reader.Close() 171 } 172 173 func (parser *blockParser) Columns() []string { 174 return parser.columns 175 } 176 177 func (parser *blockParser) SetColumns(columns []string) { 178 parser.columns = columns 179 } 180 181 func (parser *blockParser) logSyntaxError() { 182 content := parser.buf 183 if len(content) > 256 { 184 content = content[:256] 185 } 186 parser.Logger.Error("syntax error", 187 zap.Int64("pos", parser.pos), 188 zap.ByteString("content", content), 189 ) 190 } 191 192 func (parser *blockParser) SetLogger(logger log.Logger) { 193 parser.Logger = logger 194 } 195 196 type token byte 197 198 const ( 199 tokNil token = iota 200 tokRowBegin 201 tokRowEnd 202 tokValues 203 tokNull 204 tokTrue 205 tokFalse 206 tokHexString 207 tokBinString 208 tokInteger 209 tokSingleQuoted 210 tokDoubleQuoted 211 tokBackQuoted 212 tokUnquoted 213 ) 214 215 var tokenDescriptions = [...]string{ 216 tokNil: "<Nil>", 217 tokRowBegin: "RowBegin", 218 tokRowEnd: "RowEnd", 219 tokValues: "Values", 220 tokNull: "Null", 221 tokTrue: "True", 222 tokFalse: "False", 223 tokHexString: "HexString", 224 tokBinString: "BinString", 225 tokInteger: "Integer", 226 tokSingleQuoted: "SingleQuoted", 227 tokDoubleQuoted: "DoubleQuoted", 228 tokBackQuoted: "BackQuoted", 229 tokUnquoted: "Unquoted", 230 } 231 232 // String implements the fmt.Stringer interface 233 // 234 // Mainly used for debugging a token. 235 func (tok token) String() string { 236 t := int(tok) 237 if t >= 0 && t < len(tokenDescriptions) { 238 if description := tokenDescriptions[t]; description != "" { 239 return description 240 } 241 } 242 return fmt.Sprintf("<Unknown(%d)>", t) 243 } 244 245 func (parser *blockParser) readBlock() error { 246 startTime := time.Now() 247 248 n, err := parser.reader.ReadFull(parser.blockBuf) 249 250 switch err { 251 case io.ErrUnexpectedEOF, io.EOF: 252 parser.isLastChunk = true 253 fallthrough 254 case nil: 255 // `parser.buf` reference to `appendBuf.Bytes`, so should use remainBuf to 256 // hold the `parser.buf` rest data to prevent slice overlap 257 parser.remainBuf.Reset() 258 parser.remainBuf.Write(parser.buf) 259 parser.appendBuf.Reset() 260 parser.appendBuf.Write(parser.remainBuf.Bytes()) 261 parser.appendBuf.Write(parser.blockBuf[:n]) 262 parser.buf = parser.appendBuf.Bytes() 263 metric.ChunkParserReadBlockSecondsHistogram.Observe(time.Since(startTime).Seconds()) 264 return nil 265 default: 266 return errors.Trace(err) 267 } 268 } 269 270 var unescapeRegexp = regexp.MustCompile(`(?s)\\.`) 271 272 func unescape( 273 input string, 274 delim string, 275 escFlavor backslashEscapeFlavor, 276 ) string { 277 if len(delim) > 0 { 278 delim2 := delim + delim 279 if strings.Index(input, delim2) != -1 { 280 input = strings.Replace(input, delim2, delim, -1) 281 } 282 } 283 if escFlavor != backslashEscapeFlavorNone && strings.IndexByte(input, '\\') != -1 { 284 input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string { 285 switch substr[1] { 286 case '0': 287 return "\x00" 288 case 'b': 289 return "\b" 290 case 'n': 291 return "\n" 292 case 'r': 293 return "\r" 294 case 't': 295 return "\t" 296 case 'Z': 297 return "\x1a" 298 default: 299 return substr[1:] 300 } 301 }) 302 } 303 return input 304 } 305 306 func (parser *ChunkParser) unescapeString(input string) string { 307 if len(input) >= 2 { 308 switch input[0] { 309 case '\'', '"': 310 return unescape(input[1:len(input)-1], input[:1], parser.escFlavor) 311 case '`': 312 return unescape(input[1:len(input)-1], "`", backslashEscapeFlavorNone) 313 } 314 } 315 return input 316 } 317 318 // ReadRow reads a row from the datafile. 319 func (parser *ChunkParser) ReadRow() error { 320 // This parser will recognize contents like: 321 // 322 // `tableName` (...) VALUES (...) (...) (...) 323 // 324 // Keywords like INSERT, INTO and separators like ',' and ';' are treated 325 // like comments and ignored. Therefore, this parser will accept some 326 // nonsense input. The advantage is the parser becomes extremely simple, 327 // suitable for us where we just want to quickly and accurately split the 328 // file apart, not to validate the content. 329 330 type state byte 331 332 const ( 333 // the state after "INSERT INTO" before the column names or "VALUES" 334 stateTableName state = iota 335 336 // the state while reading the column names 337 stateColumns 338 339 // the state after reading "VALUES" 340 stateValues 341 342 // the state while reading row values 343 stateRow 344 ) 345 346 // Dry-run sample of the state machine, first row: 347 // 348 // Input Token State 349 // ~~~~~ ~~~~~ ~~~~~ 350 // 351 // stateValues 352 // INSERT 353 // INTO 354 // `tableName` tokBackQuoted 355 // stateTableName (reset columns) 356 // ( tokRowBegin 357 // stateColumns 358 // `a` tokBackQuoted 359 // stateColumns (append column) 360 // , 361 // `b` tokBackQuoted 362 // stateColumns (append column) 363 // ) tokRowEnd 364 // stateValues 365 // VALUES 366 // stateValues (no-op) 367 // ( tokRowBegin 368 // stateRow (reset row) 369 // 1 tokInteger 370 // stateRow (append value) 371 // , 372 // 2 tokInteger 373 // stateRow (append value) 374 // ) tokRowEnd 375 // return 376 // 377 // 378 // Second row: 379 // 380 // Input Token State 381 // ~~~~~ ~~~~~ ~~~~~ 382 // 383 // stateValues 384 // , 385 // ( tokRowBegin 386 // stateRow (reset row) 387 // 3 tokInteger 388 // stateRow (append value) 389 // ) tokRowEnd 390 // return 391 // 392 // Third row: 393 // 394 // Input Token State 395 // ~~~~~ ~~~~~ ~~~~~ 396 // 397 // ; 398 // INSERT 399 // INTO 400 // `database` tokBackQuoted 401 // stateTableName (reset columns) 402 // . 403 // `tableName` tokBackQuoted 404 // stateTableName (no-op) 405 // VALUES 406 // stateValues 407 // ( tokRowBegin 408 // stateRow (reset row) 409 // 4 tokInteger 410 // stateRow (append value) 411 // ) tokRowEnd 412 // return 413 414 row := &parser.lastRow 415 st := stateValues 416 417 for { 418 tok, content, err := parser.lex() 419 if err != nil { 420 if err == io.EOF && st != stateValues { 421 return errors.Errorf("syntax error: premature EOF at offset %d", parser.pos) 422 } 423 return errors.Trace(err) 424 } 425 switch st { 426 case stateTableName: 427 switch tok { 428 case tokRowBegin: 429 st = stateColumns 430 case tokValues: 431 st = stateValues 432 case tokUnquoted, tokDoubleQuoted, tokBackQuoted: 433 default: 434 return errors.Errorf( 435 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 436 tok, content, parser.pos, "table name", 437 ) 438 } 439 case stateColumns: 440 switch tok { 441 case tokRowEnd: 442 st = stateValues 443 case tokUnquoted, tokDoubleQuoted, tokBackQuoted: 444 columnName := strings.ToLower(parser.unescapeString(string(content))) 445 parser.columns = append(parser.columns, columnName) 446 default: 447 return errors.Errorf( 448 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 449 tok, content, parser.pos, "column list", 450 ) 451 } 452 case stateValues: 453 switch tok { 454 case tokRowBegin: 455 row.RowID++ 456 row.Row = parser.acquireDatumSlice() 457 st = stateRow 458 case tokUnquoted, tokDoubleQuoted, tokBackQuoted: 459 parser.columns = nil 460 st = stateTableName 461 case tokValues: 462 default: 463 return errors.Errorf( 464 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 465 tok, content, parser.pos, "start of row", 466 ) 467 } 468 case stateRow: 469 var value types.Datum 470 switch tok { 471 case tokRowEnd: 472 return nil 473 case tokNull: 474 value.SetNull() 475 case tokTrue: 476 value.SetInt64(1) 477 case tokFalse: 478 value.SetInt64(0) 479 case tokInteger: 480 c := string(content) 481 if strings.HasPrefix(c, "-") { 482 i, err := strconv.ParseInt(c, 10, 64) 483 if err == nil { 484 value.SetInt64(i) 485 break 486 } 487 } else { 488 u, err := strconv.ParseUint(c, 10, 64) 489 if err == nil { 490 value.SetUint64(u) 491 break 492 } 493 } 494 // if the integer is too long, fallback to treating it as a 495 // string (all types that treats integer specially like BIT 496 // can't handle integers more than 64 bits anyway) 497 fallthrough 498 case tokUnquoted, tokSingleQuoted, tokDoubleQuoted: 499 value.SetString(parser.unescapeString(string(content)), "utf8mb4_bin") 500 case tokHexString: 501 hexLit, err := types.ParseHexStr(string(content)) 502 if err != nil { 503 return err 504 } 505 value.SetBinaryLiteral(hexLit) 506 case tokBinString: 507 binLit, err := types.ParseBitStr(string(content)) 508 if err != nil { 509 return err 510 } 511 value.SetBinaryLiteral(binLit) 512 default: 513 return errors.Errorf( 514 "syntax error: unexpected %s (%s) at offset %d, expecting %s", 515 tok, content, parser.pos, "data literal", 516 ) 517 } 518 row.Row = append(row.Row, value) 519 } 520 } 521 } 522 523 // LastRow is the copy of the row parsed by the last call to ReadRow(). 524 func (parser *blockParser) LastRow() Row { 525 return parser.lastRow 526 } 527 528 // RecycleRow places the row object back into the allocation pool. 529 func (parser *blockParser) RecycleRow(row Row) { 530 parser.rowPool.Put(row.Row[:0]) 531 } 532 533 // acquireDatumSlice allocates an empty []types.Datum 534 func (parser *blockParser) acquireDatumSlice() []types.Datum { 535 return parser.rowPool.Get().([]types.Datum) 536 } 537 538 // ReadChunks parses the entire file and splits it into continuous chunks of 539 // size >= minSize. 540 func ReadChunks(parser Parser, minSize int64) ([]Chunk, error) { 541 var chunks []Chunk 542 543 pos, lastRowID := parser.Pos() 544 cur := Chunk{ 545 Offset: pos, 546 EndOffset: pos, 547 PrevRowIDMax: lastRowID, 548 RowIDMax: lastRowID, 549 } 550 551 for { 552 switch err := parser.ReadRow(); errors.Cause(err) { 553 case nil: 554 cur.EndOffset, cur.RowIDMax = parser.Pos() 555 if cur.EndOffset-cur.Offset >= minSize { 556 chunks = append(chunks, cur) 557 cur.Offset = cur.EndOffset 558 cur.PrevRowIDMax = cur.RowIDMax 559 } 560 561 case io.EOF: 562 if cur.Offset < cur.EndOffset { 563 chunks = append(chunks, cur) 564 } 565 return chunks, nil 566 567 default: 568 return nil, errors.Trace(err) 569 } 570 } 571 }