github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/csv_parser.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "bytes" 18 "io" 19 "strings" 20 21 "github.com/pingcap/errors" 22 "github.com/pingcap/tidb/types" 23 24 "github.com/pingcap/br/pkg/lightning/config" 25 "github.com/pingcap/br/pkg/lightning/worker" 26 "github.com/pingcap/br/pkg/utils" 27 ) 28 29 var ( 30 errUnterminatedQuotedField = errors.NewNoStackError("syntax error: unterminated quoted field") 31 errDanglingBackslash = errors.NewNoStackError("syntax error: no character after backslash") 32 errUnexpectedQuoteField = errors.NewNoStackError("syntax error: cannot have consecutive fields without separator") 33 ) 34 35 // CSVParser is basically a copy of encoding/csv, but special-cased for MySQL-like input. 36 type CSVParser struct { 37 blockParser 38 cfg *config.CSVConfig 39 40 comma []byte 41 quote []byte 42 newLine []byte 43 44 // These variables are used with IndexAnyByte to search a byte slice for the 45 // first index which some special character may appear. 46 // quoteByteSet is used inside quoted fields (so the first characters of 47 // the closing delimiter and backslash are special). 48 // unquoteByteSet is used outside quoted fields (so the first characters 49 // of the opening delimiter, separator, terminator and backslash are 50 // special). 51 // newLineByteSet is used in strict-format CSV dividing (so the first 52 // characters of the terminator are special). 53 quoteByteSet byteSet 54 unquoteByteSet byteSet 55 newLineByteSet byteSet 56 57 // recordBuffer holds the unescaped fields, one after another. 58 // The fields can be accessed by using the indexes in fieldIndexes. 59 // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` 60 // and fieldIndexes will contain the indexes [1, 2, 5, 6]. 61 recordBuffer []byte 62 63 // fieldIndexes is an index of fields inside recordBuffer. 64 // The i'th field ends at offset fieldIndexes[i] in recordBuffer. 65 fieldIndexes []int 66 67 lastRecord []string 68 69 escFlavor backslashEscapeFlavor 70 // if set to true, csv parser will treat the first non-empty line as header line 71 shouldParseHeader bool 72 } 73 74 func NewCSVParser( 75 cfg *config.CSVConfig, 76 reader ReadSeekCloser, 77 blockBufSize int64, 78 ioWorkers *worker.Pool, 79 shouldParseHeader bool, 80 ) *CSVParser { 81 escFlavor := backslashEscapeFlavorNone 82 var quoteStopSet, newLineStopSet []byte 83 unquoteStopSet := []byte{cfg.Separator[0]} 84 if len(cfg.Delimiter) > 0 { 85 quoteStopSet = []byte{cfg.Delimiter[0]} 86 unquoteStopSet = append(unquoteStopSet, cfg.Delimiter[0]) 87 } 88 if len(cfg.Terminator) > 0 { 89 newLineStopSet = []byte{cfg.Terminator[0]} 90 } else { 91 newLineStopSet = []byte{'\r', '\n'} 92 } 93 unquoteStopSet = append(unquoteStopSet, newLineStopSet...) 94 if cfg.BackslashEscape { 95 escFlavor = backslashEscapeFlavorMySQL 96 quoteStopSet = append(quoteStopSet, '\\') 97 unquoteStopSet = append(unquoteStopSet, '\\') 98 // we need special treatment of the NULL value \N, used by MySQL. 99 if !cfg.NotNull && cfg.Null == `\N` { 100 escFlavor = backslashEscapeFlavorMySQLWithNull 101 } 102 } 103 104 return &CSVParser{ 105 blockParser: makeBlockParser(reader, blockBufSize, ioWorkers), 106 cfg: cfg, 107 comma: []byte(cfg.Separator), 108 quote: []byte(cfg.Delimiter), 109 newLine: []byte(cfg.Terminator), 110 escFlavor: escFlavor, 111 quoteByteSet: makeByteSet(quoteStopSet), 112 unquoteByteSet: makeByteSet(unquoteStopSet), 113 newLineByteSet: makeByteSet(newLineStopSet), 114 shouldParseHeader: shouldParseHeader, 115 } 116 } 117 118 func (parser *CSVParser) unescapeString(input string) (unescaped string, isNull bool) { 119 if parser.escFlavor == backslashEscapeFlavorMySQLWithNull && input == `\N` { 120 return input, true 121 } 122 unescaped = unescape(input, "", parser.escFlavor) 123 isNull = parser.escFlavor != backslashEscapeFlavorMySQLWithNull && 124 !parser.cfg.NotNull && 125 unescaped == parser.cfg.Null 126 return 127 } 128 129 // csvToken is a type representing either a normal byte or some CSV-specific 130 // tokens such as the separator (comma), delimiter (quote) and terminator (new 131 // line). 132 type csvToken int16 133 134 const ( 135 // csvTokenAnyUnquoted is a placeholder to represent any unquoted character. 136 csvTokenAnyUnquoted csvToken = 0 137 // csvTokenWithBackslash is a mask indicating an escaped character. 138 // The actual token is represented like `csvTokenWithBackslash | 'n'`. 139 csvTokenWithBackslash csvToken = 0x100 140 // csvTokenComma is the CSV separator token. 141 csvTokenComma csvToken = 0x200 142 // csvTokenNewLine is the CSV terminator token. 143 csvTokenNewLine csvToken = 0x400 144 // csvTokenDelimiter is the CSV delimiter token. 145 csvTokenDelimiter csvToken = 0x800 146 ) 147 148 func (parser *CSVParser) readByte() (byte, error) { 149 if len(parser.buf) == 0 { 150 if err := parser.readBlock(); err != nil { 151 return 0, err 152 } 153 } 154 if len(parser.buf) == 0 { 155 return 0, io.EOF 156 } 157 b := parser.buf[0] 158 parser.buf = parser.buf[1:] 159 parser.pos++ 160 return b, nil 161 } 162 163 func (parser *CSVParser) peekBytes(cnt int) ([]byte, error) { 164 if len(parser.buf) < cnt { 165 if err := parser.readBlock(); err != nil { 166 return nil, err 167 } 168 } 169 if len(parser.buf) == 0 { 170 return nil, io.EOF 171 } 172 cnt = utils.MinInt(cnt, len(parser.buf)) 173 return parser.buf[:cnt], nil 174 } 175 176 func (parser *CSVParser) skipBytes(n int) { 177 parser.buf = parser.buf[n:] 178 parser.pos += int64(n) 179 } 180 181 // tryReadExact peeks the bytes ahead, and if it matches `content` exactly will 182 // consume it (advance the cursor) and return `true`. 183 func (parser *CSVParser) tryReadExact(content []byte) (bool, error) { 184 if len(content) == 0 { 185 return true, nil 186 } 187 bs, err := parser.peekBytes(len(content)) 188 if err == nil { 189 if bytes.Equal(bs, content) { 190 parser.skipBytes(len(content)) 191 return true, nil 192 } 193 } else if errors.Cause(err) == io.EOF { 194 err = nil 195 } 196 return false, err 197 } 198 199 func (parser *CSVParser) tryReadNewLine(b byte) (bool, error) { 200 if len(parser.newLine) == 0 { 201 return b == '\r' || b == '\n', nil 202 } 203 if b != parser.newLine[0] { 204 return false, nil 205 } 206 return parser.tryReadExact(parser.newLine[1:]) 207 } 208 209 func (parser *CSVParser) tryReadOpenDelimiter(b byte) (bool, error) { 210 if len(parser.quote) == 0 || parser.quote[0] != b { 211 return false, nil 212 } 213 return parser.tryReadExact(parser.quote[1:]) 214 } 215 216 // tryReadCloseDelimiter is currently equivalent to tryReadOpenDelimiter until 217 // we support asymmetric delimiters. 218 func (parser *CSVParser) tryReadCloseDelimiter(b byte) (bool, error) { 219 if parser.quote[0] != b { 220 return false, nil 221 } 222 return parser.tryReadExact(parser.quote[1:]) 223 } 224 225 func (parser *CSVParser) tryReadComma(b byte) (bool, error) { 226 if parser.comma[0] != b { 227 return false, nil 228 } 229 return parser.tryReadExact(parser.comma[1:]) 230 } 231 232 func (parser *CSVParser) tryReadBackslashed(bs byte) (bool, byte, error) { 233 if bs != '\\' || parser.escFlavor == backslashEscapeFlavorNone { 234 return false, 0, nil 235 } 236 b, err := parser.readByte() 237 return true, b, parser.replaceEOF(err, errDanglingBackslash) 238 } 239 240 // readQuoteToken reads a token inside quoted fields. 241 func (parser *CSVParser) readQuotedToken(b byte) (csvToken, error) { 242 if ok, err := parser.tryReadCloseDelimiter(b); ok || err != nil { 243 return csvTokenDelimiter, err 244 } 245 if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil { 246 return csvTokenWithBackslash | csvToken(eb), err 247 } 248 return csvToken(b), nil 249 } 250 251 // readUnquoteToken reads a token outside quoted fields. 252 func (parser *CSVParser) readUnquoteToken(b byte) (csvToken, error) { 253 if ok, err := parser.tryReadNewLine(b); ok || err != nil { 254 return csvTokenNewLine, err 255 } 256 if ok, err := parser.tryReadComma(b); ok || err != nil { 257 return csvTokenComma, err 258 } 259 if ok, err := parser.tryReadOpenDelimiter(b); ok || err != nil { 260 return csvTokenDelimiter, err 261 } 262 if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil { 263 return csvTokenWithBackslash | csvToken(eb), err 264 } 265 return csvToken(b), nil 266 } 267 268 func (parser *CSVParser) appendCSVTokenToRecordBuffer(token csvToken) { 269 if token&csvTokenWithBackslash != 0 { 270 parser.recordBuffer = append(parser.recordBuffer, '\\') 271 } 272 parser.recordBuffer = append(parser.recordBuffer, byte(token)) 273 } 274 275 // readUntil reads the buffer until any character from the `chars` set is found. 276 // that character is excluded from the final buffer. 277 func (parser *CSVParser) readUntil(chars *byteSet) ([]byte, byte, error) { 278 index := IndexAnyByte(parser.buf, chars) 279 if index >= 0 { 280 ret := parser.buf[:index] 281 parser.buf = parser.buf[index:] 282 parser.pos += int64(index) 283 return ret, parser.buf[0], nil 284 } 285 286 // not found in parser.buf, need allocate and loop. 287 var buf []byte 288 for { 289 buf = append(buf, parser.buf...) 290 parser.buf = nil 291 if err := parser.readBlock(); err != nil || len(parser.buf) == 0 { 292 if err == nil { 293 err = io.EOF 294 } 295 parser.pos += int64(len(buf)) 296 return buf, 0, errors.Trace(err) 297 } 298 index := IndexAnyByte(parser.buf, chars) 299 if index >= 0 { 300 buf = append(buf, parser.buf[:index]...) 301 parser.buf = parser.buf[index:] 302 parser.pos += int64(len(buf)) 303 return buf, parser.buf[0], nil 304 } 305 } 306 } 307 308 func (parser *CSVParser) readRecord(dst []string) ([]string, error) { 309 parser.recordBuffer = parser.recordBuffer[:0] 310 parser.fieldIndexes = parser.fieldIndexes[:0] 311 312 isEmptyLine := true 313 whitespaceLine := true 314 prevToken := csvTokenNewLine 315 var firstToken csvToken 316 317 outside: 318 for { 319 content, firstByte, err := parser.readUntil(&parser.unquoteByteSet) 320 321 if len(content) > 0 { 322 isEmptyLine = false 323 if prevToken == csvTokenDelimiter { 324 parser.logSyntaxError() 325 return nil, errors.AddStack(errUnexpectedQuoteField) 326 } 327 parser.recordBuffer = append(parser.recordBuffer, content...) 328 prevToken = csvTokenAnyUnquoted 329 } 330 331 if err != nil { 332 if isEmptyLine || errors.Cause(err) != io.EOF { 333 return nil, err 334 } 335 // treat EOF as the same as trailing \n. 336 firstToken = csvTokenNewLine 337 } else { 338 parser.skipBytes(1) 339 firstToken, err = parser.readUnquoteToken(firstByte) 340 if err != nil { 341 return nil, err 342 } 343 } 344 345 switch firstToken { 346 case csvTokenComma: 347 whitespaceLine = false 348 parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) 349 case csvTokenDelimiter: 350 if prevToken != csvTokenComma && prevToken != csvTokenNewLine { 351 parser.logSyntaxError() 352 return nil, errors.AddStack(errUnexpectedQuoteField) 353 } 354 if err = parser.readQuotedField(); err != nil { 355 return nil, err 356 } 357 whitespaceLine = false 358 case csvTokenNewLine: 359 // new line = end of record (ignore empty lines) 360 prevToken = firstToken 361 if isEmptyLine { 362 continue 363 } 364 // skip lines only contain whitespaces 365 if err == nil && whitespaceLine && len(bytes.TrimSpace(parser.recordBuffer)) == 0 { 366 parser.recordBuffer = parser.recordBuffer[:0] 367 continue 368 } 369 parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) 370 break outside 371 default: 372 if prevToken == csvTokenDelimiter { 373 parser.logSyntaxError() 374 return nil, errors.AddStack(errUnexpectedQuoteField) 375 } 376 parser.appendCSVTokenToRecordBuffer(firstToken) 377 } 378 prevToken = firstToken 379 isEmptyLine = false 380 } 381 // Create a single string and create slices out of it. 382 // This pins the memory of the fields together, but allocates once. 383 str := string(parser.recordBuffer) // Convert to string once to batch allocations 384 dst = dst[:0] 385 if cap(dst) < len(parser.fieldIndexes) { 386 dst = make([]string, len(parser.fieldIndexes)) 387 } 388 dst = dst[:len(parser.fieldIndexes)] 389 var preIdx int 390 for i, idx := range parser.fieldIndexes { 391 dst[i] = str[preIdx:idx] 392 preIdx = idx 393 } 394 395 // Check or update the expected fields per record. 396 return dst, nil 397 } 398 399 func (parser *CSVParser) readQuotedField() error { 400 for { 401 content, terminator, err := parser.readUntil(&parser.quoteByteSet) 402 err = parser.replaceEOF(err, errUnterminatedQuotedField) 403 if err != nil { 404 return err 405 } 406 parser.recordBuffer = append(parser.recordBuffer, content...) 407 parser.skipBytes(1) 408 409 token, err := parser.readQuotedToken(terminator) 410 if err != nil { 411 return err 412 } 413 414 switch token { 415 case csvTokenDelimiter: 416 // encountered '"' -> continue if we're seeing '""'. 417 doubledDelimiter, err := parser.tryReadExact(parser.quote) 418 if err != nil { 419 return err 420 } 421 if doubledDelimiter { 422 // consume the double quotation mark and continue 423 parser.recordBuffer = append(parser.recordBuffer, parser.quote...) 424 } else { 425 // the field is completed, exit. 426 return nil 427 } 428 default: 429 parser.appendCSVTokenToRecordBuffer(token) 430 } 431 } 432 } 433 434 func (parser *CSVParser) replaceEOF(err error, replaced error) error { 435 if err == nil || errors.Cause(err) != io.EOF { 436 return err 437 } 438 if replaced != nil { 439 parser.logSyntaxError() 440 replaced = errors.AddStack(replaced) 441 } 442 return replaced 443 } 444 445 // ReadRow reads a row from the datafile. 446 func (parser *CSVParser) ReadRow() error { 447 row := &parser.lastRow 448 row.Length = 0 449 row.RowID++ 450 451 // skip the header first 452 if parser.shouldParseHeader { 453 err := parser.ReadColumns() 454 if err != nil { 455 return errors.Trace(err) 456 } 457 parser.shouldParseHeader = false 458 } 459 460 records, err := parser.readRecord(parser.lastRecord) 461 if err != nil { 462 return errors.Trace(err) 463 } 464 parser.lastRecord = records 465 // remove the last empty value 466 if parser.cfg.TrimLastSep { 467 i := len(records) - 1 468 if i >= 0 && len(records[i]) == 0 { 469 records = records[:i] 470 } 471 } 472 473 row.Row = parser.acquireDatumSlice() 474 if cap(row.Row) >= len(records) { 475 row.Row = row.Row[:len(records)] 476 } else { 477 row.Row = make([]types.Datum, len(records)) 478 } 479 for i, record := range records { 480 row.Length += len(record) 481 unescaped, isNull := parser.unescapeString(record) 482 if isNull { 483 row.Row[i].SetNull() 484 } else { 485 row.Row[i].SetString(unescaped, "utf8mb4_bin") 486 } 487 } 488 489 return nil 490 } 491 492 func (parser *CSVParser) ReadColumns() error { 493 columns, err := parser.readRecord(nil) 494 if err != nil { 495 return errors.Trace(err) 496 } 497 parser.columns = make([]string, 0, len(columns)) 498 for _, colName := range columns { 499 colName, _ = parser.unescapeString(colName) 500 parser.columns = append(parser.columns, strings.ToLower(colName)) 501 } 502 return nil 503 } 504 505 var newLineASCIISet = makeByteSet([]byte{'\r', '\n'}) 506 507 func indexOfNewLine(b []byte) int { 508 return IndexAnyByte(b, &newLineASCIISet) 509 } 510 511 // ReadUntilTerminator seeks the file until the terminator token is found, and 512 // returns the file offset beyond the terminator. 513 // This function is used in strict-format dividing a CSV file. 514 func (parser *CSVParser) ReadUntilTerminator() (int64, error) { 515 for { 516 _, firstByte, err := parser.readUntil(&parser.newLineByteSet) 517 if err != nil { 518 return 0, err 519 } 520 parser.skipBytes(1) 521 if ok, err := parser.tryReadNewLine(firstByte); ok || err != nil { 522 return parser.pos, err 523 } 524 } 525 }