github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/csv_parser_test.go (about) 1 package mydump_test 2 3 import ( 4 "context" 5 "encoding/csv" 6 "io" 7 "os" 8 "path/filepath" 9 "strings" 10 11 . "github.com/pingcap/check" 12 "github.com/pingcap/errors" 13 "github.com/pingcap/tidb/types" 14 "go.uber.org/zap" 15 16 "github.com/pingcap/tidb-lightning/lightning/config" 17 "github.com/pingcap/tidb-lightning/lightning/log" 18 "github.com/pingcap/tidb-lightning/lightning/mydump" 19 "github.com/pingcap/tidb-lightning/lightning/worker" 20 ) 21 22 var _ = Suite(&testMydumpCSVParserSuite{}) 23 24 type testMydumpCSVParserSuite struct { 25 ioWorkers *worker.Pool 26 } 27 28 func (s *testMydumpCSVParserSuite) SetUpSuite(c *C) { 29 s.ioWorkers = worker.NewPool(context.Background(), 5, "test_csv") 30 } 31 func (s *testMydumpCSVParserSuite) TearDownSuite(c *C) {} 32 33 type assertPosEq struct { 34 *CheckerInfo 35 } 36 37 var posEq = &assertPosEq{ 38 &CheckerInfo{Name: "posEq", Params: []string{"parser", "pos", "rowID"}}, 39 } 40 41 func (checker *assertPosEq) Check(params []interface{}, names []string) (result bool, error string) { 42 parser := params[0].(mydump.Parser) 43 pos, rowID := parser.Pos() 44 expectedPos := int64(params[1].(int)) 45 expectedRowID := int64(params[2].(int)) 46 return pos == expectedPos && rowID == expectedRowID, "" 47 } 48 49 var nullDatum types.Datum 50 51 type testCase struct { 52 input string 53 expected [][]types.Datum 54 } 55 56 func (s *testMydumpCSVParserSuite) runTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []testCase) { 57 for _, tc := range cases { 58 parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc.input), blockBufSize, s.ioWorkers, false) 59 for i, row := range tc.expected { 60 comment := Commentf("input = %q, row = %d", tc.input, i+1) 61 e := parser.ReadRow() 62 c.Assert(e, IsNil, Commentf("input = %q, row = %d, error = %s", tc.input, i+1, errors.ErrorStack(e))) 63 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{RowID: int64(i) + 1, Row: row}, comment) 64 } 65 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF, Commentf("input = %q", tc.input)) 66 } 67 } 68 69 func (s *testMydumpCSVParserSuite) runFailingTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []string) { 70 for _, tc := range cases { 71 parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc), blockBufSize, s.ioWorkers, false) 72 e := parser.ReadRow() 73 c.Assert(e, ErrorMatches, "syntax error.*", Commentf("input = %q / %s", tc, errors.ErrorStack(e))) 74 } 75 } 76 77 func tpchDatums() [][]types.Datum { 78 datums := make([][]types.Datum, 0, 3) 79 datums = append(datums, []types.Datum{ 80 types.NewStringDatum("1"), 81 types.NewStringDatum("goldenrod lavender spring chocolate lace"), 82 types.NewStringDatum("Manufacturer#1"), 83 types.NewStringDatum("Brand#13"), 84 types.NewStringDatum("PROMO BURNISHED COPPER"), 85 types.NewStringDatum("7"), 86 types.NewStringDatum("JUMBO PKG"), 87 types.NewStringDatum("901.00"), 88 types.NewStringDatum("ly. slyly ironi"), 89 }) 90 datums = append(datums, []types.Datum{ 91 types.NewStringDatum("2"), 92 types.NewStringDatum("blush thistle blue yellow saddle"), 93 types.NewStringDatum("Manufacturer#1"), 94 types.NewStringDatum("Brand#13"), 95 types.NewStringDatum("LARGE BRUSHED BRASS"), 96 types.NewStringDatum("1"), 97 types.NewStringDatum("LG CASE"), 98 types.NewStringDatum("902.00"), 99 types.NewStringDatum("lar accounts amo"), 100 }) 101 datums = append(datums, []types.Datum{ 102 types.NewStringDatum("3"), 103 types.NewStringDatum("spring green yellow purple cornsilk"), 104 types.NewStringDatum("Manufacturer#4"), 105 types.NewStringDatum("Brand#42"), 106 types.NewStringDatum("STANDARD POLISHED BRASS"), 107 types.NewStringDatum("21"), 108 types.NewStringDatum("WRAP CASE"), 109 types.NewStringDatum("903.00"), 110 types.NewStringDatum("egular deposits hag"), 111 }) 112 113 return datums 114 } 115 116 func datumsToString(datums [][]types.Datum, delimitor string, quote string, lastSep bool) string { 117 var b strings.Builder 118 doubleQuote := quote + quote 119 for _, ds := range datums { 120 for i, d := range ds { 121 text := d.GetString() 122 if len(quote) > 0 { 123 b.WriteString(quote) 124 b.WriteString(strings.ReplaceAll(text, quote, doubleQuote)) 125 b.WriteString(quote) 126 } else { 127 b.WriteString(text) 128 } 129 if lastSep || i < len(ds)-1 { 130 b.WriteString(delimitor) 131 } 132 } 133 b.WriteString("\r\n") 134 } 135 return b.String() 136 } 137 138 func (s *testMydumpCSVParserSuite) TestTPCH(c *C) { 139 datums := tpchDatums() 140 input := datumsToString(datums, "|", "", true) 141 reader := mydump.NewStringReader(input) 142 143 cfg := config.CSVConfig{ 144 Separator: "|", 145 Delimiter: "", 146 TrimLastSep: true, 147 } 148 149 parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false) 150 151 c.Assert(parser.ReadRow(), IsNil) 152 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 153 RowID: 1, 154 Row: datums[0], 155 }) 156 c.Assert(parser, posEq, 126, 1) 157 158 c.Assert(parser.ReadRow(), IsNil) 159 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 160 RowID: 2, 161 Row: datums[1], 162 }) 163 c.Assert(parser, posEq, 241, 2) 164 165 c.Assert(parser.ReadRow(), IsNil) 166 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 167 RowID: 3, 168 Row: datums[2], 169 }) 170 c.Assert(parser, posEq, 369, 3) 171 172 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 173 } 174 175 func (s *testMydumpCSVParserSuite) TestTPCHMultiBytes(c *C) { 176 datums := tpchDatums() 177 sepsAndQuotes := [][2]string{ 178 {",", ""}, 179 {"\000", ""}, 180 {",", ""}, 181 {"🤔", ""}, 182 {",", "。"}, 183 {"||", ""}, 184 {"|+|", ""}, 185 {"##", ""}, 186 {",", "'"}, 187 {",", `"`}, 188 {"🤔", `''`}, 189 {"🤔", `"'`}, 190 {"🤔", `"'`}, 191 {"🤔", "🌚"}, // this two emoji have same prefix bytes 192 {"##", "#-"}, 193 {"\\s", "\\q"}, 194 {",", "1"}, 195 {",", "ac"}, 196 } 197 for _, SepAndQuote := range sepsAndQuotes { 198 inputStr := datumsToString(datums, SepAndQuote[0], SepAndQuote[1], false) 199 200 // extract all index in the middle of '\r\n' from the inputStr. 201 // they indicate where the parser stops after reading one row. 202 // should be equals to the number of datums. 203 var allExpectedParserPos []int 204 for { 205 last := 0 206 if len(allExpectedParserPos) > 0 { 207 last = allExpectedParserPos[len(allExpectedParserPos)-1] 208 } 209 pos := strings.IndexByte(inputStr[last:], '\r') 210 if pos < 0 { 211 break 212 } 213 allExpectedParserPos = append(allExpectedParserPos, last+pos+1) 214 } 215 c.Assert(allExpectedParserPos, HasLen, len(datums)) 216 217 cfg := config.CSVConfig{ 218 Separator: SepAndQuote[0], 219 Delimiter: SepAndQuote[1], 220 TrimLastSep: false, 221 } 222 223 reader := mydump.NewStringReader(inputStr) 224 parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false) 225 226 for i, expectedParserPos := range allExpectedParserPos { 227 c.Assert(parser.ReadRow(), IsNil) 228 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 229 RowID: int64(i + 1), 230 Row: datums[i], 231 }) 232 c.Assert(parser, posEq, expectedParserPos, i+1) 233 } 234 235 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 236 } 237 } 238 239 func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) { 240 cfg := config.CSVConfig{ 241 Separator: ",", 242 Delimiter: `"`, 243 } 244 245 // example 1, trailing new lines 246 247 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), int64(config.ReadBlockSize), s.ioWorkers, false) 248 249 c.Assert(parser.ReadRow(), IsNil) 250 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 251 RowID: 1, 252 Row: []types.Datum{ 253 types.NewStringDatum("aaa"), 254 types.NewStringDatum("bbb"), 255 types.NewStringDatum("ccc"), 256 }, 257 }) 258 c.Assert(parser, posEq, 12, 1) 259 260 c.Assert(parser.ReadRow(), IsNil) 261 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 262 RowID: 2, 263 Row: []types.Datum{ 264 types.NewStringDatum("zzz"), 265 types.NewStringDatum("yyy"), 266 types.NewStringDatum("xxx"), 267 }, 268 }) 269 c.Assert(parser, posEq, 24, 2) 270 271 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 272 273 // example 2, no trailing new lines 274 275 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false) 276 277 c.Assert(parser.ReadRow(), IsNil) 278 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 279 RowID: 1, 280 Row: []types.Datum{ 281 types.NewStringDatum("aaa"), 282 types.NewStringDatum("bbb"), 283 types.NewStringDatum("ccc"), 284 }, 285 }) 286 c.Assert(parser, posEq, 12, 1) 287 288 c.Assert(parser.ReadRow(), IsNil) 289 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 290 RowID: 2, 291 Row: []types.Datum{ 292 types.NewStringDatum("zzz"), 293 types.NewStringDatum("yyy"), 294 types.NewStringDatum("xxx"), 295 }, 296 }) 297 c.Assert(parser, posEq, 23, 2) 298 299 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 300 301 // example 5, quoted fields 302 303 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false) 304 305 c.Assert(parser.ReadRow(), IsNil) 306 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 307 RowID: 1, 308 Row: []types.Datum{ 309 types.NewStringDatum("aaa"), 310 types.NewStringDatum("bbb"), 311 types.NewStringDatum("ccc"), 312 }, 313 }) 314 c.Assert(parser, posEq, 18, 1) 315 316 c.Assert(parser.ReadRow(), IsNil) 317 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 318 RowID: 2, 319 Row: []types.Datum{ 320 types.NewStringDatum("zzz"), 321 types.NewStringDatum("yyy"), 322 types.NewStringDatum("xxx"), 323 }, 324 }) 325 c.Assert(parser, posEq, 29, 2) 326 327 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 328 329 // example 6, line breaks within fields 330 331 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b 332 bb","ccc" 333 zzz,yyy,xxx`), int64(config.ReadBlockSize), s.ioWorkers, false) 334 335 c.Assert(parser.ReadRow(), IsNil) 336 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 337 RowID: 1, 338 Row: []types.Datum{ 339 types.NewStringDatum("aaa"), 340 types.NewStringDatum("b\nbb"), 341 types.NewStringDatum("ccc"), 342 }, 343 }) 344 c.Assert(parser, posEq, 19, 1) 345 346 c.Assert(parser.ReadRow(), IsNil) 347 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 348 RowID: 2, 349 Row: []types.Datum{ 350 types.NewStringDatum("zzz"), 351 types.NewStringDatum("yyy"), 352 types.NewStringDatum("xxx"), 353 }, 354 }) 355 c.Assert(parser, posEq, 30, 2) 356 357 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 358 359 // example 7, quote escaping 360 361 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b""bb","ccc"`), int64(config.ReadBlockSize), s.ioWorkers, false) 362 363 c.Assert(parser.ReadRow(), IsNil) 364 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 365 RowID: 1, 366 Row: []types.Datum{ 367 types.NewStringDatum("aaa"), 368 types.NewStringDatum("b\"bb"), 369 types.NewStringDatum("ccc"), 370 }, 371 }) 372 c.Assert(parser, posEq, 19, 1) 373 374 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 375 } 376 377 func (s *testMydumpCSVParserSuite) TestMySQL(c *C) { 378 cfg := config.CSVConfig{ 379 Separator: ",", 380 Delimiter: `"`, 381 BackslashEscape: true, 382 NotNull: false, 383 Null: `\N`, 384 } 385 386 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"\"","\\","\?" 387 "\ 388 ",\N,\\N`), int64(config.ReadBlockSize), s.ioWorkers, false) 389 390 c.Assert(parser.ReadRow(), IsNil) 391 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 392 RowID: 1, 393 Row: []types.Datum{ 394 types.NewStringDatum(`"`), 395 types.NewStringDatum(`\`), 396 types.NewStringDatum("?"), 397 }, 398 }) 399 c.Assert(parser, posEq, 15, 1) 400 401 c.Assert(parser.ReadRow(), IsNil) 402 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 403 RowID: 2, 404 Row: []types.Datum{ 405 types.NewStringDatum("\n"), 406 nullDatum, 407 types.NewStringDatum(`\N`), 408 }, 409 }) 410 c.Assert(parser, posEq, 26, 2) 411 412 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 413 } 414 415 func (s *testMydumpCSVParserSuite) TestSyntaxError(c *C) { 416 cfg := config.CSVConfig{ 417 Separator: ",", 418 Delimiter: `"`, 419 BackslashEscape: true, 420 } 421 422 inputs := []string{ 423 `"???`, 424 `\`, 425 `"\`, 426 `0"`, 427 `0\`, 428 "\"\v", 429 `"""`, 430 "\"\r", 431 "\"\x01", 432 } 433 434 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), inputs) 435 436 cfg.BackslashEscape = false 437 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), []string{`"\`}) 438 } 439 440 func (s *testMydumpCSVParserSuite) TestTSV(c *C) { 441 cfg := config.CSVConfig{ 442 Separator: "\t", 443 Delimiter: "", 444 BackslashEscape: false, 445 NotNull: false, 446 Null: "", 447 Header: true, 448 } 449 450 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`a b c d e f 451 0 foo 0000-00-00 452 0 foo 0000-00-00 453 0 abc def ghi bar 1999-12-31`), int64(config.ReadBlockSize), s.ioWorkers, true) 454 455 c.Assert(parser.ReadRow(), IsNil) 456 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 457 RowID: 1, 458 Row: []types.Datum{ 459 types.NewStringDatum("0"), 460 nullDatum, 461 nullDatum, 462 nullDatum, 463 types.NewStringDatum("foo"), 464 types.NewStringDatum("0000-00-00"), 465 }, 466 }) 467 c.Assert(parser, posEq, 32, 1) 468 c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c", "d", "e", "f"}) 469 470 c.Assert(parser.ReadRow(), IsNil) 471 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 472 RowID: 2, 473 Row: []types.Datum{ 474 types.NewStringDatum("0"), 475 nullDatum, 476 nullDatum, 477 nullDatum, 478 types.NewStringDatum("foo"), 479 types.NewStringDatum("0000-00-00"), 480 }, 481 }) 482 c.Assert(parser, posEq, 52, 2) 483 484 c.Assert(parser.ReadRow(), IsNil) 485 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 486 RowID: 3, 487 Row: []types.Datum{ 488 types.NewStringDatum("0"), 489 types.NewStringDatum("abc"), 490 types.NewStringDatum("def"), 491 types.NewStringDatum("ghi"), 492 types.NewStringDatum("bar"), 493 types.NewStringDatum("1999-12-31"), 494 }, 495 }) 496 c.Assert(parser, posEq, 80, 3) 497 498 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 499 } 500 501 func (s *testMydumpCSVParserSuite) TestCsvWithWhiteSpaceLine(c *C) { 502 cfg := config.CSVConfig{ 503 Separator: ",", 504 Delimiter: `"`, 505 } 506 data := " \r\n\r\n0,,abc\r\n \r\n123,1999-12-31,test\r\n" 507 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, false) 508 c.Assert(parser.ReadRow(), IsNil) 509 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 510 RowID: 1, 511 Row: []types.Datum{ 512 types.NewStringDatum("0"), 513 nullDatum, 514 types.NewStringDatum("abc"), 515 }, 516 }) 517 518 c.Assert(parser, posEq, 12, 1) 519 c.Assert(parser.ReadRow(), IsNil) 520 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 521 RowID: 2, 522 Row: []types.Datum{ 523 types.NewStringDatum("123"), 524 types.NewStringDatum("1999-12-31"), 525 types.NewStringDatum("test"), 526 }, 527 }) 528 c.Assert(parser.Close(), IsNil) 529 530 cfg.Header = true 531 data = " \r\na,b,c\r\n0,,abc\r\n" 532 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, true) 533 c.Assert(parser.ReadRow(), IsNil) 534 c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c"}) 535 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 536 RowID: 1, 537 Row: []types.Datum{ 538 types.NewStringDatum("0"), 539 nullDatum, 540 types.NewStringDatum("abc"), 541 }, 542 }) 543 544 c.Assert(parser, posEq, 17, 1) 545 c.Assert(parser.Close(), IsNil) 546 } 547 548 func (s *testMydumpCSVParserSuite) TestEmpty(c *C) { 549 cfg := config.CSVConfig{ 550 Separator: ",", 551 Delimiter: `"`, 552 } 553 554 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, false) 555 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 556 557 // Try again with headers. 558 559 cfg.Header = true 560 561 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, true) 562 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 563 564 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("h\n"), int64(config.ReadBlockSize), s.ioWorkers, true) 565 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 566 } 567 568 func (s *testMydumpCSVParserSuite) TestCRLF(c *C) { 569 cfg := config.CSVConfig{ 570 Separator: ",", 571 Delimiter: `"`, 572 } 573 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("a\rb\r\nc\n\n\n\nd"), int64(config.ReadBlockSize), s.ioWorkers, false) 574 575 c.Assert(parser.ReadRow(), IsNil) 576 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 577 RowID: 1, 578 Row: []types.Datum{types.NewStringDatum("a")}, 579 }) 580 581 c.Assert(parser.ReadRow(), IsNil) 582 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 583 RowID: 2, 584 Row: []types.Datum{types.NewStringDatum("b")}, 585 }) 586 587 c.Assert(parser.ReadRow(), IsNil) 588 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 589 RowID: 3, 590 Row: []types.Datum{types.NewStringDatum("c")}, 591 }) 592 593 c.Assert(parser.ReadRow(), IsNil) 594 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 595 RowID: 4, 596 Row: []types.Datum{types.NewStringDatum("d")}, 597 }) 598 599 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 600 } 601 602 func (s *testMydumpCSVParserSuite) TestQuotedSeparator(c *C) { 603 cfg := config.CSVConfig{ 604 Separator: ",", 605 Delimiter: `"`, 606 } 607 608 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`",",','`), int64(config.ReadBlockSize), s.ioWorkers, false) 609 c.Assert(parser.ReadRow(), IsNil) 610 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 611 RowID: 1, 612 Row: []types.Datum{ 613 types.NewStringDatum(","), 614 types.NewStringDatum("'"), 615 types.NewStringDatum("'"), 616 }, 617 }) 618 619 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 620 } 621 622 func (s *testMydumpCSVParserSuite) TestConsecutiveFields(c *C) { 623 // Note: the behavior of reading `"xxx"yyy` here is undefined in RFC 4180. 624 // Python's CSV module returns `xxxyyy`. 625 // Rust's CSV package returns `xxxyyy`. 626 // Go's CSV package returns a parse error. 627 // NPM's CSV package returns a parse error. 628 // MySQL's LOAD DATA statement returns `"xxx"yyy` as-is. 629 630 cfg := config.CSVConfig{ 631 Separator: ",", 632 Delimiter: `"`, 633 } 634 635 testCases := []string{ 636 `"x"?`, 637 "\"\"\x01", 638 "\"\"\v", 639 } 640 641 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), testCases) 642 } 643 644 func (s *testMydumpCSVParserSuite) TestSpecialChars(c *C) { 645 cfg := config.CSVConfig{Separator: ",", Delimiter: `"`} 646 testCases := []testCase{ 647 { 648 input: "\x00", 649 expected: [][]types.Datum{{types.NewStringDatum("\x00")}}, 650 }, 651 { 652 input: `0\`, 653 expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, 654 }, 655 { 656 input: `\`, 657 expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, 658 }, 659 { 660 input: "0\v", 661 expected: [][]types.Datum{{types.NewStringDatum("0\v")}}, 662 }, 663 { 664 input: "0\x00", 665 expected: [][]types.Datum{{types.NewStringDatum("0\x00")}}, 666 }, 667 { 668 input: "\n\r", 669 expected: [][]types.Datum{}, 670 }, 671 { 672 input: `"""",0`, 673 expected: [][]types.Datum{{types.NewStringDatum(`"`), types.NewStringDatum(`0`)}}, 674 }, 675 } 676 677 s.runTestCases(c, &cfg, int64(config.ReadBlockSize), testCases) 678 } 679 680 func (s *testMydumpCSVParserSuite) TestContinuation(c *C) { 681 cfg := config.CSVConfig{ 682 Separator: ",", 683 Delimiter: `"`, 684 BackslashEscape: true, 685 TrimLastSep: true, 686 } 687 688 testCases := []testCase{ 689 { 690 input: `"abcdef",\njklm,nop` + "\r\n" + `"""""","\n",a,`, 691 expected: [][]types.Datum{ 692 { 693 types.NewStringDatum("abcdef"), 694 types.NewStringDatum("\njklm"), 695 types.NewStringDatum("nop"), 696 }, 697 { 698 types.NewStringDatum(`""`), 699 types.NewStringDatum("\n"), 700 types.NewStringDatum("a"), 701 }, 702 }, 703 }, 704 { 705 input: `"VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca"`, 706 expected: [][]types.Datum{{types.NewStringDatum("VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca")}}, 707 }, 708 } 709 710 s.runTestCases(c, &cfg, 1, testCases) 711 } 712 713 func (s *testMydumpCSVParserSuite) TestBackslashAsSep(c *C) { 714 cfg := config.CSVConfig{ 715 Separator: `\`, 716 Delimiter: `"`, 717 } 718 719 testCases := []testCase{ 720 { 721 input: `0\`, 722 expected: [][]types.Datum{{types.NewStringDatum("0"), nullDatum}}, 723 }, 724 { 725 input: `\`, 726 expected: [][]types.Datum{{nullDatum, nullDatum}}, 727 }, 728 } 729 730 s.runTestCases(c, &cfg, 1, testCases) 731 732 failingInputs := []string{ 733 `"\`, 734 } 735 s.runFailingTestCases(c, &cfg, 1, failingInputs) 736 } 737 738 func (s *testMydumpCSVParserSuite) TestBackslashAsDelim(c *C) { 739 cfg := config.CSVConfig{ 740 Separator: ",", 741 Delimiter: `\`, 742 } 743 744 testCases := []testCase{ 745 { 746 input: `\\`, 747 expected: [][]types.Datum{{nullDatum}}, 748 }, 749 } 750 s.runTestCases(c, &cfg, 1, testCases) 751 752 failingInputs := []string{ 753 `"\`, 754 } 755 s.runFailingTestCases(c, &cfg, 1, failingInputs) 756 } 757 758 // errorReader implements the Reader interface which always returns an error. 759 type errorReader struct{} 760 761 func (*errorReader) Read(p []byte) (int, error) { 762 return 0, errors.New("fake read error") 763 } 764 765 func (*errorReader) Seek(offset int64, whence int) (int64, error) { 766 return 0, errors.New("fake seek error") 767 } 768 769 func (*errorReader) Close() error { 770 return errors.New("fake close error") 771 } 772 773 func (s *testMydumpCSVParserSuite) TestReadError(c *C) { 774 cfg := config.CSVConfig{ 775 Separator: ",", 776 Delimiter: `"`, 777 } 778 779 parser := mydump.NewCSVParser(&cfg, &errorReader{}, int64(config.ReadBlockSize), s.ioWorkers, false) 780 c.Assert(parser.ReadRow(), ErrorMatches, "fake read error") 781 } 782 783 // TestSyntaxErrorLog checks that a syntax error won't dump huge strings into the log. 784 func (s *testMydumpCSVParserSuite) TestSyntaxErrorLog(c *C) { 785 cfg := config.CSVConfig{ 786 Separator: "\t", 787 Delimiter: "'", 788 } 789 790 tc := mydump.NewStringReader("x'" + strings.Repeat("y", 50000)) 791 parser := mydump.NewCSVParser(&cfg, tc, 50000, s.ioWorkers, false) 792 logger, buffer := log.MakeTestLogger() 793 parser.SetLogger(logger) 794 c.Assert(parser.ReadRow(), ErrorMatches, "syntax error.*") 795 c.Assert(logger.Sync(), IsNil) 796 797 c.Assert( 798 buffer.Stripped(), Equals, 799 `{"$lvl":"ERROR","$msg":"syntax error","pos":1,"content":"'`+strings.Repeat("y", 255)+`"}`, 800 ) 801 } 802 803 // TestTrimLastSep checks that set `TrimLastSep` to true trim only the last empty filed. 804 func (s *testMydumpCSVParserSuite) TestTrimLastSep(c *C) { 805 cfg := config.CSVConfig{ 806 Separator: ",", 807 Delimiter: `"`, 808 TrimLastSep: true, 809 } 810 parser := mydump.NewCSVParser( 811 &cfg, 812 mydump.NewStringReader("123,456,789,\r\na,b,,\r\n,,,\r\n\"a\",\"\",\"\",\r\n"), 813 int64(config.ReadBlockSize), 814 s.ioWorkers, 815 false, 816 ) 817 for i := 0; i < 4; i++ { 818 c.Assert(parser.ReadRow(), IsNil) 819 c.Assert(len(parser.LastRow().Row), Equals, 3) 820 } 821 } 822 823 // Run `go test github.com/pingcap/tidb-lightning/lightning/mydump -check.b -check.bmem -test.v` to get benchmark result. 824 // Please ensure your temporary storage has (c.N / 2) KiB of free space. 825 826 type benchCSVParserSuite struct { 827 csvPath string 828 ioWorkers *worker.Pool 829 } 830 831 var _ = Suite(&benchCSVParserSuite{}) 832 833 func (s *benchCSVParserSuite) setupTest(c *C) { 834 s.ioWorkers = worker.NewPool(context.Background(), 5, "bench_csv") 835 836 dir := c.MkDir() 837 s.csvPath = filepath.Join(dir, "input.csv") 838 file, err := os.Create(s.csvPath) 839 c.Assert(err, IsNil) 840 defer func() { 841 c.Assert(file.Close(), IsNil) 842 }() 843 for i := 0; i < c.N; i++ { 844 _, err = file.WriteString("18,1,1,0.3650,GC,BARBARBAR,rw9AOV1AjoI1,50000.00,-10.00,10.00,1,1,djj3Q2XaIPoYVy1FuF,gc80Q2o82Au3C9xv,PYOolSxG3w,DI,265111111,7586538936787184,2020-02-26 20:06:00.193,OE,YCkSPBVqoJ2V5F8zWs87V5XzbaIY70aWCD4dgcB6bjUzCr5wOJCJ2TYH49J7yWyysbudJIxlTAEWSJahY7hswLtTsqyjEkrlsN8iDMAa9Poj29miJ08tnn2G8mL64IlyywvnRGbLbyGvWDdrOSF42RyUFTWVyqlDWc6Gr5wyMPYgvweKemzFDVD3kro5JsmBmJY08EK54nQoyfo2sScyb34zcM9GFo9ZQTwloINfPYQKXQm32m0XvU7jiNmYpFTFJQjdqA825SEvQqMMefG2WG4jVu9UPdhdUjRsFRd0Gw7YPKByOlcuY0eKxT7sAzMKXx2000RR6dqHNXe47oVYd\n") 845 c.Assert(err, IsNil) 846 } 847 c.ResetTimer() 848 } 849 850 func (s *benchCSVParserSuite) BenchmarkReadRowUsingMydumpCSVParser(c *C) { 851 s.setupTest(c) 852 853 file, err := os.Open(s.csvPath) 854 c.Assert(err, IsNil) 855 defer func() { 856 c.Assert(file.Close(), IsNil) 857 }() 858 859 cfg := config.CSVConfig{Separator: ","} 860 parser := mydump.NewCSVParser(&cfg, file, 65536, s.ioWorkers, false) 861 parser.SetLogger(log.Logger{Logger: zap.NewNop()}) 862 863 rowsCount := 0 864 for { 865 err := parser.ReadRow() 866 if err == nil { 867 parser.RecycleRow(parser.LastRow()) 868 rowsCount++ 869 continue 870 } 871 if errors.Cause(err) == io.EOF { 872 break 873 } 874 c.Fatal(err) 875 } 876 c.Assert(rowsCount, Equals, c.N) 877 } 878 879 func (s *benchCSVParserSuite) BenchmarkReadRowUsingEncodingCSV(c *C) { 880 s.setupTest(c) 881 882 file, err := os.Open(s.csvPath) 883 c.Assert(err, IsNil) 884 defer func() { 885 c.Assert(file.Close(), IsNil) 886 }() 887 888 csvParser := csv.NewReader(file) 889 890 rowsCount := 0 891 var datums []types.Datum 892 for { 893 records, err := csvParser.Read() 894 if err == nil { 895 // for fair comparison, we need to include the cost of conversion to Datum. 896 for _, record := range records { 897 datums = append(datums, types.NewStringDatum(record)) 898 } 899 datums = datums[:0] 900 rowsCount++ 901 continue 902 } 903 if errors.Cause(err) == io.EOF { 904 break 905 } 906 c.Fatal(err) 907 } 908 c.Assert(rowsCount, Equals, c.N) 909 }