github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/csv_parser_test.go (about) 1 package mydump_test 2 3 import ( 4 "context" 5 "encoding/csv" 6 "io" 7 "os" 8 "path/filepath" 9 "strings" 10 11 . "github.com/pingcap/check" 12 "github.com/pingcap/errors" 13 "github.com/pingcap/tidb/types" 14 "go.uber.org/zap" 15 16 "github.com/pingcap/br/pkg/lightning/config" 17 "github.com/pingcap/br/pkg/lightning/log" 18 "github.com/pingcap/br/pkg/lightning/mydump" 19 "github.com/pingcap/br/pkg/lightning/worker" 20 ) 21 22 var _ = Suite(&testMydumpCSVParserSuite{}) 23 24 type testMydumpCSVParserSuite struct { 25 ioWorkers *worker.Pool 26 } 27 28 func (s *testMydumpCSVParserSuite) SetUpSuite(c *C) { 29 s.ioWorkers = worker.NewPool(context.Background(), 5, "test_csv") 30 } 31 func (s *testMydumpCSVParserSuite) TearDownSuite(c *C) {} 32 33 type assertPosEq struct { 34 *CheckerInfo 35 } 36 37 var posEq = &assertPosEq{ 38 &CheckerInfo{Name: "posEq", Params: []string{"parser", "pos", "rowID"}}, 39 } 40 41 func (checker *assertPosEq) Check(params []interface{}, names []string) (result bool, error string) { 42 parser := params[0].(mydump.Parser) 43 pos, rowID := parser.Pos() 44 expectedPos := int64(params[1].(int)) 45 expectedRowID := int64(params[2].(int)) 46 return pos == expectedPos && rowID == expectedRowID, "" 47 } 48 49 var nullDatum types.Datum 50 51 type testCase struct { 52 input string 53 expected [][]types.Datum 54 } 55 56 func (s *testMydumpCSVParserSuite) runTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []testCase) { 57 for _, tc := range cases { 58 parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc.input), blockBufSize, s.ioWorkers, false) 59 for i, row := range tc.expected { 60 comment := Commentf("input = %q, row = %d", tc.input, i+1) 61 e := parser.ReadRow() 62 c.Assert(e, IsNil, Commentf("input = %q, row = %d, error = %s", tc.input, i+1, errors.ErrorStack(e))) 63 c.Assert(parser.LastRow().RowID, DeepEquals, int64(i)+1, comment) 64 c.Assert(parser.LastRow().Row, DeepEquals, row, comment) 65 66 } 67 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF, Commentf("input = %q", tc.input)) 68 } 69 } 70 71 func (s *testMydumpCSVParserSuite) runFailingTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []string) { 72 for _, tc := range cases { 73 parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc), blockBufSize, s.ioWorkers, false) 74 e := parser.ReadRow() 75 c.Assert(e, ErrorMatches, "syntax error.*", Commentf("input = %q / %s", tc, errors.ErrorStack(e))) 76 } 77 } 78 79 func tpchDatums() [][]types.Datum { 80 datums := make([][]types.Datum, 0, 3) 81 datums = append(datums, []types.Datum{ 82 types.NewStringDatum("1"), 83 types.NewStringDatum("goldenrod lavender spring chocolate lace"), 84 types.NewStringDatum("Manufacturer#1"), 85 types.NewStringDatum("Brand#13"), 86 types.NewStringDatum("PROMO BURNISHED COPPER"), 87 types.NewStringDatum("7"), 88 types.NewStringDatum("JUMBO PKG"), 89 types.NewStringDatum("901.00"), 90 types.NewStringDatum("ly. slyly ironi"), 91 }) 92 datums = append(datums, []types.Datum{ 93 types.NewStringDatum("2"), 94 types.NewStringDatum("blush thistle blue yellow saddle"), 95 types.NewStringDatum("Manufacturer#1"), 96 types.NewStringDatum("Brand#13"), 97 types.NewStringDatum("LARGE BRUSHED BRASS"), 98 types.NewStringDatum("1"), 99 types.NewStringDatum("LG CASE"), 100 types.NewStringDatum("902.00"), 101 types.NewStringDatum("lar accounts amo"), 102 }) 103 datums = append(datums, []types.Datum{ 104 types.NewStringDatum("3"), 105 types.NewStringDatum("spring green yellow purple cornsilk"), 106 types.NewStringDatum("Manufacturer#4"), 107 types.NewStringDatum("Brand#42"), 108 types.NewStringDatum("STANDARD POLISHED BRASS"), 109 types.NewStringDatum("21"), 110 types.NewStringDatum("WRAP CASE"), 111 types.NewStringDatum("903.00"), 112 types.NewStringDatum("egular deposits hag"), 113 }) 114 115 return datums 116 } 117 118 func datumsToString(datums [][]types.Datum, delimitor string, quote string, lastSep bool) string { 119 var b strings.Builder 120 doubleQuote := quote + quote 121 for _, ds := range datums { 122 for i, d := range ds { 123 text := d.GetString() 124 if len(quote) > 0 { 125 b.WriteString(quote) 126 b.WriteString(strings.ReplaceAll(text, quote, doubleQuote)) 127 b.WriteString(quote) 128 } else { 129 b.WriteString(text) 130 } 131 if lastSep || i < len(ds)-1 { 132 b.WriteString(delimitor) 133 } 134 } 135 b.WriteString("\r\n") 136 } 137 return b.String() 138 } 139 140 func (s *testMydumpCSVParserSuite) TestTPCH(c *C) { 141 datums := tpchDatums() 142 input := datumsToString(datums, "|", "", true) 143 reader := mydump.NewStringReader(input) 144 145 cfg := config.CSVConfig{ 146 Separator: "|", 147 Delimiter: "", 148 TrimLastSep: true, 149 } 150 151 parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false) 152 153 c.Assert(parser.ReadRow(), IsNil) 154 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 155 RowID: 1, 156 Row: datums[0], 157 Length: 116, 158 }) 159 c.Assert(parser, posEq, 126, 1) 160 161 c.Assert(parser.ReadRow(), IsNil) 162 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 163 RowID: 2, 164 Row: datums[1], 165 Length: 104, 166 }) 167 c.Assert(parser, posEq, 241, 2) 168 169 c.Assert(parser.ReadRow(), IsNil) 170 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 171 RowID: 3, 172 Row: datums[2], 173 Length: 117, 174 }) 175 c.Assert(parser, posEq, 369, 3) 176 177 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 178 } 179 180 func (s *testMydumpCSVParserSuite) TestTPCHMultiBytes(c *C) { 181 datums := tpchDatums() 182 sepsAndQuotes := [][2]string{ 183 {",", ""}, 184 {"\000", ""}, 185 {",", ""}, 186 {"🤔", ""}, 187 {",", "。"}, 188 {"||", ""}, 189 {"|+|", ""}, 190 {"##", ""}, 191 {",", "'"}, 192 {",", `"`}, 193 {"🤔", `''`}, 194 {"🤔", `"'`}, 195 {"🤔", `"'`}, 196 {"🤔", "🌚"}, // this two emoji have same prefix bytes 197 {"##", "#-"}, 198 {"\\s", "\\q"}, 199 {",", "1"}, 200 {",", "ac"}, 201 } 202 for _, SepAndQuote := range sepsAndQuotes { 203 inputStr := datumsToString(datums, SepAndQuote[0], SepAndQuote[1], false) 204 205 // extract all index in the middle of '\r\n' from the inputStr. 206 // they indicate where the parser stops after reading one row. 207 // should be equals to the number of datums. 208 var allExpectedParserPos []int 209 for { 210 last := 0 211 if len(allExpectedParserPos) > 0 { 212 last = allExpectedParserPos[len(allExpectedParserPos)-1] 213 } 214 pos := strings.IndexByte(inputStr[last:], '\r') 215 if pos < 0 { 216 break 217 } 218 allExpectedParserPos = append(allExpectedParserPos, last+pos+1) 219 } 220 c.Assert(allExpectedParserPos, HasLen, len(datums)) 221 222 cfg := config.CSVConfig{ 223 Separator: SepAndQuote[0], 224 Delimiter: SepAndQuote[1], 225 TrimLastSep: false, 226 } 227 228 reader := mydump.NewStringReader(inputStr) 229 parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false) 230 231 for i, expectedParserPos := range allExpectedParserPos { 232 c.Assert(parser.ReadRow(), IsNil) 233 c.Assert(parser.LastRow().RowID, DeepEquals, int64(i+1)) 234 c.Assert(parser.LastRow().Row, DeepEquals, datums[i]) 235 236 c.Assert(parser, posEq, expectedParserPos, i+1) 237 } 238 239 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 240 } 241 } 242 243 func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) { 244 cfg := config.CSVConfig{ 245 Separator: ",", 246 Delimiter: `"`, 247 } 248 249 // example 1, trailing new lines 250 251 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), int64(config.ReadBlockSize), s.ioWorkers, false) 252 253 c.Assert(parser.ReadRow(), IsNil) 254 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 255 RowID: 1, 256 Row: []types.Datum{ 257 types.NewStringDatum("aaa"), 258 types.NewStringDatum("bbb"), 259 types.NewStringDatum("ccc"), 260 }, 261 Length: 9, 262 }) 263 c.Assert(parser, posEq, 12, 1) 264 265 c.Assert(parser.ReadRow(), IsNil) 266 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 267 RowID: 2, 268 Row: []types.Datum{ 269 types.NewStringDatum("zzz"), 270 types.NewStringDatum("yyy"), 271 types.NewStringDatum("xxx"), 272 }, 273 Length: 9, 274 }) 275 c.Assert(parser, posEq, 24, 2) 276 277 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 278 279 // example 2, no trailing new lines 280 281 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false) 282 283 c.Assert(parser.ReadRow(), IsNil) 284 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 285 RowID: 1, 286 Row: []types.Datum{ 287 types.NewStringDatum("aaa"), 288 types.NewStringDatum("bbb"), 289 types.NewStringDatum("ccc"), 290 }, 291 Length: 9, 292 }) 293 c.Assert(parser, posEq, 12, 1) 294 295 c.Assert(parser.ReadRow(), IsNil) 296 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 297 RowID: 2, 298 Row: []types.Datum{ 299 types.NewStringDatum("zzz"), 300 types.NewStringDatum("yyy"), 301 types.NewStringDatum("xxx"), 302 }, 303 Length: 9, 304 }) 305 c.Assert(parser, posEq, 23, 2) 306 307 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 308 309 // example 5, quoted fields 310 311 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false) 312 313 c.Assert(parser.ReadRow(), IsNil) 314 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 315 RowID: 1, 316 Row: []types.Datum{ 317 types.NewStringDatum("aaa"), 318 types.NewStringDatum("bbb"), 319 types.NewStringDatum("ccc"), 320 }, 321 Length: 9, 322 }) 323 c.Assert(parser, posEq, 18, 1) 324 325 c.Assert(parser.ReadRow(), IsNil) 326 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 327 RowID: 2, 328 Row: []types.Datum{ 329 types.NewStringDatum("zzz"), 330 types.NewStringDatum("yyy"), 331 types.NewStringDatum("xxx"), 332 }, 333 Length: 9, 334 }) 335 c.Assert(parser, posEq, 29, 2) 336 337 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 338 339 // example 6, line breaks within fields 340 341 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b 342 bb","ccc" 343 zzz,yyy,xxx`), int64(config.ReadBlockSize), s.ioWorkers, false) 344 345 c.Assert(parser.ReadRow(), IsNil) 346 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 347 RowID: 1, 348 Row: []types.Datum{ 349 types.NewStringDatum("aaa"), 350 types.NewStringDatum("b\nbb"), 351 types.NewStringDatum("ccc"), 352 }, 353 Length: 10, 354 }) 355 c.Assert(parser, posEq, 19, 1) 356 357 c.Assert(parser.ReadRow(), IsNil) 358 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 359 RowID: 2, 360 Row: []types.Datum{ 361 types.NewStringDatum("zzz"), 362 types.NewStringDatum("yyy"), 363 types.NewStringDatum("xxx"), 364 }, 365 Length: 9, 366 }) 367 c.Assert(parser, posEq, 30, 2) 368 369 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 370 371 // example 7, quote escaping 372 373 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b""bb","ccc"`), int64(config.ReadBlockSize), s.ioWorkers, false) 374 375 c.Assert(parser.ReadRow(), IsNil) 376 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 377 RowID: 1, 378 Row: []types.Datum{ 379 types.NewStringDatum("aaa"), 380 types.NewStringDatum("b\"bb"), 381 types.NewStringDatum("ccc"), 382 }, 383 Length: 10, 384 }) 385 c.Assert(parser, posEq, 19, 1) 386 387 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 388 } 389 390 func (s *testMydumpCSVParserSuite) TestMySQL(c *C) { 391 cfg := config.CSVConfig{ 392 Separator: ",", 393 Delimiter: `"`, 394 BackslashEscape: true, 395 NotNull: false, 396 Null: `\N`, 397 } 398 399 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"\"","\\","\?" 400 "\ 401 ",\N,\\N`), int64(config.ReadBlockSize), s.ioWorkers, false) 402 403 c.Assert(parser.ReadRow(), IsNil) 404 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 405 RowID: 1, 406 Row: []types.Datum{ 407 types.NewStringDatum(`"`), 408 types.NewStringDatum(`\`), 409 types.NewStringDatum("?"), 410 }, 411 Length: 6, 412 }) 413 c.Assert(parser, posEq, 15, 1) 414 415 c.Assert(parser.ReadRow(), IsNil) 416 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 417 RowID: 2, 418 Row: []types.Datum{ 419 types.NewStringDatum("\n"), 420 nullDatum, 421 types.NewStringDatum(`\N`), 422 }, 423 Length: 7, 424 }) 425 c.Assert(parser, posEq, 26, 2) 426 427 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 428 } 429 430 func (s *testMydumpCSVParserSuite) TestSyntaxError(c *C) { 431 cfg := config.CSVConfig{ 432 Separator: ",", 433 Delimiter: `"`, 434 BackslashEscape: true, 435 } 436 437 inputs := []string{ 438 `"???`, 439 `\`, 440 `"\`, 441 `0"`, 442 `0\`, 443 "\"\v", 444 `"""`, 445 "\"\r", 446 "\"\x01", 447 } 448 449 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), inputs) 450 451 cfg.BackslashEscape = false 452 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), []string{`"\`}) 453 } 454 455 func (s *testMydumpCSVParserSuite) TestTSV(c *C) { 456 cfg := config.CSVConfig{ 457 Separator: "\t", 458 Delimiter: "", 459 BackslashEscape: false, 460 NotNull: false, 461 Null: "", 462 Header: true, 463 } 464 465 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`a b c d e f 466 0 foo 0000-00-00 467 0 foo 0000-00-00 468 0 abc def ghi bar 1999-12-31`), int64(config.ReadBlockSize), s.ioWorkers, true) 469 470 c.Assert(parser.ReadRow(), IsNil) 471 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 472 RowID: 1, 473 Row: []types.Datum{ 474 types.NewStringDatum("0"), 475 nullDatum, 476 nullDatum, 477 nullDatum, 478 types.NewStringDatum("foo"), 479 types.NewStringDatum("0000-00-00"), 480 }, 481 Length: 14, 482 }) 483 c.Assert(parser, posEq, 32, 1) 484 c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c", "d", "e", "f"}) 485 486 c.Assert(parser.ReadRow(), IsNil) 487 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 488 RowID: 2, 489 Row: []types.Datum{ 490 types.NewStringDatum("0"), 491 nullDatum, 492 nullDatum, 493 nullDatum, 494 types.NewStringDatum("foo"), 495 types.NewStringDatum("0000-00-00"), 496 }, 497 Length: 14, 498 }) 499 c.Assert(parser, posEq, 52, 2) 500 501 c.Assert(parser.ReadRow(), IsNil) 502 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 503 RowID: 3, 504 Row: []types.Datum{ 505 types.NewStringDatum("0"), 506 types.NewStringDatum("abc"), 507 types.NewStringDatum("def"), 508 types.NewStringDatum("ghi"), 509 types.NewStringDatum("bar"), 510 types.NewStringDatum("1999-12-31"), 511 }, 512 Length: 23, 513 }) 514 c.Assert(parser, posEq, 80, 3) 515 516 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 517 } 518 519 func (s *testMydumpCSVParserSuite) TestCsvWithWhiteSpaceLine(c *C) { 520 cfg := config.CSVConfig{ 521 Separator: ",", 522 Delimiter: `"`, 523 } 524 data := " \r\n\r\n0,,abc\r\n \r\n123,1999-12-31,test\r\n" 525 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, false) 526 c.Assert(parser.ReadRow(), IsNil) 527 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 528 RowID: 1, 529 Row: []types.Datum{ 530 types.NewStringDatum("0"), 531 nullDatum, 532 types.NewStringDatum("abc"), 533 }, 534 Length: 4, 535 }) 536 537 c.Assert(parser, posEq, 12, 1) 538 c.Assert(parser.ReadRow(), IsNil) 539 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 540 RowID: 2, 541 Row: []types.Datum{ 542 types.NewStringDatum("123"), 543 types.NewStringDatum("1999-12-31"), 544 types.NewStringDatum("test"), 545 }, 546 Length: 17, 547 }) 548 c.Assert(parser.Close(), IsNil) 549 550 cfg.Header = true 551 data = " \r\na,b,c\r\n0,,abc\r\n" 552 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, true) 553 c.Assert(parser.ReadRow(), IsNil) 554 c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c"}) 555 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 556 RowID: 1, 557 Row: []types.Datum{ 558 types.NewStringDatum("0"), 559 nullDatum, 560 types.NewStringDatum("abc"), 561 }, 562 Length: 4, 563 }) 564 565 c.Assert(parser, posEq, 17, 1) 566 c.Assert(parser.Close(), IsNil) 567 } 568 569 func (s *testMydumpCSVParserSuite) TestEmpty(c *C) { 570 cfg := config.CSVConfig{ 571 Separator: ",", 572 Delimiter: `"`, 573 } 574 575 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, false) 576 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 577 578 // Try again with headers. 579 580 cfg.Header = true 581 582 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, true) 583 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 584 585 parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("h\n"), int64(config.ReadBlockSize), s.ioWorkers, true) 586 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 587 } 588 589 func (s *testMydumpCSVParserSuite) TestCRLF(c *C) { 590 cfg := config.CSVConfig{ 591 Separator: ",", 592 Delimiter: `"`, 593 } 594 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("a\rb\r\nc\n\n\n\nd"), int64(config.ReadBlockSize), s.ioWorkers, false) 595 596 c.Assert(parser.ReadRow(), IsNil) 597 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 598 RowID: 1, 599 Row: []types.Datum{types.NewStringDatum("a")}, 600 Length: 1, 601 }) 602 603 c.Assert(parser.ReadRow(), IsNil) 604 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 605 RowID: 2, 606 Row: []types.Datum{types.NewStringDatum("b")}, 607 Length: 1, 608 }) 609 610 c.Assert(parser.ReadRow(), IsNil) 611 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 612 RowID: 3, 613 Row: []types.Datum{types.NewStringDatum("c")}, 614 Length: 1, 615 }) 616 617 c.Assert(parser.ReadRow(), IsNil) 618 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 619 RowID: 4, 620 Row: []types.Datum{types.NewStringDatum("d")}, 621 Length: 1, 622 }) 623 624 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 625 } 626 627 func (s *testMydumpCSVParserSuite) TestQuotedSeparator(c *C) { 628 cfg := config.CSVConfig{ 629 Separator: ",", 630 Delimiter: `"`, 631 } 632 633 parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`",",','`), int64(config.ReadBlockSize), s.ioWorkers, false) 634 c.Assert(parser.ReadRow(), IsNil) 635 c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ 636 RowID: 1, 637 Row: []types.Datum{ 638 types.NewStringDatum(","), 639 types.NewStringDatum("'"), 640 types.NewStringDatum("'"), 641 }, 642 Length: 3, 643 }) 644 645 c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) 646 } 647 648 func (s *testMydumpCSVParserSuite) TestConsecutiveFields(c *C) { 649 // Note: the behavior of reading `"xxx"yyy` here is undefined in RFC 4180. 650 // Python's CSV module returns `xxxyyy`. 651 // Rust's CSV package returns `xxxyyy`. 652 // Go's CSV package returns a parse error. 653 // NPM's CSV package returns a parse error. 654 // MySQL's LOAD DATA statement returns `"xxx"yyy` as-is. 655 656 cfg := config.CSVConfig{ 657 Separator: ",", 658 Delimiter: `"`, 659 } 660 661 testCases := []string{ 662 `"x"?`, 663 "\"\"\x01", 664 "\"\"\v", 665 `abc""`, 666 } 667 668 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), testCases) 669 670 cfg.Delimiter = "|+|" 671 s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), []string{ 672 "abc|1|+||+|\r\n", 673 }) 674 } 675 676 func (s *testMydumpCSVParserSuite) TestSpecialChars(c *C) { 677 cfg := config.CSVConfig{Separator: ",", Delimiter: `"`} 678 testCases := []testCase{ 679 { 680 input: "\x00", 681 expected: [][]types.Datum{{types.NewStringDatum("\x00")}}, 682 }, 683 { 684 input: `0\`, 685 expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, 686 }, 687 { 688 input: `\`, 689 expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, 690 }, 691 { 692 input: "0\v", 693 expected: [][]types.Datum{{types.NewStringDatum("0\v")}}, 694 }, 695 { 696 input: "0\x00", 697 expected: [][]types.Datum{{types.NewStringDatum("0\x00")}}, 698 }, 699 { 700 input: "\n\r", 701 expected: [][]types.Datum{}, 702 }, 703 { 704 input: `"""",0`, 705 expected: [][]types.Datum{{types.NewStringDatum(`"`), types.NewStringDatum(`0`)}}, 706 }, 707 } 708 709 s.runTestCases(c, &cfg, int64(config.ReadBlockSize), testCases) 710 } 711 712 func (s *testMydumpCSVParserSuite) TestContinuation(c *C) { 713 cfg := config.CSVConfig{ 714 Separator: ",", 715 Delimiter: `"`, 716 BackslashEscape: true, 717 TrimLastSep: true, 718 } 719 720 testCases := []testCase{ 721 { 722 input: `"abcdef",\njklm,nop` + "\r\n" + `"""""","\n",a,`, 723 expected: [][]types.Datum{ 724 { 725 types.NewStringDatum("abcdef"), 726 types.NewStringDatum("\njklm"), 727 types.NewStringDatum("nop"), 728 }, 729 { 730 types.NewStringDatum(`""`), 731 types.NewStringDatum("\n"), 732 types.NewStringDatum("a"), 733 }, 734 }, 735 }, 736 { 737 input: `"VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca"`, 738 expected: [][]types.Datum{{types.NewStringDatum("VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca")}}, 739 }, 740 } 741 742 s.runTestCases(c, &cfg, 1, testCases) 743 } 744 745 func (s *testMydumpCSVParserSuite) TestBackslashAsSep(c *C) { 746 cfg := config.CSVConfig{ 747 Separator: `\`, 748 Delimiter: `"`, 749 } 750 751 testCases := []testCase{ 752 { 753 input: `0\`, 754 expected: [][]types.Datum{{types.NewStringDatum("0"), nullDatum}}, 755 }, 756 { 757 input: `\`, 758 expected: [][]types.Datum{{nullDatum, nullDatum}}, 759 }, 760 } 761 762 s.runTestCases(c, &cfg, 1, testCases) 763 764 failingInputs := []string{ 765 `"\`, 766 } 767 s.runFailingTestCases(c, &cfg, 1, failingInputs) 768 } 769 770 func (s *testMydumpCSVParserSuite) TestBackslashAsDelim(c *C) { 771 cfg := config.CSVConfig{ 772 Separator: ",", 773 Delimiter: `\`, 774 } 775 776 testCases := []testCase{ 777 { 778 input: `\\`, 779 expected: [][]types.Datum{{nullDatum}}, 780 }, 781 } 782 s.runTestCases(c, &cfg, 1, testCases) 783 784 failingInputs := []string{ 785 `"\`, 786 } 787 s.runFailingTestCases(c, &cfg, 1, failingInputs) 788 } 789 790 // errorReader implements the Reader interface which always returns an error. 791 type errorReader struct{} 792 793 func (*errorReader) Read(p []byte) (int, error) { 794 return 0, errors.New("fake read error") 795 } 796 797 func (*errorReader) Seek(offset int64, whence int) (int64, error) { 798 return 0, errors.New("fake seek error") 799 } 800 801 func (*errorReader) Close() error { 802 return errors.New("fake close error") 803 } 804 805 func (s *testMydumpCSVParserSuite) TestReadError(c *C) { 806 cfg := config.CSVConfig{ 807 Separator: ",", 808 Delimiter: `"`, 809 } 810 811 parser := mydump.NewCSVParser(&cfg, &errorReader{}, int64(config.ReadBlockSize), s.ioWorkers, false) 812 c.Assert(parser.ReadRow(), ErrorMatches, "fake read error") 813 } 814 815 // TestSyntaxErrorLog checks that a syntax error won't dump huge strings into the log. 816 func (s *testMydumpCSVParserSuite) TestSyntaxErrorLog(c *C) { 817 cfg := config.CSVConfig{ 818 Separator: "\t", 819 Delimiter: "'", 820 } 821 822 tc := mydump.NewStringReader("x'" + strings.Repeat("y", 50000)) 823 parser := mydump.NewCSVParser(&cfg, tc, 50000, s.ioWorkers, false) 824 logger, buffer := log.MakeTestLogger() 825 parser.SetLogger(logger) 826 c.Assert(parser.ReadRow(), ErrorMatches, "syntax error.*") 827 c.Assert(logger.Sync(), IsNil) 828 829 c.Assert( 830 buffer.Stripped(), Equals, 831 `{"$lvl":"ERROR","$msg":"syntax error","pos":2,"content":"`+strings.Repeat("y", 256)+`"}`, 832 ) 833 } 834 835 // TestTrimLastSep checks that set `TrimLastSep` to true trim only the last empty filed. 836 func (s *testMydumpCSVParserSuite) TestTrimLastSep(c *C) { 837 cfg := config.CSVConfig{ 838 Separator: ",", 839 Delimiter: `"`, 840 TrimLastSep: true, 841 } 842 parser := mydump.NewCSVParser( 843 &cfg, 844 mydump.NewStringReader("123,456,789,\r\na,b,,\r\n,,,\r\n\"a\",\"\",\"\",\r\n"), 845 int64(config.ReadBlockSize), 846 s.ioWorkers, 847 false, 848 ) 849 for i := 0; i < 4; i++ { 850 c.Assert(parser.ReadRow(), IsNil) 851 c.Assert(len(parser.LastRow().Row), Equals, 3) 852 } 853 } 854 855 // TestTerminator checks for customized terminators. 856 func (s *testMydumpCSVParserSuite) TestTerminator(c *C) { 857 cfg := config.CSVConfig{ 858 Separator: "|+|", 859 Terminator: "|+|\n", 860 } 861 862 testCases := []testCase{ 863 { 864 input: "5|+|abc\ndef\nghi|+|6|+|\n7|+|xy|+z|+|8|+|\n", 865 expected: [][]types.Datum{ 866 {types.NewStringDatum("5"), types.NewStringDatum("abc\ndef\nghi"), types.NewStringDatum("6")}, 867 {types.NewStringDatum("7"), types.NewStringDatum("xy|+z"), types.NewStringDatum("8")}, 868 }, 869 }, 870 } 871 872 s.runTestCases(c, &cfg, 1, testCases) 873 874 cfg.Delimiter = "|+>" 875 876 testCases = []testCase{ 877 { 878 input: "xyz|+|+>|+|\n|+>|+|\n|+>|+|\r|+|\n", 879 expected: [][]types.Datum{ 880 {types.NewStringDatum("xyz"), types.NewStringDatum("+>")}, 881 {types.NewStringDatum("|+|\n"), types.NewStringDatum("\r")}, 882 }, 883 }, 884 } 885 s.runTestCases(c, &cfg, 1, testCases) 886 } 887 888 // Run `go test github.com/pingcap/br/pkg/lightning/mydump -check.b -check.bmem -test.v` to get benchmark result. 889 // Please ensure your temporary storage has (c.N / 2) KiB of free space. 890 891 type benchCSVParserSuite struct { 892 csvPath string 893 ioWorkers *worker.Pool 894 } 895 896 var _ = Suite(&benchCSVParserSuite{}) 897 898 func (s *benchCSVParserSuite) setupTest(c *C) { 899 s.ioWorkers = worker.NewPool(context.Background(), 5, "bench_csv") 900 901 dir := c.MkDir() 902 s.csvPath = filepath.Join(dir, "input.csv") 903 file, err := os.Create(s.csvPath) 904 c.Assert(err, IsNil) 905 defer func() { 906 c.Assert(file.Close(), IsNil) 907 }() 908 for i := 0; i < c.N; i++ { 909 _, err = file.WriteString("18,1,1,0.3650,GC,BARBARBAR,rw9AOV1AjoI1,50000.00,-10.00,10.00,1,1,djj3Q2XaIPoYVy1FuF,gc80Q2o82Au3C9xv,PYOolSxG3w,DI,265111111,7586538936787184,2020-02-26 20:06:00.193,OE,YCkSPBVqoJ2V5F8zWs87V5XzbaIY70aWCD4dgcB6bjUzCr5wOJCJ2TYH49J7yWyysbudJIxlTAEWSJahY7hswLtTsqyjEkrlsN8iDMAa9Poj29miJ08tnn2G8mL64IlyywvnRGbLbyGvWDdrOSF42RyUFTWVyqlDWc6Gr5wyMPYgvweKemzFDVD3kro5JsmBmJY08EK54nQoyfo2sScyb34zcM9GFo9ZQTwloINfPYQKXQm32m0XvU7jiNmYpFTFJQjdqA825SEvQqMMefG2WG4jVu9UPdhdUjRsFRd0Gw7YPKByOlcuY0eKxT7sAzMKXx2000RR6dqHNXe47oVYd\n") 910 c.Assert(err, IsNil) 911 } 912 c.ResetTimer() 913 } 914 915 func (s *benchCSVParserSuite) BenchmarkReadRowUsingMydumpCSVParser(c *C) { 916 s.setupTest(c) 917 918 file, err := os.Open(s.csvPath) 919 c.Assert(err, IsNil) 920 defer func() { 921 c.Assert(file.Close(), IsNil) 922 }() 923 924 cfg := config.CSVConfig{Separator: ","} 925 parser := mydump.NewCSVParser(&cfg, file, 65536, s.ioWorkers, false) 926 parser.SetLogger(log.Logger{Logger: zap.NewNop()}) 927 928 rowsCount := 0 929 for { 930 err := parser.ReadRow() 931 if err == nil { 932 parser.RecycleRow(parser.LastRow()) 933 rowsCount++ 934 continue 935 } 936 if errors.Cause(err) == io.EOF { 937 break 938 } 939 c.Fatal(err) 940 } 941 c.Assert(rowsCount, Equals, c.N) 942 } 943 944 func (s *benchCSVParserSuite) BenchmarkReadRowUsingEncodingCSV(c *C) { 945 s.setupTest(c) 946 947 file, err := os.Open(s.csvPath) 948 c.Assert(err, IsNil) 949 defer func() { 950 c.Assert(file.Close(), IsNil) 951 }() 952 953 csvParser := csv.NewReader(file) 954 955 rowsCount := 0 956 var datums []types.Datum 957 for { 958 records, err := csvParser.Read() 959 if err == nil { 960 // for fair comparison, we need to include the cost of conversion to Datum. 961 for _, record := range records { 962 datums = append(datums, types.NewStringDatum(record)) 963 } 964 datums = datums[:0] 965 rowsCount++ 966 continue 967 } 968 if errors.Cause(err) == io.EOF { 969 break 970 } 971 c.Fatal(err) 972 } 973 c.Assert(rowsCount, Equals, c.N) 974 }