github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/csv_parser_test.go (about)

     1  package mydump_test
     2  
     3  import (
     4  	"context"
     5  	"encoding/csv"
     6  	"io"
     7  	"os"
     8  	"path/filepath"
     9  	"strings"
    10  
    11  	. "github.com/pingcap/check"
    12  	"github.com/pingcap/errors"
    13  	"github.com/pingcap/tidb/types"
    14  	"go.uber.org/zap"
    15  
    16  	"github.com/pingcap/br/pkg/lightning/config"
    17  	"github.com/pingcap/br/pkg/lightning/log"
    18  	"github.com/pingcap/br/pkg/lightning/mydump"
    19  	"github.com/pingcap/br/pkg/lightning/worker"
    20  )
    21  
    22  var _ = Suite(&testMydumpCSVParserSuite{})
    23  
    24  type testMydumpCSVParserSuite struct {
    25  	ioWorkers *worker.Pool
    26  }
    27  
    28  func (s *testMydumpCSVParserSuite) SetUpSuite(c *C) {
    29  	s.ioWorkers = worker.NewPool(context.Background(), 5, "test_csv")
    30  }
    31  func (s *testMydumpCSVParserSuite) TearDownSuite(c *C) {}
    32  
    33  type assertPosEq struct {
    34  	*CheckerInfo
    35  }
    36  
    37  var posEq = &assertPosEq{
    38  	&CheckerInfo{Name: "posEq", Params: []string{"parser", "pos", "rowID"}},
    39  }
    40  
    41  func (checker *assertPosEq) Check(params []interface{}, names []string) (result bool, error string) {
    42  	parser := params[0].(mydump.Parser)
    43  	pos, rowID := parser.Pos()
    44  	expectedPos := int64(params[1].(int))
    45  	expectedRowID := int64(params[2].(int))
    46  	return pos == expectedPos && rowID == expectedRowID, ""
    47  }
    48  
    49  var nullDatum types.Datum
    50  
    51  type testCase struct {
    52  	input    string
    53  	expected [][]types.Datum
    54  }
    55  
    56  func (s *testMydumpCSVParserSuite) runTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []testCase) {
    57  	for _, tc := range cases {
    58  		parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc.input), blockBufSize, s.ioWorkers, false)
    59  		for i, row := range tc.expected {
    60  			comment := Commentf("input = %q, row = %d", tc.input, i+1)
    61  			e := parser.ReadRow()
    62  			c.Assert(e, IsNil, Commentf("input = %q, row = %d, error = %s", tc.input, i+1, errors.ErrorStack(e)))
    63  			c.Assert(parser.LastRow().RowID, DeepEquals, int64(i)+1, comment)
    64  			c.Assert(parser.LastRow().Row, DeepEquals, row, comment)
    65  
    66  		}
    67  		c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF, Commentf("input = %q", tc.input))
    68  	}
    69  }
    70  
    71  func (s *testMydumpCSVParserSuite) runFailingTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []string) {
    72  	for _, tc := range cases {
    73  		parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc), blockBufSize, s.ioWorkers, false)
    74  		e := parser.ReadRow()
    75  		c.Assert(e, ErrorMatches, "syntax error.*", Commentf("input = %q / %s", tc, errors.ErrorStack(e)))
    76  	}
    77  }
    78  
    79  func tpchDatums() [][]types.Datum {
    80  	datums := make([][]types.Datum, 0, 3)
    81  	datums = append(datums, []types.Datum{
    82  		types.NewStringDatum("1"),
    83  		types.NewStringDatum("goldenrod lavender spring chocolate lace"),
    84  		types.NewStringDatum("Manufacturer#1"),
    85  		types.NewStringDatum("Brand#13"),
    86  		types.NewStringDatum("PROMO BURNISHED COPPER"),
    87  		types.NewStringDatum("7"),
    88  		types.NewStringDatum("JUMBO PKG"),
    89  		types.NewStringDatum("901.00"),
    90  		types.NewStringDatum("ly. slyly ironi"),
    91  	})
    92  	datums = append(datums, []types.Datum{
    93  		types.NewStringDatum("2"),
    94  		types.NewStringDatum("blush thistle blue yellow saddle"),
    95  		types.NewStringDatum("Manufacturer#1"),
    96  		types.NewStringDatum("Brand#13"),
    97  		types.NewStringDatum("LARGE BRUSHED BRASS"),
    98  		types.NewStringDatum("1"),
    99  		types.NewStringDatum("LG CASE"),
   100  		types.NewStringDatum("902.00"),
   101  		types.NewStringDatum("lar accounts amo"),
   102  	})
   103  	datums = append(datums, []types.Datum{
   104  		types.NewStringDatum("3"),
   105  		types.NewStringDatum("spring green yellow purple cornsilk"),
   106  		types.NewStringDatum("Manufacturer#4"),
   107  		types.NewStringDatum("Brand#42"),
   108  		types.NewStringDatum("STANDARD POLISHED BRASS"),
   109  		types.NewStringDatum("21"),
   110  		types.NewStringDatum("WRAP CASE"),
   111  		types.NewStringDatum("903.00"),
   112  		types.NewStringDatum("egular deposits hag"),
   113  	})
   114  
   115  	return datums
   116  }
   117  
   118  func datumsToString(datums [][]types.Datum, delimitor string, quote string, lastSep bool) string {
   119  	var b strings.Builder
   120  	doubleQuote := quote + quote
   121  	for _, ds := range datums {
   122  		for i, d := range ds {
   123  			text := d.GetString()
   124  			if len(quote) > 0 {
   125  				b.WriteString(quote)
   126  				b.WriteString(strings.ReplaceAll(text, quote, doubleQuote))
   127  				b.WriteString(quote)
   128  			} else {
   129  				b.WriteString(text)
   130  			}
   131  			if lastSep || i < len(ds)-1 {
   132  				b.WriteString(delimitor)
   133  			}
   134  		}
   135  		b.WriteString("\r\n")
   136  	}
   137  	return b.String()
   138  }
   139  
   140  func (s *testMydumpCSVParserSuite) TestTPCH(c *C) {
   141  	datums := tpchDatums()
   142  	input := datumsToString(datums, "|", "", true)
   143  	reader := mydump.NewStringReader(input)
   144  
   145  	cfg := config.CSVConfig{
   146  		Separator:   "|",
   147  		Delimiter:   "",
   148  		TrimLastSep: true,
   149  	}
   150  
   151  	parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false)
   152  
   153  	c.Assert(parser.ReadRow(), IsNil)
   154  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   155  		RowID:  1,
   156  		Row:    datums[0],
   157  		Length: 116,
   158  	})
   159  	c.Assert(parser, posEq, 126, 1)
   160  
   161  	c.Assert(parser.ReadRow(), IsNil)
   162  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   163  		RowID:  2,
   164  		Row:    datums[1],
   165  		Length: 104,
   166  	})
   167  	c.Assert(parser, posEq, 241, 2)
   168  
   169  	c.Assert(parser.ReadRow(), IsNil)
   170  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   171  		RowID:  3,
   172  		Row:    datums[2],
   173  		Length: 117,
   174  	})
   175  	c.Assert(parser, posEq, 369, 3)
   176  
   177  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   178  }
   179  
   180  func (s *testMydumpCSVParserSuite) TestTPCHMultiBytes(c *C) {
   181  	datums := tpchDatums()
   182  	sepsAndQuotes := [][2]string{
   183  		{",", ""},
   184  		{"\000", ""},
   185  		{",", ""},
   186  		{"🤔", ""},
   187  		{",", "。"},
   188  		{"||", ""},
   189  		{"|+|", ""},
   190  		{"##", ""},
   191  		{",", "'"},
   192  		{",", `"`},
   193  		{"🤔", `''`},
   194  		{"🤔", `"'`},
   195  		{"🤔", `"'`},
   196  		{"🤔", "🌚"}, // this two emoji have same prefix bytes
   197  		{"##", "#-"},
   198  		{"\\s", "\\q"},
   199  		{",", "1"},
   200  		{",", "ac"},
   201  	}
   202  	for _, SepAndQuote := range sepsAndQuotes {
   203  		inputStr := datumsToString(datums, SepAndQuote[0], SepAndQuote[1], false)
   204  
   205  		// extract all index in the middle of '\r\n' from the inputStr.
   206  		// they indicate where the parser stops after reading one row.
   207  		// should be equals to the number of datums.
   208  		var allExpectedParserPos []int
   209  		for {
   210  			last := 0
   211  			if len(allExpectedParserPos) > 0 {
   212  				last = allExpectedParserPos[len(allExpectedParserPos)-1]
   213  			}
   214  			pos := strings.IndexByte(inputStr[last:], '\r')
   215  			if pos < 0 {
   216  				break
   217  			}
   218  			allExpectedParserPos = append(allExpectedParserPos, last+pos+1)
   219  		}
   220  		c.Assert(allExpectedParserPos, HasLen, len(datums))
   221  
   222  		cfg := config.CSVConfig{
   223  			Separator:   SepAndQuote[0],
   224  			Delimiter:   SepAndQuote[1],
   225  			TrimLastSep: false,
   226  		}
   227  
   228  		reader := mydump.NewStringReader(inputStr)
   229  		parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false)
   230  
   231  		for i, expectedParserPos := range allExpectedParserPos {
   232  			c.Assert(parser.ReadRow(), IsNil)
   233  			c.Assert(parser.LastRow().RowID, DeepEquals, int64(i+1))
   234  			c.Assert(parser.LastRow().Row, DeepEquals, datums[i])
   235  
   236  			c.Assert(parser, posEq, expectedParserPos, i+1)
   237  		}
   238  
   239  		c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   240  	}
   241  }
   242  
   243  func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) {
   244  	cfg := config.CSVConfig{
   245  		Separator: ",",
   246  		Delimiter: `"`,
   247  	}
   248  
   249  	// example 1, trailing new lines
   250  
   251  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), int64(config.ReadBlockSize), s.ioWorkers, false)
   252  
   253  	c.Assert(parser.ReadRow(), IsNil)
   254  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   255  		RowID: 1,
   256  		Row: []types.Datum{
   257  			types.NewStringDatum("aaa"),
   258  			types.NewStringDatum("bbb"),
   259  			types.NewStringDatum("ccc"),
   260  		},
   261  		Length: 9,
   262  	})
   263  	c.Assert(parser, posEq, 12, 1)
   264  
   265  	c.Assert(parser.ReadRow(), IsNil)
   266  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   267  		RowID: 2,
   268  		Row: []types.Datum{
   269  			types.NewStringDatum("zzz"),
   270  			types.NewStringDatum("yyy"),
   271  			types.NewStringDatum("xxx"),
   272  		},
   273  		Length: 9,
   274  	})
   275  	c.Assert(parser, posEq, 24, 2)
   276  
   277  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   278  
   279  	// example 2, no trailing new lines
   280  
   281  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false)
   282  
   283  	c.Assert(parser.ReadRow(), IsNil)
   284  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   285  		RowID: 1,
   286  		Row: []types.Datum{
   287  			types.NewStringDatum("aaa"),
   288  			types.NewStringDatum("bbb"),
   289  			types.NewStringDatum("ccc"),
   290  		},
   291  		Length: 9,
   292  	})
   293  	c.Assert(parser, posEq, 12, 1)
   294  
   295  	c.Assert(parser.ReadRow(), IsNil)
   296  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   297  		RowID: 2,
   298  		Row: []types.Datum{
   299  			types.NewStringDatum("zzz"),
   300  			types.NewStringDatum("yyy"),
   301  			types.NewStringDatum("xxx"),
   302  		},
   303  		Length: 9,
   304  	})
   305  	c.Assert(parser, posEq, 23, 2)
   306  
   307  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   308  
   309  	// example 5, quoted fields
   310  
   311  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false)
   312  
   313  	c.Assert(parser.ReadRow(), IsNil)
   314  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   315  		RowID: 1,
   316  		Row: []types.Datum{
   317  			types.NewStringDatum("aaa"),
   318  			types.NewStringDatum("bbb"),
   319  			types.NewStringDatum("ccc"),
   320  		},
   321  		Length: 9,
   322  	})
   323  	c.Assert(parser, posEq, 18, 1)
   324  
   325  	c.Assert(parser.ReadRow(), IsNil)
   326  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   327  		RowID: 2,
   328  		Row: []types.Datum{
   329  			types.NewStringDatum("zzz"),
   330  			types.NewStringDatum("yyy"),
   331  			types.NewStringDatum("xxx"),
   332  		},
   333  		Length: 9,
   334  	})
   335  	c.Assert(parser, posEq, 29, 2)
   336  
   337  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   338  
   339  	// example 6, line breaks within fields
   340  
   341  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b
   342  bb","ccc"
   343  zzz,yyy,xxx`), int64(config.ReadBlockSize), s.ioWorkers, false)
   344  
   345  	c.Assert(parser.ReadRow(), IsNil)
   346  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   347  		RowID: 1,
   348  		Row: []types.Datum{
   349  			types.NewStringDatum("aaa"),
   350  			types.NewStringDatum("b\nbb"),
   351  			types.NewStringDatum("ccc"),
   352  		},
   353  		Length: 10,
   354  	})
   355  	c.Assert(parser, posEq, 19, 1)
   356  
   357  	c.Assert(parser.ReadRow(), IsNil)
   358  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   359  		RowID: 2,
   360  		Row: []types.Datum{
   361  			types.NewStringDatum("zzz"),
   362  			types.NewStringDatum("yyy"),
   363  			types.NewStringDatum("xxx"),
   364  		},
   365  		Length: 9,
   366  	})
   367  	c.Assert(parser, posEq, 30, 2)
   368  
   369  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   370  
   371  	// example 7, quote escaping
   372  
   373  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b""bb","ccc"`), int64(config.ReadBlockSize), s.ioWorkers, false)
   374  
   375  	c.Assert(parser.ReadRow(), IsNil)
   376  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   377  		RowID: 1,
   378  		Row: []types.Datum{
   379  			types.NewStringDatum("aaa"),
   380  			types.NewStringDatum("b\"bb"),
   381  			types.NewStringDatum("ccc"),
   382  		},
   383  		Length: 10,
   384  	})
   385  	c.Assert(parser, posEq, 19, 1)
   386  
   387  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   388  }
   389  
   390  func (s *testMydumpCSVParserSuite) TestMySQL(c *C) {
   391  	cfg := config.CSVConfig{
   392  		Separator:       ",",
   393  		Delimiter:       `"`,
   394  		BackslashEscape: true,
   395  		NotNull:         false,
   396  		Null:            `\N`,
   397  	}
   398  
   399  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"\"","\\","\?"
   400  "\
   401  ",\N,\\N`), int64(config.ReadBlockSize), s.ioWorkers, false)
   402  
   403  	c.Assert(parser.ReadRow(), IsNil)
   404  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   405  		RowID: 1,
   406  		Row: []types.Datum{
   407  			types.NewStringDatum(`"`),
   408  			types.NewStringDatum(`\`),
   409  			types.NewStringDatum("?"),
   410  		},
   411  		Length: 6,
   412  	})
   413  	c.Assert(parser, posEq, 15, 1)
   414  
   415  	c.Assert(parser.ReadRow(), IsNil)
   416  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   417  		RowID: 2,
   418  		Row: []types.Datum{
   419  			types.NewStringDatum("\n"),
   420  			nullDatum,
   421  			types.NewStringDatum(`\N`),
   422  		},
   423  		Length: 7,
   424  	})
   425  	c.Assert(parser, posEq, 26, 2)
   426  
   427  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   428  }
   429  
   430  func (s *testMydumpCSVParserSuite) TestSyntaxError(c *C) {
   431  	cfg := config.CSVConfig{
   432  		Separator:       ",",
   433  		Delimiter:       `"`,
   434  		BackslashEscape: true,
   435  	}
   436  
   437  	inputs := []string{
   438  		`"???`,
   439  		`\`,
   440  		`"\`,
   441  		`0"`,
   442  		`0\`,
   443  		"\"\v",
   444  		`"""`,
   445  		"\"\r",
   446  		"\"\x01",
   447  	}
   448  
   449  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), inputs)
   450  
   451  	cfg.BackslashEscape = false
   452  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), []string{`"\`})
   453  }
   454  
   455  func (s *testMydumpCSVParserSuite) TestTSV(c *C) {
   456  	cfg := config.CSVConfig{
   457  		Separator:       "\t",
   458  		Delimiter:       "",
   459  		BackslashEscape: false,
   460  		NotNull:         false,
   461  		Null:            "",
   462  		Header:          true,
   463  	}
   464  
   465  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`a	b	c	d	e	f
   466  0				foo	0000-00-00
   467  0				foo	0000-00-00
   468  0	abc	def	ghi	bar	1999-12-31`), int64(config.ReadBlockSize), s.ioWorkers, true)
   469  
   470  	c.Assert(parser.ReadRow(), IsNil)
   471  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   472  		RowID: 1,
   473  		Row: []types.Datum{
   474  			types.NewStringDatum("0"),
   475  			nullDatum,
   476  			nullDatum,
   477  			nullDatum,
   478  			types.NewStringDatum("foo"),
   479  			types.NewStringDatum("0000-00-00"),
   480  		},
   481  		Length: 14,
   482  	})
   483  	c.Assert(parser, posEq, 32, 1)
   484  	c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c", "d", "e", "f"})
   485  
   486  	c.Assert(parser.ReadRow(), IsNil)
   487  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   488  		RowID: 2,
   489  		Row: []types.Datum{
   490  			types.NewStringDatum("0"),
   491  			nullDatum,
   492  			nullDatum,
   493  			nullDatum,
   494  			types.NewStringDatum("foo"),
   495  			types.NewStringDatum("0000-00-00"),
   496  		},
   497  		Length: 14,
   498  	})
   499  	c.Assert(parser, posEq, 52, 2)
   500  
   501  	c.Assert(parser.ReadRow(), IsNil)
   502  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   503  		RowID: 3,
   504  		Row: []types.Datum{
   505  			types.NewStringDatum("0"),
   506  			types.NewStringDatum("abc"),
   507  			types.NewStringDatum("def"),
   508  			types.NewStringDatum("ghi"),
   509  			types.NewStringDatum("bar"),
   510  			types.NewStringDatum("1999-12-31"),
   511  		},
   512  		Length: 23,
   513  	})
   514  	c.Assert(parser, posEq, 80, 3)
   515  
   516  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   517  }
   518  
   519  func (s *testMydumpCSVParserSuite) TestCsvWithWhiteSpaceLine(c *C) {
   520  	cfg := config.CSVConfig{
   521  		Separator: ",",
   522  		Delimiter: `"`,
   523  	}
   524  	data := " \r\n\r\n0,,abc\r\n \r\n123,1999-12-31,test\r\n"
   525  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, false)
   526  	c.Assert(parser.ReadRow(), IsNil)
   527  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   528  		RowID: 1,
   529  		Row: []types.Datum{
   530  			types.NewStringDatum("0"),
   531  			nullDatum,
   532  			types.NewStringDatum("abc"),
   533  		},
   534  		Length: 4,
   535  	})
   536  
   537  	c.Assert(parser, posEq, 12, 1)
   538  	c.Assert(parser.ReadRow(), IsNil)
   539  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   540  		RowID: 2,
   541  		Row: []types.Datum{
   542  			types.NewStringDatum("123"),
   543  			types.NewStringDatum("1999-12-31"),
   544  			types.NewStringDatum("test"),
   545  		},
   546  		Length: 17,
   547  	})
   548  	c.Assert(parser.Close(), IsNil)
   549  
   550  	cfg.Header = true
   551  	data = " \r\na,b,c\r\n0,,abc\r\n"
   552  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, true)
   553  	c.Assert(parser.ReadRow(), IsNil)
   554  	c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c"})
   555  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   556  		RowID: 1,
   557  		Row: []types.Datum{
   558  			types.NewStringDatum("0"),
   559  			nullDatum,
   560  			types.NewStringDatum("abc"),
   561  		},
   562  		Length: 4,
   563  	})
   564  
   565  	c.Assert(parser, posEq, 17, 1)
   566  	c.Assert(parser.Close(), IsNil)
   567  }
   568  
   569  func (s *testMydumpCSVParserSuite) TestEmpty(c *C) {
   570  	cfg := config.CSVConfig{
   571  		Separator: ",",
   572  		Delimiter: `"`,
   573  	}
   574  
   575  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, false)
   576  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   577  
   578  	// Try again with headers.
   579  
   580  	cfg.Header = true
   581  
   582  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, true)
   583  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   584  
   585  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("h\n"), int64(config.ReadBlockSize), s.ioWorkers, true)
   586  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   587  }
   588  
   589  func (s *testMydumpCSVParserSuite) TestCRLF(c *C) {
   590  	cfg := config.CSVConfig{
   591  		Separator: ",",
   592  		Delimiter: `"`,
   593  	}
   594  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("a\rb\r\nc\n\n\n\nd"), int64(config.ReadBlockSize), s.ioWorkers, false)
   595  
   596  	c.Assert(parser.ReadRow(), IsNil)
   597  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   598  		RowID:  1,
   599  		Row:    []types.Datum{types.NewStringDatum("a")},
   600  		Length: 1,
   601  	})
   602  
   603  	c.Assert(parser.ReadRow(), IsNil)
   604  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   605  		RowID:  2,
   606  		Row:    []types.Datum{types.NewStringDatum("b")},
   607  		Length: 1,
   608  	})
   609  
   610  	c.Assert(parser.ReadRow(), IsNil)
   611  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   612  		RowID:  3,
   613  		Row:    []types.Datum{types.NewStringDatum("c")},
   614  		Length: 1,
   615  	})
   616  
   617  	c.Assert(parser.ReadRow(), IsNil)
   618  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   619  		RowID:  4,
   620  		Row:    []types.Datum{types.NewStringDatum("d")},
   621  		Length: 1,
   622  	})
   623  
   624  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   625  }
   626  
   627  func (s *testMydumpCSVParserSuite) TestQuotedSeparator(c *C) {
   628  	cfg := config.CSVConfig{
   629  		Separator: ",",
   630  		Delimiter: `"`,
   631  	}
   632  
   633  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`",",','`), int64(config.ReadBlockSize), s.ioWorkers, false)
   634  	c.Assert(parser.ReadRow(), IsNil)
   635  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   636  		RowID: 1,
   637  		Row: []types.Datum{
   638  			types.NewStringDatum(","),
   639  			types.NewStringDatum("'"),
   640  			types.NewStringDatum("'"),
   641  		},
   642  		Length: 3,
   643  	})
   644  
   645  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   646  }
   647  
   648  func (s *testMydumpCSVParserSuite) TestConsecutiveFields(c *C) {
   649  	// Note: the behavior of reading `"xxx"yyy` here is undefined in RFC 4180.
   650  	// Python's CSV module returns `xxxyyy`.
   651  	// Rust's CSV package returns `xxxyyy`.
   652  	// Go's CSV package returns a parse error.
   653  	// NPM's CSV package returns a parse error.
   654  	// MySQL's LOAD DATA statement returns `"xxx"yyy` as-is.
   655  
   656  	cfg := config.CSVConfig{
   657  		Separator: ",",
   658  		Delimiter: `"`,
   659  	}
   660  
   661  	testCases := []string{
   662  		`"x"?`,
   663  		"\"\"\x01",
   664  		"\"\"\v",
   665  		`abc""`,
   666  	}
   667  
   668  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), testCases)
   669  
   670  	cfg.Delimiter = "|+|"
   671  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), []string{
   672  		"abc|1|+||+|\r\n",
   673  	})
   674  }
   675  
   676  func (s *testMydumpCSVParserSuite) TestSpecialChars(c *C) {
   677  	cfg := config.CSVConfig{Separator: ",", Delimiter: `"`}
   678  	testCases := []testCase{
   679  		{
   680  			input:    "\x00",
   681  			expected: [][]types.Datum{{types.NewStringDatum("\x00")}},
   682  		},
   683  		{
   684  			input:    `0\`,
   685  			expected: [][]types.Datum{{types.NewStringDatum(`0\`)}},
   686  		},
   687  		{
   688  			input:    `\`,
   689  			expected: [][]types.Datum{{types.NewStringDatum(`\`)}},
   690  		},
   691  		{
   692  			input:    "0\v",
   693  			expected: [][]types.Datum{{types.NewStringDatum("0\v")}},
   694  		},
   695  		{
   696  			input:    "0\x00",
   697  			expected: [][]types.Datum{{types.NewStringDatum("0\x00")}},
   698  		},
   699  		{
   700  			input:    "\n\r",
   701  			expected: [][]types.Datum{},
   702  		},
   703  		{
   704  			input:    `"""",0`,
   705  			expected: [][]types.Datum{{types.NewStringDatum(`"`), types.NewStringDatum(`0`)}},
   706  		},
   707  	}
   708  
   709  	s.runTestCases(c, &cfg, int64(config.ReadBlockSize), testCases)
   710  }
   711  
   712  func (s *testMydumpCSVParserSuite) TestContinuation(c *C) {
   713  	cfg := config.CSVConfig{
   714  		Separator:       ",",
   715  		Delimiter:       `"`,
   716  		BackslashEscape: true,
   717  		TrimLastSep:     true,
   718  	}
   719  
   720  	testCases := []testCase{
   721  		{
   722  			input: `"abcdef",\njklm,nop` + "\r\n" + `"""""","\n",a,`,
   723  			expected: [][]types.Datum{
   724  				{
   725  					types.NewStringDatum("abcdef"),
   726  					types.NewStringDatum("\njklm"),
   727  					types.NewStringDatum("nop"),
   728  				},
   729  				{
   730  					types.NewStringDatum(`""`),
   731  					types.NewStringDatum("\n"),
   732  					types.NewStringDatum("a"),
   733  				},
   734  			},
   735  		},
   736  		{
   737  			input:    `"VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca"`,
   738  			expected: [][]types.Datum{{types.NewStringDatum("VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca")}},
   739  		},
   740  	}
   741  
   742  	s.runTestCases(c, &cfg, 1, testCases)
   743  }
   744  
   745  func (s *testMydumpCSVParserSuite) TestBackslashAsSep(c *C) {
   746  	cfg := config.CSVConfig{
   747  		Separator: `\`,
   748  		Delimiter: `"`,
   749  	}
   750  
   751  	testCases := []testCase{
   752  		{
   753  			input:    `0\`,
   754  			expected: [][]types.Datum{{types.NewStringDatum("0"), nullDatum}},
   755  		},
   756  		{
   757  			input:    `\`,
   758  			expected: [][]types.Datum{{nullDatum, nullDatum}},
   759  		},
   760  	}
   761  
   762  	s.runTestCases(c, &cfg, 1, testCases)
   763  
   764  	failingInputs := []string{
   765  		`"\`,
   766  	}
   767  	s.runFailingTestCases(c, &cfg, 1, failingInputs)
   768  }
   769  
   770  func (s *testMydumpCSVParserSuite) TestBackslashAsDelim(c *C) {
   771  	cfg := config.CSVConfig{
   772  		Separator: ",",
   773  		Delimiter: `\`,
   774  	}
   775  
   776  	testCases := []testCase{
   777  		{
   778  			input:    `\\`,
   779  			expected: [][]types.Datum{{nullDatum}},
   780  		},
   781  	}
   782  	s.runTestCases(c, &cfg, 1, testCases)
   783  
   784  	failingInputs := []string{
   785  		`"\`,
   786  	}
   787  	s.runFailingTestCases(c, &cfg, 1, failingInputs)
   788  }
   789  
   790  // errorReader implements the Reader interface which always returns an error.
   791  type errorReader struct{}
   792  
   793  func (*errorReader) Read(p []byte) (int, error) {
   794  	return 0, errors.New("fake read error")
   795  }
   796  
   797  func (*errorReader) Seek(offset int64, whence int) (int64, error) {
   798  	return 0, errors.New("fake seek error")
   799  }
   800  
   801  func (*errorReader) Close() error {
   802  	return errors.New("fake close error")
   803  }
   804  
   805  func (s *testMydumpCSVParserSuite) TestReadError(c *C) {
   806  	cfg := config.CSVConfig{
   807  		Separator: ",",
   808  		Delimiter: `"`,
   809  	}
   810  
   811  	parser := mydump.NewCSVParser(&cfg, &errorReader{}, int64(config.ReadBlockSize), s.ioWorkers, false)
   812  	c.Assert(parser.ReadRow(), ErrorMatches, "fake read error")
   813  }
   814  
   815  // TestSyntaxErrorLog checks that a syntax error won't dump huge strings into the log.
   816  func (s *testMydumpCSVParserSuite) TestSyntaxErrorLog(c *C) {
   817  	cfg := config.CSVConfig{
   818  		Separator: "\t",
   819  		Delimiter: "'",
   820  	}
   821  
   822  	tc := mydump.NewStringReader("x'" + strings.Repeat("y", 50000))
   823  	parser := mydump.NewCSVParser(&cfg, tc, 50000, s.ioWorkers, false)
   824  	logger, buffer := log.MakeTestLogger()
   825  	parser.SetLogger(logger)
   826  	c.Assert(parser.ReadRow(), ErrorMatches, "syntax error.*")
   827  	c.Assert(logger.Sync(), IsNil)
   828  
   829  	c.Assert(
   830  		buffer.Stripped(), Equals,
   831  		`{"$lvl":"ERROR","$msg":"syntax error","pos":2,"content":"`+strings.Repeat("y", 256)+`"}`,
   832  	)
   833  }
   834  
   835  // TestTrimLastSep checks that set `TrimLastSep` to true trim only the last empty filed.
   836  func (s *testMydumpCSVParserSuite) TestTrimLastSep(c *C) {
   837  	cfg := config.CSVConfig{
   838  		Separator:   ",",
   839  		Delimiter:   `"`,
   840  		TrimLastSep: true,
   841  	}
   842  	parser := mydump.NewCSVParser(
   843  		&cfg,
   844  		mydump.NewStringReader("123,456,789,\r\na,b,,\r\n,,,\r\n\"a\",\"\",\"\",\r\n"),
   845  		int64(config.ReadBlockSize),
   846  		s.ioWorkers,
   847  		false,
   848  	)
   849  	for i := 0; i < 4; i++ {
   850  		c.Assert(parser.ReadRow(), IsNil)
   851  		c.Assert(len(parser.LastRow().Row), Equals, 3)
   852  	}
   853  }
   854  
   855  // TestTerminator checks for customized terminators.
   856  func (s *testMydumpCSVParserSuite) TestTerminator(c *C) {
   857  	cfg := config.CSVConfig{
   858  		Separator:  "|+|",
   859  		Terminator: "|+|\n",
   860  	}
   861  
   862  	testCases := []testCase{
   863  		{
   864  			input: "5|+|abc\ndef\nghi|+|6|+|\n7|+|xy|+z|+|8|+|\n",
   865  			expected: [][]types.Datum{
   866  				{types.NewStringDatum("5"), types.NewStringDatum("abc\ndef\nghi"), types.NewStringDatum("6")},
   867  				{types.NewStringDatum("7"), types.NewStringDatum("xy|+z"), types.NewStringDatum("8")},
   868  			},
   869  		},
   870  	}
   871  
   872  	s.runTestCases(c, &cfg, 1, testCases)
   873  
   874  	cfg.Delimiter = "|+>"
   875  
   876  	testCases = []testCase{
   877  		{
   878  			input: "xyz|+|+>|+|\n|+>|+|\n|+>|+|\r|+|\n",
   879  			expected: [][]types.Datum{
   880  				{types.NewStringDatum("xyz"), types.NewStringDatum("+>")},
   881  				{types.NewStringDatum("|+|\n"), types.NewStringDatum("\r")},
   882  			},
   883  		},
   884  	}
   885  	s.runTestCases(c, &cfg, 1, testCases)
   886  }
   887  
   888  // Run `go test github.com/pingcap/br/pkg/lightning/mydump -check.b -check.bmem -test.v` to get benchmark result.
   889  // Please ensure your temporary storage has (c.N / 2) KiB of free space.
   890  
   891  type benchCSVParserSuite struct {
   892  	csvPath   string
   893  	ioWorkers *worker.Pool
   894  }
   895  
   896  var _ = Suite(&benchCSVParserSuite{})
   897  
   898  func (s *benchCSVParserSuite) setupTest(c *C) {
   899  	s.ioWorkers = worker.NewPool(context.Background(), 5, "bench_csv")
   900  
   901  	dir := c.MkDir()
   902  	s.csvPath = filepath.Join(dir, "input.csv")
   903  	file, err := os.Create(s.csvPath)
   904  	c.Assert(err, IsNil)
   905  	defer func() {
   906  		c.Assert(file.Close(), IsNil)
   907  	}()
   908  	for i := 0; i < c.N; i++ {
   909  		_, err = file.WriteString("18,1,1,0.3650,GC,BARBARBAR,rw9AOV1AjoI1,50000.00,-10.00,10.00,1,1,djj3Q2XaIPoYVy1FuF,gc80Q2o82Au3C9xv,PYOolSxG3w,DI,265111111,7586538936787184,2020-02-26 20:06:00.193,OE,YCkSPBVqoJ2V5F8zWs87V5XzbaIY70aWCD4dgcB6bjUzCr5wOJCJ2TYH49J7yWyysbudJIxlTAEWSJahY7hswLtTsqyjEkrlsN8iDMAa9Poj29miJ08tnn2G8mL64IlyywvnRGbLbyGvWDdrOSF42RyUFTWVyqlDWc6Gr5wyMPYgvweKemzFDVD3kro5JsmBmJY08EK54nQoyfo2sScyb34zcM9GFo9ZQTwloINfPYQKXQm32m0XvU7jiNmYpFTFJQjdqA825SEvQqMMefG2WG4jVu9UPdhdUjRsFRd0Gw7YPKByOlcuY0eKxT7sAzMKXx2000RR6dqHNXe47oVYd\n")
   910  		c.Assert(err, IsNil)
   911  	}
   912  	c.ResetTimer()
   913  }
   914  
   915  func (s *benchCSVParserSuite) BenchmarkReadRowUsingMydumpCSVParser(c *C) {
   916  	s.setupTest(c)
   917  
   918  	file, err := os.Open(s.csvPath)
   919  	c.Assert(err, IsNil)
   920  	defer func() {
   921  		c.Assert(file.Close(), IsNil)
   922  	}()
   923  
   924  	cfg := config.CSVConfig{Separator: ","}
   925  	parser := mydump.NewCSVParser(&cfg, file, 65536, s.ioWorkers, false)
   926  	parser.SetLogger(log.Logger{Logger: zap.NewNop()})
   927  
   928  	rowsCount := 0
   929  	for {
   930  		err := parser.ReadRow()
   931  		if err == nil {
   932  			parser.RecycleRow(parser.LastRow())
   933  			rowsCount++
   934  			continue
   935  		}
   936  		if errors.Cause(err) == io.EOF {
   937  			break
   938  		}
   939  		c.Fatal(err)
   940  	}
   941  	c.Assert(rowsCount, Equals, c.N)
   942  }
   943  
   944  func (s *benchCSVParserSuite) BenchmarkReadRowUsingEncodingCSV(c *C) {
   945  	s.setupTest(c)
   946  
   947  	file, err := os.Open(s.csvPath)
   948  	c.Assert(err, IsNil)
   949  	defer func() {
   950  		c.Assert(file.Close(), IsNil)
   951  	}()
   952  
   953  	csvParser := csv.NewReader(file)
   954  
   955  	rowsCount := 0
   956  	var datums []types.Datum
   957  	for {
   958  		records, err := csvParser.Read()
   959  		if err == nil {
   960  			// for fair comparison, we need to include the cost of conversion to Datum.
   961  			for _, record := range records {
   962  				datums = append(datums, types.NewStringDatum(record))
   963  			}
   964  			datums = datums[:0]
   965  			rowsCount++
   966  			continue
   967  		}
   968  		if errors.Cause(err) == io.EOF {
   969  			break
   970  		}
   971  		c.Fatal(err)
   972  	}
   973  	c.Assert(rowsCount, Equals, c.N)
   974  }