github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/csv_parser_test.go (about)

     1  package mydump_test
     2  
     3  import (
     4  	"context"
     5  	"encoding/csv"
     6  	"io"
     7  	"os"
     8  	"path/filepath"
     9  	"strings"
    10  
    11  	. "github.com/pingcap/check"
    12  	"github.com/pingcap/errors"
    13  	"github.com/pingcap/tidb/types"
    14  	"go.uber.org/zap"
    15  
    16  	"github.com/pingcap/tidb-lightning/lightning/config"
    17  	"github.com/pingcap/tidb-lightning/lightning/log"
    18  	"github.com/pingcap/tidb-lightning/lightning/mydump"
    19  	"github.com/pingcap/tidb-lightning/lightning/worker"
    20  )
    21  
    22  var _ = Suite(&testMydumpCSVParserSuite{})
    23  
    24  type testMydumpCSVParserSuite struct {
    25  	ioWorkers *worker.Pool
    26  }
    27  
    28  func (s *testMydumpCSVParserSuite) SetUpSuite(c *C) {
    29  	s.ioWorkers = worker.NewPool(context.Background(), 5, "test_csv")
    30  }
    31  func (s *testMydumpCSVParserSuite) TearDownSuite(c *C) {}
    32  
    33  type assertPosEq struct {
    34  	*CheckerInfo
    35  }
    36  
    37  var posEq = &assertPosEq{
    38  	&CheckerInfo{Name: "posEq", Params: []string{"parser", "pos", "rowID"}},
    39  }
    40  
    41  func (checker *assertPosEq) Check(params []interface{}, names []string) (result bool, error string) {
    42  	parser := params[0].(mydump.Parser)
    43  	pos, rowID := parser.Pos()
    44  	expectedPos := int64(params[1].(int))
    45  	expectedRowID := int64(params[2].(int))
    46  	return pos == expectedPos && rowID == expectedRowID, ""
    47  }
    48  
    49  var nullDatum types.Datum
    50  
    51  type testCase struct {
    52  	input    string
    53  	expected [][]types.Datum
    54  }
    55  
    56  func (s *testMydumpCSVParserSuite) runTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []testCase) {
    57  	for _, tc := range cases {
    58  		parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc.input), blockBufSize, s.ioWorkers, false)
    59  		for i, row := range tc.expected {
    60  			comment := Commentf("input = %q, row = %d", tc.input, i+1)
    61  			e := parser.ReadRow()
    62  			c.Assert(e, IsNil, Commentf("input = %q, row = %d, error = %s", tc.input, i+1, errors.ErrorStack(e)))
    63  			c.Assert(parser.LastRow(), DeepEquals, mydump.Row{RowID: int64(i) + 1, Row: row}, comment)
    64  		}
    65  		c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF, Commentf("input = %q", tc.input))
    66  	}
    67  }
    68  
    69  func (s *testMydumpCSVParserSuite) runFailingTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []string) {
    70  	for _, tc := range cases {
    71  		parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc), blockBufSize, s.ioWorkers, false)
    72  		e := parser.ReadRow()
    73  		c.Assert(e, ErrorMatches, "syntax error.*", Commentf("input = %q / %s", tc, errors.ErrorStack(e)))
    74  	}
    75  }
    76  
    77  func tpchDatums() [][]types.Datum {
    78  	datums := make([][]types.Datum, 0, 3)
    79  	datums = append(datums, []types.Datum{
    80  		types.NewStringDatum("1"),
    81  		types.NewStringDatum("goldenrod lavender spring chocolate lace"),
    82  		types.NewStringDatum("Manufacturer#1"),
    83  		types.NewStringDatum("Brand#13"),
    84  		types.NewStringDatum("PROMO BURNISHED COPPER"),
    85  		types.NewStringDatum("7"),
    86  		types.NewStringDatum("JUMBO PKG"),
    87  		types.NewStringDatum("901.00"),
    88  		types.NewStringDatum("ly. slyly ironi"),
    89  	})
    90  	datums = append(datums, []types.Datum{
    91  		types.NewStringDatum("2"),
    92  		types.NewStringDatum("blush thistle blue yellow saddle"),
    93  		types.NewStringDatum("Manufacturer#1"),
    94  		types.NewStringDatum("Brand#13"),
    95  		types.NewStringDatum("LARGE BRUSHED BRASS"),
    96  		types.NewStringDatum("1"),
    97  		types.NewStringDatum("LG CASE"),
    98  		types.NewStringDatum("902.00"),
    99  		types.NewStringDatum("lar accounts amo"),
   100  	})
   101  	datums = append(datums, []types.Datum{
   102  		types.NewStringDatum("3"),
   103  		types.NewStringDatum("spring green yellow purple cornsilk"),
   104  		types.NewStringDatum("Manufacturer#4"),
   105  		types.NewStringDatum("Brand#42"),
   106  		types.NewStringDatum("STANDARD POLISHED BRASS"),
   107  		types.NewStringDatum("21"),
   108  		types.NewStringDatum("WRAP CASE"),
   109  		types.NewStringDatum("903.00"),
   110  		types.NewStringDatum("egular deposits hag"),
   111  	})
   112  
   113  	return datums
   114  }
   115  
   116  func datumsToString(datums [][]types.Datum, delimitor string, quote string, lastSep bool) string {
   117  	var b strings.Builder
   118  	doubleQuote := quote + quote
   119  	for _, ds := range datums {
   120  		for i, d := range ds {
   121  			text := d.GetString()
   122  			if len(quote) > 0 {
   123  				b.WriteString(quote)
   124  				b.WriteString(strings.ReplaceAll(text, quote, doubleQuote))
   125  				b.WriteString(quote)
   126  			} else {
   127  				b.WriteString(text)
   128  			}
   129  			if lastSep || i < len(ds)-1 {
   130  				b.WriteString(delimitor)
   131  			}
   132  		}
   133  		b.WriteString("\r\n")
   134  	}
   135  	return b.String()
   136  }
   137  
   138  func (s *testMydumpCSVParserSuite) TestTPCH(c *C) {
   139  	datums := tpchDatums()
   140  	input := datumsToString(datums, "|", "", true)
   141  	reader := mydump.NewStringReader(input)
   142  
   143  	cfg := config.CSVConfig{
   144  		Separator:   "|",
   145  		Delimiter:   "",
   146  		TrimLastSep: true,
   147  	}
   148  
   149  	parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false)
   150  
   151  	c.Assert(parser.ReadRow(), IsNil)
   152  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   153  		RowID: 1,
   154  		Row:   datums[0],
   155  	})
   156  	c.Assert(parser, posEq, 126, 1)
   157  
   158  	c.Assert(parser.ReadRow(), IsNil)
   159  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   160  		RowID: 2,
   161  		Row:   datums[1],
   162  	})
   163  	c.Assert(parser, posEq, 241, 2)
   164  
   165  	c.Assert(parser.ReadRow(), IsNil)
   166  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   167  		RowID: 3,
   168  		Row:   datums[2],
   169  	})
   170  	c.Assert(parser, posEq, 369, 3)
   171  
   172  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   173  }
   174  
   175  func (s *testMydumpCSVParserSuite) TestTPCHMultiBytes(c *C) {
   176  	datums := tpchDatums()
   177  	sepsAndQuotes := [][2]string{
   178  		{",", ""},
   179  		{"\000", ""},
   180  		{",", ""},
   181  		{"🤔", ""},
   182  		{",", "。"},
   183  		{"||", ""},
   184  		{"|+|", ""},
   185  		{"##", ""},
   186  		{",", "'"},
   187  		{",", `"`},
   188  		{"🤔", `''`},
   189  		{"🤔", `"'`},
   190  		{"🤔", `"'`},
   191  		{"🤔", "🌚"}, // this two emoji have same prefix bytes
   192  		{"##", "#-"},
   193  		{"\\s", "\\q"},
   194  		{",", "1"},
   195  		{",", "ac"},
   196  	}
   197  	for _, SepAndQuote := range sepsAndQuotes {
   198  		inputStr := datumsToString(datums, SepAndQuote[0], SepAndQuote[1], false)
   199  
   200  		// extract all index in the middle of '\r\n' from the inputStr.
   201  		// they indicate where the parser stops after reading one row.
   202  		// should be equals to the number of datums.
   203  		var allExpectedParserPos []int
   204  		for {
   205  			last := 0
   206  			if len(allExpectedParserPos) > 0 {
   207  				last = allExpectedParserPos[len(allExpectedParserPos)-1]
   208  			}
   209  			pos := strings.IndexByte(inputStr[last:], '\r')
   210  			if pos < 0 {
   211  				break
   212  			}
   213  			allExpectedParserPos = append(allExpectedParserPos, last+pos+1)
   214  		}
   215  		c.Assert(allExpectedParserPos, HasLen, len(datums))
   216  
   217  		cfg := config.CSVConfig{
   218  			Separator:   SepAndQuote[0],
   219  			Delimiter:   SepAndQuote[1],
   220  			TrimLastSep: false,
   221  		}
   222  
   223  		reader := mydump.NewStringReader(inputStr)
   224  		parser := mydump.NewCSVParser(&cfg, reader, int64(config.ReadBlockSize), s.ioWorkers, false)
   225  
   226  		for i, expectedParserPos := range allExpectedParserPos {
   227  			c.Assert(parser.ReadRow(), IsNil)
   228  			c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   229  				RowID: int64(i + 1),
   230  				Row:   datums[i],
   231  			})
   232  			c.Assert(parser, posEq, expectedParserPos, i+1)
   233  		}
   234  
   235  		c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   236  	}
   237  }
   238  
   239  func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) {
   240  	cfg := config.CSVConfig{
   241  		Separator: ",",
   242  		Delimiter: `"`,
   243  	}
   244  
   245  	// example 1, trailing new lines
   246  
   247  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), int64(config.ReadBlockSize), s.ioWorkers, false)
   248  
   249  	c.Assert(parser.ReadRow(), IsNil)
   250  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   251  		RowID: 1,
   252  		Row: []types.Datum{
   253  			types.NewStringDatum("aaa"),
   254  			types.NewStringDatum("bbb"),
   255  			types.NewStringDatum("ccc"),
   256  		},
   257  	})
   258  	c.Assert(parser, posEq, 12, 1)
   259  
   260  	c.Assert(parser.ReadRow(), IsNil)
   261  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   262  		RowID: 2,
   263  		Row: []types.Datum{
   264  			types.NewStringDatum("zzz"),
   265  			types.NewStringDatum("yyy"),
   266  			types.NewStringDatum("xxx"),
   267  		},
   268  	})
   269  	c.Assert(parser, posEq, 24, 2)
   270  
   271  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   272  
   273  	// example 2, no trailing new lines
   274  
   275  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false)
   276  
   277  	c.Assert(parser.ReadRow(), IsNil)
   278  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   279  		RowID: 1,
   280  		Row: []types.Datum{
   281  			types.NewStringDatum("aaa"),
   282  			types.NewStringDatum("bbb"),
   283  			types.NewStringDatum("ccc"),
   284  		},
   285  	})
   286  	c.Assert(parser, posEq, 12, 1)
   287  
   288  	c.Assert(parser.ReadRow(), IsNil)
   289  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   290  		RowID: 2,
   291  		Row: []types.Datum{
   292  			types.NewStringDatum("zzz"),
   293  			types.NewStringDatum("yyy"),
   294  			types.NewStringDatum("xxx"),
   295  		},
   296  	})
   297  	c.Assert(parser, posEq, 23, 2)
   298  
   299  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   300  
   301  	// example 5, quoted fields
   302  
   303  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), int64(config.ReadBlockSize), s.ioWorkers, false)
   304  
   305  	c.Assert(parser.ReadRow(), IsNil)
   306  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   307  		RowID: 1,
   308  		Row: []types.Datum{
   309  			types.NewStringDatum("aaa"),
   310  			types.NewStringDatum("bbb"),
   311  			types.NewStringDatum("ccc"),
   312  		},
   313  	})
   314  	c.Assert(parser, posEq, 18, 1)
   315  
   316  	c.Assert(parser.ReadRow(), IsNil)
   317  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   318  		RowID: 2,
   319  		Row: []types.Datum{
   320  			types.NewStringDatum("zzz"),
   321  			types.NewStringDatum("yyy"),
   322  			types.NewStringDatum("xxx"),
   323  		},
   324  	})
   325  	c.Assert(parser, posEq, 29, 2)
   326  
   327  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   328  
   329  	// example 6, line breaks within fields
   330  
   331  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b
   332  bb","ccc"
   333  zzz,yyy,xxx`), int64(config.ReadBlockSize), s.ioWorkers, false)
   334  
   335  	c.Assert(parser.ReadRow(), IsNil)
   336  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   337  		RowID: 1,
   338  		Row: []types.Datum{
   339  			types.NewStringDatum("aaa"),
   340  			types.NewStringDatum("b\nbb"),
   341  			types.NewStringDatum("ccc"),
   342  		},
   343  	})
   344  	c.Assert(parser, posEq, 19, 1)
   345  
   346  	c.Assert(parser.ReadRow(), IsNil)
   347  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   348  		RowID: 2,
   349  		Row: []types.Datum{
   350  			types.NewStringDatum("zzz"),
   351  			types.NewStringDatum("yyy"),
   352  			types.NewStringDatum("xxx"),
   353  		},
   354  	})
   355  	c.Assert(parser, posEq, 30, 2)
   356  
   357  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   358  
   359  	// example 7, quote escaping
   360  
   361  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b""bb","ccc"`), int64(config.ReadBlockSize), s.ioWorkers, false)
   362  
   363  	c.Assert(parser.ReadRow(), IsNil)
   364  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   365  		RowID: 1,
   366  		Row: []types.Datum{
   367  			types.NewStringDatum("aaa"),
   368  			types.NewStringDatum("b\"bb"),
   369  			types.NewStringDatum("ccc"),
   370  		},
   371  	})
   372  	c.Assert(parser, posEq, 19, 1)
   373  
   374  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   375  }
   376  
   377  func (s *testMydumpCSVParserSuite) TestMySQL(c *C) {
   378  	cfg := config.CSVConfig{
   379  		Separator:       ",",
   380  		Delimiter:       `"`,
   381  		BackslashEscape: true,
   382  		NotNull:         false,
   383  		Null:            `\N`,
   384  	}
   385  
   386  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"\"","\\","\?"
   387  "\
   388  ",\N,\\N`), int64(config.ReadBlockSize), s.ioWorkers, false)
   389  
   390  	c.Assert(parser.ReadRow(), IsNil)
   391  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   392  		RowID: 1,
   393  		Row: []types.Datum{
   394  			types.NewStringDatum(`"`),
   395  			types.NewStringDatum(`\`),
   396  			types.NewStringDatum("?"),
   397  		},
   398  	})
   399  	c.Assert(parser, posEq, 15, 1)
   400  
   401  	c.Assert(parser.ReadRow(), IsNil)
   402  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   403  		RowID: 2,
   404  		Row: []types.Datum{
   405  			types.NewStringDatum("\n"),
   406  			nullDatum,
   407  			types.NewStringDatum(`\N`),
   408  		},
   409  	})
   410  	c.Assert(parser, posEq, 26, 2)
   411  
   412  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   413  }
   414  
   415  func (s *testMydumpCSVParserSuite) TestSyntaxError(c *C) {
   416  	cfg := config.CSVConfig{
   417  		Separator:       ",",
   418  		Delimiter:       `"`,
   419  		BackslashEscape: true,
   420  	}
   421  
   422  	inputs := []string{
   423  		`"???`,
   424  		`\`,
   425  		`"\`,
   426  		`0"`,
   427  		`0\`,
   428  		"\"\v",
   429  		`"""`,
   430  		"\"\r",
   431  		"\"\x01",
   432  	}
   433  
   434  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), inputs)
   435  
   436  	cfg.BackslashEscape = false
   437  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), []string{`"\`})
   438  }
   439  
   440  func (s *testMydumpCSVParserSuite) TestTSV(c *C) {
   441  	cfg := config.CSVConfig{
   442  		Separator:       "\t",
   443  		Delimiter:       "",
   444  		BackslashEscape: false,
   445  		NotNull:         false,
   446  		Null:            "",
   447  		Header:          true,
   448  	}
   449  
   450  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`a	b	c	d	e	f
   451  0				foo	0000-00-00
   452  0				foo	0000-00-00
   453  0	abc	def	ghi	bar	1999-12-31`), int64(config.ReadBlockSize), s.ioWorkers, true)
   454  
   455  	c.Assert(parser.ReadRow(), IsNil)
   456  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   457  		RowID: 1,
   458  		Row: []types.Datum{
   459  			types.NewStringDatum("0"),
   460  			nullDatum,
   461  			nullDatum,
   462  			nullDatum,
   463  			types.NewStringDatum("foo"),
   464  			types.NewStringDatum("0000-00-00"),
   465  		},
   466  	})
   467  	c.Assert(parser, posEq, 32, 1)
   468  	c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c", "d", "e", "f"})
   469  
   470  	c.Assert(parser.ReadRow(), IsNil)
   471  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   472  		RowID: 2,
   473  		Row: []types.Datum{
   474  			types.NewStringDatum("0"),
   475  			nullDatum,
   476  			nullDatum,
   477  			nullDatum,
   478  			types.NewStringDatum("foo"),
   479  			types.NewStringDatum("0000-00-00"),
   480  		},
   481  	})
   482  	c.Assert(parser, posEq, 52, 2)
   483  
   484  	c.Assert(parser.ReadRow(), IsNil)
   485  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   486  		RowID: 3,
   487  		Row: []types.Datum{
   488  			types.NewStringDatum("0"),
   489  			types.NewStringDatum("abc"),
   490  			types.NewStringDatum("def"),
   491  			types.NewStringDatum("ghi"),
   492  			types.NewStringDatum("bar"),
   493  			types.NewStringDatum("1999-12-31"),
   494  		},
   495  	})
   496  	c.Assert(parser, posEq, 80, 3)
   497  
   498  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   499  }
   500  
   501  func (s *testMydumpCSVParserSuite) TestCsvWithWhiteSpaceLine(c *C) {
   502  	cfg := config.CSVConfig{
   503  		Separator: ",",
   504  		Delimiter: `"`,
   505  	}
   506  	data := " \r\n\r\n0,,abc\r\n \r\n123,1999-12-31,test\r\n"
   507  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, false)
   508  	c.Assert(parser.ReadRow(), IsNil)
   509  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   510  		RowID: 1,
   511  		Row: []types.Datum{
   512  			types.NewStringDatum("0"),
   513  			nullDatum,
   514  			types.NewStringDatum("abc"),
   515  		},
   516  	})
   517  
   518  	c.Assert(parser, posEq, 12, 1)
   519  	c.Assert(parser.ReadRow(), IsNil)
   520  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   521  		RowID: 2,
   522  		Row: []types.Datum{
   523  			types.NewStringDatum("123"),
   524  			types.NewStringDatum("1999-12-31"),
   525  			types.NewStringDatum("test"),
   526  		},
   527  	})
   528  	c.Assert(parser.Close(), IsNil)
   529  
   530  	cfg.Header = true
   531  	data = " \r\na,b,c\r\n0,,abc\r\n"
   532  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), s.ioWorkers, true)
   533  	c.Assert(parser.ReadRow(), IsNil)
   534  	c.Assert(parser.Columns(), DeepEquals, []string{"a", "b", "c"})
   535  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   536  		RowID: 1,
   537  		Row: []types.Datum{
   538  			types.NewStringDatum("0"),
   539  			nullDatum,
   540  			types.NewStringDatum("abc"),
   541  		},
   542  	})
   543  
   544  	c.Assert(parser, posEq, 17, 1)
   545  	c.Assert(parser.Close(), IsNil)
   546  }
   547  
   548  func (s *testMydumpCSVParserSuite) TestEmpty(c *C) {
   549  	cfg := config.CSVConfig{
   550  		Separator: ",",
   551  		Delimiter: `"`,
   552  	}
   553  
   554  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, false)
   555  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   556  
   557  	// Try again with headers.
   558  
   559  	cfg.Header = true
   560  
   561  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), int64(config.ReadBlockSize), s.ioWorkers, true)
   562  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   563  
   564  	parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("h\n"), int64(config.ReadBlockSize), s.ioWorkers, true)
   565  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   566  }
   567  
   568  func (s *testMydumpCSVParserSuite) TestCRLF(c *C) {
   569  	cfg := config.CSVConfig{
   570  		Separator: ",",
   571  		Delimiter: `"`,
   572  	}
   573  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("a\rb\r\nc\n\n\n\nd"), int64(config.ReadBlockSize), s.ioWorkers, false)
   574  
   575  	c.Assert(parser.ReadRow(), IsNil)
   576  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   577  		RowID: 1,
   578  		Row:   []types.Datum{types.NewStringDatum("a")},
   579  	})
   580  
   581  	c.Assert(parser.ReadRow(), IsNil)
   582  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   583  		RowID: 2,
   584  		Row:   []types.Datum{types.NewStringDatum("b")},
   585  	})
   586  
   587  	c.Assert(parser.ReadRow(), IsNil)
   588  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   589  		RowID: 3,
   590  		Row:   []types.Datum{types.NewStringDatum("c")},
   591  	})
   592  
   593  	c.Assert(parser.ReadRow(), IsNil)
   594  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   595  		RowID: 4,
   596  		Row:   []types.Datum{types.NewStringDatum("d")},
   597  	})
   598  
   599  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   600  }
   601  
   602  func (s *testMydumpCSVParserSuite) TestQuotedSeparator(c *C) {
   603  	cfg := config.CSVConfig{
   604  		Separator: ",",
   605  		Delimiter: `"`,
   606  	}
   607  
   608  	parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`",",','`), int64(config.ReadBlockSize), s.ioWorkers, false)
   609  	c.Assert(parser.ReadRow(), IsNil)
   610  	c.Assert(parser.LastRow(), DeepEquals, mydump.Row{
   611  		RowID: 1,
   612  		Row: []types.Datum{
   613  			types.NewStringDatum(","),
   614  			types.NewStringDatum("'"),
   615  			types.NewStringDatum("'"),
   616  		},
   617  	})
   618  
   619  	c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF)
   620  }
   621  
   622  func (s *testMydumpCSVParserSuite) TestConsecutiveFields(c *C) {
   623  	// Note: the behavior of reading `"xxx"yyy` here is undefined in RFC 4180.
   624  	// Python's CSV module returns `xxxyyy`.
   625  	// Rust's CSV package returns `xxxyyy`.
   626  	// Go's CSV package returns a parse error.
   627  	// NPM's CSV package returns a parse error.
   628  	// MySQL's LOAD DATA statement returns `"xxx"yyy` as-is.
   629  
   630  	cfg := config.CSVConfig{
   631  		Separator: ",",
   632  		Delimiter: `"`,
   633  	}
   634  
   635  	testCases := []string{
   636  		`"x"?`,
   637  		"\"\"\x01",
   638  		"\"\"\v",
   639  	}
   640  
   641  	s.runFailingTestCases(c, &cfg, int64(config.ReadBlockSize), testCases)
   642  }
   643  
   644  func (s *testMydumpCSVParserSuite) TestSpecialChars(c *C) {
   645  	cfg := config.CSVConfig{Separator: ",", Delimiter: `"`}
   646  	testCases := []testCase{
   647  		{
   648  			input:    "\x00",
   649  			expected: [][]types.Datum{{types.NewStringDatum("\x00")}},
   650  		},
   651  		{
   652  			input:    `0\`,
   653  			expected: [][]types.Datum{{types.NewStringDatum(`0\`)}},
   654  		},
   655  		{
   656  			input:    `\`,
   657  			expected: [][]types.Datum{{types.NewStringDatum(`\`)}},
   658  		},
   659  		{
   660  			input:    "0\v",
   661  			expected: [][]types.Datum{{types.NewStringDatum("0\v")}},
   662  		},
   663  		{
   664  			input:    "0\x00",
   665  			expected: [][]types.Datum{{types.NewStringDatum("0\x00")}},
   666  		},
   667  		{
   668  			input:    "\n\r",
   669  			expected: [][]types.Datum{},
   670  		},
   671  		{
   672  			input:    `"""",0`,
   673  			expected: [][]types.Datum{{types.NewStringDatum(`"`), types.NewStringDatum(`0`)}},
   674  		},
   675  	}
   676  
   677  	s.runTestCases(c, &cfg, int64(config.ReadBlockSize), testCases)
   678  }
   679  
   680  func (s *testMydumpCSVParserSuite) TestContinuation(c *C) {
   681  	cfg := config.CSVConfig{
   682  		Separator:       ",",
   683  		Delimiter:       `"`,
   684  		BackslashEscape: true,
   685  		TrimLastSep:     true,
   686  	}
   687  
   688  	testCases := []testCase{
   689  		{
   690  			input: `"abcdef",\njklm,nop` + "\r\n" + `"""""","\n",a,`,
   691  			expected: [][]types.Datum{
   692  				{
   693  					types.NewStringDatum("abcdef"),
   694  					types.NewStringDatum("\njklm"),
   695  					types.NewStringDatum("nop"),
   696  				},
   697  				{
   698  					types.NewStringDatum(`""`),
   699  					types.NewStringDatum("\n"),
   700  					types.NewStringDatum("a"),
   701  				},
   702  			},
   703  		},
   704  		{
   705  			input:    `"VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca"`,
   706  			expected: [][]types.Datum{{types.NewStringDatum("VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca")}},
   707  		},
   708  	}
   709  
   710  	s.runTestCases(c, &cfg, 1, testCases)
   711  }
   712  
   713  func (s *testMydumpCSVParserSuite) TestBackslashAsSep(c *C) {
   714  	cfg := config.CSVConfig{
   715  		Separator: `\`,
   716  		Delimiter: `"`,
   717  	}
   718  
   719  	testCases := []testCase{
   720  		{
   721  			input:    `0\`,
   722  			expected: [][]types.Datum{{types.NewStringDatum("0"), nullDatum}},
   723  		},
   724  		{
   725  			input:    `\`,
   726  			expected: [][]types.Datum{{nullDatum, nullDatum}},
   727  		},
   728  	}
   729  
   730  	s.runTestCases(c, &cfg, 1, testCases)
   731  
   732  	failingInputs := []string{
   733  		`"\`,
   734  	}
   735  	s.runFailingTestCases(c, &cfg, 1, failingInputs)
   736  }
   737  
   738  func (s *testMydumpCSVParserSuite) TestBackslashAsDelim(c *C) {
   739  	cfg := config.CSVConfig{
   740  		Separator: ",",
   741  		Delimiter: `\`,
   742  	}
   743  
   744  	testCases := []testCase{
   745  		{
   746  			input:    `\\`,
   747  			expected: [][]types.Datum{{nullDatum}},
   748  		},
   749  	}
   750  	s.runTestCases(c, &cfg, 1, testCases)
   751  
   752  	failingInputs := []string{
   753  		`"\`,
   754  	}
   755  	s.runFailingTestCases(c, &cfg, 1, failingInputs)
   756  }
   757  
   758  // errorReader implements the Reader interface which always returns an error.
   759  type errorReader struct{}
   760  
   761  func (*errorReader) Read(p []byte) (int, error) {
   762  	return 0, errors.New("fake read error")
   763  }
   764  
   765  func (*errorReader) Seek(offset int64, whence int) (int64, error) {
   766  	return 0, errors.New("fake seek error")
   767  }
   768  
   769  func (*errorReader) Close() error {
   770  	return errors.New("fake close error")
   771  }
   772  
   773  func (s *testMydumpCSVParserSuite) TestReadError(c *C) {
   774  	cfg := config.CSVConfig{
   775  		Separator: ",",
   776  		Delimiter: `"`,
   777  	}
   778  
   779  	parser := mydump.NewCSVParser(&cfg, &errorReader{}, int64(config.ReadBlockSize), s.ioWorkers, false)
   780  	c.Assert(parser.ReadRow(), ErrorMatches, "fake read error")
   781  }
   782  
   783  // TestSyntaxErrorLog checks that a syntax error won't dump huge strings into the log.
   784  func (s *testMydumpCSVParserSuite) TestSyntaxErrorLog(c *C) {
   785  	cfg := config.CSVConfig{
   786  		Separator: "\t",
   787  		Delimiter: "'",
   788  	}
   789  
   790  	tc := mydump.NewStringReader("x'" + strings.Repeat("y", 50000))
   791  	parser := mydump.NewCSVParser(&cfg, tc, 50000, s.ioWorkers, false)
   792  	logger, buffer := log.MakeTestLogger()
   793  	parser.SetLogger(logger)
   794  	c.Assert(parser.ReadRow(), ErrorMatches, "syntax error.*")
   795  	c.Assert(logger.Sync(), IsNil)
   796  
   797  	c.Assert(
   798  		buffer.Stripped(), Equals,
   799  		`{"$lvl":"ERROR","$msg":"syntax error","pos":1,"content":"'`+strings.Repeat("y", 255)+`"}`,
   800  	)
   801  }
   802  
   803  // TestTrimLastSep checks that set `TrimLastSep` to true trim only the last empty filed.
   804  func (s *testMydumpCSVParserSuite) TestTrimLastSep(c *C) {
   805  	cfg := config.CSVConfig{
   806  		Separator:   ",",
   807  		Delimiter:   `"`,
   808  		TrimLastSep: true,
   809  	}
   810  	parser := mydump.NewCSVParser(
   811  		&cfg,
   812  		mydump.NewStringReader("123,456,789,\r\na,b,,\r\n,,,\r\n\"a\",\"\",\"\",\r\n"),
   813  		int64(config.ReadBlockSize),
   814  		s.ioWorkers,
   815  		false,
   816  	)
   817  	for i := 0; i < 4; i++ {
   818  		c.Assert(parser.ReadRow(), IsNil)
   819  		c.Assert(len(parser.LastRow().Row), Equals, 3)
   820  	}
   821  }
   822  
   823  // Run `go test github.com/pingcap/tidb-lightning/lightning/mydump -check.b -check.bmem -test.v` to get benchmark result.
   824  // Please ensure your temporary storage has (c.N / 2) KiB of free space.
   825  
   826  type benchCSVParserSuite struct {
   827  	csvPath   string
   828  	ioWorkers *worker.Pool
   829  }
   830  
   831  var _ = Suite(&benchCSVParserSuite{})
   832  
   833  func (s *benchCSVParserSuite) setupTest(c *C) {
   834  	s.ioWorkers = worker.NewPool(context.Background(), 5, "bench_csv")
   835  
   836  	dir := c.MkDir()
   837  	s.csvPath = filepath.Join(dir, "input.csv")
   838  	file, err := os.Create(s.csvPath)
   839  	c.Assert(err, IsNil)
   840  	defer func() {
   841  		c.Assert(file.Close(), IsNil)
   842  	}()
   843  	for i := 0; i < c.N; i++ {
   844  		_, err = file.WriteString("18,1,1,0.3650,GC,BARBARBAR,rw9AOV1AjoI1,50000.00,-10.00,10.00,1,1,djj3Q2XaIPoYVy1FuF,gc80Q2o82Au3C9xv,PYOolSxG3w,DI,265111111,7586538936787184,2020-02-26 20:06:00.193,OE,YCkSPBVqoJ2V5F8zWs87V5XzbaIY70aWCD4dgcB6bjUzCr5wOJCJ2TYH49J7yWyysbudJIxlTAEWSJahY7hswLtTsqyjEkrlsN8iDMAa9Poj29miJ08tnn2G8mL64IlyywvnRGbLbyGvWDdrOSF42RyUFTWVyqlDWc6Gr5wyMPYgvweKemzFDVD3kro5JsmBmJY08EK54nQoyfo2sScyb34zcM9GFo9ZQTwloINfPYQKXQm32m0XvU7jiNmYpFTFJQjdqA825SEvQqMMefG2WG4jVu9UPdhdUjRsFRd0Gw7YPKByOlcuY0eKxT7sAzMKXx2000RR6dqHNXe47oVYd\n")
   845  		c.Assert(err, IsNil)
   846  	}
   847  	c.ResetTimer()
   848  }
   849  
   850  func (s *benchCSVParserSuite) BenchmarkReadRowUsingMydumpCSVParser(c *C) {
   851  	s.setupTest(c)
   852  
   853  	file, err := os.Open(s.csvPath)
   854  	c.Assert(err, IsNil)
   855  	defer func() {
   856  		c.Assert(file.Close(), IsNil)
   857  	}()
   858  
   859  	cfg := config.CSVConfig{Separator: ","}
   860  	parser := mydump.NewCSVParser(&cfg, file, 65536, s.ioWorkers, false)
   861  	parser.SetLogger(log.Logger{Logger: zap.NewNop()})
   862  
   863  	rowsCount := 0
   864  	for {
   865  		err := parser.ReadRow()
   866  		if err == nil {
   867  			parser.RecycleRow(parser.LastRow())
   868  			rowsCount++
   869  			continue
   870  		}
   871  		if errors.Cause(err) == io.EOF {
   872  			break
   873  		}
   874  		c.Fatal(err)
   875  	}
   876  	c.Assert(rowsCount, Equals, c.N)
   877  }
   878  
   879  func (s *benchCSVParserSuite) BenchmarkReadRowUsingEncodingCSV(c *C) {
   880  	s.setupTest(c)
   881  
   882  	file, err := os.Open(s.csvPath)
   883  	c.Assert(err, IsNil)
   884  	defer func() {
   885  		c.Assert(file.Close(), IsNil)
   886  	}()
   887  
   888  	csvParser := csv.NewReader(file)
   889  
   890  	rowsCount := 0
   891  	var datums []types.Datum
   892  	for {
   893  		records, err := csvParser.Read()
   894  		if err == nil {
   895  			// for fair comparison, we need to include the cost of conversion to Datum.
   896  			for _, record := range records {
   897  				datums = append(datums, types.NewStringDatum(record))
   898  			}
   899  			datums = datums[:0]
   900  			rowsCount++
   901  			continue
   902  		}
   903  		if errors.Cause(err) == io.EOF {
   904  			break
   905  		}
   906  		c.Fatal(err)
   907  	}
   908  	c.Assert(rowsCount, Equals, c.N)
   909  }