github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/util/csvparser/csv_parser_test.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package csvparser
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"strings"
    21  	"testing"
    22  
    23  	"github.com/stretchr/testify/require"
    24  )
    25  
    26  // TODO: rewrite test case
    27  
    28  func NewStringReader(str string) io.Reader {
    29  	return strings.NewReader(str)
    30  }
    31  
    32  func newStringField(val string, isNull bool) Field {
    33  	return Field{
    34  		Val:    val,
    35  		IsNull: isNull,
    36  	}
    37  }
    38  func assertPosEqual(t *testing.T, parser *CSVParser, pos int64) {
    39  	require.Equal(t, parser.Pos(), pos)
    40  }
    41  func tpchDatums() [][]Field {
    42  	datums := make([][]Field, 0, 3)
    43  	datums = append(datums, []Field{
    44  		newStringField("1", false),
    45  		newStringField("goldenrod lavender spring chocolate lace", false),
    46  		newStringField("Manufacturer#1", false),
    47  		newStringField("Brand#13", false),
    48  		newStringField("PROMO BURNISHED COPPER", false),
    49  		newStringField("7", false),
    50  		newStringField("JUMBO PKG", false),
    51  		newStringField("901.00", false),
    52  		newStringField("ly. slyly ironi", false),
    53  	})
    54  	datums = append(datums, []Field{
    55  		newStringField("2", false),
    56  		newStringField("blush thistle blue yellow saddle", false),
    57  		newStringField("Manufacturer#1", false),
    58  		newStringField("Brand#13", false),
    59  		newStringField("LARGE BRUSHED BRASS", false),
    60  		newStringField("1", false),
    61  		newStringField("LG CASE", false),
    62  		newStringField("902.00", false),
    63  		newStringField("lar accounts amo", false),
    64  	})
    65  	datums = append(datums, []Field{
    66  		newStringField("3", false),
    67  		newStringField("spring green yellow purple cornsilk", false),
    68  		newStringField("Manufacturer#4", false),
    69  		newStringField("Brand#42", false),
    70  		newStringField("STANDARD POLISHED BRASS", false),
    71  		newStringField("21", false),
    72  		newStringField("WRAP CASE", false),
    73  		newStringField("903.00", false),
    74  		newStringField("egular deposits hag", false),
    75  	})
    76  
    77  	return datums
    78  }
    79  
    80  func datumsToString(datums [][]Field, delimitor string, quote string, lastSep bool) string {
    81  	var b strings.Builder
    82  	doubleQuote := quote + quote
    83  	for _, ds := range datums {
    84  		for i, d := range ds {
    85  			text := d.Val
    86  			if len(quote) > 0 {
    87  				b.WriteString(quote)
    88  				b.WriteString(strings.ReplaceAll(text, quote, doubleQuote))
    89  				b.WriteString(quote)
    90  			} else {
    91  				b.WriteString(text)
    92  			}
    93  			if lastSep || i < len(ds)-1 {
    94  				b.WriteString(delimitor)
    95  			}
    96  		}
    97  		b.WriteString("\r\n")
    98  	}
    99  	return b.String()
   100  }
   101  
   102  func TestTPCH(t *testing.T) {
   103  	datums := tpchDatums()
   104  	input := datumsToString(datums, "|", "", true)
   105  	reader := strings.NewReader(input)
   106  
   107  	cfg := CSVConfig{
   108  		FieldsTerminatedBy: "|",
   109  		FieldsEnclosedBy:   "",
   110  		TrimLastSep:        true,
   111  	}
   112  
   113  	parser, err := NewCSVParser(&cfg, reader, int64(ReadBlockSize), false, false)
   114  	require.NoError(t, err)
   115  
   116  	var row []Field
   117  
   118  	row, err = parser.Read()
   119  	require.Nil(t, err)
   120  	require.Equal(t, datums[0], row)
   121  	require.Equal(t, parser.Pos(), int64(126))
   122  	assertPosEqual(t, parser, 126)
   123  
   124  	row, err = parser.Read()
   125  	require.Nil(t, err)
   126  	require.Equal(t, datums[1], row)
   127  	assertPosEqual(t, parser, 241)
   128  
   129  	row, err = parser.Read()
   130  	require.Nil(t, err)
   131  	require.Equal(t, datums[2], row)
   132  	assertPosEqual(t, parser, 369)
   133  
   134  }
   135  
   136  func TestTPCHMultiBytes(t *testing.T) {
   137  	datums := tpchDatums()
   138  	sepsAndQuotes := [][2]string{
   139  		{",", ""},
   140  		{"\000", ""},
   141  		{",", ""},
   142  		{"🤔", ""},
   143  		{",", "。"},
   144  		{"||", ""},
   145  		{"|+|", ""},
   146  		{"##", ""},
   147  		{",", "'"},
   148  		{",", `"`},
   149  		{"🤔", `''`},
   150  		{"🤔", `"'`},
   151  		{"🤔", `"'`},
   152  		{"🤔", "🌚"}, // this two emoji have same prefix bytes
   153  		{"##", "#-"},
   154  		{"\\s", "\\q"},
   155  		{",", "1"},
   156  		{",", "ac"},
   157  	}
   158  	for _, SepAndQuote := range sepsAndQuotes {
   159  		inputStr := datumsToString(datums, SepAndQuote[0], SepAndQuote[1], false)
   160  
   161  		// extract all index in the middle of '\r\n' from the inputStr.
   162  		// they indicate where the parser stops after reading one row.
   163  		// should be equals to the number of datums.
   164  		var allExpectedParserPos []int
   165  		for {
   166  			last := 0
   167  			if len(allExpectedParserPos) > 0 {
   168  				last = allExpectedParserPos[len(allExpectedParserPos)-1]
   169  			}
   170  			pos := strings.IndexByte(inputStr[last:], '\r')
   171  			if pos < 0 {
   172  				break
   173  			}
   174  			allExpectedParserPos = append(allExpectedParserPos, last+pos+1)
   175  		}
   176  		require.Len(t, allExpectedParserPos, len(datums))
   177  
   178  		cfg := CSVConfig{
   179  			FieldsTerminatedBy: SepAndQuote[0],
   180  			FieldsEnclosedBy:   SepAndQuote[1],
   181  			TrimLastSep:        false,
   182  		}
   183  
   184  		reader := NewStringReader(inputStr)
   185  		parser, err := NewCSVParser(&cfg, reader, int64(ReadBlockSize), false, false)
   186  		if fmt.Sprint(err) == "invalid input: invalid field or comment delimiter" {
   187  			continue
   188  		}
   189  		require.NoError(t, err)
   190  
   191  		for i, expectedParserPos := range allExpectedParserPos {
   192  			row, err := parser.Read()
   193  			require.Nil(t, err)
   194  			require.Equal(t, datums[i], row)
   195  			assertPosEqual(t, parser, int64(expectedParserPos))
   196  		}
   197  
   198  	}
   199  }
   200  
   201  func TestLinesTerminatedBy(t *testing.T) {
   202  	datums := tpchDatums()
   203  	input := datumsToString(datums, "|", "", true)
   204  	reader := strings.NewReader(input)
   205  
   206  	cfg := CSVConfig{
   207  		FieldsTerminatedBy: "|",
   208  		FieldsEnclosedBy:   "",
   209  		LinesTerminatedBy:  "\r\n",
   210  		TrimLastSep:        true,
   211  	}
   212  
   213  	parser, err := NewCSVParser(&cfg, reader, int64(ReadBlockSize), false, false)
   214  	require.NoError(t, err)
   215  
   216  	var row []Field
   217  
   218  	row, err = parser.Read()
   219  	require.Nil(t, err)
   220  	require.Equal(t, datums[0], row)
   221  	require.Equal(t, parser.Pos(), int64(127))
   222  	assertPosEqual(t, parser, 127)
   223  
   224  	row, err = parser.Read()
   225  	require.Nil(t, err)
   226  	require.Equal(t, datums[1], row)
   227  	assertPosEqual(t, parser, 242)
   228  
   229  	row, err = parser.Read()
   230  	require.Nil(t, err)
   231  	require.Equal(t, datums[2], row)
   232  	assertPosEqual(t, parser, 370)
   233  
   234  }
   235  
   236  func TestRFC4180(t *testing.T) {
   237  	cfg := CSVConfig{
   238  		FieldsTerminatedBy: ",",
   239  		FieldsEnclosedBy:   `"`,
   240  	}
   241  
   242  	// example 1, trailing new lines
   243  
   244  	parser, err := NewCSVParser(&cfg, NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), int64(ReadBlockSize), false, false)
   245  	require.NoError(t, err)
   246  
   247  	var row []Field
   248  
   249  	row, err = parser.Read()
   250  	require.Nil(t, err)
   251  	require.Equal(t, []Field{
   252  		newStringField("aaa", false),
   253  		newStringField("bbb", false),
   254  		newStringField("ccc", false),
   255  	}, row)
   256  	assertPosEqual(t, parser, 12)
   257  
   258  	row, err = parser.Read()
   259  	require.Nil(t, err)
   260  	require.Equal(t, []Field{
   261  		newStringField("zzz", false),
   262  		newStringField("yyy", false),
   263  		newStringField("xxx", false),
   264  	}, row)
   265  	assertPosEqual(t, parser, 24)
   266  
   267  	// example 2, no trailing new lines
   268  
   269  	parser, err = NewCSVParser(&cfg, NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), int64(ReadBlockSize), false, false)
   270  	require.NoError(t, err)
   271  
   272  	row, err = parser.Read()
   273  	require.Nil(t, err)
   274  	require.Equal(t, []Field{
   275  		newStringField("aaa", false),
   276  		newStringField("bbb", false),
   277  		newStringField("ccc", false),
   278  	}, row)
   279  	assertPosEqual(t, parser, 12)
   280  
   281  	row, err = parser.Read()
   282  	require.Nil(t, err)
   283  	require.Equal(t, []Field{
   284  		newStringField("zzz", false),
   285  		newStringField("yyy", false),
   286  		newStringField("xxx", false),
   287  	}, row)
   288  	assertPosEqual(t, parser, 23)
   289  
   290  	// example 5, quoted fields
   291  
   292  	parser, err = NewCSVParser(&cfg, NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), int64(ReadBlockSize), false, false)
   293  	require.NoError(t, err)
   294  
   295  	row, err = parser.Read()
   296  	require.Nil(t, err)
   297  	require.Equal(t, []Field{
   298  		newStringField("aaa", false),
   299  		newStringField("bbb", false),
   300  		newStringField("ccc", false),
   301  	}, row)
   302  	assertPosEqual(t, parser, 18)
   303  
   304  	row, err = parser.Read()
   305  	require.Nil(t, err)
   306  	require.Equal(t, []Field{
   307  		newStringField("zzz", false),
   308  		newStringField("yyy", false),
   309  		newStringField("xxx", false),
   310  	}, row)
   311  	assertPosEqual(t, parser, 29)
   312  
   313  	// example 6, line breaks within fields
   314  
   315  	parser, err = NewCSVParser(&cfg, NewStringReader(`"aaa","b
   316  bb","ccc"
   317  zzz,yyy,xxx`), int64(ReadBlockSize), false, false)
   318  	require.NoError(t, err)
   319  
   320  	row, err = parser.Read()
   321  	require.Nil(t, err)
   322  	require.Equal(t, []Field{
   323  		newStringField("aaa", false),
   324  		newStringField("b\nbb", false),
   325  		newStringField("ccc", false),
   326  	}, row)
   327  	assertPosEqual(t, parser, 19)
   328  
   329  	row, err = parser.Read()
   330  	require.Nil(t, err)
   331  	require.Equal(t, []Field{
   332  		newStringField("zzz", false),
   333  		newStringField("yyy", false),
   334  		newStringField("xxx", false),
   335  	}, row)
   336  	assertPosEqual(t, parser, 30)
   337  
   338  	// example 7, quote escaping
   339  
   340  	parser, err = NewCSVParser(&cfg, NewStringReader(`"aaa","b""bb","ccc"`), int64(ReadBlockSize), false, false)
   341  	require.NoError(t, err)
   342  
   343  	row, err = parser.Read()
   344  	require.Nil(t, err)
   345  	require.Equal(t, []Field{
   346  		newStringField("aaa", false),
   347  		newStringField("b\"bb", false),
   348  		newStringField("ccc", false),
   349  	}, row)
   350  	assertPosEqual(t, parser, 19)
   351  
   352  }
   353  
   354  func TestMySQL(t *testing.T) {
   355  	cfg := CSVConfig{
   356  		FieldsTerminatedBy: ",",
   357  		FieldsEnclosedBy:   `"`,
   358  		LinesTerminatedBy:  "\n",
   359  		FieldsEscapedBy:    `\`,
   360  		NotNull:            false,
   361  		Null:               []string{`\N`},
   362  	}
   363  
   364  	parser, err := NewCSVParser(&cfg, NewStringReader(`"\"","\\","\?"
   365  "\
   366  ",\N,\\N`), int64(ReadBlockSize), false, false)
   367  	require.NoError(t, err)
   368  
   369  	var row []Field
   370  
   371  	row, err = parser.Read()
   372  	require.NoError(t, err)
   373  	require.Equal(t, []Field{
   374  		newStringField(`"`, false),
   375  		newStringField(`\`, false),
   376  		newStringField("?", false),
   377  	}, row)
   378  
   379  	assertPosEqual(t, parser, 15)
   380  
   381  	row, err = parser.Read()
   382  	require.NoError(t, err)
   383  
   384  	require.Equal(t, []Field{
   385  		newStringField("\n", false),
   386  		newStringField("\\N", true),
   387  		newStringField(`\N`, false),
   388  	}, row)
   389  
   390  	assertPosEqual(t, parser, 26)
   391  
   392  	parser, err = NewCSVParser(
   393  		&cfg,
   394  		NewStringReader(`"\0\b\n\r\t\Z\\\  \c\'\""`),
   395  		int64(ReadBlockSize), false, false)
   396  	require.NoError(t, err)
   397  
   398  	row, err = parser.Read()
   399  	require.NoError(t, err)
   400  	require.Equal(t, []Field{
   401  		newStringField(string([]byte{0, '\b', '\n', '\r', '\t', 26, '\\', ' ', ' ', 'c', '\'', '"'}), false),
   402  	}, row)
   403  
   404  	cfg.UnescapedQuote = true
   405  	parser, err = NewCSVParser(
   406  		&cfg,
   407  		NewStringReader(`3,"a string containing a " quote",102.20
   408  `),
   409  		int64(ReadBlockSize), false, false)
   410  	require.NoError(t, err)
   411  
   412  	row, err = parser.Read()
   413  	require.NoError(t, err)
   414  	require.Equal(t, []Field{
   415  		newStringField("3", false),
   416  		newStringField(`a string containing a " quote`, false),
   417  		newStringField("102.20", false),
   418  	}, row)
   419  
   420  	parser, err = NewCSVParser(
   421  		&cfg,
   422  		NewStringReader(`3,"a string containing a " quote","102.20"`),
   423  		int64(ReadBlockSize), false, false)
   424  	require.NoError(t, err)
   425  
   426  	row, err = parser.Read()
   427  	require.NoError(t, err)
   428  	require.Equal(t, []Field{
   429  		newStringField("3", false),
   430  		newStringField(`a string containing a " quote`, false),
   431  		newStringField("102.20", false),
   432  	}, row)
   433  
   434  	parser, err = NewCSVParser(
   435  		&cfg,
   436  		NewStringReader(`"a"b",c"d"e`),
   437  		int64(ReadBlockSize), false, false)
   438  	require.NoError(t, err)
   439  
   440  	row, err = parser.Read()
   441  	require.NoError(t, err)
   442  	require.Equal(t, []Field{
   443  		newStringField(`a"b`, false),
   444  		newStringField(`c"d"e`, false),
   445  	}, row)
   446  }
   447  
   448  func TestCustomEscapeChar(t *testing.T) {
   449  	cfg := CSVConfig{
   450  		FieldsTerminatedBy: ",",
   451  		FieldsEnclosedBy:   `"`,
   452  		FieldsEscapedBy:    `!`,
   453  		NotNull:            false,
   454  		Null:               []string{`!N`},
   455  	}
   456  
   457  	parser, err := NewCSVParser(&cfg, NewStringReader(`"!"","!!","!\"
   458  "!
   459  ",!N,!!N`), int64(ReadBlockSize), false, false)
   460  	require.NoError(t, err)
   461  
   462  	var row []Field
   463  
   464  	row, err = parser.Read()
   465  	require.Nil(t, err)
   466  	require.Equal(t, []Field{
   467  		newStringField(`"`, false),
   468  		newStringField(`!`, false),
   469  		newStringField(`\`, false),
   470  	}, row)
   471  	assertPosEqual(t, parser, 15)
   472  
   473  	row, err = parser.Read()
   474  	require.Nil(t, err)
   475  	require.Equal(t, []Field{
   476  		newStringField("\n", false),
   477  		newStringField(`!N`, true),
   478  		newStringField(`!N`, false),
   479  	}, row)
   480  	assertPosEqual(t, parser, 26)
   481  
   482  	cfg = CSVConfig{
   483  		FieldsTerminatedBy: ",",
   484  		FieldsEnclosedBy:   `"`,
   485  		FieldsEscapedBy:    ``,
   486  		NotNull:            false,
   487  		Null:               []string{`NULL`},
   488  	}
   489  
   490  	parser, err = NewCSVParser(
   491  		&cfg,
   492  		NewStringReader(`"{""itemRangeType"":0,""itemContainType"":0,""shopRangeType"":1,""shopJson"":""[{\""id\"":\""A1234\"",\""shopName\"":\""AAAAAA\""}]""}"`),
   493  		int64(ReadBlockSize), false, false)
   494  	require.NoError(t, err)
   495  
   496  	row, err = parser.Read()
   497  	require.Nil(t, err)
   498  	require.Equal(t, []Field{
   499  		newStringField(`{"itemRangeType":0,"itemContainType":0,"shopRangeType":1,"shopJson":"[{\"id\":\"A1234\",\"shopName\":\"AAAAAA\"}]"}`, false),
   500  	}, row)
   501  }