github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/util/csvparser/csv_parser_test.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package csvparser 16 17 import ( 18 "fmt" 19 "io" 20 "strings" 21 "testing" 22 23 "github.com/stretchr/testify/require" 24 ) 25 26 // TODO: rewrite test case 27 28 func NewStringReader(str string) io.Reader { 29 return strings.NewReader(str) 30 } 31 32 func newStringField(val string, isNull bool) Field { 33 return Field{ 34 Val: val, 35 IsNull: isNull, 36 } 37 } 38 func assertPosEqual(t *testing.T, parser *CSVParser, pos int64) { 39 require.Equal(t, parser.Pos(), pos) 40 } 41 func tpchDatums() [][]Field { 42 datums := make([][]Field, 0, 3) 43 datums = append(datums, []Field{ 44 newStringField("1", false), 45 newStringField("goldenrod lavender spring chocolate lace", false), 46 newStringField("Manufacturer#1", false), 47 newStringField("Brand#13", false), 48 newStringField("PROMO BURNISHED COPPER", false), 49 newStringField("7", false), 50 newStringField("JUMBO PKG", false), 51 newStringField("901.00", false), 52 newStringField("ly. slyly ironi", false), 53 }) 54 datums = append(datums, []Field{ 55 newStringField("2", false), 56 newStringField("blush thistle blue yellow saddle", false), 57 newStringField("Manufacturer#1", false), 58 newStringField("Brand#13", false), 59 newStringField("LARGE BRUSHED BRASS", false), 60 newStringField("1", false), 61 newStringField("LG CASE", false), 62 newStringField("902.00", false), 63 newStringField("lar accounts amo", false), 64 }) 65 datums = append(datums, []Field{ 66 newStringField("3", false), 67 newStringField("spring green yellow purple cornsilk", false), 68 newStringField("Manufacturer#4", false), 69 newStringField("Brand#42", false), 70 newStringField("STANDARD POLISHED BRASS", false), 71 newStringField("21", false), 72 newStringField("WRAP CASE", false), 73 newStringField("903.00", false), 74 newStringField("egular deposits hag", false), 75 }) 76 77 return datums 78 } 79 80 func datumsToString(datums [][]Field, delimitor string, quote string, lastSep bool) string { 81 var b strings.Builder 82 doubleQuote := quote + quote 83 for _, ds := range datums { 84 for i, d := range ds { 85 text := d.Val 86 if len(quote) > 0 { 87 b.WriteString(quote) 88 b.WriteString(strings.ReplaceAll(text, quote, doubleQuote)) 89 b.WriteString(quote) 90 } else { 91 b.WriteString(text) 92 } 93 if lastSep || i < len(ds)-1 { 94 b.WriteString(delimitor) 95 } 96 } 97 b.WriteString("\r\n") 98 } 99 return b.String() 100 } 101 102 func TestTPCH(t *testing.T) { 103 datums := tpchDatums() 104 input := datumsToString(datums, "|", "", true) 105 reader := strings.NewReader(input) 106 107 cfg := CSVConfig{ 108 FieldsTerminatedBy: "|", 109 FieldsEnclosedBy: "", 110 TrimLastSep: true, 111 } 112 113 parser, err := NewCSVParser(&cfg, reader, int64(ReadBlockSize), false, false) 114 require.NoError(t, err) 115 116 var row []Field 117 118 row, err = parser.Read() 119 require.Nil(t, err) 120 require.Equal(t, datums[0], row) 121 require.Equal(t, parser.Pos(), int64(126)) 122 assertPosEqual(t, parser, 126) 123 124 row, err = parser.Read() 125 require.Nil(t, err) 126 require.Equal(t, datums[1], row) 127 assertPosEqual(t, parser, 241) 128 129 row, err = parser.Read() 130 require.Nil(t, err) 131 require.Equal(t, datums[2], row) 132 assertPosEqual(t, parser, 369) 133 134 } 135 136 func TestTPCHMultiBytes(t *testing.T) { 137 datums := tpchDatums() 138 sepsAndQuotes := [][2]string{ 139 {",", ""}, 140 {"\000", ""}, 141 {",", ""}, 142 {"🤔", ""}, 143 {",", "。"}, 144 {"||", ""}, 145 {"|+|", ""}, 146 {"##", ""}, 147 {",", "'"}, 148 {",", `"`}, 149 {"🤔", `''`}, 150 {"🤔", `"'`}, 151 {"🤔", `"'`}, 152 {"🤔", "🌚"}, // this two emoji have same prefix bytes 153 {"##", "#-"}, 154 {"\\s", "\\q"}, 155 {",", "1"}, 156 {",", "ac"}, 157 } 158 for _, SepAndQuote := range sepsAndQuotes { 159 inputStr := datumsToString(datums, SepAndQuote[0], SepAndQuote[1], false) 160 161 // extract all index in the middle of '\r\n' from the inputStr. 162 // they indicate where the parser stops after reading one row. 163 // should be equals to the number of datums. 164 var allExpectedParserPos []int 165 for { 166 last := 0 167 if len(allExpectedParserPos) > 0 { 168 last = allExpectedParserPos[len(allExpectedParserPos)-1] 169 } 170 pos := strings.IndexByte(inputStr[last:], '\r') 171 if pos < 0 { 172 break 173 } 174 allExpectedParserPos = append(allExpectedParserPos, last+pos+1) 175 } 176 require.Len(t, allExpectedParserPos, len(datums)) 177 178 cfg := CSVConfig{ 179 FieldsTerminatedBy: SepAndQuote[0], 180 FieldsEnclosedBy: SepAndQuote[1], 181 TrimLastSep: false, 182 } 183 184 reader := NewStringReader(inputStr) 185 parser, err := NewCSVParser(&cfg, reader, int64(ReadBlockSize), false, false) 186 if fmt.Sprint(err) == "invalid input: invalid field or comment delimiter" { 187 continue 188 } 189 require.NoError(t, err) 190 191 for i, expectedParserPos := range allExpectedParserPos { 192 row, err := parser.Read() 193 require.Nil(t, err) 194 require.Equal(t, datums[i], row) 195 assertPosEqual(t, parser, int64(expectedParserPos)) 196 } 197 198 } 199 } 200 201 func TestLinesTerminatedBy(t *testing.T) { 202 datums := tpchDatums() 203 input := datumsToString(datums, "|", "", true) 204 reader := strings.NewReader(input) 205 206 cfg := CSVConfig{ 207 FieldsTerminatedBy: "|", 208 FieldsEnclosedBy: "", 209 LinesTerminatedBy: "\r\n", 210 TrimLastSep: true, 211 } 212 213 parser, err := NewCSVParser(&cfg, reader, int64(ReadBlockSize), false, false) 214 require.NoError(t, err) 215 216 var row []Field 217 218 row, err = parser.Read() 219 require.Nil(t, err) 220 require.Equal(t, datums[0], row) 221 require.Equal(t, parser.Pos(), int64(127)) 222 assertPosEqual(t, parser, 127) 223 224 row, err = parser.Read() 225 require.Nil(t, err) 226 require.Equal(t, datums[1], row) 227 assertPosEqual(t, parser, 242) 228 229 row, err = parser.Read() 230 require.Nil(t, err) 231 require.Equal(t, datums[2], row) 232 assertPosEqual(t, parser, 370) 233 234 } 235 236 func TestRFC4180(t *testing.T) { 237 cfg := CSVConfig{ 238 FieldsTerminatedBy: ",", 239 FieldsEnclosedBy: `"`, 240 } 241 242 // example 1, trailing new lines 243 244 parser, err := NewCSVParser(&cfg, NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), int64(ReadBlockSize), false, false) 245 require.NoError(t, err) 246 247 var row []Field 248 249 row, err = parser.Read() 250 require.Nil(t, err) 251 require.Equal(t, []Field{ 252 newStringField("aaa", false), 253 newStringField("bbb", false), 254 newStringField("ccc", false), 255 }, row) 256 assertPosEqual(t, parser, 12) 257 258 row, err = parser.Read() 259 require.Nil(t, err) 260 require.Equal(t, []Field{ 261 newStringField("zzz", false), 262 newStringField("yyy", false), 263 newStringField("xxx", false), 264 }, row) 265 assertPosEqual(t, parser, 24) 266 267 // example 2, no trailing new lines 268 269 parser, err = NewCSVParser(&cfg, NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), int64(ReadBlockSize), false, false) 270 require.NoError(t, err) 271 272 row, err = parser.Read() 273 require.Nil(t, err) 274 require.Equal(t, []Field{ 275 newStringField("aaa", false), 276 newStringField("bbb", false), 277 newStringField("ccc", false), 278 }, row) 279 assertPosEqual(t, parser, 12) 280 281 row, err = parser.Read() 282 require.Nil(t, err) 283 require.Equal(t, []Field{ 284 newStringField("zzz", false), 285 newStringField("yyy", false), 286 newStringField("xxx", false), 287 }, row) 288 assertPosEqual(t, parser, 23) 289 290 // example 5, quoted fields 291 292 parser, err = NewCSVParser(&cfg, NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), int64(ReadBlockSize), false, false) 293 require.NoError(t, err) 294 295 row, err = parser.Read() 296 require.Nil(t, err) 297 require.Equal(t, []Field{ 298 newStringField("aaa", false), 299 newStringField("bbb", false), 300 newStringField("ccc", false), 301 }, row) 302 assertPosEqual(t, parser, 18) 303 304 row, err = parser.Read() 305 require.Nil(t, err) 306 require.Equal(t, []Field{ 307 newStringField("zzz", false), 308 newStringField("yyy", false), 309 newStringField("xxx", false), 310 }, row) 311 assertPosEqual(t, parser, 29) 312 313 // example 6, line breaks within fields 314 315 parser, err = NewCSVParser(&cfg, NewStringReader(`"aaa","b 316 bb","ccc" 317 zzz,yyy,xxx`), int64(ReadBlockSize), false, false) 318 require.NoError(t, err) 319 320 row, err = parser.Read() 321 require.Nil(t, err) 322 require.Equal(t, []Field{ 323 newStringField("aaa", false), 324 newStringField("b\nbb", false), 325 newStringField("ccc", false), 326 }, row) 327 assertPosEqual(t, parser, 19) 328 329 row, err = parser.Read() 330 require.Nil(t, err) 331 require.Equal(t, []Field{ 332 newStringField("zzz", false), 333 newStringField("yyy", false), 334 newStringField("xxx", false), 335 }, row) 336 assertPosEqual(t, parser, 30) 337 338 // example 7, quote escaping 339 340 parser, err = NewCSVParser(&cfg, NewStringReader(`"aaa","b""bb","ccc"`), int64(ReadBlockSize), false, false) 341 require.NoError(t, err) 342 343 row, err = parser.Read() 344 require.Nil(t, err) 345 require.Equal(t, []Field{ 346 newStringField("aaa", false), 347 newStringField("b\"bb", false), 348 newStringField("ccc", false), 349 }, row) 350 assertPosEqual(t, parser, 19) 351 352 } 353 354 func TestMySQL(t *testing.T) { 355 cfg := CSVConfig{ 356 FieldsTerminatedBy: ",", 357 FieldsEnclosedBy: `"`, 358 LinesTerminatedBy: "\n", 359 FieldsEscapedBy: `\`, 360 NotNull: false, 361 Null: []string{`\N`}, 362 } 363 364 parser, err := NewCSVParser(&cfg, NewStringReader(`"\"","\\","\?" 365 "\ 366 ",\N,\\N`), int64(ReadBlockSize), false, false) 367 require.NoError(t, err) 368 369 var row []Field 370 371 row, err = parser.Read() 372 require.NoError(t, err) 373 require.Equal(t, []Field{ 374 newStringField(`"`, false), 375 newStringField(`\`, false), 376 newStringField("?", false), 377 }, row) 378 379 assertPosEqual(t, parser, 15) 380 381 row, err = parser.Read() 382 require.NoError(t, err) 383 384 require.Equal(t, []Field{ 385 newStringField("\n", false), 386 newStringField("\\N", true), 387 newStringField(`\N`, false), 388 }, row) 389 390 assertPosEqual(t, parser, 26) 391 392 parser, err = NewCSVParser( 393 &cfg, 394 NewStringReader(`"\0\b\n\r\t\Z\\\ \c\'\""`), 395 int64(ReadBlockSize), false, false) 396 require.NoError(t, err) 397 398 row, err = parser.Read() 399 require.NoError(t, err) 400 require.Equal(t, []Field{ 401 newStringField(string([]byte{0, '\b', '\n', '\r', '\t', 26, '\\', ' ', ' ', 'c', '\'', '"'}), false), 402 }, row) 403 404 cfg.UnescapedQuote = true 405 parser, err = NewCSVParser( 406 &cfg, 407 NewStringReader(`3,"a string containing a " quote",102.20 408 `), 409 int64(ReadBlockSize), false, false) 410 require.NoError(t, err) 411 412 row, err = parser.Read() 413 require.NoError(t, err) 414 require.Equal(t, []Field{ 415 newStringField("3", false), 416 newStringField(`a string containing a " quote`, false), 417 newStringField("102.20", false), 418 }, row) 419 420 parser, err = NewCSVParser( 421 &cfg, 422 NewStringReader(`3,"a string containing a " quote","102.20"`), 423 int64(ReadBlockSize), false, false) 424 require.NoError(t, err) 425 426 row, err = parser.Read() 427 require.NoError(t, err) 428 require.Equal(t, []Field{ 429 newStringField("3", false), 430 newStringField(`a string containing a " quote`, false), 431 newStringField("102.20", false), 432 }, row) 433 434 parser, err = NewCSVParser( 435 &cfg, 436 NewStringReader(`"a"b",c"d"e`), 437 int64(ReadBlockSize), false, false) 438 require.NoError(t, err) 439 440 row, err = parser.Read() 441 require.NoError(t, err) 442 require.Equal(t, []Field{ 443 newStringField(`a"b`, false), 444 newStringField(`c"d"e`, false), 445 }, row) 446 } 447 448 func TestCustomEscapeChar(t *testing.T) { 449 cfg := CSVConfig{ 450 FieldsTerminatedBy: ",", 451 FieldsEnclosedBy: `"`, 452 FieldsEscapedBy: `!`, 453 NotNull: false, 454 Null: []string{`!N`}, 455 } 456 457 parser, err := NewCSVParser(&cfg, NewStringReader(`"!"","!!","!\" 458 "! 459 ",!N,!!N`), int64(ReadBlockSize), false, false) 460 require.NoError(t, err) 461 462 var row []Field 463 464 row, err = parser.Read() 465 require.Nil(t, err) 466 require.Equal(t, []Field{ 467 newStringField(`"`, false), 468 newStringField(`!`, false), 469 newStringField(`\`, false), 470 }, row) 471 assertPosEqual(t, parser, 15) 472 473 row, err = parser.Read() 474 require.Nil(t, err) 475 require.Equal(t, []Field{ 476 newStringField("\n", false), 477 newStringField(`!N`, true), 478 newStringField(`!N`, false), 479 }, row) 480 assertPosEqual(t, parser, 26) 481 482 cfg = CSVConfig{ 483 FieldsTerminatedBy: ",", 484 FieldsEnclosedBy: `"`, 485 FieldsEscapedBy: ``, 486 NotNull: false, 487 Null: []string{`NULL`}, 488 } 489 490 parser, err = NewCSVParser( 491 &cfg, 492 NewStringReader(`"{""itemRangeType"":0,""itemContainType"":0,""shopRangeType"":1,""shopJson"":""[{\""id\"":\""A1234\"",\""shopName\"":\""AAAAAA\""}]""}"`), 493 int64(ReadBlockSize), false, false) 494 require.NoError(t, err) 495 496 row, err = parser.Read() 497 require.Nil(t, err) 498 require.Equal(t, []Field{ 499 newStringField(`{"itemRangeType":0,"itemContainType":0,"shopRangeType":1,"shopJson":"[{\"id\":\"A1234\",\"shopName\":\"AAAAAA\"}]"}`, false), 500 }, row) 501 }