github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/util/encoding/csv/reader_test.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Copyright 2011 The Go Authors. All rights reserved. 12 // Use of this source code is governed by a BSD-style 13 // license that can be found in licenses/BSD-golang.txt. 14 15 package csv 16 17 import ( 18 "io" 19 "reflect" 20 "strings" 21 "testing" 22 "unicode/utf8" 23 ) 24 25 func TestRead(t *testing.T) { 26 tests := []struct { 27 Name string 28 Input string 29 Output [][]string 30 Error error 31 32 // These fields are copied into the Reader 33 Comma rune 34 Comment rune 35 UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1 36 FieldsPerRecord int 37 LazyQuotes bool 38 TrimLeadingSpace bool 39 ReuseRecord bool 40 }{{ 41 Name: "Simple", 42 Input: "a,b,c\n", 43 Output: [][]string{{"a", "b", "c"}}, 44 }, { 45 Name: "CRLF", 46 Input: "a,b\r\nc,d\r\n", 47 Output: [][]string{{"a", "b"}, {"c", "d"}}, 48 }, { 49 Name: "BareCR", 50 Input: "a,b\rc,d\r\n", 51 Output: [][]string{{"a", "b\rc", "d"}}, 52 }, { 53 Name: "RFC4180test", 54 Input: `#field1,field2,field3 55 "aaa","bb 56 b","ccc" 57 "a,a","b""bb","ccc" 58 zzz,yyy,xxx 59 `, 60 Output: [][]string{ 61 {"#field1", "field2", "field3"}, 62 {"aaa", "bb\nb", "ccc"}, 63 {"a,a", `b"bb`, "ccc"}, 64 {"zzz", "yyy", "xxx"}, 65 }, 66 UseFieldsPerRecord: true, 67 FieldsPerRecord: 0, 68 }, { 69 Name: "NoEOLTest", 70 Input: "a,b,c", 71 Output: [][]string{{"a", "b", "c"}}, 72 }, { 73 Name: "Semicolon", 74 Input: "a;b;c\n", 75 Output: [][]string{{"a", "b", "c"}}, 76 Comma: ';', 77 }, { 78 Name: "MultiLine", 79 Input: `"two 80 line","one line","three 81 line 82 field"`, 83 Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}}, 84 }, { 85 Name: "BlankLine", 86 Input: "a,b,c\n\nd,e,f\n\n", 87 Output: [][]string{ 88 {"a", "b", "c"}, 89 {"d", "e", "f"}, 90 }, 91 }, { 92 Name: "BlankLineFieldCount", 93 Input: "a,b,c\n\nd,e,f\n\n", 94 Output: [][]string{ 95 {"a", "b", "c"}, 96 {"d", "e", "f"}, 97 }, 98 UseFieldsPerRecord: true, 99 FieldsPerRecord: 0, 100 }, { 101 Name: "TrimSpace", 102 Input: " a, b, c\n", 103 Output: [][]string{{"a", "b", "c"}}, 104 TrimLeadingSpace: true, 105 }, { 106 Name: "LeadingSpace", 107 Input: " a, b, c\n", 108 Output: [][]string{{" a", " b", " c"}}, 109 }, { 110 Name: "Comment", 111 Input: "#1,2,3\na,b,c\n#comment", 112 Output: [][]string{{"a", "b", "c"}}, 113 Comment: '#', 114 }, { 115 Name: "NoComment", 116 Input: "#1,2,3\na,b,c", 117 Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}}, 118 }, { 119 Name: "LazyQuotes", 120 Input: `a "word","1"2",a","b`, 121 Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}}, 122 LazyQuotes: true, 123 }, { 124 Name: "BareQuotes", 125 Input: `a "word","1"2",a"`, 126 Output: [][]string{{`a "word"`, `1"2`, `a"`}}, 127 LazyQuotes: true, 128 }, { 129 Name: "BareDoubleQuotes", 130 Input: `a""b,c`, 131 Output: [][]string{{`a""b`, `c`}}, 132 LazyQuotes: true, 133 }, { 134 Name: "BadDoubleQuotes", 135 Input: `a""b,c`, 136 Error: &ParseError{StartLine: 1, Line: 1, Column: 1, Err: ErrBareQuote}, 137 }, { 138 Name: "TrimQuote", 139 Input: ` "a"," b",c`, 140 Output: [][]string{{"a", " b", "c"}}, 141 TrimLeadingSpace: true, 142 }, { 143 Name: "BadBareQuote", 144 Input: `a "word","b"`, 145 Error: &ParseError{StartLine: 1, Line: 1, Column: 2, Err: ErrBareQuote}, 146 }, { 147 Name: "BadTrailingQuote", 148 Input: `"a word",b"`, 149 Error: &ParseError{StartLine: 1, Line: 1, Column: 10, Err: ErrBareQuote}, 150 }, { 151 Name: "ExtraneousQuote", 152 Input: `"a "word","b"`, 153 Error: &ParseError{StartLine: 1, Line: 1, Column: 3, Err: ErrQuote}, 154 }, { 155 Name: "BadFieldCount", 156 Input: "a,b,c\nd,e", 157 Error: &ParseError{StartLine: 2, Line: 2, Err: ErrFieldCount}, 158 UseFieldsPerRecord: true, 159 FieldsPerRecord: 0, 160 }, { 161 Name: "BadFieldCount1", 162 Input: `a,b,c`, 163 Error: &ParseError{StartLine: 1, Line: 1, Err: ErrFieldCount}, 164 UseFieldsPerRecord: true, 165 FieldsPerRecord: 2, 166 }, { 167 Name: "FieldCount", 168 Input: "a,b,c\nd,e", 169 Output: [][]string{{"a", "b", "c"}, {"d", "e"}}, 170 }, { 171 Name: "TrailingCommaEOF", 172 Input: "a,b,c,", 173 Output: [][]string{{"a", "b", "c", ""}}, 174 }, { 175 Name: "TrailingCommaEOL", 176 Input: "a,b,c,\n", 177 Output: [][]string{{"a", "b", "c", ""}}, 178 }, { 179 Name: "TrailingCommaSpaceEOF", 180 Input: "a,b,c, ", 181 Output: [][]string{{"a", "b", "c", ""}}, 182 TrimLeadingSpace: true, 183 }, { 184 Name: "TrailingCommaSpaceEOL", 185 Input: "a,b,c, \n", 186 Output: [][]string{{"a", "b", "c", ""}}, 187 TrimLeadingSpace: true, 188 }, { 189 Name: "TrailingCommaLine3", 190 Input: "a,b,c\nd,e,f\ng,hi,", 191 Output: [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}}, 192 TrimLeadingSpace: true, 193 }, { 194 Name: "NotTrailingComma3", 195 Input: "a,b,c, \n", 196 Output: [][]string{{"a", "b", "c", " "}}, 197 }, { 198 Name: "CommaFieldTest", 199 Input: `x,y,z,w 200 x,y,z, 201 x,y,, 202 x,,, 203 ,,, 204 "x","y","z","w" 205 "x","y","z","" 206 "x","y","","" 207 "x","","","" 208 "","","","" 209 `, 210 Output: [][]string{ 211 {"x", "y", "z", "w"}, 212 {"x", "y", "z", ""}, 213 {"x", "y", "", ""}, 214 {"x", "", "", ""}, 215 {"", "", "", ""}, 216 {"x", "y", "z", "w"}, 217 {"x", "y", "z", ""}, 218 {"x", "y", "", ""}, 219 {"x", "", "", ""}, 220 {"", "", "", ""}, 221 }, 222 }, { 223 Name: "TrailingCommaIneffective1", 224 Input: "a,b,\nc,d,e", 225 Output: [][]string{ 226 {"a", "b", ""}, 227 {"c", "d", "e"}, 228 }, 229 TrimLeadingSpace: true, 230 }, { 231 Name: "ReadAllReuseRecord", 232 Input: "a,b\nc,d", 233 Output: [][]string{ 234 {"a", "b"}, 235 {"c", "d"}, 236 }, 237 ReuseRecord: true, 238 }, { 239 Name: "StartLine1", // Issue 19019 240 Input: "a,\"b\nc\"d,e", 241 Error: &ParseError{StartLine: 1, Line: 2, Column: 1, Err: ErrQuote}, 242 }, { 243 Name: "StartLine2", 244 Input: "a,b\n\"d\n\n,e", 245 Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote}, 246 }, { 247 Name: "CRLFInQuotedField", // Issue 21201 248 Input: "\"Hello\r\nHi\"", 249 Output: [][]string{ 250 {"Hello\r\nHi"}, 251 }, 252 }, { 253 Name: "BinaryBlobField", // Issue 19410 254 Input: "x09\x41\xb4\x1c,aktau", 255 Output: [][]string{{"x09A\xb4\x1c", "aktau"}}, 256 }, { 257 Name: "TrailingCR", 258 Input: "field1,field2\r", 259 Output: [][]string{{"field1", "field2"}}, 260 }, { 261 Name: "QuotedTrailingCR", 262 Input: "\"field\"\r", 263 Output: [][]string{{"field"}}, 264 }, { 265 Name: "QuotedTrailingCRCR", 266 Input: "\"field\"\r\r", 267 Error: &ParseError{StartLine: 1, Line: 1, Column: 6, Err: ErrQuote}, 268 }, { 269 Name: "FieldCR", 270 Input: "field\rfield\r", 271 Output: [][]string{{"field\rfield"}}, 272 }, { 273 Name: "FieldCRCR", 274 Input: "field\r\rfield\r\r", 275 Output: [][]string{{"field\r\rfield\r"}}, 276 }, { 277 Name: "FieldCRCRLF", 278 Input: "field\r\r\nfield\r\r\n", 279 Output: [][]string{{"field\r"}, {"field\r"}}, 280 }, { 281 Name: "FieldCRCRLFCR", 282 Input: "field\r\r\n\rfield\r\r\n\r", 283 Output: [][]string{{"field\r"}, {"\rfield\r"}}, 284 }, { 285 Name: "FieldCRCRLFCRCR", 286 Input: "field\r\r\n\r\rfield\r\r\n\r\r", 287 Output: [][]string{{"field\r"}, {"\r\rfield\r"}, {"\r"}}, 288 }, { 289 Name: "MultiFieldCRCRLFCRCR", 290 Input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,", 291 Output: [][]string{ 292 {"field1", "field2\r"}, 293 {"\r\rfield1", "field2\r"}, 294 {"\r\r", ""}, 295 }, 296 }, { 297 Name: "NonASCIICommaAndComment", 298 Input: "a£b,c£ \td,e\n€ comment\n", 299 Output: [][]string{{"a", "b,c", "d,e"}}, 300 TrimLeadingSpace: true, 301 Comma: '£', 302 Comment: '€', 303 }, { 304 Name: "NonASCIICommaAndCommentWithQuotes", 305 Input: "a€\" b,\"€ c\nλ comment\n", 306 Output: [][]string{{"a", " b,", " c"}}, 307 Comma: '€', 308 Comment: 'λ', 309 }, { 310 // λ and θ start with the same byte. 311 // This tests that the parser doesn't confuse such characters. 312 Name: "NonASCIICommaConfusion", 313 Input: "\"abθcd\"λefθgh", 314 Output: [][]string{{"abθcd", "efθgh"}}, 315 Comma: 'λ', 316 Comment: '€', 317 }, { 318 Name: "NonASCIICommentConfusion", 319 Input: "λ\nλ\nθ\nλ\n", 320 Output: [][]string{{"λ"}, {"λ"}, {"λ"}}, 321 Comment: 'θ', 322 }, { 323 Name: "QuotedFieldMultipleLF", 324 Input: "\"\n\n\n\n\"", 325 Output: [][]string{{"\n\n\n\n"}}, 326 }, { 327 Name: "MultipleCRLF", 328 Input: "\r\n\r\n\r\n\r\n", 329 }, { 330 // The implementation may read each line in several chunks if it doesn't fit entirely 331 // in the read buffer, so we should test the code to handle that condition. 332 Name: "HugeLines", 333 Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000), 334 Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}}, 335 Comment: '#', 336 }, { 337 Name: "QuoteWithTrailingCRLF", 338 Input: "\"foo\"bar\"\r\n", 339 Error: &ParseError{StartLine: 1, Line: 1, Column: 4, Err: ErrQuote}, 340 }, { 341 Name: "LazyQuoteWithTrailingCRLF", 342 Input: "\"foo\"bar\"\r\n", 343 Output: [][]string{{`foo"bar`}}, 344 LazyQuotes: true, 345 }, { 346 Name: "DoubleQuoteWithTrailingCRLF", 347 Input: "\"foo\"\"bar\"\r\n", 348 Output: [][]string{{`foo"bar`}}, 349 }, { 350 Name: "EvenQuotes", 351 Input: `""""""""`, 352 Output: [][]string{{`"""`}}, 353 }, { 354 Name: "OddQuotes", 355 Input: `"""""""`, 356 Error: &ParseError{StartLine: 1, Line: 1, Column: 7, Err: ErrQuote}, 357 }, { 358 Name: "LazyOddQuotes", 359 Input: `"""""""`, 360 Output: [][]string{{`"""`}}, 361 LazyQuotes: true, 362 }, { 363 Name: "BadComma1", 364 Comma: '\n', 365 Error: errInvalidDelim, 366 }, { 367 Name: "BadComma2", 368 Comma: '\r', 369 Error: errInvalidDelim, 370 }, { 371 Name: "BadComma3", 372 Comma: utf8.RuneError, 373 Error: errInvalidDelim, 374 }, { 375 Name: "BadComment1", 376 Comment: '\n', 377 Error: errInvalidDelim, 378 }, { 379 Name: "BadComment2", 380 Comment: '\r', 381 Error: errInvalidDelim, 382 }, { 383 Name: "BadComment3", 384 Comment: utf8.RuneError, 385 Error: errInvalidDelim, 386 }, { 387 Name: "BadCommaComment", 388 Comma: 'X', 389 Comment: 'X', 390 Error: errInvalidDelim, 391 }} 392 393 for _, tt := range tests { 394 t.Run(tt.Name, func(t *testing.T) { 395 r := NewReader(strings.NewReader(tt.Input)) 396 397 if tt.Comma != 0 { 398 r.Comma = tt.Comma 399 } 400 r.Comment = tt.Comment 401 if tt.UseFieldsPerRecord { 402 r.FieldsPerRecord = tt.FieldsPerRecord 403 } else { 404 r.FieldsPerRecord = -1 405 } 406 r.LazyQuotes = tt.LazyQuotes 407 r.TrimLeadingSpace = tt.TrimLeadingSpace 408 r.ReuseRecord = tt.ReuseRecord 409 410 out, err := r.ReadAll() 411 if !reflect.DeepEqual(err, tt.Error) { 412 t.Errorf("ReadAll() error:\ngot %v\nwant %v", err, tt.Error) 413 } else if !reflect.DeepEqual(out, tt.Output) { 414 t.Errorf("ReadAll() output:\ngot %q\nwant %q", out, tt.Output) 415 } 416 417 // Check that the error can be rendered. 418 if err != nil { 419 _ = err.Error() 420 } 421 }) 422 } 423 } 424 425 // nTimes is an io.Reader which yields the string s n times. 426 type nTimes struct { 427 s string 428 n int 429 off int 430 } 431 432 func (r *nTimes) Read(p []byte) (n int, err error) { 433 for { 434 if r.n <= 0 || r.s == "" { 435 return n, io.EOF 436 } 437 n0 := copy(p, r.s[r.off:]) 438 p = p[n0:] 439 n += n0 440 r.off += n0 441 if r.off == len(r.s) { 442 r.off = 0 443 r.n-- 444 } 445 if len(p) == 0 { 446 return 447 } 448 } 449 } 450 451 // benchmarkRead measures reading the provided CSV rows data. 452 // initReader, if non-nil, modifies the Reader before it's used. 453 func benchmarkRead(b *testing.B, initReader func(*Reader), rows string) { 454 b.ReportAllocs() 455 r := NewReader(&nTimes{s: rows, n: b.N}) 456 if initReader != nil { 457 initReader(r) 458 } 459 for { 460 _, err := r.Read() 461 if err == io.EOF { 462 break 463 } 464 if err != nil { 465 b.Fatal(err) 466 } 467 } 468 } 469 470 const benchmarkCSVData = `x,y,z,w 471 x,y,z, 472 x,y,, 473 x,,, 474 ,,, 475 "x","y","z","w" 476 "x","y","z","" 477 "x","y","","" 478 "x","","","" 479 "","","","" 480 ` 481 482 func BenchmarkRead(b *testing.B) { 483 benchmarkRead(b, nil, benchmarkCSVData) 484 } 485 486 func BenchmarkReadWithFieldsPerRecord(b *testing.B) { 487 benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = 4 }, benchmarkCSVData) 488 } 489 490 func BenchmarkReadWithoutFieldsPerRecord(b *testing.B) { 491 benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = -1 }, benchmarkCSVData) 492 } 493 494 func BenchmarkReadLargeFields(b *testing.B) { 495 benchmarkRead(b, nil, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 496 xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv 497 ,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 498 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 499 `, 3)) 500 } 501 502 func BenchmarkReadReuseRecord(b *testing.B) { 503 benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData) 504 } 505 506 func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) { 507 benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData) 508 } 509 510 func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) { 511 benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData) 512 } 513 514 func BenchmarkReadReuseRecordLargeFields(b *testing.B) { 515 benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 516 xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv 517 ,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 518 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv 519 `, 3)) 520 }