code.gitea.io/gitea@v1.19.3/modules/csv/csv_test.go (about) 1 // Copyright 2021 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package csv 5 6 import ( 7 "bytes" 8 "encoding/csv" 9 "io" 10 "strconv" 11 "strings" 12 "testing" 13 14 "code.gitea.io/gitea/modules/git" 15 "code.gitea.io/gitea/modules/markup" 16 17 "github.com/stretchr/testify/assert" 18 ) 19 20 func TestCreateReader(t *testing.T) { 21 rd := CreateReader(bytes.NewReader([]byte{}), ',') 22 assert.Equal(t, ',', rd.Comma) 23 } 24 25 func decodeSlashes(t *testing.T, s string) string { 26 s = strings.ReplaceAll(s, "\n", "\\n") 27 s = strings.ReplaceAll(s, "\"", "\\\"") 28 decoded, err := strconv.Unquote(`"` + s + `"`) 29 assert.NoError(t, err, "unable to decode string") 30 return decoded 31 } 32 33 func TestCreateReaderAndDetermineDelimiter(t *testing.T) { 34 cases := []struct { 35 csv string 36 expectedRows [][]string 37 expectedDelimiter rune 38 }{ 39 // case 0 - semicolon delimited 40 { 41 csv: `a;b;c 42 1;2;3 43 4;5;6`, 44 expectedRows: [][]string{ 45 {"a", "b", "c"}, 46 {"1", "2", "3"}, 47 {"4", "5", "6"}, 48 }, 49 expectedDelimiter: ';', 50 }, 51 // case 1 - tab delimited with empty fields 52 { 53 csv: `col1 col2 col3 54 a, b c 55 e f 56 g h i 57 j l 58 m n,\t 59 p q r 60 u 61 v w x 62 y\t\t 63 `, 64 expectedRows: [][]string{ 65 {"col1", "col2", "col3"}, 66 {"a,", "b", "c"}, 67 {"", "e", "f"}, 68 {"g", "h", "i"}, 69 {"j", "", "l"}, 70 {"m", "n,", ""}, 71 {"p", "q", "r"}, 72 {"", "", "u"}, 73 {"v", "w", "x"}, 74 {"y", "", ""}, 75 {"", "", ""}, 76 }, 77 expectedDelimiter: '\t', 78 }, 79 // case 2 - comma delimited with leading spaces 80 { 81 csv: ` col1,col2,col3 82 a, b, c 83 d,e,f 84 ,h, i 85 j, ,\x20 86 , , `, 87 expectedRows: [][]string{ 88 {"col1", "col2", "col3"}, 89 {"a", "b", "c"}, 90 {"d", "e", "f"}, 91 {"", "h", "i"}, 92 {"j", "", ""}, 93 {"", "", ""}, 94 }, 95 expectedDelimiter: ',', 96 }, 97 } 98 99 for n, c := range cases { 100 rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(decodeSlashes(t, c.csv))) 101 assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err) 102 assert.EqualValues(t, c.expectedDelimiter, rd.Comma, "case %d: delimiter should be '%c', got '%c'", n, c.expectedDelimiter, rd.Comma) 103 rows, err := rd.ReadAll() 104 assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err) 105 assert.EqualValues(t, c.expectedRows, rows, "case %d: rows should be equal", n) 106 } 107 } 108 109 type mockReader struct{} 110 111 func (r *mockReader) Read(buf []byte) (int, error) { 112 return 0, io.ErrShortBuffer 113 } 114 115 func TestDetermineDelimiterShortBufferError(t *testing.T) { 116 rd, err := CreateReaderAndDetermineDelimiter(nil, &mockReader{}) 117 assert.Error(t, err, "CreateReaderAndDetermineDelimiter() should throw an error") 118 assert.ErrorIs(t, err, io.ErrShortBuffer) 119 assert.Nil(t, rd, "CSV reader should be mnil") 120 } 121 122 func TestDetermineDelimiterReadAllError(t *testing.T) { 123 rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(`col1,col2 124 a;b 125 c@e 126 f g 127 h|i 128 jkl`)) 129 assert.NoError(t, err, "CreateReaderAndDetermineDelimiter() shouldn't throw error") 130 assert.NotNil(t, rd, "CSV reader should not be mnil") 131 rows, err := rd.ReadAll() 132 assert.Error(t, err, "RaadAll() should throw error") 133 assert.ErrorIs(t, err, csv.ErrFieldCount) 134 assert.Empty(t, rows, "rows should be empty") 135 } 136 137 func TestDetermineDelimiter(t *testing.T) { 138 cases := []struct { 139 csv string 140 filename string 141 expectedDelimiter rune 142 }{ 143 // case 0 - semicolon delmited 144 { 145 csv: "a", 146 filename: "test.csv", 147 expectedDelimiter: ',', 148 }, 149 // case 1 - single column/row CSV 150 { 151 csv: "a", 152 filename: "", 153 expectedDelimiter: ',', 154 }, 155 // case 2 - single column, single row CSV w/ tsv file extension (so is tabbed delimited) 156 { 157 csv: "1,2", 158 filename: "test.tsv", 159 expectedDelimiter: '\t', 160 }, 161 // case 3 - two column, single row CSV w/ no filename, so will guess comma as delimiter 162 { 163 csv: "1,2", 164 filename: "", 165 expectedDelimiter: ',', 166 }, 167 // case 4 - semi-colon delimited with csv extension 168 { 169 csv: "1;2", 170 filename: "test.csv", 171 expectedDelimiter: ';', 172 }, 173 // case 5 - tabbed delimited with tsv extension 174 { 175 csv: "1\t2", 176 filename: "test.tsv", 177 expectedDelimiter: '\t', 178 }, 179 // case 6 - tabbed delimited without any filename 180 { 181 csv: "1\t2", 182 filename: "", 183 expectedDelimiter: '\t', 184 }, 185 // case 7 - tabs won't work, only commas as every row has same amount of commas 186 { 187 csv: "col1,col2\nfirst\tval,seconed\tval", 188 filename: "", 189 expectedDelimiter: ',', 190 }, 191 // case 8 - While looks like comma delimited, has psv extension 192 { 193 csv: "1,2", 194 filename: "test.psv", 195 expectedDelimiter: '|', 196 }, 197 // case 9 - pipe delmiited with no extension 198 { 199 csv: "1|2", 200 filename: "", 201 expectedDelimiter: '|', 202 }, 203 // case 10 - semi-colon delimited with commas in values 204 { 205 csv: "1,2,3;4,5,6;7,8,9\na;b;c", 206 filename: "", 207 expectedDelimiter: ';', 208 }, 209 // case 11 - semi-colon delimited with newline in content 210 { 211 csv: `"1,2,3,4";"a 212 b";% 213 c;d;#`, 214 filename: "", 215 expectedDelimiter: ';', 216 }, 217 // case 12 - HTML as single value 218 { 219 csv: "<br/>", 220 filename: "", 221 expectedDelimiter: ',', 222 }, 223 // case 13 - tab delimited with commas in values 224 { 225 csv: `name email note 226 John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`, 227 filename: "", 228 expectedDelimiter: '\t', 229 }, 230 } 231 232 for n, c := range cases { 233 delimiter := determineDelimiter(&markup.RenderContext{ 234 Ctx: git.DefaultContext, 235 RelativePath: c.filename, 236 }, []byte(decodeSlashes(t, c.csv))) 237 assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter) 238 } 239 } 240 241 func TestRemoveQuotedString(t *testing.T) { 242 cases := []struct { 243 text string 244 expectedText string 245 }{ 246 // case 0 - quoted text with escaped quotes in 1st column 247 { 248 text: `col1,col2,col3 249 "quoted ""text"" with 250 new lines 251 in first column",b,c`, 252 expectedText: `col1,col2,col3 253 ,b,c`, 254 }, 255 // case 1 - quoted text with escaped quotes in 2nd column 256 { 257 text: `col1,col2,col3 258 a,"quoted ""text"" with 259 new lines 260 in second column",c`, 261 expectedText: `col1,col2,col3 262 a,,c`, 263 }, 264 // case 2 - quoted text with escaped quotes in last column 265 { 266 text: `col1,col2,col3 267 a,b,"quoted ""text"" with 268 new lines 269 in last column"`, 270 expectedText: `col1,col2,col3 271 a,b,`, 272 }, 273 // case 3 - csv with lots of quotes 274 { 275 text: `a,"b",c,d,"e 276 e 277 e",f 278 a,bb,c,d,ee ,"f 279 f" 280 a,b,"c "" 281 c",d,e,f`, 282 expectedText: `a,,c,d,,f 283 a,bb,c,d,ee , 284 a,b,,d,e,f`, 285 }, 286 // case 4 - csv with pipes and quotes 287 { 288 text: `Col1 | Col2 | Col3 289 abc | "Hello 290 World"|123 291 "de 292 293 f" | 4.56 | 789`, 294 expectedText: `Col1 | Col2 | Col3 295 abc | |123 296 | 4.56 | 789`, 297 }, 298 } 299 300 for n, c := range cases { 301 modifiedText := removeQuotedString(decodeSlashes(t, c.text)) 302 assert.EqualValues(t, c.expectedText, modifiedText, "case %d: modified text should be equal", n) 303 } 304 } 305 306 func TestGuessDelimiter(t *testing.T) { 307 cases := []struct { 308 csv string 309 expectedDelimiter rune 310 }{ 311 // case 0 - single cell, comma delmited 312 { 313 csv: "a", 314 expectedDelimiter: ',', 315 }, 316 // case 1 - two cells, comma delimited 317 { 318 csv: "1,2", 319 expectedDelimiter: ',', 320 }, 321 // case 2 - semicolon delimited 322 { 323 csv: "1;2", 324 expectedDelimiter: ';', 325 }, 326 // case 3 - tab delimited 327 { 328 csv: "1\t2", 329 expectedDelimiter: '\t', 330 }, 331 // case 4 - pipe delimited 332 { 333 csv: "1|2", 334 expectedDelimiter: '|', 335 }, 336 // case 5 - semicolon delimited with commas in text 337 { 338 csv: `1,2,3;4,5,6;7,8,9 339 a;b;c`, 340 expectedDelimiter: ';', 341 }, 342 // case 6 - semicolon delmited with commas in quoted text 343 { 344 csv: `"1,2,3,4";"a 345 b" 346 c;d`, 347 expectedDelimiter: ';', 348 }, 349 // case 7 - HTML 350 { 351 csv: "<br/>", 352 expectedDelimiter: ',', 353 }, 354 // case 8 - tab delimited with commas in value 355 { 356 csv: `name email note 357 John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`, 358 expectedDelimiter: '\t', 359 }, 360 // case 9 - tab delimited with new lines in values, commas in values 361 { 362 csv: `1 "some,""more 363 "" 364 quoted, 365 text," a 366 2 "some, 367 quoted,\t 368 text," b 369 3 "some, 370 quoted, 371 text" c 372 4 "some, 373 quoted, 374 text," d`, 375 expectedDelimiter: '\t', 376 }, 377 // case 10 - semicolon delmited with quotes and semicolon in value 378 { 379 csv: `col1;col2 380 "this has a literal "" in the text";"and an ; in the text"`, 381 expectedDelimiter: ';', 382 }, 383 // case 11 - pipe delimited with quotes 384 { 385 csv: `Col1 | Col2 | Col3 386 abc | "Hello 387 World"|123 388 "de 389 | 390 f" | 4.56 | 789`, 391 expectedDelimiter: '|', 392 }, 393 // case 12 - a tab delimited 6 column CSV, but the values are not quoted and have lots of commas. 394 // In the previous bestScore algorithm, this would have picked comma as the delimiter, but now it should guess tab 395 { 396 csv: `c1 c2 c3 c4 c5 c6 397 v,k,x,v ym,f,oa,qn,uqijh,n,s,wvygpo uj,kt,j,w,i,fvv,tm,f,ddt,b,mwt,e,t,teq,rd,p,a e,wfuae,t,h,q,im,ix,y h,mrlu,l,dz,ff,zi,af,emh ,gov,bmfelvb,axp,f,u,i,cni,x,z,v,sh,w,jo,,m,h 398 k,ohf,pgr,tde,m,s te,ek,,v,,ic,kqc,dv,w,oi,j,w,gojjr,ug,,l,j,zl g,qziq,bcajx,zfow,ka,j,re,ohbc k,nzm,qm,ts,auf th,elb,lx,l,q,e,qf asbr,z,k,y,tltobga 399 g,m,bu,el h,l,jwi,o,wge,fy,rure,c,g,lcxu,fxte,uns,cl,s,o,t,h,rsoy,f bq,s,uov,z,ikkhgyg,,sabs,c,hzue mc,b,,j,t,n sp,mn,,m,t,dysi,eq,pigb,rfa,z w,rfli,sg,,o,wjjjf,f,wxdzfk,x,t,p,zy,p,mg,r,l,h 400 e,ewbkc,nugd,jj,sf,ih,i,n,jo,b,poem,kw,q,i,x,t,e,uug,k j,xm,sch,ux,h,,fb,f,pq,,mh,,f,v,,oba,w,h,v,eiz,yzd,o,a,c,e,dhp,q a,pbef,epc,k,rdpuw,cw k,j,e,d xf,dz,sviv,w,sqnzew,t,b v,yg,f,cq,ti,g,m,ta,hm,ym,ii,hxy,p,z,r,e,ga,sfs,r,p,l,aar,w,kox,j 401 l,d,v,pp,q,j,bxip,w,i,im,qa,o e,o h,w,a,a,qzj,nt,qfn,ut,fvhu,ts hu,q,g,p,q,ofpje,fsqa,frp,p,vih,j,w,k,jx, ln,th,ka,l,b,vgk,rv,hkx rj,v,y,cwm,rao,e,l,wvr,ptc,lm,yg,u,k,i,b,zk,b,gv,fls 402 velxtnhlyuysbnlchosqlhkozkdapjaueexjwrndwb nglvnv kqiv pbshwlmcexdzipopxjyrxhvjalwp pydvipwlkkpdvbtepahskwuornbsb qwbacgq 403 l,y,u,bf,y,m,eals,n,cop,h,g,vs,jga,opt x,b,zwmn,hh,b,n,pdj,t,d px yn,vtd,u,y,b,ps,yo,qqnem,mxg,m,al,rd,c,k,d,q,f ilxdxa,m,y,,p,p,y,prgmg,q,n,etj,k,ns b,pl,z,jq,hk 404 p,gc jn,mzr,bw sb,e,r,dy,ur,wzy,r,c,n,yglr,jbdu,r,pqk,k q,d,,,p,l,euhl,dc,rwh,t,tq,z,h,p,s,t,x,fugr,h wi,zxb,jcig,o,t,k mfh,ym,h,e,p,cnvx,uv,zx,x,pq,blt,v,r,u,tr,g,g,xt 405 nri,p,,t,if,,y,ptlqq a,i w,ovli,um,w,f,re,k,sb,w,jy,zf i,g,p,q,mii,nr,jm,cc i,szl,k,eg,l,d ,ah,w,b,vh 406 ,,sh,wx,mn,xm,u,d,yy,u,t,m,j,s,b ogadq,g,y,y,i,h,ln,jda,g,cz,s,rv,r,s,s,le,r, y,nu,f,nagj o,h,,adfy,o,nf,ns,gvsvnub,k,b,xyz v,h,g,ef,y,gb c,x,cw,x,go,h,t,x,cu,u,qgrqzrcmn,kq,cd,g,rejp,zcq 407 skxg,t,vay,d,wug,d,xg,sexc rt g,ag,mjq,fjnyji,iwa,m,ml,b,ua,b,qjxeoc be,s,sh,n,jbzxs,g,n,i,h,y,r,be,mfo,u,p cw,r,,u,zn,eg,r,yac,m,l,edkr,ha,x,g,b,c,tg,c j,ye,u,ejd,maj,ea,bm,u,iy`, 408 expectedDelimiter: '\t', 409 }, 410 // case 13 - a CSV with more than 10 lines and since we only use the first 10 lines, it should still get the delimiter as semicolon 411 { 412 csv: `col1;col2;col3 413 1;1;1 414 2;2;2 415 3;3;3 416 4;4;4 417 5;5;5 418 6;6;6 419 7;7;7 420 8;8;8 421 9;9;9 422 10;10;10 423 11 11 11 424 12|12|12`, 425 expectedDelimiter: ';', 426 }, 427 // case 14 - a really long single line (over 10k) that will get truncated, but since it has commas and semicolons (but more semicolons) it will pick semicolon 428 { 429 csv: strings.Repeat("a;b,c;", 1700), 430 expectedDelimiter: ';', 431 }, 432 // case 15 - 2 lines that are well over 10k, but since the 2nd line is where this CSV will be truncated (10k sample), it will only use the first line, so semicolon will be picked 433 { 434 csv: "col1@col2@col3\na@b@" + strings.Repeat("c", 6000) + "\nd,e," + strings.Repeat("f", 4000), 435 expectedDelimiter: '@', 436 }, 437 // case 16 - has all delimiters so should return comma 438 { 439 csv: `col1,col2;col3@col4|col5 col6 440 a b|c@d;e,f`, 441 expectedDelimiter: ',', 442 }, 443 // case 16 - nothing works (bad csv) so returns comma by default 444 { 445 csv: `col1,col2 446 a;b 447 c@e 448 f g 449 h|i 450 jkl`, 451 expectedDelimiter: ',', 452 }, 453 } 454 455 for n, c := range cases { 456 delimiter := guessDelimiter([]byte(decodeSlashes(t, c.csv))) 457 assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter) 458 } 459 } 460 461 func TestGuessFromBeforeAfterQuotes(t *testing.T) { 462 cases := []struct { 463 csv string 464 expectedDelimiter rune 465 }{ 466 // case 0 - tab delimited with new lines in values, commas in values 467 { 468 csv: `1 "some,""more 469 "" 470 quoted, 471 text," a 472 2 "some, 473 quoted,\t 474 text," b 475 3 "some, 476 quoted, 477 text" c 478 4 "some, 479 quoted, 480 text," d`, 481 expectedDelimiter: '\t', 482 }, 483 // case 1 - semicolon delmited with quotes and semicolon in value 484 { 485 csv: `col1;col2 486 "this has a literal "" in the text";"and an ; in the text"`, 487 expectedDelimiter: ';', 488 }, 489 // case 2 - pipe delimited with quotes 490 { 491 csv: `Col1 | Col2 | Col3 492 abc | "Hello 493 World"|123 494 "de 495 | 496 f" | 4.56 | 789`, 497 expectedDelimiter: '|', 498 }, 499 // case 3 - a complicated quoted CSV that is semicolon delmiited 500 { 501 csv: `he; she 502 "he said, ""hey!"""; "she said, ""hey back!""" 503 but; "be"`, 504 expectedDelimiter: ';', 505 }, 506 // case 4 - no delimiter should be found 507 { 508 csv: `a,b`, 509 expectedDelimiter: 0, 510 }, 511 // case 5 - no limiter should be found 512 { 513 csv: `col1 514 "he said, ""here I am"""`, 515 expectedDelimiter: 0, 516 }, 517 // case 6 - delimiter before double quoted string with space 518 { 519 csv: `col1|col2 520 a| "he said, ""here I am"""`, 521 expectedDelimiter: '|', 522 }, 523 // case 7 - delimiter before double quoted string without space 524 { 525 csv: `col1|col2 526 a|"he said, ""here I am"""`, 527 expectedDelimiter: '|', 528 }, 529 // case 8 - delimiter after double quoted string with space 530 { 531 csv: `col1, col2 532 "abc\n 533 534 ", def`, 535 expectedDelimiter: ',', 536 }, 537 // case 9 - delimiter after double quoted string without space 538 { 539 csv: `col1,col2 540 "abc\n 541 542 ",def`, 543 expectedDelimiter: ',', 544 }, 545 } 546 547 for n, c := range cases { 548 delimiter := guessFromBeforeAfterQuotes([]byte(decodeSlashes(t, c.csv))) 549 assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter) 550 } 551 } 552 553 type mockLocale struct{} 554 555 func (l mockLocale) Language() string { 556 return "en" 557 } 558 559 func (l mockLocale) Tr(s string, _ ...interface{}) string { 560 return s 561 } 562 563 func (l mockLocale) TrN(_cnt interface{}, key1, _keyN string, _args ...interface{}) string { 564 return key1 565 } 566 567 func TestFormatError(t *testing.T) { 568 cases := []struct { 569 err error 570 expectedMessage string 571 expectsError bool 572 }{ 573 { 574 err: &csv.ParseError{ 575 Err: csv.ErrFieldCount, 576 }, 577 expectedMessage: "repo.error.csv.invalid_field_count", 578 expectsError: false, 579 }, 580 { 581 err: &csv.ParseError{ 582 Err: csv.ErrBareQuote, 583 }, 584 expectedMessage: "repo.error.csv.unexpected", 585 expectsError: false, 586 }, 587 { 588 err: bytes.ErrTooLarge, 589 expectsError: true, 590 }, 591 } 592 593 for n, c := range cases { 594 message, err := FormatError(c.err, mockLocale{}) 595 if c.expectsError { 596 assert.Error(t, err, "case %d: expected an error to be returned", n) 597 } else { 598 assert.NoError(t, err, "case %d: no error was expected, got error: %v", n, err) 599 assert.EqualValues(t, c.expectedMessage, message, "case %d: messages should be equal, expected '%s' got '%s'", n, c.expectedMessage, message) 600 } 601 } 602 }