code.gitea.io/gitea@v1.22.3/modules/csv/csv_test.go (about) 1 // Copyright 2021 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package csv 5 6 import ( 7 "bytes" 8 "encoding/csv" 9 "io" 10 "strconv" 11 "strings" 12 "testing" 13 14 "code.gitea.io/gitea/modules/git" 15 "code.gitea.io/gitea/modules/markup" 16 "code.gitea.io/gitea/modules/translation" 17 18 "github.com/stretchr/testify/assert" 19 ) 20 21 func TestCreateReader(t *testing.T) { 22 rd := CreateReader(bytes.NewReader([]byte{}), ',') 23 assert.Equal(t, ',', rd.Comma) 24 } 25 26 func decodeSlashes(t *testing.T, s string) string { 27 s = strings.ReplaceAll(s, "\n", "\\n") 28 s = strings.ReplaceAll(s, "\"", "\\\"") 29 decoded, err := strconv.Unquote(`"` + s + `"`) 30 assert.NoError(t, err, "unable to decode string") 31 return decoded 32 } 33 34 func TestCreateReaderAndDetermineDelimiter(t *testing.T) { 35 cases := []struct { 36 csv string 37 expectedRows [][]string 38 expectedDelimiter rune 39 }{ 40 // case 0 - semicolon delimited 41 { 42 csv: `a;b;c 43 1;2;3 44 4;5;6`, 45 expectedRows: [][]string{ 46 {"a", "b", "c"}, 47 {"1", "2", "3"}, 48 {"4", "5", "6"}, 49 }, 50 expectedDelimiter: ';', 51 }, 52 // case 1 - tab delimited with empty fields 53 { 54 csv: `col1 col2 col3 55 a, b c 56 e f 57 g h i 58 j l 59 m n,\t 60 p q r 61 u 62 v w x 63 y\t\t 64 `, 65 expectedRows: [][]string{ 66 {"col1", "col2", "col3"}, 67 {"a,", "b", "c"}, 68 {"", "e", "f"}, 69 {"g", "h", "i"}, 70 {"j", "", "l"}, 71 {"m", "n,", ""}, 72 {"p", "q", "r"}, 73 {"", "", "u"}, 74 {"v", "w", "x"}, 75 {"y", "", ""}, 76 {"", "", ""}, 77 }, 78 expectedDelimiter: '\t', 79 }, 80 // case 2 - comma delimited with leading spaces 81 { 82 csv: ` col1,col2,col3 83 a, b, c 84 d,e,f 85 ,h, i 86 j, ,\x20 87 , , `, 88 expectedRows: [][]string{ 89 {"col1", "col2", "col3"}, 90 {"a", "b", "c"}, 91 {"d", "e", "f"}, 92 {"", "h", "i"}, 93 {"j", "", ""}, 94 {"", "", ""}, 95 }, 96 expectedDelimiter: ',', 97 }, 98 } 99 100 for n, c := range cases { 101 rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(decodeSlashes(t, c.csv))) 102 assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err) 103 assert.EqualValues(t, c.expectedDelimiter, rd.Comma, "case %d: delimiter should be '%c', got '%c'", n, c.expectedDelimiter, rd.Comma) 104 rows, err := rd.ReadAll() 105 assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err) 106 assert.EqualValues(t, c.expectedRows, rows, "case %d: rows should be equal", n) 107 } 108 } 109 110 type mockReader struct{} 111 112 func (r *mockReader) Read(buf []byte) (int, error) { 113 return 0, io.ErrShortBuffer 114 } 115 116 func TestDetermineDelimiterShortBufferError(t *testing.T) { 117 rd, err := CreateReaderAndDetermineDelimiter(nil, &mockReader{}) 118 assert.Error(t, err, "CreateReaderAndDetermineDelimiter() should throw an error") 119 assert.ErrorIs(t, err, io.ErrShortBuffer) 120 assert.Nil(t, rd, "CSV reader should be mnil") 121 } 122 123 func TestDetermineDelimiterReadAllError(t *testing.T) { 124 rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(`col1,col2 125 a;b 126 c@e 127 f g 128 h|i 129 jkl`)) 130 assert.NoError(t, err, "CreateReaderAndDetermineDelimiter() shouldn't throw error") 131 assert.NotNil(t, rd, "CSV reader should not be mnil") 132 rows, err := rd.ReadAll() 133 assert.Error(t, err, "RaadAll() should throw error") 134 assert.ErrorIs(t, err, csv.ErrFieldCount) 135 assert.Empty(t, rows, "rows should be empty") 136 } 137 138 func TestDetermineDelimiter(t *testing.T) { 139 cases := []struct { 140 csv string 141 filename string 142 expectedDelimiter rune 143 }{ 144 // case 0 - semicolon delmited 145 { 146 csv: "a", 147 filename: "test.csv", 148 expectedDelimiter: ',', 149 }, 150 // case 1 - single column/row CSV 151 { 152 csv: "a", 153 filename: "", 154 expectedDelimiter: ',', 155 }, 156 // case 2 - single column, single row CSV w/ tsv file extension (so is tabbed delimited) 157 { 158 csv: "1,2", 159 filename: "test.tsv", 160 expectedDelimiter: '\t', 161 }, 162 // case 3 - two column, single row CSV w/ no filename, so will guess comma as delimiter 163 { 164 csv: "1,2", 165 filename: "", 166 expectedDelimiter: ',', 167 }, 168 // case 4 - semi-colon delimited with csv extension 169 { 170 csv: "1;2", 171 filename: "test.csv", 172 expectedDelimiter: ';', 173 }, 174 // case 5 - tabbed delimited with tsv extension 175 { 176 csv: "1\t2", 177 filename: "test.tsv", 178 expectedDelimiter: '\t', 179 }, 180 // case 6 - tabbed delimited without any filename 181 { 182 csv: "1\t2", 183 filename: "", 184 expectedDelimiter: '\t', 185 }, 186 // case 7 - tabs won't work, only commas as every row has same amount of commas 187 { 188 csv: "col1,col2\nfirst\tval,seconed\tval", 189 filename: "", 190 expectedDelimiter: ',', 191 }, 192 // case 8 - While looks like comma delimited, has psv extension 193 { 194 csv: "1,2", 195 filename: "test.psv", 196 expectedDelimiter: '|', 197 }, 198 // case 9 - pipe delmiited with no extension 199 { 200 csv: "1|2", 201 filename: "", 202 expectedDelimiter: '|', 203 }, 204 // case 10 - semi-colon delimited with commas in values 205 { 206 csv: "1,2,3;4,5,6;7,8,9\na;b;c", 207 filename: "", 208 expectedDelimiter: ';', 209 }, 210 // case 11 - semi-colon delimited with newline in content 211 { 212 csv: `"1,2,3,4";"a 213 b";% 214 c;d;#`, 215 filename: "", 216 expectedDelimiter: ';', 217 }, 218 // case 12 - HTML as single value 219 { 220 csv: "<br/>", 221 filename: "", 222 expectedDelimiter: ',', 223 }, 224 // case 13 - tab delimited with commas in values 225 { 226 csv: `name email note 227 John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`, 228 filename: "", 229 expectedDelimiter: '\t', 230 }, 231 } 232 233 for n, c := range cases { 234 delimiter := determineDelimiter(&markup.RenderContext{ 235 Ctx: git.DefaultContext, 236 RelativePath: c.filename, 237 }, []byte(decodeSlashes(t, c.csv))) 238 assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter) 239 } 240 } 241 242 func TestRemoveQuotedString(t *testing.T) { 243 cases := []struct { 244 text string 245 expectedText string 246 }{ 247 // case 0 - quoted text with escaped quotes in 1st column 248 { 249 text: `col1,col2,col3 250 "quoted ""text"" with 251 new lines 252 in first column",b,c`, 253 expectedText: `col1,col2,col3 254 ,b,c`, 255 }, 256 // case 1 - quoted text with escaped quotes in 2nd column 257 { 258 text: `col1,col2,col3 259 a,"quoted ""text"" with 260 new lines 261 in second column",c`, 262 expectedText: `col1,col2,col3 263 a,,c`, 264 }, 265 // case 2 - quoted text with escaped quotes in last column 266 { 267 text: `col1,col2,col3 268 a,b,"quoted ""text"" with 269 new lines 270 in last column"`, 271 expectedText: `col1,col2,col3 272 a,b,`, 273 }, 274 // case 3 - csv with lots of quotes 275 { 276 text: `a,"b",c,d,"e 277 e 278 e",f 279 a,bb,c,d,ee ,"f 280 f" 281 a,b,"c "" 282 c",d,e,f`, 283 expectedText: `a,,c,d,,f 284 a,bb,c,d,ee , 285 a,b,,d,e,f`, 286 }, 287 // case 4 - csv with pipes and quotes 288 { 289 text: `Col1 | Col2 | Col3 290 abc | "Hello 291 World"|123 292 "de 293 294 f" | 4.56 | 789`, 295 expectedText: `Col1 | Col2 | Col3 296 abc | |123 297 | 4.56 | 789`, 298 }, 299 } 300 301 for n, c := range cases { 302 modifiedText := removeQuotedString(decodeSlashes(t, c.text)) 303 assert.EqualValues(t, c.expectedText, modifiedText, "case %d: modified text should be equal", n) 304 } 305 } 306 307 func TestGuessDelimiter(t *testing.T) { 308 cases := []struct { 309 csv string 310 expectedDelimiter rune 311 }{ 312 // case 0 - single cell, comma delmited 313 { 314 csv: "a", 315 expectedDelimiter: ',', 316 }, 317 // case 1 - two cells, comma delimited 318 { 319 csv: "1,2", 320 expectedDelimiter: ',', 321 }, 322 // case 2 - semicolon delimited 323 { 324 csv: "1;2", 325 expectedDelimiter: ';', 326 }, 327 // case 3 - tab delimited 328 { 329 csv: "1\t2", 330 expectedDelimiter: '\t', 331 }, 332 // case 4 - pipe delimited 333 { 334 csv: "1|2", 335 expectedDelimiter: '|', 336 }, 337 // case 5 - semicolon delimited with commas in text 338 { 339 csv: `1,2,3;4,5,6;7,8,9 340 a;b;c`, 341 expectedDelimiter: ';', 342 }, 343 // case 6 - semicolon delmited with commas in quoted text 344 { 345 csv: `"1,2,3,4";"a 346 b" 347 c;d`, 348 expectedDelimiter: ';', 349 }, 350 // case 7 - HTML 351 { 352 csv: "<br/>", 353 expectedDelimiter: ',', 354 }, 355 // case 8 - tab delimited with commas in value 356 { 357 csv: `name email note 358 John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`, 359 expectedDelimiter: '\t', 360 }, 361 // case 9 - tab delimited with new lines in values, commas in values 362 { 363 csv: `1 "some,""more 364 "" 365 quoted, 366 text," a 367 2 "some, 368 quoted,\t 369 text," b 370 3 "some, 371 quoted, 372 text" c 373 4 "some, 374 quoted, 375 text," d`, 376 expectedDelimiter: '\t', 377 }, 378 // case 10 - semicolon delmited with quotes and semicolon in value 379 { 380 csv: `col1;col2 381 "this has a literal "" in the text";"and an ; in the text"`, 382 expectedDelimiter: ';', 383 }, 384 // case 11 - pipe delimited with quotes 385 { 386 csv: `Col1 | Col2 | Col3 387 abc | "Hello 388 World"|123 389 "de 390 | 391 f" | 4.56 | 789`, 392 expectedDelimiter: '|', 393 }, 394 // case 12 - a tab delimited 6 column CSV, but the values are not quoted and have lots of commas. 395 // In the previous bestScore algorithm, this would have picked comma as the delimiter, but now it should guess tab 396 { 397 csv: `c1 c2 c3 c4 c5 c6 398 v,k,x,v ym,f,oa,qn,uqijh,n,s,wvygpo uj,kt,j,w,i,fvv,tm,f,ddt,b,mwt,e,t,teq,rd,p,a e,wfuae,t,h,q,im,ix,y h,mrlu,l,dz,ff,zi,af,emh ,gov,bmfelvb,axp,f,u,i,cni,x,z,v,sh,w,jo,,m,h 399 k,ohf,pgr,tde,m,s te,ek,,v,,ic,kqc,dv,w,oi,j,w,gojjr,ug,,l,j,zl g,qziq,bcajx,zfow,ka,j,re,ohbc k,nzm,qm,ts,auf th,elb,lx,l,q,e,qf asbr,z,k,y,tltobga 400 g,m,bu,el h,l,jwi,o,wge,fy,rure,c,g,lcxu,fxte,uns,cl,s,o,t,h,rsoy,f bq,s,uov,z,ikkhgyg,,sabs,c,hzue mc,b,,j,t,n sp,mn,,m,t,dysi,eq,pigb,rfa,z w,rfli,sg,,o,wjjjf,f,wxdzfk,x,t,p,zy,p,mg,r,l,h 401 e,ewbkc,nugd,jj,sf,ih,i,n,jo,b,poem,kw,q,i,x,t,e,uug,k j,xm,sch,ux,h,,fb,f,pq,,mh,,f,v,,oba,w,h,v,eiz,yzd,o,a,c,e,dhp,q a,pbef,epc,k,rdpuw,cw k,j,e,d xf,dz,sviv,w,sqnzew,t,b v,yg,f,cq,ti,g,m,ta,hm,ym,ii,hxy,p,z,r,e,ga,sfs,r,p,l,aar,w,kox,j 402 l,d,v,pp,q,j,bxip,w,i,im,qa,o e,o h,w,a,a,qzj,nt,qfn,ut,fvhu,ts hu,q,g,p,q,ofpje,fsqa,frp,p,vih,j,w,k,jx, ln,th,ka,l,b,vgk,rv,hkx rj,v,y,cwm,rao,e,l,wvr,ptc,lm,yg,u,k,i,b,zk,b,gv,fls 403 velxtnhlyuysbnlchosqlhkozkdapjaueexjwrndwb nglvnv kqiv pbshwlmcexdzipopxjyrxhvjalwp pydvipwlkkpdvbtepahskwuornbsb qwbacgq 404 l,y,u,bf,y,m,eals,n,cop,h,g,vs,jga,opt x,b,zwmn,hh,b,n,pdj,t,d px yn,vtd,u,y,b,ps,yo,qqnem,mxg,m,al,rd,c,k,d,q,f ilxdxa,m,y,,p,p,y,prgmg,q,n,etj,k,ns b,pl,z,jq,hk 405 p,gc jn,mzr,bw sb,e,r,dy,ur,wzy,r,c,n,yglr,jbdu,r,pqk,k q,d,,,p,l,euhl,dc,rwh,t,tq,z,h,p,s,t,x,fugr,h wi,zxb,jcig,o,t,k mfh,ym,h,e,p,cnvx,uv,zx,x,pq,blt,v,r,u,tr,g,g,xt 406 nri,p,,t,if,,y,ptlqq a,i w,ovli,um,w,f,re,k,sb,w,jy,zf i,g,p,q,mii,nr,jm,cc i,szl,k,eg,l,d ,ah,w,b,vh 407 ,,sh,wx,mn,xm,u,d,yy,u,t,m,j,s,b ogadq,g,y,y,i,h,ln,jda,g,cz,s,rv,r,s,s,le,r, y,nu,f,nagj o,h,,adfy,o,nf,ns,gvsvnub,k,b,xyz v,h,g,ef,y,gb c,x,cw,x,go,h,t,x,cu,u,qgrqzrcmn,kq,cd,g,rejp,zcq 408 skxg,t,vay,d,wug,d,xg,sexc rt g,ag,mjq,fjnyji,iwa,m,ml,b,ua,b,qjxeoc be,s,sh,n,jbzxs,g,n,i,h,y,r,be,mfo,u,p cw,r,,u,zn,eg,r,yac,m,l,edkr,ha,x,g,b,c,tg,c j,ye,u,ejd,maj,ea,bm,u,iy`, 409 expectedDelimiter: '\t', 410 }, 411 // case 13 - a CSV with more than 10 lines and since we only use the first 10 lines, it should still get the delimiter as semicolon 412 { 413 csv: `col1;col2;col3 414 1;1;1 415 2;2;2 416 3;3;3 417 4;4;4 418 5;5;5 419 6;6;6 420 7;7;7 421 8;8;8 422 9;9;9 423 10;10;10 424 11 11 11 425 12|12|12`, 426 expectedDelimiter: ';', 427 }, 428 // case 14 - a really long single line (over 10k) that will get truncated, but since it has commas and semicolons (but more semicolons) it will pick semicolon 429 { 430 csv: strings.Repeat("a;b,c;", 1700), 431 expectedDelimiter: ';', 432 }, 433 // case 15 - 2 lines that are well over 10k, but since the 2nd line is where this CSV will be truncated (10k sample), it will only use the first line, so semicolon will be picked 434 { 435 csv: "col1@col2@col3\na@b@" + strings.Repeat("c", 6000) + "\nd,e," + strings.Repeat("f", 4000), 436 expectedDelimiter: '@', 437 }, 438 // case 16 - has all delimiters so should return comma 439 { 440 csv: `col1,col2;col3@col4|col5 col6 441 a b|c@d;e,f`, 442 expectedDelimiter: ',', 443 }, 444 // case 16 - nothing works (bad csv) so returns comma by default 445 { 446 csv: `col1,col2 447 a;b 448 c@e 449 f g 450 h|i 451 jkl`, 452 expectedDelimiter: ',', 453 }, 454 } 455 456 for n, c := range cases { 457 delimiter := guessDelimiter([]byte(decodeSlashes(t, c.csv))) 458 assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter) 459 } 460 } 461 462 func TestGuessFromBeforeAfterQuotes(t *testing.T) { 463 cases := []struct { 464 csv string 465 expectedDelimiter rune 466 }{ 467 // case 0 - tab delimited with new lines in values, commas in values 468 { 469 csv: `1 "some,""more 470 "" 471 quoted, 472 text," a 473 2 "some, 474 quoted,\t 475 text," b 476 3 "some, 477 quoted, 478 text" c 479 4 "some, 480 quoted, 481 text," d`, 482 expectedDelimiter: '\t', 483 }, 484 // case 1 - semicolon delmited with quotes and semicolon in value 485 { 486 csv: `col1;col2 487 "this has a literal "" in the text";"and an ; in the text"`, 488 expectedDelimiter: ';', 489 }, 490 // case 2 - pipe delimited with quotes 491 { 492 csv: `Col1 | Col2 | Col3 493 abc | "Hello 494 World"|123 495 "de 496 | 497 f" | 4.56 | 789`, 498 expectedDelimiter: '|', 499 }, 500 // case 3 - a complicated quoted CSV that is semicolon delmiited 501 { 502 csv: `he; she 503 "he said, ""hey!"""; "she said, ""hey back!""" 504 but; "be"`, 505 expectedDelimiter: ';', 506 }, 507 // case 4 - no delimiter should be found 508 { 509 csv: `a,b`, 510 expectedDelimiter: 0, 511 }, 512 // case 5 - no limiter should be found 513 { 514 csv: `col1 515 "he said, ""here I am"""`, 516 expectedDelimiter: 0, 517 }, 518 // case 6 - delimiter before double quoted string with space 519 { 520 csv: `col1|col2 521 a| "he said, ""here I am"""`, 522 expectedDelimiter: '|', 523 }, 524 // case 7 - delimiter before double quoted string without space 525 { 526 csv: `col1|col2 527 a|"he said, ""here I am"""`, 528 expectedDelimiter: '|', 529 }, 530 // case 8 - delimiter after double quoted string with space 531 { 532 csv: `col1, col2 533 "abc\n 534 535 ", def`, 536 expectedDelimiter: ',', 537 }, 538 // case 9 - delimiter after double quoted string without space 539 { 540 csv: `col1,col2 541 "abc\n 542 543 ",def`, 544 expectedDelimiter: ',', 545 }, 546 } 547 548 for n, c := range cases { 549 delimiter := guessFromBeforeAfterQuotes([]byte(decodeSlashes(t, c.csv))) 550 assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter) 551 } 552 } 553 554 func TestFormatError(t *testing.T) { 555 cases := []struct { 556 err error 557 expectedMessage string 558 expectsError bool 559 }{ 560 { 561 err: &csv.ParseError{ 562 Err: csv.ErrFieldCount, 563 }, 564 expectedMessage: "repo.error.csv.invalid_field_count:0", 565 expectsError: false, 566 }, 567 { 568 err: &csv.ParseError{ 569 Err: csv.ErrBareQuote, 570 }, 571 expectedMessage: "repo.error.csv.unexpected:0,0", 572 expectsError: false, 573 }, 574 { 575 err: bytes.ErrTooLarge, 576 expectsError: true, 577 }, 578 } 579 580 for n, c := range cases { 581 message, err := FormatError(c.err, &translation.MockLocale{}) 582 if c.expectsError { 583 assert.Error(t, err, "case %d: expected an error to be returned", n) 584 } else { 585 assert.NoError(t, err, "case %d: no error was expected, got error: %v", n, err) 586 assert.EqualValues(t, c.expectedMessage, message, "case %d: messages should be equal, expected '%s' got '%s'", n, c.expectedMessage, message) 587 } 588 } 589 }