code.gitea.io/gitea@v1.22.3/modules/csv/csv_test.go (about)

     1  // Copyright 2021 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package csv
     5  
     6  import (
     7  	"bytes"
     8  	"encoding/csv"
     9  	"io"
    10  	"strconv"
    11  	"strings"
    12  	"testing"
    13  
    14  	"code.gitea.io/gitea/modules/git"
    15  	"code.gitea.io/gitea/modules/markup"
    16  	"code.gitea.io/gitea/modules/translation"
    17  
    18  	"github.com/stretchr/testify/assert"
    19  )
    20  
    21  func TestCreateReader(t *testing.T) {
    22  	rd := CreateReader(bytes.NewReader([]byte{}), ',')
    23  	assert.Equal(t, ',', rd.Comma)
    24  }
    25  
    26  func decodeSlashes(t *testing.T, s string) string {
    27  	s = strings.ReplaceAll(s, "\n", "\\n")
    28  	s = strings.ReplaceAll(s, "\"", "\\\"")
    29  	decoded, err := strconv.Unquote(`"` + s + `"`)
    30  	assert.NoError(t, err, "unable to decode string")
    31  	return decoded
    32  }
    33  
    34  func TestCreateReaderAndDetermineDelimiter(t *testing.T) {
    35  	cases := []struct {
    36  		csv               string
    37  		expectedRows      [][]string
    38  		expectedDelimiter rune
    39  	}{
    40  		// case 0 - semicolon delimited
    41  		{
    42  			csv: `a;b;c
    43  1;2;3
    44  4;5;6`,
    45  			expectedRows: [][]string{
    46  				{"a", "b", "c"},
    47  				{"1", "2", "3"},
    48  				{"4", "5", "6"},
    49  			},
    50  			expectedDelimiter: ';',
    51  		},
    52  		// case 1 - tab delimited with empty fields
    53  		{
    54  			csv: `col1	col2	col3
    55  a,	b	c
    56  	e	f
    57  g	h	i
    58  j		l
    59  m	n,\t
    60  p	q	r
    61  		u
    62  v	w	x
    63  y\t\t
    64  		`,
    65  			expectedRows: [][]string{
    66  				{"col1", "col2", "col3"},
    67  				{"a,", "b", "c"},
    68  				{"", "e", "f"},
    69  				{"g", "h", "i"},
    70  				{"j", "", "l"},
    71  				{"m", "n,", ""},
    72  				{"p", "q", "r"},
    73  				{"", "", "u"},
    74  				{"v", "w", "x"},
    75  				{"y", "", ""},
    76  				{"", "", ""},
    77  			},
    78  			expectedDelimiter: '\t',
    79  		},
    80  		// case 2 - comma delimited with leading spaces
    81  		{
    82  			csv: ` col1,col2,col3
    83   a, b, c
    84  d,e,f
    85   ,h, i
    86  j, ,\x20
    87   , , `,
    88  			expectedRows: [][]string{
    89  				{"col1", "col2", "col3"},
    90  				{"a", "b", "c"},
    91  				{"d", "e", "f"},
    92  				{"", "h", "i"},
    93  				{"j", "", ""},
    94  				{"", "", ""},
    95  			},
    96  			expectedDelimiter: ',',
    97  		},
    98  	}
    99  
   100  	for n, c := range cases {
   101  		rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(decodeSlashes(t, c.csv)))
   102  		assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
   103  		assert.EqualValues(t, c.expectedDelimiter, rd.Comma, "case %d: delimiter should be '%c', got '%c'", n, c.expectedDelimiter, rd.Comma)
   104  		rows, err := rd.ReadAll()
   105  		assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
   106  		assert.EqualValues(t, c.expectedRows, rows, "case %d: rows should be equal", n)
   107  	}
   108  }
   109  
   110  type mockReader struct{}
   111  
   112  func (r *mockReader) Read(buf []byte) (int, error) {
   113  	return 0, io.ErrShortBuffer
   114  }
   115  
   116  func TestDetermineDelimiterShortBufferError(t *testing.T) {
   117  	rd, err := CreateReaderAndDetermineDelimiter(nil, &mockReader{})
   118  	assert.Error(t, err, "CreateReaderAndDetermineDelimiter() should throw an error")
   119  	assert.ErrorIs(t, err, io.ErrShortBuffer)
   120  	assert.Nil(t, rd, "CSV reader should be mnil")
   121  }
   122  
   123  func TestDetermineDelimiterReadAllError(t *testing.T) {
   124  	rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(`col1,col2
   125  	a;b
   126  	c@e
   127  	f	g
   128  	h|i
   129  	jkl`))
   130  	assert.NoError(t, err, "CreateReaderAndDetermineDelimiter() shouldn't throw error")
   131  	assert.NotNil(t, rd, "CSV reader should not be mnil")
   132  	rows, err := rd.ReadAll()
   133  	assert.Error(t, err, "RaadAll() should throw error")
   134  	assert.ErrorIs(t, err, csv.ErrFieldCount)
   135  	assert.Empty(t, rows, "rows should be empty")
   136  }
   137  
   138  func TestDetermineDelimiter(t *testing.T) {
   139  	cases := []struct {
   140  		csv               string
   141  		filename          string
   142  		expectedDelimiter rune
   143  	}{
   144  		// case 0 - semicolon delmited
   145  		{
   146  			csv:               "a",
   147  			filename:          "test.csv",
   148  			expectedDelimiter: ',',
   149  		},
   150  		// case 1 - single column/row CSV
   151  		{
   152  			csv:               "a",
   153  			filename:          "",
   154  			expectedDelimiter: ',',
   155  		},
   156  		// case 2 - single column, single row CSV w/ tsv file extension (so is tabbed delimited)
   157  		{
   158  			csv:               "1,2",
   159  			filename:          "test.tsv",
   160  			expectedDelimiter: '\t',
   161  		},
   162  		// case 3 - two column, single row CSV w/ no filename, so will guess comma as delimiter
   163  		{
   164  			csv:               "1,2",
   165  			filename:          "",
   166  			expectedDelimiter: ',',
   167  		},
   168  		// case 4 - semi-colon delimited with csv extension
   169  		{
   170  			csv:               "1;2",
   171  			filename:          "test.csv",
   172  			expectedDelimiter: ';',
   173  		},
   174  		// case 5 - tabbed delimited with tsv extension
   175  		{
   176  			csv:               "1\t2",
   177  			filename:          "test.tsv",
   178  			expectedDelimiter: '\t',
   179  		},
   180  		// case 6 - tabbed delimited without any filename
   181  		{
   182  			csv:               "1\t2",
   183  			filename:          "",
   184  			expectedDelimiter: '\t',
   185  		},
   186  		// case 7 - tabs won't work, only commas as every row has same amount of commas
   187  		{
   188  			csv:               "col1,col2\nfirst\tval,seconed\tval",
   189  			filename:          "",
   190  			expectedDelimiter: ',',
   191  		},
   192  		// case 8 - While looks like comma delimited, has psv extension
   193  		{
   194  			csv:               "1,2",
   195  			filename:          "test.psv",
   196  			expectedDelimiter: '|',
   197  		},
   198  		// case 9 - pipe delmiited with no extension
   199  		{
   200  			csv:               "1|2",
   201  			filename:          "",
   202  			expectedDelimiter: '|',
   203  		},
   204  		// case 10 - semi-colon delimited with commas in values
   205  		{
   206  			csv:               "1,2,3;4,5,6;7,8,9\na;b;c",
   207  			filename:          "",
   208  			expectedDelimiter: ';',
   209  		},
   210  		// case 11 - semi-colon delimited with newline in content
   211  		{
   212  			csv: `"1,2,3,4";"a
   213  b";%
   214  c;d;#`,
   215  			filename:          "",
   216  			expectedDelimiter: ';',
   217  		},
   218  		// case 12 - HTML as single value
   219  		{
   220  			csv:               "<br/>",
   221  			filename:          "",
   222  			expectedDelimiter: ',',
   223  		},
   224  		// case 13 - tab delimited with commas in values
   225  		{
   226  			csv: `name	email	note
   227  John Doe	john@doe.com	This,note,had,a,lot,of,commas,to,test,delimiters`,
   228  			filename:          "",
   229  			expectedDelimiter: '\t',
   230  		},
   231  	}
   232  
   233  	for n, c := range cases {
   234  		delimiter := determineDelimiter(&markup.RenderContext{
   235  			Ctx:          git.DefaultContext,
   236  			RelativePath: c.filename,
   237  		}, []byte(decodeSlashes(t, c.csv)))
   238  		assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
   239  	}
   240  }
   241  
   242  func TestRemoveQuotedString(t *testing.T) {
   243  	cases := []struct {
   244  		text         string
   245  		expectedText string
   246  	}{
   247  		// case 0 - quoted text with escaped quotes in 1st column
   248  		{
   249  			text: `col1,col2,col3
   250  "quoted ""text"" with
   251  new lines
   252  in first column",b,c`,
   253  			expectedText: `col1,col2,col3
   254  ,b,c`,
   255  		},
   256  		// case 1 - quoted text with escaped quotes in 2nd column
   257  		{
   258  			text: `col1,col2,col3
   259  a,"quoted ""text"" with
   260  new lines
   261  in second column",c`,
   262  			expectedText: `col1,col2,col3
   263  a,,c`,
   264  		},
   265  		// case 2 - quoted text with escaped quotes in last column
   266  		{
   267  			text: `col1,col2,col3
   268  a,b,"quoted ""text"" with
   269  new lines
   270  in last column"`,
   271  			expectedText: `col1,col2,col3
   272  a,b,`,
   273  		},
   274  		// case 3 - csv with lots of quotes
   275  		{
   276  			text: `a,"b",c,d,"e
   277  e
   278  e",f
   279  a,bb,c,d,ee ,"f
   280  f"
   281  a,b,"c ""
   282  c",d,e,f`,
   283  			expectedText: `a,,c,d,,f
   284  a,bb,c,d,ee ,
   285  a,b,,d,e,f`,
   286  		},
   287  		// case 4 - csv with pipes and quotes
   288  		{
   289  			text: `Col1 | Col2 | Col3
   290  abc   | "Hello
   291  World"|123
   292  "de
   293  
   294  f" | 4.56 | 789`,
   295  			expectedText: `Col1 | Col2 | Col3
   296  abc   | |123
   297   | 4.56 | 789`,
   298  		},
   299  	}
   300  
   301  	for n, c := range cases {
   302  		modifiedText := removeQuotedString(decodeSlashes(t, c.text))
   303  		assert.EqualValues(t, c.expectedText, modifiedText, "case %d: modified text should be equal", n)
   304  	}
   305  }
   306  
   307  func TestGuessDelimiter(t *testing.T) {
   308  	cases := []struct {
   309  		csv               string
   310  		expectedDelimiter rune
   311  	}{
   312  		// case 0 - single cell, comma delmited
   313  		{
   314  			csv:               "a",
   315  			expectedDelimiter: ',',
   316  		},
   317  		// case 1 - two cells, comma delimited
   318  		{
   319  			csv:               "1,2",
   320  			expectedDelimiter: ',',
   321  		},
   322  		// case 2 - semicolon delimited
   323  		{
   324  			csv:               "1;2",
   325  			expectedDelimiter: ';',
   326  		},
   327  		// case 3 - tab delimited
   328  		{
   329  			csv:               "1\t2",
   330  			expectedDelimiter: '\t',
   331  		},
   332  		// case 4 - pipe delimited
   333  		{
   334  			csv:               "1|2",
   335  			expectedDelimiter: '|',
   336  		},
   337  		// case 5 - semicolon delimited with commas in text
   338  		{
   339  			csv: `1,2,3;4,5,6;7,8,9
   340  a;b;c`,
   341  			expectedDelimiter: ';',
   342  		},
   343  		// case 6 - semicolon delmited with commas in quoted text
   344  		{
   345  			csv: `"1,2,3,4";"a
   346  b"
   347  c;d`,
   348  			expectedDelimiter: ';',
   349  		},
   350  		// case 7 - HTML
   351  		{
   352  			csv:               "<br/>",
   353  			expectedDelimiter: ',',
   354  		},
   355  		// case 8 - tab delimited with commas in value
   356  		{
   357  			csv: `name	email	note
   358  John Doe	john@doe.com	This,note,had,a,lot,of,commas,to,test,delimiters`,
   359  			expectedDelimiter: '\t',
   360  		},
   361  		// case 9 - tab delimited with new lines in values, commas in values
   362  		{
   363  			csv: `1	"some,""more
   364  ""
   365  	quoted,
   366  text,"	a
   367  2	"some,
   368  quoted,\t
   369  	text,"	b
   370  3	"some,
   371  quoted,
   372  	text"	c
   373  4	"some,
   374  quoted,
   375  text,"	d`,
   376  			expectedDelimiter: '\t',
   377  		},
   378  		// case 10 - semicolon delmited with quotes and semicolon in value
   379  		{
   380  			csv: `col1;col2
   381  "this has a literal "" in the text";"and an ; in the text"`,
   382  			expectedDelimiter: ';',
   383  		},
   384  		// case 11 - pipe delimited with quotes
   385  		{
   386  			csv: `Col1 | Col2 | Col3
   387  abc   | "Hello
   388  World"|123
   389  "de
   390  |
   391  f" | 4.56 | 789`,
   392  			expectedDelimiter: '|',
   393  		},
   394  		// case 12 - a tab delimited 6 column CSV, but the values are not quoted and have lots of commas.
   395  		// In the previous bestScore algorithm, this would have picked comma as the delimiter, but now it should guess tab
   396  		{
   397  			csv: `c1	c2	c3	c4	c5	c6
   398  v,k,x,v	ym,f,oa,qn,uqijh,n,s,wvygpo	uj,kt,j,w,i,fvv,tm,f,ddt,b,mwt,e,t,teq,rd,p,a	e,wfuae,t,h,q,im,ix,y	h,mrlu,l,dz,ff,zi,af,emh	,gov,bmfelvb,axp,f,u,i,cni,x,z,v,sh,w,jo,,m,h
   399  k,ohf,pgr,tde,m,s	te,ek,,v,,ic,kqc,dv,w,oi,j,w,gojjr,ug,,l,j,zl	g,qziq,bcajx,zfow,ka,j,re,ohbc	k,nzm,qm,ts,auf	th,elb,lx,l,q,e,qf	asbr,z,k,y,tltobga
   400  g,m,bu,el	h,l,jwi,o,wge,fy,rure,c,g,lcxu,fxte,uns,cl,s,o,t,h,rsoy,f	bq,s,uov,z,ikkhgyg,,sabs,c,hzue	mc,b,,j,t,n	sp,mn,,m,t,dysi,eq,pigb,rfa,z	w,rfli,sg,,o,wjjjf,f,wxdzfk,x,t,p,zy,p,mg,r,l,h
   401  e,ewbkc,nugd,jj,sf,ih,i,n,jo,b,poem,kw,q,i,x,t,e,uug,k	j,xm,sch,ux,h,,fb,f,pq,,mh,,f,v,,oba,w,h,v,eiz,yzd,o,a,c,e,dhp,q	a,pbef,epc,k,rdpuw,cw	k,j,e,d	xf,dz,sviv,w,sqnzew,t,b	v,yg,f,cq,ti,g,m,ta,hm,ym,ii,hxy,p,z,r,e,ga,sfs,r,p,l,aar,w,kox,j
   402  l,d,v,pp,q,j,bxip,w,i,im,qa,o	e,o	h,w,a,a,qzj,nt,qfn,ut,fvhu,ts	hu,q,g,p,q,ofpje,fsqa,frp,p,vih,j,w,k,jx,	ln,th,ka,l,b,vgk,rv,hkx	rj,v,y,cwm,rao,e,l,wvr,ptc,lm,yg,u,k,i,b,zk,b,gv,fls
   403  velxtnhlyuysbnlchosqlhkozkdapjaueexjwrndwb	nglvnv	kqiv	pbshwlmcexdzipopxjyrxhvjalwp	pydvipwlkkpdvbtepahskwuornbsb	qwbacgq
   404  l,y,u,bf,y,m,eals,n,cop,h,g,vs,jga,opt	x,b,zwmn,hh,b,n,pdj,t,d	px	yn,vtd,u,y,b,ps,yo,qqnem,mxg,m,al,rd,c,k,d,q,f	ilxdxa,m,y,,p,p,y,prgmg,q,n,etj,k,ns	b,pl,z,jq,hk
   405  p,gc	jn,mzr,bw	sb,e,r,dy,ur,wzy,r,c,n,yglr,jbdu,r,pqk,k	q,d,,,p,l,euhl,dc,rwh,t,tq,z,h,p,s,t,x,fugr,h	wi,zxb,jcig,o,t,k	mfh,ym,h,e,p,cnvx,uv,zx,x,pq,blt,v,r,u,tr,g,g,xt
   406  nri,p,,t,if,,y,ptlqq	a,i	w,ovli,um,w,f,re,k,sb,w,jy,zf	i,g,p,q,mii,nr,jm,cc	i,szl,k,eg,l,d	,ah,w,b,vh
   407  ,,sh,wx,mn,xm,u,d,yy,u,t,m,j,s,b	ogadq,g,y,y,i,h,ln,jda,g,cz,s,rv,r,s,s,le,r,	y,nu,f,nagj	o,h,,adfy,o,nf,ns,gvsvnub,k,b,xyz	v,h,g,ef,y,gb	c,x,cw,x,go,h,t,x,cu,u,qgrqzrcmn,kq,cd,g,rejp,zcq
   408  skxg,t,vay,d,wug,d,xg,sexc	rt	g,ag,mjq,fjnyji,iwa,m,ml,b,ua,b,qjxeoc	be,s,sh,n,jbzxs,g,n,i,h,y,r,be,mfo,u,p	cw,r,,u,zn,eg,r,yac,m,l,edkr,ha,x,g,b,c,tg,c	j,ye,u,ejd,maj,ea,bm,u,iy`,
   409  			expectedDelimiter: '\t',
   410  		},
   411  		// case 13 - a CSV with more than 10 lines and since we only use the first 10 lines, it should still get the delimiter as semicolon
   412  		{
   413  			csv: `col1;col2;col3
   414  1;1;1
   415  2;2;2
   416  3;3;3
   417  4;4;4
   418  5;5;5
   419  6;6;6
   420  7;7;7
   421  8;8;8
   422  9;9;9
   423  10;10;10
   424  11	11	11
   425  12|12|12`,
   426  			expectedDelimiter: ';',
   427  		},
   428  		// case 14 - a really long single line (over 10k) that will get truncated, but since it has commas and semicolons (but more semicolons) it will pick semicolon
   429  		{
   430  			csv:               strings.Repeat("a;b,c;", 1700),
   431  			expectedDelimiter: ';',
   432  		},
   433  		// case 15 - 2 lines that are well over 10k, but since the 2nd line is where this CSV will be truncated (10k sample), it will only use the first line, so semicolon will be picked
   434  		{
   435  			csv:               "col1@col2@col3\na@b@" + strings.Repeat("c", 6000) + "\nd,e," + strings.Repeat("f", 4000),
   436  			expectedDelimiter: '@',
   437  		},
   438  		// case 16 - has all delimiters so should return comma
   439  		{
   440  			csv: `col1,col2;col3@col4|col5	col6
   441  a	b|c@d;e,f`,
   442  			expectedDelimiter: ',',
   443  		},
   444  		// case 16 - nothing works (bad csv) so returns comma by default
   445  		{
   446  			csv: `col1,col2
   447  a;b
   448  c@e
   449  f	g
   450  h|i
   451  jkl`,
   452  			expectedDelimiter: ',',
   453  		},
   454  	}
   455  
   456  	for n, c := range cases {
   457  		delimiter := guessDelimiter([]byte(decodeSlashes(t, c.csv)))
   458  		assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
   459  	}
   460  }
   461  
   462  func TestGuessFromBeforeAfterQuotes(t *testing.T) {
   463  	cases := []struct {
   464  		csv               string
   465  		expectedDelimiter rune
   466  	}{
   467  		// case 0 - tab delimited with new lines in values, commas in values
   468  		{
   469  			csv: `1	"some,""more
   470  ""
   471  	quoted,
   472  text,"	a
   473  2	"some,
   474  quoted,\t
   475  	text,"	b
   476  3	"some,
   477  quoted,
   478  	text"	c
   479  4	"some,
   480  quoted,
   481  text,"	d`,
   482  			expectedDelimiter: '\t',
   483  		},
   484  		// case 1 - semicolon delmited with quotes and semicolon in value
   485  		{
   486  			csv: `col1;col2
   487  "this has a literal "" in the text";"and an ; in the text"`,
   488  			expectedDelimiter: ';',
   489  		},
   490  		// case 2 - pipe delimited with quotes
   491  		{
   492  			csv: `Col1 | Col2 | Col3
   493  abc   | "Hello
   494  World"|123
   495  "de
   496  |
   497  f" | 4.56 | 789`,
   498  			expectedDelimiter: '|',
   499  		},
   500  		// case 3 - a complicated quoted CSV that is semicolon delmiited
   501  		{
   502  			csv: `he; she
   503  "he said, ""hey!"""; "she said, ""hey back!"""
   504  but; "be"`,
   505  			expectedDelimiter: ';',
   506  		},
   507  		// case 4 - no delimiter should be found
   508  		{
   509  			csv:               `a,b`,
   510  			expectedDelimiter: 0,
   511  		},
   512  		// case 5 - no limiter should be found
   513  		{
   514  			csv: `col1
   515  "he said, ""here I am"""`,
   516  			expectedDelimiter: 0,
   517  		},
   518  		// case 6 - delimiter before double quoted string with space
   519  		{
   520  			csv: `col1|col2
   521  a| "he said, ""here I am"""`,
   522  			expectedDelimiter: '|',
   523  		},
   524  		// case 7 - delimiter before double quoted string without space
   525  		{
   526  			csv: `col1|col2
   527  a|"he said, ""here I am"""`,
   528  			expectedDelimiter: '|',
   529  		},
   530  		// case 8 - delimiter after double quoted string with space
   531  		{
   532  			csv: `col1, col2
   533  "abc\n
   534  
   535  ", def`,
   536  			expectedDelimiter: ',',
   537  		},
   538  		// case 9 - delimiter after double quoted string without space
   539  		{
   540  			csv: `col1,col2
   541  "abc\n
   542  
   543  ",def`,
   544  			expectedDelimiter: ',',
   545  		},
   546  	}
   547  
   548  	for n, c := range cases {
   549  		delimiter := guessFromBeforeAfterQuotes([]byte(decodeSlashes(t, c.csv)))
   550  		assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
   551  	}
   552  }
   553  
   554  func TestFormatError(t *testing.T) {
   555  	cases := []struct {
   556  		err             error
   557  		expectedMessage string
   558  		expectsError    bool
   559  	}{
   560  		{
   561  			err: &csv.ParseError{
   562  				Err: csv.ErrFieldCount,
   563  			},
   564  			expectedMessage: "repo.error.csv.invalid_field_count:0",
   565  			expectsError:    false,
   566  		},
   567  		{
   568  			err: &csv.ParseError{
   569  				Err: csv.ErrBareQuote,
   570  			},
   571  			expectedMessage: "repo.error.csv.unexpected:0,0",
   572  			expectsError:    false,
   573  		},
   574  		{
   575  			err:          bytes.ErrTooLarge,
   576  			expectsError: true,
   577  		},
   578  	}
   579  
   580  	for n, c := range cases {
   581  		message, err := FormatError(c.err, &translation.MockLocale{})
   582  		if c.expectsError {
   583  			assert.Error(t, err, "case %d: expected an error to be returned", n)
   584  		} else {
   585  			assert.NoError(t, err, "case %d: no error was expected, got error: %v", n, err)
   586  			assert.EqualValues(t, c.expectedMessage, message, "case %d: messages should be equal, expected '%s' got '%s'", n, c.expectedMessage, message)
   587  		}
   588  	}
   589  }