github.com/fraugster/parquet-go@v0.12.0/type_bytearray_test.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"testing"
     6  
     7  	"github.com/fraugster/parquet-go/parquet"
     8  	"github.com/stretchr/testify/assert"
     9  
    10  	"github.com/fraugster/parquet-go/parquetschema"
    11  	"github.com/stretchr/testify/require"
    12  )
    13  
    14  func TestFuzzCrashByteArrayPlainDecoderNext(t *testing.T) {
    15  	data := []byte("PAR1\x15\x00\x15\xac\x02\x15\xac\x02,\x150\x15\x00\x15\x06\x15" +
    16  		"00\x01\x15\x02\x19,H\f00000000000" +
    17  		"0\x1500\x15\x0e\x15\x1d\x150\x18\x0500000%0\x15" +
    18  		"0\x1500\x160\x19\x1c\x19\x1c&0\x1c\x15\x0e\x190000" +
    19  		"\x19\x18\x0500000\x15\x00\x160\x16\xfa0\x16\xfa\x02&\b" +
    20  		"<\x18\x06000000\x18\x06000000\x1600" +
    21  		"\x19\x1c\x150\x150\x150000\x16\xfa0\x1600000" +
    22  		"00000000000000000000" +
    23  		"00000000000000000000" +
    24  		"00000000000000000000" +
    25  		"00000000000000000000" +
    26  		"00000000000000000000" +
    27  		"00000000000000000000" +
    28  		"00000000000000000000" +
    29  		"00000000000000000000" +
    30  		"00000000000000000000" +
    31  		"00000000000000000000" +
    32  		"0000000000000000000P" +
    33  		"\x01\x00\x00PAR1")
    34  
    35  	readAllData(t, data)
    36  }
    37  
    38  func TestRepeatedBinaryWithNil(t *testing.T) {
    39  	// this is here to somehow reproduce the issue discussed in https://github.com/fraugster/parquet-go/pull/8
    40  	sd, err := parquetschema.ParseSchemaDefinition(`message msg {
    41  		repeated binary foo;
    42  	}`)
    43  	require.NoError(t, err)
    44  
    45  	var buf bytes.Buffer
    46  	fw := NewFileWriter(&buf, WithSchemaDefinition(sd))
    47  
    48  	err = fw.AddData(map[string]interface{}{
    49  		"foo": [][]byte{
    50  			[]byte("hello"),
    51  			nil,
    52  			[]byte("world!"),
    53  		},
    54  	})
    55  	require.NoError(t, err)
    56  
    57  	require.NoError(t, fw.Close())
    58  
    59  	r, err := NewFileReader(bytes.NewReader(buf.Bytes()))
    60  	require.NoError(t, err)
    61  
    62  	row, err := r.NextRow()
    63  	require.NoError(t, err)
    64  
    65  	// here's a problem: we added nil, but got a []byte{}.
    66  	require.Equal(t, [][]byte{
    67  		[]byte("hello"),
    68  		{},
    69  		[]byte("world!"),
    70  	}, row["foo"])
    71  }
    72  
    73  func TestByteArrayStore(t *testing.T) {
    74  	buf := &bytes.Buffer{}
    75  	pq := NewFileWriter(buf)
    76  	s1, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
    77  	require.NoError(t, err)
    78  	s2, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
    79  	require.NoError(t, err)
    80  	require.NoError(t, pq.AddColumnByPath([]string{"s1"}, NewDataColumn(s1, parquet.FieldRepetitionType_REQUIRED)))
    81  	require.NoError(t, pq.AddColumnByPath([]string{"s2"}, NewDataColumn(s2, parquet.FieldRepetitionType_REPEATED)))
    82  
    83  	// The old way is not effected
    84  	err = pq.AddData(map[string]interface{}{
    85  		"s1": []byte("abc"),
    86  		"s2": [][]byte{
    87  			[]byte("a"),
    88  			[]byte("b"),
    89  			[]byte("c"),
    90  		},
    91  	})
    92  	assert.NoError(t, err)
    93  	// The new string data
    94  	err = pq.AddData(map[string]interface{}{
    95  		"s1": "cba",
    96  		"s2": []string{
    97  			"1",
    98  			"2",
    99  			"3",
   100  		},
   101  	})
   102  	assert.NoError(t, err)
   103  	require.NoError(t, pq.Close())
   104  
   105  	pqr, err := NewFileReader(bytes.NewReader(buf.Bytes()))
   106  	assert.NoError(t, err)
   107  
   108  	r1, err := pqr.NextRow()
   109  	// The first one is equal to the input, since it is the proper type
   110  	require.Equal(t, r1, map[string]interface{}{
   111  		"s1": []byte("abc"),
   112  		"s2": [][]byte{
   113  			[]byte("a"),
   114  			[]byte("b"),
   115  			[]byte("c"),
   116  		}})
   117  	assert.NoError(t, err)
   118  
   119  	r2, err := pqr.NextRow()
   120  	// But since parquet do not keep the string type, the returned value here is []byte
   121  	require.Equal(t, r2, map[string]interface{}{
   122  		"s1": []byte("cba"),
   123  		"s2": [][]byte{
   124  			[]byte("1"),
   125  			[]byte("2"),
   126  			[]byte("3"),
   127  		}})
   128  	assert.NoError(t, err)
   129  
   130  	// There should be nothing left in the file
   131  	_, err = pqr.NextRow()
   132  	require.Error(t, err)
   133  }