github.com/fraugster/parquet-go@v0.12.0/readwrite_test.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"math"
    10  	"math/rand"
    11  	"os"
    12  	"testing"
    13  	"time"
    14  
    15  	"github.com/fraugster/parquet-go/parquet"
    16  	"github.com/fraugster/parquet-go/parquetschema"
    17  	"github.com/stretchr/testify/assert"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  func TestWriteThenReadFile(t *testing.T) {
    22  	ctx := context.Background()
    23  
    24  	testFunc := func(t *testing.T, name string, opts []FileWriterOption, ropts []FileReaderOption) {
    25  		_ = os.Mkdir("files", 0755)
    26  
    27  		filename := fmt.Sprintf("files/test1_%s.parquet", name)
    28  
    29  		wf, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
    30  		require.NoError(t, err, "creating file failed")
    31  
    32  		w := NewFileWriter(wf, opts...)
    33  
    34  		fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
    35  		require.NoError(t, err, "failed to create fooStore")
    36  
    37  		barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
    38  		require.NoError(t, err, "failed to create barStore")
    39  
    40  		bazStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
    41  		require.NoError(t, err, "failed to create bazStore")
    42  
    43  		require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_REQUIRED)))
    44  		require.NoError(t, w.AddColumn("bar", NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL)))
    45  		require.NoError(t, w.AddColumn("baz", NewDataColumn(bazStore, parquet.FieldRepetitionType_OPTIONAL)))
    46  
    47  		const (
    48  			numRecords = 10000
    49  			flushLimit = 1000
    50  		)
    51  
    52  		for idx := 0; idx < numRecords; idx++ {
    53  			if idx > 0 && idx%flushLimit == 0 {
    54  				require.NoError(t, w.FlushRowGroup(), "%d. AddData failed", idx)
    55  			}
    56  
    57  			data := map[string]interface{}{"foo": int64(idx), "bar": []byte("value" + fmt.Sprint(idx))}
    58  			if idx%20 != 0 {
    59  				data["baz"] = int32(idx % 16)
    60  			}
    61  
    62  			require.NoError(t, w.AddData(data), "%d. AddData failed", idx)
    63  		}
    64  
    65  		assert.NoError(t, w.Close(), "Close failed")
    66  
    67  		require.NoError(t, wf.Close())
    68  
    69  		rf, err := os.Open(filename)
    70  		require.NoError(t, err, "opening file failed")
    71  		defer rf.Close()
    72  
    73  		r, err := NewFileReaderWithOptions(rf, ropts...)
    74  		require.NoError(t, err, "creating file reader failed")
    75  
    76  		cols := r.Columns()
    77  		require.Len(t, cols, 3, "got %d column", len(cols))
    78  		require.Equal(t, "foo", cols[0].Name())
    79  		require.Equal(t, "foo", cols[0].FlatName())
    80  		require.Equal(t, "bar", cols[1].Name())
    81  		require.Equal(t, "bar", cols[1].FlatName())
    82  		require.Equal(t, "baz", cols[2].Name())
    83  		require.Equal(t, "baz", cols[2].FlatName())
    84  		for g := 0; g < r.RowGroupCount(); g++ {
    85  			require.NoError(t, r.readRowGroup(ctx), "Reading row group failed")
    86  			for i := 0; i < int(r.schemaReader.rowGroupNumRecords()); i++ {
    87  				data, err := r.schemaReader.getData()
    88  				require.NoError(t, err)
    89  				_, ok := data["foo"]
    90  				require.True(t, ok)
    91  			}
    92  		}
    93  	}
    94  
    95  	tests := []struct {
    96  		Name      string
    97  		WriteOpts []FileWriterOption
    98  		ReadOpts  []FileReaderOption
    99  	}{
   100  		{
   101  			Name: "datapagev1",
   102  			WriteOpts: []FileWriterOption{
   103  				WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
   104  				WithCreator("parquet-go-unittest"),
   105  			},
   106  			ReadOpts: []FileReaderOption{},
   107  		},
   108  		{
   109  			Name: "datapagev2",
   110  			WriteOpts: []FileWriterOption{
   111  				WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
   112  				WithCreator("parquet-go-unittest"), WithDataPageV2(),
   113  			},
   114  			ReadOpts: []FileReaderOption{},
   115  		},
   116  		{
   117  			Name: "datapagev1_crc",
   118  			WriteOpts: []FileWriterOption{
   119  				WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
   120  				WithCreator("parquet-go-unittest"),
   121  				WithCRC(true),
   122  			},
   123  			ReadOpts: []FileReaderOption{WithCRC32Validation(true)},
   124  		},
   125  		{
   126  			Name: "datapagev2_crc",
   127  			WriteOpts: []FileWriterOption{
   128  				WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
   129  				WithCreator("parquet-go-unittest"),
   130  				WithDataPageV2(),
   131  				WithCRC(true),
   132  			},
   133  			ReadOpts: []FileReaderOption{WithCRC32Validation(true)},
   134  		},
   135  	}
   136  
   137  	for _, tt := range tests {
   138  		t.Run(tt.Name, func(t *testing.T) {
   139  			testFunc(t, tt.Name, tt.WriteOpts, tt.ReadOpts)
   140  		})
   141  	}
   142  }
   143  
   144  func TestWriteThenReadFileRepeated(t *testing.T) {
   145  	ctx := context.Background()
   146  
   147  	_ = os.Mkdir("files", 0755)
   148  
   149  	wf, err := os.OpenFile("files/test2.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   150  	require.NoError(t, err, "creating file failed")
   151  
   152  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   153  
   154  	fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   155  	require.NoError(t, err, "failed to create fooStore")
   156  
   157  	require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_REPEATED)))
   158  
   159  	data := []map[string]interface{}{
   160  		{"foo": []int64{1}},
   161  		{"foo": []int64{1, 2, 3, 1}},
   162  		{},
   163  		{"foo": []int64{1, 3, 1, 1}},
   164  		{},
   165  		{"foo": []int64{1, 2, 2, 1}},
   166  	}
   167  
   168  	for i := range data {
   169  		require.NoError(t, w.AddData(data[i]))
   170  	}
   171  
   172  	assert.NoError(t, w.Close(), "Close failed")
   173  
   174  	require.NoError(t, wf.Close())
   175  
   176  	rf, err := os.Open("files/test2.parquet")
   177  	require.NoError(t, err, "opening file failed")
   178  	defer rf.Close()
   179  
   180  	r, err := NewFileReader(rf)
   181  	require.NoError(t, err, "creating file reader failed")
   182  	require.NoError(t, r.readRowGroup(ctx))
   183  
   184  	require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords())
   185  	for i := range data {
   186  		d, err := r.schemaReader.getData()
   187  		require.NoError(t, err)
   188  		require.Equal(t, data[i], d)
   189  	}
   190  }
   191  
   192  func TestWriteThenReadFileOptional(t *testing.T) {
   193  	ctx := context.Background()
   194  	_ = os.Mkdir("files", 0755)
   195  
   196  	wf, err := os.OpenFile("files/test3.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   197  	require.NoError(t, err, "creating file failed")
   198  
   199  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   200  
   201  	fooStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   202  	require.NoError(t, err, "failed to create fooStore")
   203  
   204  	require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_OPTIONAL)))
   205  
   206  	data := []map[string]interface{}{
   207  		{"foo": []byte("1")},
   208  		{"foo": []byte("2")},
   209  		{},
   210  		{"foo": []byte("3")},
   211  		{},
   212  		{"foo": []byte("4")},
   213  	}
   214  
   215  	for i := range data {
   216  		require.NoError(t, w.AddData(data[i]))
   217  	}
   218  
   219  	assert.NoError(t, w.Close(), "Close failed")
   220  
   221  	require.NoError(t, wf.Close())
   222  
   223  	rf, err := os.Open("files/test3.parquet")
   224  	require.NoError(t, err, "opening file failed")
   225  	defer rf.Close()
   226  
   227  	r, err := NewFileReader(rf)
   228  	require.NoError(t, err, "creating file reader failed")
   229  	require.NoError(t, r.readRowGroup(ctx))
   230  
   231  	require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords())
   232  	root := r.schemaReader.root
   233  	for i := range data {
   234  		_, ok := data[i]["foo"]
   235  		rL, dL, b := root.getFirstRDLevel()
   236  		if ok {
   237  			assert.False(t, b)
   238  			assert.Equal(t, int32(0), rL)
   239  			assert.Equal(t, int32(1), dL)
   240  		} else {
   241  			assert.False(t, b)
   242  			assert.Equal(t, int32(0), rL)
   243  			assert.Equal(t, int32(0), dL)
   244  		}
   245  
   246  		get, err := r.schemaReader.getData()
   247  		require.NoError(t, err)
   248  		require.Equal(t, data[i], get)
   249  	}
   250  }
   251  
   252  func TestWriteThenReadFileNested(t *testing.T) {
   253  	ctx := context.Background()
   254  	_ = os.Mkdir("files", 0755)
   255  
   256  	wf, err := os.OpenFile("files/test4.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   257  	require.NoError(t, err, "creating file failed")
   258  
   259  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   260  
   261  	fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   262  	require.NoError(t, err, "failed to create fooStore")
   263  	barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   264  	require.NoError(t, err, "failed to create barStore")
   265  
   266  	require.NoError(t, w.AddGroupByPath(ColumnPath{"baz"}, parquet.FieldRepetitionType_REPEATED))
   267  	require.NoError(t, w.AddColumnByPath(ColumnPath{"baz", "foo"}, NewDataColumn(fooStore, parquet.FieldRepetitionType_REQUIRED)))
   268  	require.NoError(t, w.AddColumnByPath(ColumnPath{"baz", "bar"}, NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL)))
   269  
   270  	data := []map[string]interface{}{
   271  		{
   272  			"baz": []map[string]interface{}{
   273  				{"foo": int64(10)},
   274  			},
   275  		},
   276  	}
   277  
   278  	for i := range data {
   279  		require.NoError(t, w.AddData(data[i]))
   280  	}
   281  
   282  	assert.NoError(t, w.Close(), "Close failed")
   283  
   284  	require.NoError(t, wf.Close())
   285  
   286  	rf, err := os.Open("files/test4.parquet")
   287  	require.NoError(t, err, "opening file failed")
   288  	defer rf.Close()
   289  
   290  	r, err := NewFileReader(rf)
   291  	require.NoError(t, err, "creating file reader failed")
   292  	require.NoError(t, r.readRowGroup(ctx))
   293  
   294  	require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords())
   295  	for i := range data {
   296  		d, err := r.schemaReader.getData()
   297  		require.NoError(t, err)
   298  		require.Equal(t, data[i], d)
   299  	}
   300  }
   301  
   302  func TestWriteThenReadFileNested2(t *testing.T) {
   303  	ctx := context.Background()
   304  	_ = os.Mkdir("files", 0755)
   305  
   306  	wf, err := os.OpenFile("files/test5.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   307  	require.NoError(t, err, "creating file failed")
   308  
   309  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   310  
   311  	blaStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   312  	require.NoError(t, err, "failed to create fooStore")
   313  	barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   314  	require.NoError(t, err, "failed to create barStore")
   315  
   316  	require.NoError(t, w.AddGroup("foo", parquet.FieldRepetitionType_REPEATED))
   317  	require.NoError(t, w.AddColumn("foo.bla", NewDataColumn(blaStore, parquet.FieldRepetitionType_REQUIRED)))
   318  	require.NoError(t, w.AddColumn("foo.bar", NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL)))
   319  
   320  	data := []map[string]interface{}{
   321  		{
   322  			"foo": []map[string]interface{}{
   323  				{
   324  					"bla": int64(23),
   325  					"bar": []byte("foobar"),
   326  				},
   327  			},
   328  		},
   329  		{
   330  			"foo": []map[string]interface{}{
   331  				{
   332  					"bla": int64(24),
   333  					"bar": []byte("hello"),
   334  				},
   335  			},
   336  		},
   337  		{
   338  			"foo": []map[string]interface{}{
   339  				{
   340  					"bla": int64(25),
   341  				},
   342  				{
   343  					"bla": int64(26),
   344  					"bar": []byte("bye!"),
   345  				},
   346  				{
   347  					"bla": int64(27),
   348  				},
   349  			},
   350  		},
   351  	}
   352  	for i := range data {
   353  		require.NoError(t, w.AddData(data[i]))
   354  	}
   355  
   356  	assert.NoError(t, w.Close(), "Close failed")
   357  
   358  	require.NoError(t, wf.Close())
   359  
   360  	rf, err := os.Open("files/test5.parquet")
   361  	require.NoError(t, err, "opening file failed")
   362  	defer rf.Close()
   363  
   364  	r, err := NewFileReader(rf)
   365  	require.NoError(t, err, "creating file reader failed")
   366  	require.NoError(t, r.readRowGroup(ctx))
   367  
   368  	require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords())
   369  	for i := range data {
   370  		d, err := r.schemaReader.getData()
   371  		require.NoError(t, err)
   372  		require.Equal(t, data[i], d)
   373  	}
   374  }
   375  
   376  func TestWriteThenReadFileMap(t *testing.T) {
   377  	ctx := context.Background()
   378  	_ = os.Mkdir("files", 0755)
   379  
   380  	wf, err := os.OpenFile("files/test6.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   381  	require.NoError(t, err, "creating file failed")
   382  
   383  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   384  
   385  	fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   386  	require.NoError(t, err, "failed to create fooStore")
   387  	barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   388  	require.NoError(t, err, "failed to create barStore")
   389  	elementStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   390  	require.NoError(t, err, "failed to create elementStore")
   391  
   392  	elementCol := NewDataColumn(elementStore, parquet.FieldRepetitionType_REQUIRED)
   393  	list, err := NewListColumn(elementCol, parquet.FieldRepetitionType_OPTIONAL)
   394  	require.NoError(t, err)
   395  
   396  	quuxParams := &ColumnParameters{
   397  		LogicalType: parquet.NewLogicalType(),
   398  	}
   399  	quuxParams.LogicalType.DECIMAL = parquet.NewDecimalType()
   400  	quuxParams.LogicalType.DECIMAL.Scale = 3
   401  	quuxParams.LogicalType.DECIMAL.Precision = 5
   402  
   403  	quuxStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, quuxParams)
   404  	require.NoError(t, err)
   405  
   406  	require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_REQUIRED)))
   407  	require.NoError(t, w.AddColumn("bar", NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL)))
   408  	require.NoError(t, w.AddColumn("baz", list))
   409  	require.NoError(t, w.AddColumn("quux", NewDataColumn(quuxStore, parquet.FieldRepetitionType_OPTIONAL)))
   410  
   411  	/* `message test_msg {
   412  		required int64 foo;
   413  		optional binary bar (STRING);
   414  		optional group baz (LIST) {
   415  			repeated group list {
   416  				required int32 element;
   417  			}
   418  		}
   419  		optional int32 quux (DECIMAL(3, 5));
   420  	}` */
   421  	data := []map[string]interface{}{
   422  		{
   423  			"foo": int64(500),
   424  		},
   425  		{
   426  			"foo": int64(23),
   427  			"bar": []byte("hello!"),
   428  			"baz": map[string]interface{}{
   429  				"list": []map[string]interface{}{
   430  					{"element": int32(23)},
   431  				},
   432  			},
   433  			"quux": int32(123456),
   434  		},
   435  		{
   436  			"foo": int64(42),
   437  			"bar": []byte("world!"),
   438  			"baz": map[string]interface{}{
   439  				"list": []map[string]interface{}{
   440  					{"element": int32(1)},
   441  					{"element": int32(1)},
   442  					{"element": int32(2)},
   443  					{"element": int32(3)},
   444  					{"element": int32(5)},
   445  				},
   446  			},
   447  		},
   448  		{
   449  			"foo": int64(1000),
   450  			"bar": []byte("bye!"),
   451  			"baz": map[string]interface{}{
   452  				"list": []map[string]interface{}{
   453  					{"element": int32(2)},
   454  					{"element": int32(3)},
   455  					{"element": int32(5)},
   456  					{"element": int32(7)},
   457  					{"element": int32(11)},
   458  				},
   459  			},
   460  		},
   461  	}
   462  
   463  	for i := range data {
   464  		require.NoError(t, w.AddData(data[i]))
   465  	}
   466  
   467  	assert.NoError(t, w.Close(), "Close failed")
   468  
   469  	require.NoError(t, wf.Close())
   470  
   471  	rf, err := os.Open("files/test6.parquet")
   472  	require.NoError(t, err, "opening file failed")
   473  	defer rf.Close()
   474  
   475  	r, err := NewFileReader(rf)
   476  	require.NoError(t, err, "creating file reader failed")
   477  	require.NoError(t, r.readRowGroup(ctx))
   478  
   479  	require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords())
   480  	for i := range data {
   481  		d, err := r.schemaReader.getData()
   482  		require.NoError(t, err)
   483  		require.Equal(t, data[i], d)
   484  	}
   485  }
   486  
   487  func TestWriteThenReadFileNested3(t *testing.T) {
   488  	ctx := context.Background()
   489  	_ = os.Mkdir("files", 0755)
   490  
   491  	wf, err := os.OpenFile("files/test7.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   492  	require.NoError(t, err, "creating file failed")
   493  
   494  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   495  	valueStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   496  	require.NoError(t, err, "failed to create valueStore")
   497  	require.NoError(t, w.AddGroup("baz", parquet.FieldRepetitionType_OPTIONAL))
   498  	require.NoError(t, w.AddColumn("baz.value", NewDataColumn(valueStore, parquet.FieldRepetitionType_REQUIRED)))
   499  
   500  	data := []map[string]interface{}{
   501  		{
   502  			"baz": map[string]interface{}{
   503  				"value": int64(9001),
   504  			},
   505  		},
   506  		{},
   507  		{},
   508  	}
   509  
   510  	for i := range data {
   511  		require.NoError(t, w.AddData(data[i]))
   512  	}
   513  
   514  	assert.NoError(t, w.Close(), "Close failed")
   515  
   516  	require.NoError(t, wf.Close())
   517  
   518  	rf, err := os.Open("files/test7.parquet")
   519  	require.NoError(t, err, "opening file failed")
   520  	defer rf.Close()
   521  
   522  	r, err := NewFileReader(rf)
   523  	require.NoError(t, err, "creating file reader failed")
   524  	require.NoError(t, r.readRowGroup(ctx))
   525  
   526  	require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords())
   527  	for i := range data {
   528  		d, err := r.schemaReader.getData()
   529  		require.NoError(t, err)
   530  		require.Equal(t, data[i], d)
   531  	}
   532  }
   533  
   534  func TestWriteEmptyDict(t *testing.T) {
   535  	ctx := context.Background()
   536  	_ = os.Mkdir("files", 0755)
   537  
   538  	wf, err := os.OpenFile("files/test8.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   539  	require.NoError(t, err, "creating file failed")
   540  
   541  	w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest"))
   542  	valueStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
   543  	require.NoError(t, err, "failed to create valueStore")
   544  	require.NoError(t, w.AddColumn("value", NewDataColumn(valueStore, parquet.FieldRepetitionType_OPTIONAL)))
   545  
   546  	for i := 0; i < 1000; i++ {
   547  		require.NoError(t, w.AddData(nil))
   548  	}
   549  
   550  	assert.NoError(t, w.Close(), "Close failed")
   551  
   552  	require.NoError(t, wf.Close())
   553  
   554  	rf, err := os.Open("files/test8.parquet")
   555  	require.NoError(t, err, "opening file failed")
   556  	defer rf.Close()
   557  
   558  	r, err := NewFileReader(rf)
   559  	require.NoError(t, err, "creating file reader failed")
   560  	require.NoError(t, r.readRowGroup(ctx))
   561  
   562  	require.Equal(t, int64(1000), r.schemaReader.rowGroupNumRecords())
   563  	for i := 0; i < 1000; i++ {
   564  		d, err := r.schemaReader.getData()
   565  		require.NoError(t, err)
   566  		require.Equal(t, map[string]interface{}{}, d)
   567  	}
   568  }
   569  
   570  func TestWriteTimeData(t *testing.T) {
   571  	_ = os.Mkdir("files", 0755)
   572  
   573  	wf, err := os.OpenFile("files/test9.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
   574  	require.NoError(t, err, "creating file failed")
   575  
   576  	sd, err := parquetschema.ParseSchemaDefinition(`
   577  		message foo {
   578  			required int64 ts_nanos (TIMESTAMP(NANOS, true));
   579  			required int64 ts_micros (TIMESTAMP(MICROS, true));
   580  			required int64 ts_millis (TIMESTAMP(MILLIS, true));
   581  			required int32 date (DATE);
   582  			required int64 t_nanos (TIME(NANOS, false));
   583  			required int64 t_micros (TIME(MICROS, false));
   584  			required int32 t_millis (TIME(MILLIS, false));
   585  			optional int32 t_alwaysnull (TIME(MILLIS, false));
   586  		}
   587  	`)
   588  	require.NoError(t, err)
   589  
   590  	w := NewFileWriter(wf, WithSchemaDefinition(sd), WithCompressionCodec(parquet.CompressionCodec_GZIP))
   591  	testData := []time.Time{
   592  		time.Date(2015, 5, 9, 14, 15, 45, 666777888, time.UTC),
   593  		time.Date(1983, 10, 18, 11, 45, 16, 123456789, time.UTC),
   594  	}
   595  
   596  	for _, tt := range testData {
   597  		require.NoError(t, w.AddData(map[string]interface{}{
   598  			"ts_nanos":  tt.UnixNano(),
   599  			"ts_micros": tt.UnixNano() / 1000,
   600  			"ts_millis": tt.UnixNano() / 1000000,
   601  			"date":      int32(tt.UnixNano() / (86400 * 1000000000)),
   602  			"t_nanos":   int64((tt.Hour()*3600+tt.Minute()*60+tt.Second())*1000000000 + tt.Nanosecond()),
   603  			"t_micros":  int64((tt.Hour()*3600+tt.Minute()*60+tt.Second())*1000000 + tt.Nanosecond()/1000),
   604  			"t_millis":  int32((tt.Hour()*3600+tt.Minute()*60+tt.Second())*1000 + tt.Nanosecond()/1000000),
   605  		}))
   606  	}
   607  
   608  	require.NoError(t, w.FlushRowGroup())
   609  	require.NoError(t, w.Close())
   610  	require.NoError(t, wf.Close())
   611  
   612  	rf, err := os.Open("files/test9.parquet")
   613  	require.NoError(t, err, "opening file failed")
   614  	defer rf.Close()
   615  
   616  	r, err := NewFileReader(rf)
   617  	require.NoError(t, err, "creating file reader failed")
   618  
   619  	require.NoError(t, r.PreLoad())
   620  
   621  	rg := r.CurrentRowGroup()
   622  
   623  	verificationData := []struct {
   624  		pathInSchema  []string
   625  		maxValue      []byte
   626  		minValue      []byte
   627  		nullCount     int64
   628  		distinctCount int64
   629  	}{
   630  		{
   631  			[]string{"ts_nanos"},
   632  			[]byte{0x20, 0xa3, 0xc6, 0xc3, 0x7c, 0x93, 0xdc, 0x13},
   633  			[]byte{0x15, 0xc5, 0x33, 0x1e, 0x40, 0x96, 0xa, 0x6},
   634  			0,
   635  			2,
   636  		},
   637  		{
   638  			[]string{"ts_micros"},
   639  			[]byte{0xd9, 0x32, 0xe0, 0xc7, 0xa6, 0x15, 0x5, 0x0},
   640  			[]byte{0x40, 0xd, 0xc0, 0x1e, 0xed, 0x8b, 0x1, 0x0},
   641  			0,
   642  			2,
   643  		},
   644  		{
   645  			[]string{"ts_millis"},
   646  			[]byte{0x2, 0x29, 0x8, 0x39, 0x4d, 0x1, 0x0, 0x0},
   647  			[]byte{0x5b, 0x39, 0x6c, 0x5b, 0x65, 0x0, 0x0, 0x0},
   648  			0,
   649  			2,
   650  		},
   651  		{
   652  			[]string{"date"},
   653  			[]byte{0xb4, 0x40, 0x0, 0x0},
   654  			[]byte{0xae, 0x13, 0x0, 0x0},
   655  			0,
   656  			2,
   657  		},
   658  		{
   659  			[]string{"t_nanos"},
   660  			[]byte{0x20, 0xa3, 0x3a, 0xd8, 0xb2, 0x2e, 0x0, 0x0},
   661  			[]byte{0x15, 0xc5, 0x81, 0x7d, 0x7c, 0x26, 0x0, 0x0},
   662  			0,
   663  			2,
   664  		},
   665  		{
   666  			[]string{"t_micros"},
   667  			[]byte{0xd9, 0xb2, 0x70, 0xf4, 0xb, 0x0, 0x0, 0x0},
   668  			[]byte{0x40, 0xcd, 0x3c, 0xda, 0x9, 0x0, 0x0, 0x0},
   669  			0,
   670  			2,
   671  		},
   672  		{
   673  			[]string{"t_millis"},
   674  			[]byte{0x2, 0x79, 0xf, 0x3},
   675  			[]byte{0x5b, 0xb1, 0x85, 0x2},
   676  			0,
   677  			2,
   678  		},
   679  		{
   680  			[]string{"t_alwaysnull"},
   681  			nil,
   682  			nil,
   683  			2,
   684  			0,
   685  		},
   686  	}
   687  
   688  	for idx, tt := range verificationData {
   689  		assert.Equal(t, tt.pathInSchema, rg.Columns[idx].MetaData.PathInSchema, "%d. path in schema doesn't match", idx)
   690  		assert.Equal(t, tt.maxValue, rg.Columns[idx].MetaData.Statistics.MaxValue, "%d. max value doesn't match", idx)
   691  		assert.Equal(t, tt.minValue, rg.Columns[idx].MetaData.Statistics.MinValue, "%d. min value doesn't match", idx)
   692  		assert.Equal(t, tt.nullCount, rg.Columns[idx].MetaData.Statistics.GetNullCount(), "%d. null count doesn't match", idx)
   693  		assert.Equal(t, tt.distinctCount, rg.Columns[idx].MetaData.Statistics.GetDistinctCount(), "%d. distinct count doesn't match", idx)
   694  	}
   695  }
   696  
   697  func TestReadWriteMultiLevel(t *testing.T) {
   698  	sc := `message txn {
   699    optional group cluster (LIST) {
   700      repeated group list {
   701        required group element {
   702          optional group cluster_step (LIST) {
   703              repeated group list {
   704                required group element {
   705                  optional group story_point {
   706                    required binary type (STRING);
   707                  }
   708                }
   709              }
   710            }
   711        }
   712      }
   713    }
   714  }
   715  `
   716  	buf := &bytes.Buffer{}
   717  	sd, err := parquetschema.ParseSchemaDefinition(sc)
   718  	require.NoError(t, err)
   719  	w := NewFileWriter(buf, WithSchemaDefinition(sd))
   720  
   721  	require.NoError(t, w.AddData(map[string]interface{}{}))
   722  	require.NoError(t, w.Close())
   723  	buf2 := bytes.NewReader(buf.Bytes())
   724  	r, err := NewFileReader(buf2)
   725  	require.NoError(t, err)
   726  	data, err := r.NextRow()
   727  	require.NoError(t, err)
   728  	require.Equal(t, map[string]interface{}{}, data)
   729  
   730  	_, err = r.NextRow()
   731  	require.Equal(t, io.EOF, err)
   732  }
   733  
   734  func TestWriteFileWithMarshallerThenReadWithUnmarshaller(t *testing.T) {
   735  	sd, err := parquetschema.ParseSchemaDefinition(
   736  		`message test_msg {
   737  			required group baz (LIST) {
   738  				repeated group list {
   739  					required group element {
   740  						required int64 quux;
   741  					}
   742  				}
   743  			}
   744  		}`)
   745  
   746  	require.NoError(t, err, "parsing schema definition failed")
   747  
   748  	buf := &bytes.Buffer{}
   749  	hlWriter := NewFileWriter(
   750  		buf,
   751  		WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
   752  		WithCreator("floor-unittest"),
   753  		WithSchemaDefinition(sd),
   754  	)
   755  
   756  	require.NoError(t, err, "creating new file writer failed")
   757  
   758  	testData := map[string]interface{}{
   759  		"baz": map[string]interface{}{
   760  			"list": []map[string]interface{}{
   761  				{
   762  					"element": map[string]interface{}{
   763  						"quux": int64(23),
   764  					},
   765  				},
   766  				{
   767  					"element": map[string]interface{}{
   768  						"quux": int64(42),
   769  					},
   770  				},
   771  			},
   772  		},
   773  	}
   774  
   775  	require.NoError(t, hlWriter.AddData(testData), "writing object using marshaller failed")
   776  
   777  	require.NoError(t, hlWriter.Close())
   778  
   779  	hlReader, err := NewFileReader(bytes.NewReader(buf.Bytes()))
   780  	require.NoError(t, err, "opening file failed")
   781  
   782  	readData, err := hlReader.NextRow()
   783  	require.NoError(t, err)
   784  	require.Equal(t, testData, readData, "written and read data don't match")
   785  }
   786  
   787  func TestWriteWithFlushGroupMetaDataThenRead(t *testing.T) {
   788  	sd, err := parquetschema.ParseSchemaDefinition(
   789  		`message test_msg {
   790  			required int64 foo;
   791  			required group x {
   792  				required int64 bar;
   793  			}
   794  		}`)
   795  
   796  	require.NoError(t, err, "parsing schema definition failed")
   797  
   798  	buf := &bytes.Buffer{}
   799  	hlWriter := NewFileWriter(
   800  		buf,
   801  		WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
   802  		WithCreator("floor-unittest"),
   803  		WithSchemaDefinition(sd),
   804  		WithMetaData(map[string]string{"global": "metadata"}),
   805  	)
   806  
   807  	require.NoError(t, err, "creating new file writer failed")
   808  
   809  	testData := map[string]interface{}{
   810  		"foo": int64(23),
   811  		"x": map[string]interface{}{
   812  			"bar": int64(42),
   813  		},
   814  	}
   815  
   816  	require.NoError(t, hlWriter.AddData(testData), "writing object using marshaller failed")
   817  
   818  	require.NoError(t, hlWriter.Close(
   819  		WithRowGroupMetaData(map[string]string{"a": "hello", "b": "world"}),
   820  		WithRowGroupMetaDataForColumn("foo", map[string]string{"b": "friendo", "c": "!"}),
   821  		WithRowGroupMetaDataForColumn("x.bar", map[string]string{"a": "goodbye"}),
   822  	))
   823  
   824  	hlReader, err := NewFileReader(bytes.NewReader(buf.Bytes()))
   825  	require.NoError(t, err)
   826  
   827  	require.Equal(t, map[string]string{"global": "metadata"}, hlReader.MetaData())
   828  
   829  	require.NoError(t, hlReader.PreLoad())
   830  
   831  	// the low-level way of inspecting column metadata:
   832  	rg := hlReader.CurrentRowGroup()
   833  	cols := rg.GetColumns()
   834  	require.Equal(t, 2, len(cols))
   835  
   836  	require.Equal(t, []string{"foo"}, cols[0].MetaData.PathInSchema)
   837  	require.Equal(t, []*parquet.KeyValue{
   838  		{Key: "a", Value: strPtr("hello")},
   839  		{Key: "b", Value: strPtr("friendo")},
   840  		{Key: "c", Value: strPtr("!")},
   841  	}, cols[0].MetaData.KeyValueMetadata)
   842  
   843  	require.Equal(t, []string{"x", "bar"}, cols[1].MetaData.PathInSchema)
   844  	require.Equal(t, []*parquet.KeyValue{
   845  		{Key: "a", Value: strPtr("goodbye")},
   846  		{Key: "b", Value: strPtr("world")},
   847  	}, cols[1].MetaData.KeyValueMetadata)
   848  
   849  	// the high-level way of inspecting column metadata:
   850  	fooMetaData, err := hlReader.ColumnMetaData("foo")
   851  	require.NoError(t, err)
   852  	require.Equal(t, map[string]string{"a": "hello", "b": "friendo", "c": "!"}, fooMetaData)
   853  
   854  	xbarMetaData, err := hlReader.ColumnMetaData("x.bar")
   855  	require.NoError(t, err)
   856  	require.Equal(t, map[string]string{"a": "goodbye", "b": "world"}, xbarMetaData)
   857  
   858  	_, err = hlReader.ColumnMetaData("does.not.exist")
   859  	require.Error(t, err)
   860  }
   861  
   862  func TestReadWriteColumeEncodings(t *testing.T) {
   863  	buf := &bytes.Buffer{}
   864  
   865  	w := NewFileWriter(buf)
   866  
   867  	s, err := NewBooleanStore(parquet.Encoding_RLE, &ColumnParameters{})
   868  	require.NoError(t, err)
   869  	require.NoError(t, w.AddColumn("a", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED)))
   870  
   871  	s, err = NewBooleanStore(parquet.Encoding_PLAIN, &ColumnParameters{})
   872  	require.NoError(t, err)
   873  	require.NoError(t, w.AddColumn("b", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED)))
   874  
   875  	s, err = NewByteArrayStore(parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, false, &ColumnParameters{})
   876  	require.NoError(t, err)
   877  	require.NoError(t, w.AddColumn("c", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED)))
   878  
   879  	s, err = NewByteArrayStore(parquet.Encoding_DELTA_BYTE_ARRAY, false, &ColumnParameters{})
   880  	require.NoError(t, err)
   881  	require.NoError(t, w.AddColumn("d", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED)))
   882  
   883  	s, err = NewFloatStore(parquet.Encoding_PLAIN, false, &ColumnParameters{})
   884  	require.NoError(t, err)
   885  	require.NoError(t, w.AddColumn("e", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED)))
   886  
   887  	s, err = NewDoubleStore(parquet.Encoding_PLAIN, false, &ColumnParameters{})
   888  	require.NoError(t, err)
   889  	require.NoError(t, w.AddColumn("f", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED)))
   890  
   891  	testData := map[string]interface{}{
   892  		"a": true,
   893  		"b": false,
   894  		"c": []byte("hello"),
   895  		"d": []byte("world"),
   896  		"e": float32(23.0),
   897  		"f": float64(42.0),
   898  	}
   899  
   900  	require.NoError(t, w.AddData(testData))
   901  
   902  	require.NoError(t, w.Close())
   903  
   904  	r, err := NewFileReader(bytes.NewReader(buf.Bytes()))
   905  	require.NoError(t, err)
   906  
   907  	data, err := r.NextRow()
   908  	require.NoError(t, err)
   909  
   910  	require.Equal(t, testData, data)
   911  
   912  	_, err = r.NextRow()
   913  	require.Equal(t, io.EOF, err)
   914  }
   915  
   916  func strPtr(s string) *string {
   917  	return &s
   918  }
   919  
   920  func TestWriteThenReadFileUnsetOptional(t *testing.T) {
   921  	sd, err := parquetschema.ParseSchemaDefinition(`
   922  		message foo {
   923  			optional group a (LIST) {
   924  				repeated group list {
   925  					optional group element {
   926  						optional int64 b;
   927  					}
   928  				}
   929  			}
   930  		}`)
   931  	require.NoError(t, err)
   932  
   933  	var buf bytes.Buffer
   934  	require.NoError(t, err)
   935  	w := NewFileWriter(&buf, WithSchemaDefinition(sd))
   936  	testData := map[string]interface{}{
   937  		"a": map[string]interface{}{
   938  			"list": []map[string]interface{}{
   939  				{},
   940  				{
   941  					"element": map[string]interface{}{},
   942  				},
   943  				{
   944  					"element": map[string]interface{}{
   945  						"b": int64(2),
   946  					},
   947  				},
   948  			},
   949  		},
   950  	}
   951  	require.NoError(t, w.AddData(testData))
   952  	require.NoError(t, w.Close())
   953  
   954  	r, err := NewFileReader(bytes.NewReader(buf.Bytes()))
   955  	require.NoError(t, err)
   956  
   957  	data, err := r.NextRow()
   958  	require.NoError(t, err)
   959  	require.Equal(t, testData, data)
   960  
   961  	_, err = r.NextRow()
   962  	require.Equal(t, io.EOF, err)
   963  }
   964  
   965  func TestReadWriteFixedLenByteArrayEncodings(t *testing.T) {
   966  	testData := []struct {
   967  		name    string
   968  		enc     parquet.Encoding
   969  		useDict bool
   970  		input   []byte
   971  	}{
   972  		{name: "delta_byte_array_with_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: true, input: []byte{1, 3, 2, 14, 99, 42}},
   973  		{name: "delta_byte_array_no_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: false, input: []byte{7, 5, 254, 127, 42, 23}},
   974  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: []byte{9, 8, 7, 6, 5, 4}},
   975  	}
   976  
   977  	for _, tt := range testData {
   978  		t.Run(tt.name, func(t *testing.T) {
   979  			var buf bytes.Buffer
   980  			wr := NewFileWriter(&buf)
   981  
   982  			l := int32(len(tt.input))
   983  			store, err := NewFixedByteArrayStore(tt.enc, tt.useDict, &ColumnParameters{TypeLength: &l})
   984  			require.NoError(t, err)
   985  
   986  			require.NoError(t, wr.AddColumn("value", NewDataColumn(store, parquet.FieldRepetitionType_REQUIRED)))
   987  
   988  			inputRow := map[string]interface{}{"value": tt.input}
   989  
   990  			require.NoError(t, wr.AddData(inputRow))
   991  
   992  			require.NoError(t, wr.Close())
   993  
   994  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
   995  			require.NoError(t, err)
   996  
   997  			outputRow, err := rd.NextRow()
   998  			require.NoError(t, err)
   999  
  1000  			require.Equal(t, inputRow, outputRow)
  1001  
  1002  			_, err = rd.NextRow()
  1003  			require.Error(t, err)
  1004  			require.True(t, errors.Is(err, io.EOF))
  1005  		})
  1006  	}
  1007  }
  1008  
  1009  func TestReadWriteByteArrayEncodings(t *testing.T) {
  1010  	testData := []struct {
  1011  		name    string
  1012  		enc     parquet.Encoding
  1013  		useDict bool
  1014  		input   []byte
  1015  	}{
  1016  		{name: "delta_byte_array_with_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: true, input: []byte{1, 3, 2, 14, 99, 42}},
  1017  		{name: "delta_byte_array_no_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: false, input: []byte{7, 5, 254, 127, 42, 23}},
  1018  		{name: "delta_length_byte_array_with_dict", enc: parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, useDict: true, input: []byte{1, 5, 15, 25, 35, 75}},
  1019  		{name: "delta_length_byte_array_no_dict", enc: parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, useDict: false, input: []byte{75, 25, 5, 35, 15, 1}},
  1020  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: []byte{9, 8, 7, 6, 5, 4}},
  1021  	}
  1022  
  1023  	for _, tt := range testData {
  1024  		t.Run(tt.name, func(t *testing.T) {
  1025  			var buf bytes.Buffer
  1026  			wr := NewFileWriter(&buf)
  1027  
  1028  			store, err := NewByteArrayStore(tt.enc, tt.useDict, &ColumnParameters{})
  1029  			require.NoError(t, err)
  1030  
  1031  			require.NoError(t, wr.AddColumn("value", NewDataColumn(store, parquet.FieldRepetitionType_REQUIRED)))
  1032  
  1033  			inputRow := map[string]interface{}{"value": tt.input}
  1034  
  1035  			require.NoError(t, wr.AddData(inputRow))
  1036  
  1037  			require.NoError(t, wr.Close())
  1038  
  1039  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1040  			require.NoError(t, err)
  1041  
  1042  			outputRow, err := rd.NextRow()
  1043  			require.NoError(t, err)
  1044  
  1045  			require.Equal(t, inputRow, outputRow)
  1046  
  1047  			_, err = rd.NextRow()
  1048  			require.Error(t, err)
  1049  			require.True(t, errors.Is(err, io.EOF))
  1050  		})
  1051  	}
  1052  }
  1053  
  1054  func TestReadWriteInt64Encodings(t *testing.T) {
  1055  	testData := []struct {
  1056  		name    string
  1057  		enc     parquet.Encoding
  1058  		useDict bool
  1059  		input   int64
  1060  	}{
  1061  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 87743737636726},
  1062  		{name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 42},
  1063  		{name: "delta_binary_packed", enc: parquet.Encoding_DELTA_BINARY_PACKED, useDict: false, input: 6363228832},
  1064  	}
  1065  
  1066  	for _, tt := range testData {
  1067  		t.Run(tt.name, func(t *testing.T) {
  1068  			var buf bytes.Buffer
  1069  
  1070  			wr := NewFileWriter(&buf)
  1071  
  1072  			bas, err := NewInt64Store(tt.enc, tt.useDict, &ColumnParameters{})
  1073  			require.NoError(t, err)
  1074  
  1075  			col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1076  			require.NoError(t, wr.AddColumn("number", col))
  1077  
  1078  			inputRow := map[string]interface{}{
  1079  				"number": tt.input,
  1080  			}
  1081  
  1082  			require.NoError(t, wr.AddData(inputRow))
  1083  
  1084  			require.NoError(t, wr.Close())
  1085  
  1086  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1087  			if err != nil {
  1088  				t.Fatal(err)
  1089  			}
  1090  
  1091  			outputRow, err := rd.NextRow()
  1092  			require.NoError(t, err)
  1093  
  1094  			require.Equal(t, inputRow, outputRow)
  1095  
  1096  			_, err = rd.NextRow()
  1097  			require.True(t, errors.Is(err, io.EOF))
  1098  		})
  1099  	}
  1100  }
  1101  
  1102  func TestReadWriteInt32Encodings(t *testing.T) {
  1103  	testData := []struct {
  1104  		name    string
  1105  		enc     parquet.Encoding
  1106  		useDict bool
  1107  		input   int32
  1108  	}{
  1109  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 3628282},
  1110  		{name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 23},
  1111  		{name: "delta_binary_packed", enc: parquet.Encoding_DELTA_BINARY_PACKED, useDict: false, input: 9361082},
  1112  	}
  1113  
  1114  	for _, tt := range testData {
  1115  		t.Run(tt.name, func(t *testing.T) {
  1116  			var buf bytes.Buffer
  1117  
  1118  			wr := NewFileWriter(&buf)
  1119  
  1120  			bas, err := NewInt32Store(tt.enc, tt.useDict, &ColumnParameters{})
  1121  			require.NoError(t, err)
  1122  
  1123  			col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1124  			require.NoError(t, wr.AddColumn("number", col))
  1125  
  1126  			inputRow := map[string]interface{}{
  1127  				"number": tt.input,
  1128  			}
  1129  
  1130  			require.NoError(t, wr.AddData(inputRow))
  1131  
  1132  			require.NoError(t, wr.Close())
  1133  
  1134  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1135  			if err != nil {
  1136  				t.Fatal(err)
  1137  			}
  1138  
  1139  			outputRow, err := rd.NextRow()
  1140  			require.NoError(t, err)
  1141  
  1142  			require.Equal(t, inputRow, outputRow)
  1143  
  1144  			_, err = rd.NextRow()
  1145  			require.True(t, errors.Is(err, io.EOF))
  1146  		})
  1147  	}
  1148  }
  1149  
  1150  func TestReadWriteInt96Encodings(t *testing.T) {
  1151  	testData := []struct {
  1152  		name    string
  1153  		enc     parquet.Encoding
  1154  		useDict bool
  1155  		input   [12]byte
  1156  	}{
  1157  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: TimeToInt96(time.Date(2020, 3, 16, 14, 30, 0, 0, time.UTC))},
  1158  		{name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: TimeToInt96(time.Now())},
  1159  	}
  1160  
  1161  	for _, tt := range testData {
  1162  		t.Run(tt.name, func(t *testing.T) {
  1163  			var buf bytes.Buffer
  1164  
  1165  			wr := NewFileWriter(&buf)
  1166  
  1167  			bas, err := NewInt96Store(tt.enc, tt.useDict, &ColumnParameters{})
  1168  			require.NoError(t, err)
  1169  
  1170  			col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1171  			require.NoError(t, wr.AddColumn("ts", col))
  1172  
  1173  			inputRow := map[string]interface{}{
  1174  				"ts": tt.input,
  1175  			}
  1176  
  1177  			require.NoError(t, wr.AddData(inputRow))
  1178  
  1179  			require.NoError(t, wr.Close())
  1180  
  1181  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1182  			if err != nil {
  1183  				t.Fatal(err)
  1184  			}
  1185  
  1186  			outputRow, err := rd.NextRow()
  1187  			require.NoError(t, err)
  1188  
  1189  			require.Equal(t, inputRow, outputRow)
  1190  
  1191  			_, err = rd.NextRow()
  1192  			require.True(t, errors.Is(err, io.EOF))
  1193  		})
  1194  	}
  1195  }
  1196  
  1197  func TestReadWriteFloatEncodings(t *testing.T) {
  1198  	testData := []struct {
  1199  		name    string
  1200  		enc     parquet.Encoding
  1201  		useDict bool
  1202  		input   float32
  1203  	}{
  1204  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 1.1111},
  1205  		{name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 2.2222},
  1206  	}
  1207  
  1208  	for _, tt := range testData {
  1209  		t.Run(tt.name, func(t *testing.T) {
  1210  			var buf bytes.Buffer
  1211  
  1212  			wr := NewFileWriter(&buf)
  1213  
  1214  			bas, err := NewFloatStore(tt.enc, tt.useDict, &ColumnParameters{})
  1215  			require.NoError(t, err)
  1216  
  1217  			col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1218  			require.NoError(t, wr.AddColumn("number", col))
  1219  
  1220  			inputRow := map[string]interface{}{
  1221  				"number": tt.input,
  1222  			}
  1223  
  1224  			require.NoError(t, wr.AddData(inputRow))
  1225  
  1226  			require.NoError(t, wr.Close())
  1227  
  1228  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1229  			if err != nil {
  1230  				t.Fatal(err)
  1231  			}
  1232  
  1233  			outputRow, err := rd.NextRow()
  1234  			require.NoError(t, err)
  1235  
  1236  			require.Equal(t, inputRow, outputRow)
  1237  
  1238  			_, err = rd.NextRow()
  1239  			require.True(t, errors.Is(err, io.EOF))
  1240  		})
  1241  	}
  1242  }
  1243  
  1244  func TestReadWriteDoubleEncodings(t *testing.T) {
  1245  	testData := []struct {
  1246  		name    string
  1247  		enc     parquet.Encoding
  1248  		useDict bool
  1249  		input   float64
  1250  	}{
  1251  		{name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 42.123456},
  1252  		{name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 32.98765},
  1253  	}
  1254  
  1255  	for _, tt := range testData {
  1256  		t.Run(tt.name, func(t *testing.T) {
  1257  			var buf bytes.Buffer
  1258  
  1259  			wr := NewFileWriter(&buf)
  1260  
  1261  			bas, err := NewDoubleStore(tt.enc, tt.useDict, &ColumnParameters{})
  1262  			require.NoError(t, err)
  1263  
  1264  			col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1265  			require.NoError(t, wr.AddColumn("number", col))
  1266  
  1267  			inputRow := map[string]interface{}{
  1268  				"number": tt.input,
  1269  			}
  1270  
  1271  			require.NoError(t, wr.AddData(inputRow))
  1272  
  1273  			require.NoError(t, wr.Close())
  1274  
  1275  			rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1276  			if err != nil {
  1277  				t.Fatal(err)
  1278  			}
  1279  
  1280  			outputRow, err := rd.NextRow()
  1281  			require.NoError(t, err)
  1282  
  1283  			require.Equal(t, inputRow, outputRow)
  1284  
  1285  			_, err = rd.NextRow()
  1286  			require.True(t, errors.Is(err, io.EOF))
  1287  		})
  1288  	}
  1289  }
  1290  
  1291  func TestWriteThenReadMultiplePages(t *testing.T) {
  1292  	const mySchema = `message msg {
  1293  		required binary ts_str (STRING);
  1294  	}`
  1295  
  1296  	sd, err := parquetschema.ParseSchemaDefinition(mySchema)
  1297  	require.NoError(t, err)
  1298  
  1299  	testData := []struct {
  1300  		name    string
  1301  		options []FileWriterOption
  1302  	}{
  1303  
  1304  		{
  1305  			name: "snappy",
  1306  			options: []FileWriterOption{
  1307  				WithSchemaDefinition(sd), WithCompressionCodec(parquet.CompressionCodec_SNAPPY),
  1308  			},
  1309  		},
  1310  		{
  1311  			name: "snappy_1kb_page",
  1312  			options: []FileWriterOption{
  1313  				WithSchemaDefinition(sd), WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithMaxPageSize(1 * 1024),
  1314  			},
  1315  		},
  1316  	}
  1317  
  1318  	for _, tt := range testData {
  1319  		t.Run(tt.name, func(t *testing.T) {
  1320  			f := new(bytes.Buffer)
  1321  
  1322  			fw := NewFileWriter(f, tt.options...)
  1323  			defer fw.Close()
  1324  
  1325  			const numRows = 75
  1326  
  1327  			records := []map[string]interface{}{}
  1328  
  1329  			for i := 0; i < numRows; i++ {
  1330  				tsStr := time.Now().Add(time.Duration(1+rand.Int63n(300)) * time.Second).Format(time.RFC3339)
  1331  				rec := map[string]interface{}{"ts_str": []byte(tsStr)}
  1332  				records = append(records, rec)
  1333  				require.NoError(t, fw.AddData(rec))
  1334  			}
  1335  
  1336  			require.NoError(t, fw.Close())
  1337  
  1338  			r, err := NewFileReader(bytes.NewReader(f.Bytes()))
  1339  			require.NoError(t, err)
  1340  
  1341  			rowCount := r.NumRows()
  1342  			require.Equal(t, int64(numRows), rowCount)
  1343  
  1344  			for i := int64(0); i < rowCount; i++ {
  1345  				data, err := r.NextRow()
  1346  				require.NoError(t, err)
  1347  				require.Equal(t, records[i], data, "%d. records don't match", i)
  1348  				//fmt.Printf("in %d. %s\n", i, string(data["ts_str"].([]byte)))
  1349  			}
  1350  		})
  1351  	}
  1352  }
  1353  
  1354  func TestReadWriteDoubleNaN(t *testing.T) {
  1355  	var buf bytes.Buffer
  1356  
  1357  	wr := NewFileWriter(&buf)
  1358  
  1359  	bas, err := NewDoubleStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
  1360  	require.NoError(t, err)
  1361  
  1362  	col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1363  	require.NoError(t, wr.AddColumn("value", col))
  1364  
  1365  	data := []float64{42.23, math.NaN(), math.NaN(), 23.42, math.Inf(1), math.Inf(-1), 1.111}
  1366  
  1367  	for _, f := range data {
  1368  		require.NoError(t, wr.AddData(map[string]interface{}{
  1369  			"value": f,
  1370  		}))
  1371  	}
  1372  
  1373  	require.NoError(t, wr.Close())
  1374  
  1375  	rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1376  	if err != nil {
  1377  		t.Fatal(err)
  1378  	}
  1379  
  1380  	for i := range data {
  1381  		outputRow, err := rd.NextRow()
  1382  		require.NoError(t, err)
  1383  		if math.IsNaN(data[i]) {
  1384  			require.True(t, math.IsNaN(outputRow["value"].(float64)))
  1385  		} else {
  1386  			require.Equal(t, data[i], outputRow["value"].(float64))
  1387  		}
  1388  	}
  1389  
  1390  	_, err = rd.NextRow()
  1391  	require.True(t, errors.Is(err, io.EOF))
  1392  }
  1393  
  1394  func TestReadWriteFloatNaN(t *testing.T) {
  1395  	var buf bytes.Buffer
  1396  
  1397  	wr := NewFileWriter(&buf)
  1398  
  1399  	bas, err := NewFloatStore(parquet.Encoding_PLAIN, true, &ColumnParameters{})
  1400  	require.NoError(t, err)
  1401  
  1402  	col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED)
  1403  	require.NoError(t, wr.AddColumn("value", col))
  1404  
  1405  	data := []float32{42.23, float32(math.NaN()), float32(math.NaN()), 23.42, float32(math.Inf(1)), float32(math.Inf(-1)), 1.111}
  1406  
  1407  	for _, f := range data {
  1408  		require.NoError(t, wr.AddData(map[string]interface{}{
  1409  			"value": f,
  1410  		}))
  1411  	}
  1412  
  1413  	require.NoError(t, wr.Close())
  1414  
  1415  	rd, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1416  	if err != nil {
  1417  		t.Fatal(err)
  1418  	}
  1419  
  1420  	for i := range data {
  1421  		outputRow, err := rd.NextRow()
  1422  		require.NoError(t, err)
  1423  		if math.IsNaN(float64(data[i])) {
  1424  			require.True(t, math.IsNaN(float64(outputRow["value"].(float32))))
  1425  		} else {
  1426  			require.Equal(t, data[i], outputRow["value"].(float32))
  1427  		}
  1428  	}
  1429  
  1430  	_, err = rd.NextRow()
  1431  	require.True(t, errors.Is(err, io.EOF))
  1432  }
  1433  
  1434  func TestWriteThenReadSetSchemaDefinition(t *testing.T) {
  1435  	var buf bytes.Buffer
  1436  
  1437  	wr := NewFileWriter(&buf)
  1438  
  1439  	sd, err := parquetschema.ParseSchemaDefinition(`message msg { required int64 foo; }`)
  1440  	require.NoError(t, err)
  1441  
  1442  	require.NoError(t, wr.SetSchemaDefinition(sd))
  1443  
  1444  	require.NoError(t, wr.AddData(map[string]interface{}{"foo": int64(23)}))
  1445  
  1446  	require.NoError(t, wr.Close())
  1447  
  1448  	require.Equal(t, sd.String(), wr.GetSchemaDefinition().String())
  1449  
  1450  	require.Equal(t, 1, len(wr.Columns()))
  1451  	require.Equal(t, parquet.TypePtr(parquet.Type_INT64), wr.GetColumnByName("foo").Type())
  1452  	require.Nil(t, wr.GetColumnByName("bar"))
  1453  	require.Nil(t, wr.GetColumnByPath(ColumnPath{"bar"}))
  1454  	require.NotNil(t, wr.GetColumnByPath(ColumnPath{"foo"}))
  1455  
  1456  	r, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1457  	require.NoError(t, err)
  1458  
  1459  	sd2 := r.GetSchemaDefinition()
  1460  
  1461  	require.Equal(t, sd.String(), sd2.String())
  1462  
  1463  	row, err := r.NextRow()
  1464  	require.NoError(t, err)
  1465  	require.Equal(t, map[string]interface{}{"foo": int64(23)}, row)
  1466  
  1467  	_, err = r.NextRow()
  1468  	require.True(t, errors.Is(err, io.EOF))
  1469  }
  1470  
  1471  func TestRepeatedInt32(t *testing.T) {
  1472  	// this is here to somehow reproduce the issue discussed in https://github.com/fraugster/parquet-go/pull/8
  1473  	sd, err := parquetschema.ParseSchemaDefinition(`message msg {
  1474  		repeated int32 foo;
  1475  	}`)
  1476  	require.NoError(t, err)
  1477  
  1478  	var buf bytes.Buffer
  1479  	fw := NewFileWriter(&buf, WithSchemaDefinition(sd))
  1480  
  1481  	err = fw.AddData(map[string]interface{}{
  1482  		"foo": []int32{
  1483  			int32(23),
  1484  			int32(42),
  1485  			int32(9001),
  1486  		},
  1487  	})
  1488  	require.NoError(t, err)
  1489  
  1490  	require.NoError(t, fw.Close())
  1491  
  1492  	r, err := NewFileReader(bytes.NewReader(buf.Bytes()))
  1493  	require.NoError(t, err)
  1494  
  1495  	row, err := r.NextRow()
  1496  	require.NoError(t, err)
  1497  
  1498  	// here's a problem: we added nil, but got a []byte{}.
  1499  	require.Equal(t, []int32{
  1500  		int32(23),
  1501  		int32(42),
  1502  		int32(9001),
  1503  	}, row["foo"])
  1504  }