github.com/parquet-go/parquet-go@v0.20.0/buffer_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"math"
    10  	"math/rand"
    11  	"reflect"
    12  	"sort"
    13  	"strconv"
    14  	"testing"
    15  
    16  	"github.com/parquet-go/parquet-go"
    17  	"github.com/parquet-go/parquet-go/encoding"
    18  )
    19  
    20  func TestGenericBuffer(t *testing.T) {
    21  	testGenericBuffer[booleanColumn](t)
    22  	testGenericBuffer[int32Column](t)
    23  	testGenericBuffer[int64Column](t)
    24  	testGenericBuffer[int96Column](t)
    25  	testGenericBuffer[floatColumn](t)
    26  	testGenericBuffer[doubleColumn](t)
    27  	testGenericBuffer[byteArrayColumn](t)
    28  	testGenericBuffer[fixedLenByteArrayColumn](t)
    29  	testGenericBuffer[stringColumn](t)
    30  	testGenericBuffer[indexedStringColumn](t)
    31  	testGenericBuffer[uuidColumn](t)
    32  	testGenericBuffer[timeColumn](t)
    33  	testGenericBuffer[timeInMillisColumn](t)
    34  	testGenericBuffer[mapColumn](t)
    35  	testGenericBuffer[decimalColumn](t)
    36  	testGenericBuffer[addressBook](t)
    37  	testGenericBuffer[contact](t)
    38  	testGenericBuffer[listColumn2](t)
    39  	testGenericBuffer[listColumn1](t)
    40  	testGenericBuffer[listColumn0](t)
    41  	testGenericBuffer[nestedListColumn1](t)
    42  	testGenericBuffer[nestedListColumn](t)
    43  	testGenericBuffer[*contact](t)
    44  	testGenericBuffer[paddedBooleanColumn](t)
    45  	testGenericBuffer[optionalInt32Column](t)
    46  	testGenericBuffer[repeatedInt32Column](t)
    47  }
    48  
    49  func testGenericBuffer[Row any](t *testing.T) {
    50  	var model Row
    51  	t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {
    52  		err := quickCheck(func(rows []Row) bool {
    53  			if len(rows) == 0 {
    54  				return true // TODO: fix support for parquet files with zero rows
    55  			}
    56  			if err := testGenericBufferRows(rows); err != nil {
    57  				t.Error(err)
    58  				return false
    59  			}
    60  			return true
    61  		})
    62  		if err != nil {
    63  			t.Error(err)
    64  		}
    65  	})
    66  }
    67  
    68  func testGenericBufferRows[Row any](rows []Row) error {
    69  	setNullPointers(rows)
    70  	buffer := parquet.NewGenericBuffer[Row]()
    71  	_, err := buffer.Write(rows)
    72  	if err != nil {
    73  		return err
    74  	}
    75  	reader := parquet.NewGenericRowGroupReader[Row](buffer)
    76  	result := make([]Row, len(rows))
    77  	n, err := reader.Read(result)
    78  	if err != nil && !errors.Is(err, io.EOF) {
    79  		return err
    80  	}
    81  	if n < len(rows) {
    82  		return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n)
    83  	}
    84  	if !reflect.DeepEqual(rows, result) {
    85  		return fmt.Errorf("rows mismatch:\nwant: %#v\ngot:  %#v", rows, result)
    86  	}
    87  	return nil
    88  }
    89  
    90  func setNullPointers[Row any](rows []Row) {
    91  	if len(rows) > 0 && reflect.TypeOf(rows[0]).Kind() == reflect.Pointer {
    92  		for i := range rows {
    93  			v := reflect.ValueOf(&rows[i]).Elem()
    94  			if v.IsNil() {
    95  				v.Set(reflect.New(v.Type().Elem()))
    96  			}
    97  		}
    98  	}
    99  }
   100  
   101  type generator[T any] interface {
   102  	generate(*rand.Rand) T
   103  }
   104  
   105  func BenchmarkGenericBuffer(b *testing.B) {
   106  	benchmarkGenericBuffer[benchmarkRowType](b)
   107  	benchmarkGenericBuffer[booleanColumn](b)
   108  	benchmarkGenericBuffer[int32Column](b)
   109  	benchmarkGenericBuffer[int64Column](b)
   110  	benchmarkGenericBuffer[floatColumn](b)
   111  	benchmarkGenericBuffer[doubleColumn](b)
   112  	benchmarkGenericBuffer[byteArrayColumn](b)
   113  	benchmarkGenericBuffer[fixedLenByteArrayColumn](b)
   114  	benchmarkGenericBuffer[stringColumn](b)
   115  	benchmarkGenericBuffer[indexedStringColumn](b)
   116  	benchmarkGenericBuffer[uuidColumn](b)
   117  	benchmarkGenericBuffer[timeColumn](b)
   118  	benchmarkGenericBuffer[timeInMillisColumn](b)
   119  	benchmarkGenericBuffer[mapColumn](b)
   120  	benchmarkGenericBuffer[decimalColumn](b)
   121  	benchmarkGenericBuffer[contact](b)
   122  	benchmarkGenericBuffer[paddedBooleanColumn](b)
   123  	benchmarkGenericBuffer[optionalInt32Column](b)
   124  	benchmarkGenericBuffer[repeatedInt32Column](b)
   125  }
   126  
   127  func benchmarkGenericBuffer[Row generator[Row]](b *testing.B) {
   128  	var model Row
   129  	b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {
   130  		prng := rand.New(rand.NewSource(0))
   131  		rows := make([]Row, benchmarkNumRows)
   132  		for i := range rows {
   133  			rows[i] = rows[i].generate(prng)
   134  		}
   135  
   136  		b.Run("go1.17", func(b *testing.B) {
   137  			buffer := parquet.NewBuffer(parquet.SchemaOf(rows[0]))
   138  			i := 0
   139  			benchmarkRowsPerSecond(b, func() int {
   140  				for j := 0; j < benchmarkRowsPerStep; j++ {
   141  					if err := buffer.Write(&rows[i]); err != nil {
   142  						b.Fatal(err)
   143  					}
   144  				}
   145  
   146  				i += benchmarkRowsPerStep
   147  				i %= benchmarkNumRows
   148  
   149  				if i == 0 {
   150  					buffer.Reset()
   151  				}
   152  				return benchmarkRowsPerStep
   153  			})
   154  		})
   155  
   156  		b.Run("go1.18", func(b *testing.B) {
   157  			buffer := parquet.NewGenericBuffer[Row]()
   158  			i := 0
   159  			benchmarkRowsPerSecond(b, func() int {
   160  				n, err := buffer.Write(rows[i : i+benchmarkRowsPerStep])
   161  				if err != nil {
   162  					b.Fatal(err)
   163  				}
   164  
   165  				i += benchmarkRowsPerStep
   166  				i %= benchmarkNumRows
   167  
   168  				if i == 0 {
   169  					buffer.Reset()
   170  				}
   171  				return n
   172  			})
   173  		})
   174  	})
   175  }
   176  
   177  func TestIssue327(t *testing.T) {
   178  	t.Run("untagged nested lists should panic", func(t *testing.T) {
   179  		type testType struct {
   180  			ListOfLists [][]int
   181  		}
   182  
   183  		defer func() {
   184  			if r := recover(); r == nil {
   185  				t.Errorf("Nested lists without the list tag should panic")
   186  			}
   187  		}()
   188  
   189  		_ = parquet.NewGenericBuffer[testType]()
   190  	})
   191  }
   192  
   193  func TestIssue346(t *testing.T) {
   194  	type TestType struct {
   195  		Key int
   196  	}
   197  
   198  	schema := parquet.SchemaOf(TestType{})
   199  	buffer := parquet.NewGenericBuffer[any](schema)
   200  
   201  	data := make([]any, 1)
   202  	data[0] = TestType{Key: 0}
   203  	_, _ = buffer.Write(data)
   204  }
   205  
   206  func TestIssue347(t *testing.T) {
   207  	type TestType struct {
   208  		Key int
   209  	}
   210  
   211  	// instantiating with concrete type shouldn't panic
   212  	_ = parquet.NewGenericBuffer[TestType]()
   213  
   214  	// instantiating with schema and interface type parameter shouldn't panic
   215  	schema := parquet.SchemaOf(TestType{})
   216  	_ = parquet.NewGenericBuffer[any](schema)
   217  
   218  	defer func() {
   219  		if r := recover(); r == nil {
   220  			t.Errorf("instantiating generic buffer without schema and with interface " +
   221  				"type parameter should panic")
   222  		}
   223  	}()
   224  	_ = parquet.NewGenericBuffer[any]()
   225  }
   226  
   227  func BenchmarkSortGenericBuffer(b *testing.B) {
   228  	type Row struct {
   229  		I0 int64
   230  		I1 int64
   231  		I2 int64
   232  		I3 int64
   233  		I4 int64
   234  		I5 int64
   235  		I6 int64
   236  		I7 int64
   237  		I8 int64
   238  		I9 int64
   239  		ID [16]byte
   240  	}
   241  
   242  	buf := parquet.NewGenericBuffer[Row](
   243  		parquet.SortingRowGroupConfig(
   244  			parquet.SortingColumns(
   245  				parquet.Ascending("ID"),
   246  			),
   247  		),
   248  	)
   249  
   250  	rows := make([]Row, 10e3)
   251  	prng := rand.New(rand.NewSource(0))
   252  
   253  	for i := range rows {
   254  		binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i))
   255  		binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i))
   256  	}
   257  
   258  	buf.Write(rows)
   259  	b.ResetTimer()
   260  
   261  	for i := 0; i < b.N; i++ {
   262  		for j := 0; j < 10; j++ {
   263  			buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows)))
   264  		}
   265  
   266  		sort.Sort(buf)
   267  	}
   268  }
   269  
   270  var bufferTests = [...]struct {
   271  	scenario string
   272  	typ      parquet.Type
   273  	values   [][]interface{}
   274  }{
   275  	{
   276  		scenario: "boolean",
   277  		typ:      parquet.BooleanType,
   278  		values: [][]interface{}{
   279  			{},
   280  			{false},
   281  			{true},
   282  			{
   283  				false, true, false, false, true, true,
   284  				false, false, false, true, false, true,
   285  			},
   286  		},
   287  	},
   288  
   289  	{
   290  		scenario: "int32",
   291  		typ:      parquet.Int32Type,
   292  		values: [][]interface{}{
   293  			{},
   294  			{int32(0)},
   295  			{int32(1)},
   296  			{
   297  				int32(1), int32(2), int32(3), int32(4), int32(5), int32(6),
   298  				int32(math.MaxInt8), int32(math.MaxInt16), int32(math.MaxInt32),
   299  				int32(7), int32(9), int32(9), int32(0),
   300  			},
   301  		},
   302  	},
   303  
   304  	{
   305  		scenario: "int64",
   306  		typ:      parquet.Int64Type,
   307  		values: [][]interface{}{
   308  			{},
   309  			{int64(0)},
   310  			{int64(1)},
   311  			{
   312  				int64(1), int64(2), int64(3), int64(4), int64(5), int64(6),
   313  				int64(math.MaxInt8), int64(math.MaxInt16), int64(math.MaxInt64), int64(7),
   314  				int64(9), int64(9), int64(0),
   315  			},
   316  		},
   317  	},
   318  
   319  	{
   320  		scenario: "float",
   321  		typ:      parquet.FloatType,
   322  		values: [][]interface{}{
   323  			{},
   324  			{float32(0)},
   325  			{float32(1)},
   326  			{
   327  				float32(1), float32(2), float32(3), float32(4), float32(5), float32(6),
   328  				float32(0.5), float32(math.SmallestNonzeroFloat32), float32(math.MaxFloat32), float32(7),
   329  				float32(9), float32(9), float32(0),
   330  			},
   331  		},
   332  	},
   333  
   334  	{
   335  		scenario: "double",
   336  		typ:      parquet.DoubleType,
   337  		values: [][]interface{}{
   338  			{},
   339  			{float64(0)},
   340  			{float64(1)},
   341  			{
   342  				float64(1), float64(2), float64(3), float64(4), float64(5), float64(6),
   343  				float64(0.5), float64(math.SmallestNonzeroFloat64), float64(math.MaxFloat64), float64(7),
   344  				float64(9), float64(9), float64(0),
   345  			},
   346  		},
   347  	},
   348  
   349  	{
   350  		scenario: "string",
   351  		typ:      parquet.ByteArrayType,
   352  		values: [][]interface{}{
   353  			{},
   354  			{""},
   355  			{"Hello World!"},
   356  			{
   357  				"ABCDEFG", "HIJKLMN", "OPQRSTU", "VWXZY01", "2345678",
   358  				"90!@#$%", "^&*()_+", "Hello World!", "Answer=42", "ABCEDFG",
   359  				"HIJKLMN", "OPQRSTU", "VWXYZ",
   360  			},
   361  		},
   362  	},
   363  
   364  	{
   365  		scenario: "fixed length byte array",
   366  		typ:      parquet.FixedLenByteArrayType(10),
   367  		values: [][]interface{}{
   368  			{},
   369  			{[10]byte{}},
   370  			{[10]byte{0: 1}},
   371  			{
   372  				[10]byte{0: 0}, [10]byte{0: 2}, [10]byte{0: 1}, [10]byte{0: 4}, [10]byte{0: 3},
   373  				[10]byte{0: 6}, [10]byte{0: 5}, [10]byte{0: 8}, [10]byte{0: 7}, [10]byte{0: 10},
   374  				[10]byte{0: 11}, [10]byte{0: 12}, [10]byte{9: 0xFF},
   375  			},
   376  		},
   377  	},
   378  
   379  	{
   380  		scenario: "uuid",
   381  		typ:      parquet.UUID().Type(),
   382  		values: [][]interface{}{
   383  			{},
   384  			{[16]byte{}},
   385  			{[16]byte{0: 1}},
   386  			{
   387  				[16]byte{0: 0}, [16]byte{0: 2}, [16]byte{0: 1}, [16]byte{0: 4}, [16]byte{0: 3},
   388  				[16]byte{0: 6}, [16]byte{0: 5}, [16]byte{0: 8}, [16]byte{0: 7}, [16]byte{0: 10},
   389  				[16]byte{0: 11}, [16]byte{0: 12}, [16]byte{15: 0xFF},
   390  			},
   391  		},
   392  	},
   393  
   394  	{
   395  		scenario: "uint32",
   396  		typ:      parquet.Uint(32).Type(),
   397  		values: [][]interface{}{
   398  			{},
   399  			{uint32(0)},
   400  			{uint32(1)},
   401  			{
   402  				uint32(1), uint32(2), uint32(3), uint32(4), uint32(5), uint32(6),
   403  				uint32(math.MaxInt8), uint32(math.MaxInt16), uint32(math.MaxUint32), uint32(7),
   404  				uint32(9), uint32(9), uint32(0),
   405  			},
   406  		},
   407  	},
   408  
   409  	{
   410  		scenario: "uint64",
   411  		typ:      parquet.Uint(64).Type(),
   412  		values: [][]interface{}{
   413  			{},
   414  			{uint64(0)},
   415  			{uint64(1)},
   416  			{
   417  				uint64(1), uint64(2), uint64(3), uint64(4), uint64(5), uint64(6),
   418  				uint64(math.MaxInt8), uint64(math.MaxInt16), uint64(math.MaxUint64),
   419  				uint64(7), uint64(9), uint64(9), uint64(0),
   420  			},
   421  		},
   422  	},
   423  }
   424  
   425  func TestBuffer(t *testing.T) {
   426  	for _, test := range bufferTests {
   427  		t.Run(test.scenario, func(t *testing.T) {
   428  			for _, config := range [...]struct {
   429  				scenario string
   430  				typ      parquet.Type
   431  			}{
   432  				{scenario: "plain", typ: test.typ},
   433  				{scenario: "indexed", typ: test.typ.NewDictionary(0, 0, test.typ.NewValues(nil, nil)).Type()},
   434  			} {
   435  				t.Run(config.scenario, func(t *testing.T) {
   436  					for _, mod := range [...]struct {
   437  						scenario string
   438  						function func(parquet.Node) parquet.Node
   439  					}{
   440  						{scenario: "optional", function: parquet.Optional},
   441  						{scenario: "repeated", function: parquet.Repeated},
   442  						{scenario: "required", function: parquet.Required},
   443  					} {
   444  						t.Run(mod.scenario, func(t *testing.T) {
   445  							for _, ordering := range [...]struct {
   446  								scenario string
   447  								sorting  parquet.SortingColumn
   448  								sortFunc func(parquet.Type, []parquet.Value)
   449  							}{
   450  								{scenario: "unordered", sorting: nil, sortFunc: unordered},
   451  								{scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending},
   452  								{scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending},
   453  							} {
   454  								t.Run(ordering.scenario, func(t *testing.T) {
   455  									schema := parquet.NewSchema("test", parquet.Group{
   456  										"data": mod.function(parquet.Leaf(config.typ)),
   457  									})
   458  
   459  									options := []parquet.RowGroupOption{
   460  										schema,
   461  										parquet.ColumnBufferCapacity(100),
   462  									}
   463  									if ordering.sorting != nil {
   464  										options = append(options,
   465  											parquet.SortingRowGroupConfig(
   466  												parquet.SortingColumns(ordering.sorting),
   467  											),
   468  										)
   469  									}
   470  
   471  									content := new(bytes.Buffer)
   472  									buffer := parquet.NewBuffer(options...)
   473  
   474  									for _, values := range test.values {
   475  										t.Run("", func(t *testing.T) {
   476  											defer content.Reset()
   477  											defer buffer.Reset()
   478  											fields := schema.Fields()
   479  											testBuffer(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc)
   480  										})
   481  									}
   482  								})
   483  							}
   484  						})
   485  					}
   486  				})
   487  			}
   488  		})
   489  	}
   490  }
   491  
   492  type sortFunc func(parquet.Type, []parquet.Value)
   493  
   494  func unordered(typ parquet.Type, values []parquet.Value) {}
   495  
   496  func ascending(typ parquet.Type, values []parquet.Value) {
   497  	sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) < 0 })
   498  }
   499  
   500  func descending(typ parquet.Type, values []parquet.Value) {
   501  	sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) > 0 })
   502  }
   503  
   504  func testBuffer(t *testing.T, node parquet.Node, buffer *parquet.Buffer, encoding encoding.Encoding, values []interface{}, sortFunc sortFunc) {
   505  	repetitionLevel := 0
   506  	definitionLevel := 0
   507  	if !node.Required() {
   508  		definitionLevel = 1
   509  	}
   510  
   511  	minValue := parquet.Value{}
   512  	maxValue := parquet.Value{}
   513  	batch := make([]parquet.Value, len(values))
   514  	for i := range values {
   515  		batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0)
   516  	}
   517  
   518  	for i := range batch {
   519  		_, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]})
   520  		if err != nil {
   521  			t.Fatalf("writing value to row group: %v", err)
   522  		}
   523  	}
   524  
   525  	numRows := buffer.NumRows()
   526  	if numRows != int64(len(batch)) {
   527  		t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows)
   528  	}
   529  
   530  	typ := node.Type()
   531  	for _, value := range batch {
   532  		if minValue.IsNull() || typ.Compare(value, minValue) < 0 {
   533  			minValue = value
   534  		}
   535  		if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 {
   536  			maxValue = value
   537  		}
   538  	}
   539  
   540  	sortFunc(typ, batch)
   541  	sort.Sort(buffer)
   542  
   543  	page := buffer.ColumnBuffers()[0].Page()
   544  	numValues := page.NumValues()
   545  	if numValues != int64(len(batch)) {
   546  		t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues)
   547  	}
   548  
   549  	numNulls := page.NumNulls()
   550  	if numNulls != 0 {
   551  		t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls)
   552  	}
   553  
   554  	min, max, hasBounds := page.Bounds()
   555  	if !hasBounds && numRows > 0 {
   556  		t.Fatal("page bounds are missing")
   557  	}
   558  	if !parquet.Equal(min, minValue) {
   559  		t.Fatalf("min value mismatch: want=%v got=%v", minValue, min)
   560  	}
   561  	if !parquet.Equal(max, maxValue) {
   562  		t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max)
   563  	}
   564  
   565  	// We write a single value per row, so num values = num rows for all pages
   566  	// including repeated ones, which makes it OK to slice the pages using the
   567  	// number of values as a proxy for the row indexes.
   568  	halfValues := numValues / 2
   569  
   570  	for _, test := range [...]struct {
   571  		scenario string
   572  		values   []parquet.Value
   573  		reader   parquet.ValueReader
   574  	}{
   575  		{"page", batch, page.Values()},
   576  		{"head", batch[:halfValues], page.Slice(0, halfValues).Values()},
   577  		{"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()},
   578  	} {
   579  		v := [1]parquet.Value{}
   580  		i := 0
   581  
   582  		for {
   583  			n, err := test.reader.ReadValues(v[:])
   584  			if n > 0 {
   585  				if n != 1 {
   586  					t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n)
   587  				}
   588  				if i < len(test.values) {
   589  					if !parquet.Equal(v[0], test.values[i]) {
   590  						t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0])
   591  					}
   592  				}
   593  				i++
   594  			}
   595  			if err != nil {
   596  				if err == io.EOF {
   597  					break
   598  				}
   599  				t.Fatalf("reading value from %q reader: %v", test.scenario, err)
   600  			}
   601  		}
   602  
   603  		if i != len(test.values) {
   604  			t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i)
   605  		}
   606  	}
   607  }
   608  
   609  func TestBufferGenerateBloomFilters(t *testing.T) {
   610  	type Point3D struct {
   611  		X float64
   612  		Y float64
   613  		Z float64
   614  	}
   615  
   616  	f := func(rows []Point3D) bool {
   617  		if len(rows) == 0 { // TODO: support writing files with no rows
   618  			return true
   619  		}
   620  
   621  		output := new(bytes.Buffer)
   622  		buffer := parquet.NewBuffer()
   623  		writer := parquet.NewWriter(output,
   624  			parquet.BloomFilters(
   625  				parquet.SplitBlockFilter(10, "X"),
   626  				parquet.SplitBlockFilter(10, "Y"),
   627  				parquet.SplitBlockFilter(10, "Z"),
   628  			),
   629  		)
   630  		for i := range rows {
   631  			buffer.Write(&rows[i])
   632  		}
   633  		_, err := copyRowsAndClose(writer, buffer.Rows())
   634  		if err != nil {
   635  			t.Error(err)
   636  			return false
   637  		}
   638  		if err := writer.Close(); err != nil {
   639  			t.Error(err)
   640  			return false
   641  		}
   642  
   643  		reader := bytes.NewReader(output.Bytes())
   644  		f, err := parquet.OpenFile(reader, reader.Size())
   645  		if err != nil {
   646  			t.Error(err)
   647  			return false
   648  		}
   649  		rowGroup := f.RowGroups()[0]
   650  		columns := rowGroup.ColumnChunks()
   651  		x := columns[0]
   652  		y := columns[1]
   653  		z := columns[2]
   654  
   655  		for i, col := range []parquet.ColumnChunk{x, y, z} {
   656  			if col.BloomFilter() == nil {
   657  				t.Errorf("column %d has no bloom filter despite being configured to have one", i)
   658  				return false
   659  			}
   660  		}
   661  
   662  		fx := x.BloomFilter()
   663  		fy := y.BloomFilter()
   664  		fz := z.BloomFilter()
   665  
   666  		test := func(f parquet.BloomFilter, v float64) bool {
   667  			if ok, err := f.Check(parquet.ValueOf(v)); err != nil {
   668  				t.Errorf("unexpected error checking bloom filter: %v", err)
   669  				return false
   670  			} else if !ok {
   671  				t.Errorf("bloom filter does not contain value %g", v)
   672  				return false
   673  			}
   674  			return true
   675  		}
   676  
   677  		for _, row := range rows {
   678  			if !test(fx, row.X) || !test(fy, row.Y) || !test(fz, row.Z) {
   679  				return false
   680  			}
   681  		}
   682  
   683  		return true
   684  	}
   685  
   686  	if err := quickCheck(f); err != nil {
   687  		t.Error(err)
   688  	}
   689  }
   690  
   691  func TestBufferRoundtripNestedRepeated(t *testing.T) {
   692  	type C struct {
   693  		D int
   694  	}
   695  	type B struct {
   696  		C []C
   697  	}
   698  	type A struct {
   699  		B []B
   700  	}
   701  
   702  	// Write enough objects to exceed first page
   703  	buffer := parquet.NewBuffer()
   704  	var objs []A
   705  	for i := 0; i < 6; i++ {
   706  		o := A{[]B{{[]C{
   707  			{i},
   708  			{i},
   709  		}}}}
   710  		buffer.Write(&o)
   711  		objs = append(objs, o)
   712  	}
   713  
   714  	buf := new(bytes.Buffer)
   715  	w := parquet.NewWriter(buf, parquet.PageBufferSize(100))
   716  	w.WriteRowGroup(buffer)
   717  	w.Flush()
   718  	w.Close()
   719  
   720  	file := bytes.NewReader(buf.Bytes())
   721  	r := parquet.NewReader(file)
   722  	for i := 0; ; i++ {
   723  		o := new(A)
   724  		err := r.Read(o)
   725  		if errors.Is(err, io.EOF) {
   726  			if i < len(objs) {
   727  				t.Errorf("too few rows were read: %d<%d", i, len(objs))
   728  			}
   729  			break
   730  		}
   731  		if !reflect.DeepEqual(*o, objs[i]) {
   732  			t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o)
   733  		}
   734  	}
   735  }
   736  
   737  func TestBufferRoundtripNestedRepeatedPointer(t *testing.T) {
   738  	type C struct {
   739  		D *int
   740  	}
   741  	type B struct {
   742  		C []C
   743  	}
   744  	type A struct {
   745  		B []B
   746  	}
   747  
   748  	// Write enough objects to exceed first page
   749  	buffer := parquet.NewBuffer()
   750  	var objs []A
   751  	for i := 0; i < 6; i++ {
   752  		j := i
   753  		o := A{[]B{{[]C{
   754  			{&j},
   755  			{nil},
   756  		}}}}
   757  		buffer.Write(&o)
   758  		objs = append(objs, o)
   759  	}
   760  
   761  	buf := new(bytes.Buffer)
   762  	w := parquet.NewWriter(buf, parquet.PageBufferSize(100))
   763  	w.WriteRowGroup(buffer)
   764  	w.Flush()
   765  	w.Close()
   766  
   767  	file := bytes.NewReader(buf.Bytes())
   768  	r := parquet.NewReader(file)
   769  	for i := 0; ; i++ {
   770  		o := new(A)
   771  		err := r.Read(o)
   772  		if err == io.EOF {
   773  			break
   774  		}
   775  		if !reflect.DeepEqual(*o, objs[i]) {
   776  			t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o)
   777  		}
   778  	}
   779  }
   780  
   781  func TestRoundtripNestedRepeatedBytes(t *testing.T) {
   782  	type B struct {
   783  		C []byte
   784  	}
   785  	type A struct {
   786  		A string
   787  		B []B
   788  	}
   789  
   790  	var objs []A
   791  	for i := 0; i < 2; i++ {
   792  		o := A{
   793  			"test" + strconv.Itoa(i),
   794  			[]B{
   795  				{[]byte{byte(i)}},
   796  			},
   797  		}
   798  		objs = append(objs, o)
   799  	}
   800  
   801  	buf := new(bytes.Buffer)
   802  	w := parquet.NewWriter(buf, parquet.PageBufferSize(100))
   803  	for _, o := range objs {
   804  		w.Write(&o)
   805  	}
   806  	w.Close()
   807  
   808  	file := bytes.NewReader(buf.Bytes())
   809  
   810  	r := parquet.NewReader(file)
   811  	for i := 0; ; i++ {
   812  		o := new(A)
   813  		err := r.Read(o)
   814  		if errors.Is(err, io.EOF) {
   815  			if i < len(objs) {
   816  				t.Errorf("too few rows were read: %d<%d", i, len(objs))
   817  			}
   818  			break
   819  		}
   820  		if !reflect.DeepEqual(*o, objs[i]) {
   821  			t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o)
   822  		}
   823  	}
   824  }
   825  
   826  func TestBufferSeekToRow(t *testing.T) {
   827  	type B struct {
   828  		I int
   829  		C []string
   830  	}
   831  	type A struct {
   832  		B []B
   833  	}
   834  
   835  	buffer := parquet.NewBuffer()
   836  	var objs []A
   837  	for i := 0; i < 2; i++ {
   838  		o := A{
   839  			B: []B{
   840  				{I: i, C: []string{"foo", strconv.Itoa(i)}},
   841  				{I: i + 1, C: []string{"bar", strconv.Itoa(i + 1)}},
   842  			},
   843  		}
   844  		buffer.Write(&o)
   845  		objs = append(objs, o)
   846  	}
   847  
   848  	buf := new(bytes.Buffer)
   849  	w := parquet.NewWriter(buf)
   850  	w.WriteRowGroup(buffer)
   851  	w.Flush()
   852  	w.Close()
   853  
   854  	file := bytes.NewReader(buf.Bytes())
   855  	r := parquet.NewReader(file)
   856  
   857  	i := 1
   858  	o := new(A)
   859  	if err := r.SeekToRow(int64(i)); err != nil {
   860  		t.Fatal(err)
   861  	}
   862  	if err := r.Read(o); err != nil {
   863  		t.Fatal(err)
   864  	}
   865  	if !reflect.DeepEqual(*o, objs[i]) {
   866  		t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o)
   867  	}
   868  }
   869  
   870  type TestStruct struct {
   871  	A *string `parquet:"a,optional,dict"`
   872  }
   873  
   874  func TestOptionalDictWriteRowGroup(t *testing.T) {
   875  	s := parquet.SchemaOf(&TestStruct{})
   876  
   877  	str1 := "test1"
   878  	str2 := "test2"
   879  	records := []*TestStruct{
   880  		{A: nil},
   881  		{A: &str1},
   882  		{A: nil},
   883  		{A: &str2},
   884  		{A: nil},
   885  	}
   886  
   887  	buf := parquet.NewBuffer(s)
   888  	for _, rec := range records {
   889  		row := s.Deconstruct(nil, rec)
   890  		_, err := buf.WriteRows([]parquet.Row{row})
   891  		if err != nil {
   892  			t.Fatal(err)
   893  		}
   894  	}
   895  
   896  	b := bytes.NewBuffer(nil)
   897  	w := parquet.NewWriter(b)
   898  	_, err := w.WriteRowGroup(buf)
   899  	if err != nil {
   900  		t.Fatal(err)
   901  	}
   902  }
   903  
   904  func TestNullsSortFirst(t *testing.T) {
   905  	s := parquet.SchemaOf(&TestStruct{})
   906  
   907  	str1 := "test1"
   908  	str2 := "test2"
   909  	records := []*TestStruct{
   910  		{A: &str1},
   911  		{A: nil},
   912  		{A: &str2},
   913  	}
   914  	buf := parquet.NewBuffer(
   915  		s,
   916  		parquet.SortingRowGroupConfig(parquet.SortingColumns(parquet.NullsFirst(parquet.Ascending(s.Columns()[0][0])))),
   917  	)
   918  	for _, rec := range records {
   919  		row := s.Deconstruct(nil, rec)
   920  		_, err := buf.WriteRows([]parquet.Row{row})
   921  		if err != nil {
   922  			t.Fatal(err)
   923  		}
   924  	}
   925  
   926  	sort.Sort(buf)
   927  
   928  	rows := buf.Rows()
   929  	defer rows.Close()
   930  	rowBuf := make([]parquet.Row, len(records))
   931  	if _, err := rows.ReadRows(rowBuf); err != nil {
   932  		t.Fatal(err)
   933  	}
   934  
   935  	resultRecords := make([]TestStruct, len(records))
   936  	for i, r := range rowBuf {
   937  		if err := s.Reconstruct(&resultRecords[i], r); err != nil {
   938  			t.Fatal(err)
   939  		}
   940  	}
   941  
   942  	if resultRecords[0].A != nil {
   943  		t.Fatal("expected null to sort first, but found", resultRecords)
   944  	}
   945  }
   946  
   947  func generateBenchmarkBufferRows(n int) (*parquet.Schema, []parquet.Row) {
   948  	model := new(benchmarkRowType)
   949  	schema := parquet.SchemaOf(model)
   950  	prng := rand.New(rand.NewSource(0))
   951  	rows := make([]parquet.Row, n)
   952  
   953  	for i := range rows {
   954  		io.ReadFull(prng, model.ID[:])
   955  		model.Value = prng.Float64()
   956  		rows[i] = make(parquet.Row, 0, 2)
   957  		rows[i] = schema.Deconstruct(rows[i], model)
   958  	}
   959  
   960  	return schema, rows
   961  }
   962  
   963  func BenchmarkBufferReadRows100x(b *testing.B) {
   964  	schema, rows := generateBenchmarkBufferRows(benchmarkNumRows)
   965  	buffer := parquet.NewBuffer(schema)
   966  
   967  	for i := 0; i < len(rows); i += benchmarkRowsPerStep {
   968  		j := i + benchmarkRowsPerStep
   969  		if _, err := buffer.WriteRows(rows[i:j]); err != nil {
   970  			b.Fatal(err)
   971  		}
   972  	}
   973  
   974  	bufferRows := buffer.Rows()
   975  	defer bufferRows.Close()
   976  
   977  	benchmarkRowsPerSecond(b, func() int {
   978  		n, err := bufferRows.ReadRows(rows[:benchmarkRowsPerStep])
   979  		if err != nil {
   980  			if errors.Is(err, io.EOF) {
   981  				err = bufferRows.SeekToRow(0)
   982  			}
   983  			if err != nil {
   984  				b.Fatal(err)
   985  			}
   986  		}
   987  		return n
   988  	})
   989  }
   990  
   991  func BenchmarkBufferWriteRows100x(b *testing.B) {
   992  	schema, rows := generateBenchmarkBufferRows(benchmarkNumRows)
   993  	buffer := parquet.NewBuffer(schema)
   994  
   995  	i := 0
   996  	benchmarkRowsPerSecond(b, func() int {
   997  		n, err := buffer.WriteRows(rows[i : i+benchmarkRowsPerStep])
   998  		if err != nil {
   999  			b.Fatal(err)
  1000  		}
  1001  
  1002  		i += benchmarkRowsPerStep
  1003  		i %= benchmarkNumRows
  1004  
  1005  		if i == 0 {
  1006  			buffer.Reset()
  1007  		}
  1008  		return n
  1009  	})
  1010  }