github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/row_buffer_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"math/rand"
    10  	"reflect"
    11  	"sort"
    12  	"testing"
    13  
    14  	"github.com/parquet-go/parquet-go"
    15  	"github.com/parquet-go/parquet-go/encoding"
    16  )
    17  
    18  func TestRowBuffer(t *testing.T) {
    19  	testRowBuffer[booleanColumn](t)
    20  	testRowBuffer[int32Column](t)
    21  	testRowBuffer[int64Column](t)
    22  	testRowBuffer[int96Column](t)
    23  	testRowBuffer[floatColumn](t)
    24  	testRowBuffer[doubleColumn](t)
    25  	testRowBuffer[byteArrayColumn](t)
    26  	testRowBuffer[fixedLenByteArrayColumn](t)
    27  	testRowBuffer[stringColumn](t)
    28  	testRowBuffer[indexedStringColumn](t)
    29  	testRowBuffer[uuidColumn](t)
    30  	testRowBuffer[timeColumn](t)
    31  	testRowBuffer[timeInMillisColumn](t)
    32  	testRowBuffer[mapColumn](t)
    33  	testRowBuffer[decimalColumn](t)
    34  	testRowBuffer[addressBook](t)
    35  	testRowBuffer[contact](t)
    36  	testRowBuffer[listColumn2](t)
    37  	testRowBuffer[listColumn1](t)
    38  	testRowBuffer[listColumn0](t)
    39  	testRowBuffer[nestedListColumn1](t)
    40  	testRowBuffer[nestedListColumn](t)
    41  	testRowBuffer[*contact](t)
    42  	testRowBuffer[paddedBooleanColumn](t)
    43  	testRowBuffer[optionalInt32Column](t)
    44  	testRowBuffer[repeatedInt32Column](t)
    45  
    46  	for _, test := range bufferTests {
    47  		t.Run(test.scenario, func(t *testing.T) {
    48  			for _, mod := range [...]struct {
    49  				scenario string
    50  				function func(parquet.Node) parquet.Node
    51  			}{
    52  				{scenario: "optional", function: parquet.Optional},
    53  				{scenario: "repeated", function: parquet.Repeated},
    54  				{scenario: "required", function: parquet.Required},
    55  			} {
    56  				t.Run(mod.scenario, func(t *testing.T) {
    57  					for _, ordering := range [...]struct {
    58  						scenario string
    59  						sorting  parquet.SortingColumn
    60  						sortFunc func(parquet.Type, []parquet.Value)
    61  					}{
    62  						{scenario: "unordered", sorting: nil, sortFunc: unordered},
    63  						{scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending},
    64  						{scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending},
    65  					} {
    66  						t.Run(ordering.scenario, func(t *testing.T) {
    67  							schema := parquet.NewSchema("test", parquet.Group{
    68  								"data": mod.function(parquet.Leaf(test.typ)),
    69  							})
    70  
    71  							options := []parquet.RowGroupOption{
    72  								schema,
    73  							}
    74  
    75  							if ordering.sorting != nil {
    76  								options = append(options,
    77  									parquet.SortingRowGroupConfig(
    78  										parquet.SortingColumns(ordering.sorting),
    79  									),
    80  								)
    81  							}
    82  
    83  							content := new(bytes.Buffer)
    84  							buffer := parquet.NewRowBuffer[any](options...)
    85  
    86  							for _, values := range test.values {
    87  								t.Run("", func(t *testing.T) {
    88  									defer content.Reset()
    89  									defer buffer.Reset()
    90  									fields := schema.Fields()
    91  									testRowBufferAny(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc)
    92  								})
    93  							}
    94  						})
    95  					}
    96  				})
    97  			}
    98  		})
    99  	}
   100  }
   101  
   102  func testRowBuffer[Row any](t *testing.T) {
   103  	var model Row
   104  	t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {
   105  		err := quickCheck(func(rows []Row) bool {
   106  			if len(rows) == 0 {
   107  				return true // TODO: fix support for parquet files with zero rows
   108  			}
   109  			if err := testRowBufferRows(rows); err != nil {
   110  				t.Error(err)
   111  				return false
   112  			}
   113  			return true
   114  		})
   115  		if err != nil {
   116  			t.Error(err)
   117  		}
   118  	})
   119  }
   120  
   121  func testRowBufferRows[Row any](rows []Row) error {
   122  	setNullPointers(rows)
   123  	buffer := parquet.NewRowBuffer[Row]()
   124  	_, err := buffer.Write(rows)
   125  	if err != nil {
   126  		return err
   127  	}
   128  	reader := parquet.NewGenericRowGroupReader[Row](buffer)
   129  	result := make([]Row, len(rows))
   130  	n, err := reader.Read(result)
   131  	if err != nil && !errors.Is(err, io.EOF) {
   132  		return err
   133  	}
   134  	if n < len(rows) {
   135  		return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n)
   136  	}
   137  	if !reflect.DeepEqual(rows, result) {
   138  		return fmt.Errorf("rows mismatch:\nwant: %#v\ngot:  %#v", rows, result)
   139  	}
   140  	return nil
   141  }
   142  
   143  func testRowBufferAny(t *testing.T, node parquet.Node, buffer *parquet.RowBuffer[any], encoding encoding.Encoding, values []any, sortFunc sortFunc) {
   144  	repetitionLevel := 0
   145  	definitionLevel := 0
   146  	if !node.Required() {
   147  		definitionLevel = 1
   148  	}
   149  
   150  	minValue := parquet.Value{}
   151  	maxValue := parquet.Value{}
   152  	batch := make([]parquet.Value, len(values))
   153  	for i := range values {
   154  		batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0)
   155  	}
   156  
   157  	for i := range batch {
   158  		_, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]})
   159  		if err != nil {
   160  			t.Fatalf("writing value to row group: %v", err)
   161  		}
   162  	}
   163  
   164  	numRows := buffer.NumRows()
   165  	if numRows != int64(len(batch)) {
   166  		t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows)
   167  	}
   168  
   169  	typ := node.Type()
   170  	for _, value := range batch {
   171  		if minValue.IsNull() || typ.Compare(value, minValue) < 0 {
   172  			minValue = value
   173  		}
   174  		if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 {
   175  			maxValue = value
   176  		}
   177  	}
   178  
   179  	sortFunc(typ, batch)
   180  	sort.Sort(buffer)
   181  
   182  	pages := buffer.ColumnChunks()[0].Pages()
   183  	page, err := pages.ReadPage()
   184  	defer pages.Close()
   185  
   186  	if err == io.EOF {
   187  		if numRows != 0 {
   188  			t.Fatalf("no pages found in row buffer despite having %d rows", numRows)
   189  		} else {
   190  			return
   191  		}
   192  	}
   193  
   194  	numValues := page.NumValues()
   195  	if numValues != int64(len(batch)) {
   196  		t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues)
   197  	}
   198  
   199  	numNulls := page.NumNulls()
   200  	if numNulls != 0 {
   201  		t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls)
   202  	}
   203  
   204  	min, max, hasBounds := page.Bounds()
   205  	if !hasBounds && numRows > 0 {
   206  		t.Fatal("page bounds are missing")
   207  	}
   208  	if !parquet.Equal(min, minValue) {
   209  		t.Fatalf("min value mismatch: want=%v got=%v", minValue, min)
   210  	}
   211  	if !parquet.Equal(max, maxValue) {
   212  		t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max)
   213  	}
   214  
   215  	// We write a single value per row, so num values = num rows for all pages
   216  	// including repeated ones, which makes it OK to slice the pages using the
   217  	// number of values as a proxy for the row indexes.
   218  	halfValues := numValues / 2
   219  
   220  	for _, test := range [...]struct {
   221  		scenario string
   222  		values   []parquet.Value
   223  		reader   parquet.ValueReader
   224  	}{
   225  		{"page", batch, page.Values()},
   226  		{"head", batch[:halfValues], page.Slice(0, halfValues).Values()},
   227  		{"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()},
   228  	} {
   229  		v := [1]parquet.Value{}
   230  		i := 0
   231  
   232  		for {
   233  			n, err := test.reader.ReadValues(v[:])
   234  			if n > 0 {
   235  				if n != 1 {
   236  					t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n)
   237  				}
   238  				if i < len(test.values) {
   239  					if !parquet.Equal(v[0], test.values[i]) {
   240  						t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0])
   241  					}
   242  				}
   243  				i++
   244  			}
   245  			if err != nil {
   246  				if err == io.EOF {
   247  					break
   248  				}
   249  				t.Fatalf("reading value from %q reader: %v", test.scenario, err)
   250  			}
   251  		}
   252  
   253  		if i != len(test.values) {
   254  			t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i)
   255  		}
   256  	}
   257  }
   258  
   259  func BenchmarkSortRowBuffer(b *testing.B) {
   260  	type Row struct {
   261  		I0 int64
   262  		I1 int64
   263  		I2 int64
   264  		I3 int64
   265  		I4 int64
   266  		I5 int64
   267  		I6 int64
   268  		I7 int64
   269  		I8 int64
   270  		I9 int64
   271  		ID [16]byte
   272  	}
   273  
   274  	buf := parquet.NewRowBuffer[Row](
   275  		parquet.SortingRowGroupConfig(
   276  			parquet.SortingColumns(
   277  				parquet.Ascending("ID"),
   278  			),
   279  		),
   280  	)
   281  
   282  	rows := make([]Row, 10e3)
   283  	prng := rand.New(rand.NewSource(0))
   284  
   285  	for i := range rows {
   286  		binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i))
   287  		binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i))
   288  	}
   289  
   290  	buf.Write(rows)
   291  	b.ResetTimer()
   292  
   293  	for i := 0; i < b.N; i++ {
   294  		for j := 0; j < 10; j++ {
   295  			buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows)))
   296  		}
   297  
   298  		sort.Sort(buf)
   299  	}
   300  }
   301  
   302  func BenchmarkMergeRowBuffers(b *testing.B) {
   303  	type Row struct {
   304  		ID int64 `parquet:"id"`
   305  	}
   306  
   307  	const (
   308  		numBuffers       = 100
   309  		numRowsPerBuffer = 10e3
   310  	)
   311  
   312  	rows := [numBuffers][numRowsPerBuffer]Row{}
   313  	nextID := int64(0)
   314  	for i := 0; i < numRowsPerBuffer; i++ {
   315  		for j := 0; j < numBuffers; j++ {
   316  			rows[j][i].ID = nextID
   317  			nextID++
   318  		}
   319  	}
   320  
   321  	options := []parquet.RowGroupOption{
   322  		parquet.SortingRowGroupConfig(
   323  			parquet.SortingColumns(
   324  				parquet.Ascending("id"),
   325  			),
   326  		),
   327  	}
   328  
   329  	rowGroups := make([]parquet.RowGroup, numBuffers)
   330  	for i := range rowGroups {
   331  		buffer := parquet.NewRowBuffer[Row](options...)
   332  		buffer.Write(rows[i][:])
   333  		rowGroups[i] = buffer
   334  	}
   335  
   336  	merge, err := parquet.MergeRowGroups(rowGroups, options...)
   337  	if err != nil {
   338  		b.Fatal(err)
   339  	}
   340  
   341  	b.ResetTimer()
   342  
   343  	for i := 0; i < b.N; i++ {
   344  		rows := merge.Rows()
   345  		_, err := parquet.CopyRows(discardRows{}, rows)
   346  		rows.Close()
   347  		if err != nil {
   348  			b.Fatal(err)
   349  		}
   350  	}
   351  }
   352  
   353  type discardRows struct{}
   354  
   355  func (discardRows) WriteRows(rows []parquet.Row) (int, error) {
   356  	return len(rows), nil
   357  }