github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/buffer_go18_test.go (about)

     1  //go:build go1.18
     2  
     3  package parquet_test
     4  
     5  import (
     6  	"encoding/binary"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"math/rand"
    11  	"reflect"
    12  	"sort"
    13  	"testing"
    14  
    15  	"github.com/segmentio/parquet-go"
    16  )
    17  
    18  func TestGenericBuffer(t *testing.T) {
    19  	testGenericBuffer[booleanColumn](t)
    20  	testGenericBuffer[int32Column](t)
    21  	testGenericBuffer[int64Column](t)
    22  	testGenericBuffer[int96Column](t)
    23  	testGenericBuffer[floatColumn](t)
    24  	testGenericBuffer[doubleColumn](t)
    25  	testGenericBuffer[byteArrayColumn](t)
    26  	testGenericBuffer[fixedLenByteArrayColumn](t)
    27  	testGenericBuffer[stringColumn](t)
    28  	testGenericBuffer[indexedStringColumn](t)
    29  	testGenericBuffer[uuidColumn](t)
    30  	testGenericBuffer[timeColumn](t)
    31  	testGenericBuffer[timeInMillisColumn](t)
    32  	testGenericBuffer[mapColumn](t)
    33  	testGenericBuffer[decimalColumn](t)
    34  	testGenericBuffer[addressBook](t)
    35  	testGenericBuffer[contact](t)
    36  	testGenericBuffer[listColumn2](t)
    37  	testGenericBuffer[listColumn1](t)
    38  	testGenericBuffer[listColumn0](t)
    39  	testGenericBuffer[nestedListColumn1](t)
    40  	testGenericBuffer[nestedListColumn](t)
    41  	testGenericBuffer[*contact](t)
    42  	testGenericBuffer[paddedBooleanColumn](t)
    43  	testGenericBuffer[optionalInt32Column](t)
    44  	testGenericBuffer[repeatedInt32Column](t)
    45  }
    46  
    47  func testGenericBuffer[Row any](t *testing.T) {
    48  	var model Row
    49  	t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {
    50  		err := quickCheck(func(rows []Row) bool {
    51  			if len(rows) == 0 {
    52  				return true // TODO: fix support for parquet files with zero rows
    53  			}
    54  			if err := testGenericBufferRows(rows); err != nil {
    55  				t.Error(err)
    56  				return false
    57  			}
    58  			return true
    59  		})
    60  		if err != nil {
    61  			t.Error(err)
    62  		}
    63  	})
    64  }
    65  
    66  func testGenericBufferRows[Row any](rows []Row) error {
    67  	setNullPointers(rows)
    68  	buffer := parquet.NewGenericBuffer[Row]()
    69  	_, err := buffer.Write(rows)
    70  	if err != nil {
    71  		return err
    72  	}
    73  	reader := parquet.NewGenericRowGroupReader[Row](buffer)
    74  	result := make([]Row, len(rows))
    75  	n, err := reader.Read(result)
    76  	if err != nil && !errors.Is(err, io.EOF) {
    77  		return err
    78  	}
    79  	if n < len(rows) {
    80  		return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n)
    81  	}
    82  	if !reflect.DeepEqual(rows, result) {
    83  		return fmt.Errorf("rows mismatch:\nwant: %#v\ngot:  %#v", rows, result)
    84  	}
    85  	return nil
    86  }
    87  
    88  func setNullPointers[Row any](rows []Row) {
    89  	if len(rows) > 0 && reflect.TypeOf(rows[0]).Kind() == reflect.Pointer {
    90  		for i := range rows {
    91  			v := reflect.ValueOf(&rows[i]).Elem()
    92  			if v.IsNil() {
    93  				v.Set(reflect.New(v.Type().Elem()))
    94  			}
    95  		}
    96  	}
    97  }
    98  
    99  type generator[T any] interface {
   100  	generate(*rand.Rand) T
   101  }
   102  
   103  func BenchmarkGenericBuffer(b *testing.B) {
   104  	benchmarkGenericBuffer[benchmarkRowType](b)
   105  	benchmarkGenericBuffer[booleanColumn](b)
   106  	benchmarkGenericBuffer[int32Column](b)
   107  	benchmarkGenericBuffer[int64Column](b)
   108  	benchmarkGenericBuffer[floatColumn](b)
   109  	benchmarkGenericBuffer[doubleColumn](b)
   110  	benchmarkGenericBuffer[byteArrayColumn](b)
   111  	benchmarkGenericBuffer[fixedLenByteArrayColumn](b)
   112  	benchmarkGenericBuffer[stringColumn](b)
   113  	benchmarkGenericBuffer[indexedStringColumn](b)
   114  	benchmarkGenericBuffer[uuidColumn](b)
   115  	benchmarkGenericBuffer[timeColumn](b)
   116  	benchmarkGenericBuffer[timeInMillisColumn](b)
   117  	benchmarkGenericBuffer[mapColumn](b)
   118  	benchmarkGenericBuffer[decimalColumn](b)
   119  	benchmarkGenericBuffer[contact](b)
   120  	benchmarkGenericBuffer[paddedBooleanColumn](b)
   121  	benchmarkGenericBuffer[optionalInt32Column](b)
   122  	benchmarkGenericBuffer[repeatedInt32Column](b)
   123  }
   124  
   125  func benchmarkGenericBuffer[Row generator[Row]](b *testing.B) {
   126  	var model Row
   127  	b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {
   128  		prng := rand.New(rand.NewSource(0))
   129  		rows := make([]Row, benchmarkNumRows)
   130  		for i := range rows {
   131  			rows[i] = rows[i].generate(prng)
   132  		}
   133  
   134  		b.Run("go1.17", func(b *testing.B) {
   135  			buffer := parquet.NewBuffer(parquet.SchemaOf(rows[0]))
   136  			i := 0
   137  			benchmarkRowsPerSecond(b, func() int {
   138  				for j := 0; j < benchmarkRowsPerStep; j++ {
   139  					if err := buffer.Write(&rows[i]); err != nil {
   140  						b.Fatal(err)
   141  					}
   142  				}
   143  
   144  				i += benchmarkRowsPerStep
   145  				i %= benchmarkNumRows
   146  
   147  				if i == 0 {
   148  					buffer.Reset()
   149  				}
   150  				return benchmarkRowsPerStep
   151  			})
   152  		})
   153  
   154  		b.Run("go1.18", func(b *testing.B) {
   155  			buffer := parquet.NewGenericBuffer[Row]()
   156  			i := 0
   157  			benchmarkRowsPerSecond(b, func() int {
   158  				n, err := buffer.Write(rows[i : i+benchmarkRowsPerStep])
   159  				if err != nil {
   160  					b.Fatal(err)
   161  				}
   162  
   163  				i += benchmarkRowsPerStep
   164  				i %= benchmarkNumRows
   165  
   166  				if i == 0 {
   167  					buffer.Reset()
   168  				}
   169  				return n
   170  			})
   171  		})
   172  	})
   173  }
   174  
   175  func TestIssue327(t *testing.T) {
   176  	t.Run("untagged nested lists should panic", func(t *testing.T) {
   177  		type testType struct {
   178  			ListOfLists [][]int
   179  		}
   180  
   181  		defer func() {
   182  			if r := recover(); r == nil {
   183  				t.Errorf("Nested lists without the list tag should panic")
   184  			}
   185  		}()
   186  
   187  		_ = parquet.NewGenericBuffer[testType]()
   188  	})
   189  }
   190  
   191  func TestIssue346(t *testing.T) {
   192  	type TestType struct {
   193  		Key int
   194  	}
   195  
   196  	schema := parquet.SchemaOf(TestType{})
   197  	buffer := parquet.NewGenericBuffer[any](schema)
   198  
   199  	data := make([]any, 1)
   200  	data[0] = TestType{Key: 0}
   201  	_, _ = buffer.Write(data)
   202  }
   203  
   204  func TestIssue347(t *testing.T) {
   205  	type TestType struct {
   206  		Key int
   207  	}
   208  
   209  	// instantiating with concrete type shouldn't panic
   210  	_ = parquet.NewGenericBuffer[TestType]()
   211  
   212  	// instantiating with schema and interface type parameter shouldn't panic
   213  	schema := parquet.SchemaOf(TestType{})
   214  	_ = parquet.NewGenericBuffer[any](schema)
   215  
   216  	defer func() {
   217  		if r := recover(); r == nil {
   218  			t.Errorf("instantiating generic buffer without schema and with interface " +
   219  				"type parameter should panic")
   220  		}
   221  	}()
   222  	_ = parquet.NewGenericBuffer[any]()
   223  }
   224  
   225  func BenchmarkSortGenericBuffer(b *testing.B) {
   226  	type Row struct {
   227  		I0 int64
   228  		I1 int64
   229  		I2 int64
   230  		I3 int64
   231  		I4 int64
   232  		I5 int64
   233  		I6 int64
   234  		I7 int64
   235  		I8 int64
   236  		I9 int64
   237  		ID [16]byte
   238  	}
   239  
   240  	buf := parquet.NewGenericBuffer[Row](
   241  		parquet.SortingRowGroupConfig(
   242  			parquet.SortingColumns(
   243  				parquet.Ascending("ID"),
   244  			),
   245  		),
   246  	)
   247  
   248  	rows := make([]Row, 10e3)
   249  	prng := rand.New(rand.NewSource(0))
   250  
   251  	for i := range rows {
   252  		binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i))
   253  		binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i))
   254  	}
   255  
   256  	buf.Write(rows)
   257  	b.ResetTimer()
   258  
   259  	for i := 0; i < b.N; i++ {
   260  		for j := 0; j < 10; j++ {
   261  			buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows)))
   262  		}
   263  
   264  		sort.Sort(buf)
   265  	}
   266  }