github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/parquet_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"math/rand"
     8  	"reflect"
     9  	"strings"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/google/uuid"
    14  	"github.com/vc42/parquet-go"
    15  	"github.com/vc42/parquet-go/deprecated"
    16  	"github.com/vc42/parquet-go/internal/quick"
    17  )
    18  
    19  const (
    20  	benchmarkNumRows     = 10_000
    21  	benchmarkRowsPerStep = 1000
    22  )
    23  
    24  type benchmarkRowType struct {
    25  	ID    [16]byte `parquet:"id,uuid"`
    26  	Value float64  `parquet:"value"`
    27  }
    28  
    29  func (row benchmarkRowType) generate(prng *rand.Rand) benchmarkRowType {
    30  	prng.Read(row.ID[:])
    31  	row.Value = prng.Float64()
    32  	return row
    33  }
    34  
    35  type paddedBooleanColumn struct {
    36  	Value bool
    37  	_     [3]byte
    38  }
    39  
    40  func (row paddedBooleanColumn) generate(prng *rand.Rand) paddedBooleanColumn {
    41  	return paddedBooleanColumn{Value: prng.Int()%2 == 0}
    42  }
    43  
    44  type booleanColumn struct {
    45  	Value bool
    46  }
    47  
    48  func (row booleanColumn) generate(prng *rand.Rand) booleanColumn {
    49  	return booleanColumn{Value: prng.Int()%2 == 0}
    50  }
    51  
    52  type int32Column struct {
    53  	Value int32 `parquet:",delta"`
    54  }
    55  
    56  func (row int32Column) generate(prng *rand.Rand) int32Column {
    57  	return int32Column{Value: prng.Int31n(100)}
    58  }
    59  
    60  type int64Column struct {
    61  	Value int64 `parquet:",delta"`
    62  }
    63  
    64  func (row int64Column) generate(prng *rand.Rand) int64Column {
    65  	return int64Column{Value: prng.Int63n(100)}
    66  }
    67  
    68  type int96Column struct {
    69  	Value deprecated.Int96
    70  }
    71  
    72  func (row int96Column) generate(prng *rand.Rand) int96Column {
    73  	row.Value[0] = prng.Uint32()
    74  	row.Value[1] = prng.Uint32()
    75  	row.Value[2] = prng.Uint32()
    76  	return row
    77  }
    78  
    79  type floatColumn struct {
    80  	Value float32
    81  }
    82  
    83  func (row floatColumn) generate(prng *rand.Rand) floatColumn {
    84  	return floatColumn{Value: prng.Float32()}
    85  }
    86  
    87  type doubleColumn struct {
    88  	Value float64
    89  }
    90  
    91  func (row doubleColumn) generate(prng *rand.Rand) doubleColumn {
    92  	return doubleColumn{Value: prng.Float64()}
    93  }
    94  
    95  type byteArrayColumn struct {
    96  	Value []byte
    97  }
    98  
    99  func (row byteArrayColumn) generate(prng *rand.Rand) byteArrayColumn {
   100  	row.Value = make([]byte, prng.Intn(10))
   101  	prng.Read(row.Value)
   102  	return row
   103  }
   104  
   105  type fixedLenByteArrayColumn struct {
   106  	Value [10]byte
   107  }
   108  
   109  func (row fixedLenByteArrayColumn) generate(prng *rand.Rand) fixedLenByteArrayColumn {
   110  	prng.Read(row.Value[:])
   111  	return row
   112  }
   113  
   114  type stringColumn struct {
   115  	Value string
   116  }
   117  
   118  func (row stringColumn) generate(prng *rand.Rand) stringColumn {
   119  	return stringColumn{Value: generateString(prng, 10)}
   120  }
   121  
   122  type indexedStringColumn struct {
   123  	Value string `parquet:",dict"`
   124  }
   125  
   126  func (row indexedStringColumn) generate(prng *rand.Rand) indexedStringColumn {
   127  	return indexedStringColumn{Value: generateString(prng, 10)}
   128  }
   129  
   130  type uuidColumn struct {
   131  	Value uuid.UUID `parquet:",delta"`
   132  }
   133  
   134  func (row uuidColumn) generate(prng *rand.Rand) uuidColumn {
   135  	prng.Read(row.Value[:])
   136  	return row
   137  }
   138  
   139  type decimalColumn struct {
   140  	Value int64 `parquet:",decimal(0:3)"`
   141  }
   142  
   143  func (row decimalColumn) generate(prng *rand.Rand) decimalColumn {
   144  	return decimalColumn{Value: prng.Int63()}
   145  }
   146  
   147  type mapColumn struct {
   148  	Value map[utf8string]int
   149  }
   150  
   151  func (row mapColumn) generate(prng *rand.Rand) mapColumn {
   152  	n := prng.Intn(10)
   153  	row.Value = make(map[utf8string]int, n)
   154  	for i := 0; i < n; i++ {
   155  		row.Value[utf8string(generateString(prng, 8))] = prng.Intn(100)
   156  	}
   157  	return row
   158  }
   159  
   160  type addressBook struct {
   161  	Owner             utf8string   `parquet:",plain"`
   162  	OwnerPhoneNumbers []utf8string `parquet:",plain"`
   163  	Contacts          []contact
   164  }
   165  
   166  type contact struct {
   167  	Name        utf8string `parquet:",plain"`
   168  	PhoneNumber utf8string `parquet:",plain"`
   169  }
   170  
   171  func (row contact) generate(prng *rand.Rand) contact {
   172  	return contact{
   173  		Name:        utf8string(generateString(prng, 16)),
   174  		PhoneNumber: utf8string(generateString(prng, 10)),
   175  	}
   176  }
   177  
   178  type optionalInt32Column struct {
   179  	Value int32 `parquet:",optional"`
   180  }
   181  
   182  func (row optionalInt32Column) generate(prng *rand.Rand) optionalInt32Column {
   183  	return optionalInt32Column{Value: prng.Int31n(100)}
   184  }
   185  
   186  type repeatedInt32Column struct {
   187  	Values []int32
   188  }
   189  
   190  func (row repeatedInt32Column) generate(prng *rand.Rand) repeatedInt32Column {
   191  	row.Values = make([]int32, prng.Intn(10))
   192  	for i := range row.Values {
   193  		row.Values[i] = prng.Int31n(10)
   194  	}
   195  	return row
   196  }
   197  
   198  type listColumn2 struct {
   199  	Value utf8string `parquet:",optional"`
   200  }
   201  
   202  type listColumn1 struct {
   203  	List2 []listColumn2 `parquet:",list"`
   204  }
   205  
   206  type listColumn0 struct {
   207  	List1 []listColumn1 `parquet:",list"`
   208  }
   209  
   210  type nestedListColumn1 struct {
   211  	Level3 []utf8string `parquet:"level3"`
   212  }
   213  
   214  type nestedListColumn struct {
   215  	Level1 []nestedListColumn1 `parquet:"level1"`
   216  	Level2 []utf8string        `parquet:"level2"`
   217  }
   218  
   219  type utf8string string
   220  
   221  func (utf8string) Generate(rand *rand.Rand, size int) reflect.Value {
   222  	const characters = "abcdefghijklmnopqrstuvwxyz1234567890"
   223  	const maxSize = 10
   224  	if size > maxSize {
   225  		size = maxSize
   226  	}
   227  	n := rand.Intn(size)
   228  	b := make([]byte, n)
   229  	for i := range b {
   230  		b[i] = characters[rand.Intn(len(characters))]
   231  	}
   232  	return reflect.ValueOf(utf8string(b))
   233  }
   234  
   235  type Contact struct {
   236  	Name        string `parquet:"name"`
   237  	PhoneNumber string `parquet:"phoneNumber,optional,zstd"`
   238  }
   239  
   240  type AddressBook struct {
   241  	Owner             string    `parquet:"owner,zstd"`
   242  	OwnerPhoneNumbers []string  `parquet:"ownerPhoneNumbers,gzip"`
   243  	Contacts          []Contact `parquet:"contacts"`
   244  }
   245  
   246  func forEachLeafColumn(col *parquet.Column, do func(*parquet.Column) error) error {
   247  	children := col.Columns()
   248  
   249  	if len(children) == 0 {
   250  		return do(col)
   251  	}
   252  
   253  	for _, child := range children {
   254  		if err := forEachLeafColumn(child, do); err != nil {
   255  			return err
   256  		}
   257  	}
   258  
   259  	return nil
   260  }
   261  
   262  func forEachPage(pages parquet.PageReader, do func(parquet.Page) error) error {
   263  	for {
   264  		p, err := pages.ReadPage()
   265  		if err != nil {
   266  			if err == io.EOF {
   267  				err = nil
   268  			}
   269  			return err
   270  		}
   271  		if err := do(p); err != nil {
   272  			return err
   273  		}
   274  	}
   275  }
   276  
   277  func forEachValue(values parquet.ValueReader, do func(parquet.Value) error) error {
   278  	buffer := [3]parquet.Value{}
   279  	for {
   280  		n, err := values.ReadValues(buffer[:])
   281  		for _, v := range buffer[:n] {
   282  			if err := do(v); err != nil {
   283  				return err
   284  			}
   285  		}
   286  		if err != nil {
   287  			if err == io.EOF {
   288  				err = nil
   289  			}
   290  			return err
   291  		}
   292  	}
   293  }
   294  
   295  func forEachColumnPage(col *parquet.Column, do func(*parquet.Column, parquet.Page) error) error {
   296  	return forEachLeafColumn(col, func(leaf *parquet.Column) error {
   297  		pages := leaf.Pages()
   298  		defer pages.Close()
   299  		return forEachPage(pages, func(page parquet.Page) error { return do(leaf, page) })
   300  	})
   301  }
   302  
   303  func forEachColumnValue(col *parquet.Column, do func(*parquet.Column, parquet.Value) error) error {
   304  	return forEachColumnPage(col, func(leaf *parquet.Column, page parquet.Page) error {
   305  		return forEachValue(page.Values(), func(value parquet.Value) error { return do(leaf, value) })
   306  	})
   307  }
   308  
   309  func forEachColumnChunk(file *parquet.File, do func(*parquet.Column, parquet.ColumnChunk) error) error {
   310  	return forEachLeafColumn(file.Root(), func(leaf *parquet.Column) error {
   311  		for _, rowGroup := range file.RowGroups() {
   312  			if err := do(leaf, rowGroup.ColumnChunks()[leaf.Index()]); err != nil {
   313  				return err
   314  			}
   315  		}
   316  		return nil
   317  	})
   318  }
   319  
   320  func createParquetFile(rows rows, options ...parquet.WriterOption) (*parquet.File, error) {
   321  	buffer := new(bytes.Buffer)
   322  
   323  	if err := writeParquetFile(buffer, rows, options...); err != nil {
   324  		return nil, err
   325  	}
   326  
   327  	reader := bytes.NewReader(buffer.Bytes())
   328  	return parquet.OpenFile(reader, reader.Size())
   329  }
   330  
   331  func writeParquetFile(w io.Writer, rows rows, options ...parquet.WriterOption) error {
   332  	writer := parquet.NewWriter(w, options...)
   333  
   334  	for _, row := range rows {
   335  		if err := writer.Write(row); err != nil {
   336  			return err
   337  		}
   338  	}
   339  
   340  	return writer.Close()
   341  }
   342  
   343  func writeParquetFileWithBuffer(w io.Writer, rows rows, options ...parquet.WriterOption) error {
   344  	buffer := parquet.NewBuffer()
   345  	for _, row := range rows {
   346  		if err := buffer.Write(row); err != nil {
   347  			return err
   348  		}
   349  	}
   350  
   351  	writer := parquet.NewWriter(w, options...)
   352  	numRows, err := copyRowsAndClose(writer, buffer.Rows())
   353  	if err != nil {
   354  		return err
   355  	}
   356  	if numRows != int64(len(rows)) {
   357  		return fmt.Errorf("wrong number of rows written from buffer to file: want=%d got=%d", len(rows), numRows)
   358  	}
   359  	return writer.Close()
   360  }
   361  
   362  type rows []interface{}
   363  
   364  func makeRows(any interface{}) rows {
   365  	if v, ok := any.([]interface{}); ok {
   366  		return rows(v)
   367  	}
   368  	value := reflect.ValueOf(any)
   369  	slice := make([]interface{}, value.Len())
   370  	for i := range slice {
   371  		slice[i] = value.Index(i).Interface()
   372  	}
   373  	return rows(slice)
   374  }
   375  
   376  func randValueFuncOf(t parquet.Type) func(*rand.Rand) parquet.Value {
   377  	switch k := t.Kind(); k {
   378  	case parquet.Boolean:
   379  		return func(r *rand.Rand) parquet.Value {
   380  			return parquet.ValueOf(r.Float64() < 0.5)
   381  		}
   382  
   383  	case parquet.Int32:
   384  		return func(r *rand.Rand) parquet.Value {
   385  			return parquet.ValueOf(r.Int31())
   386  		}
   387  
   388  	case parquet.Int64:
   389  		return func(r *rand.Rand) parquet.Value {
   390  			return parquet.ValueOf(r.Int63())
   391  		}
   392  
   393  	case parquet.Int96:
   394  		return func(r *rand.Rand) parquet.Value {
   395  			return parquet.ValueOf(deprecated.Int96{
   396  				0: r.Uint32(),
   397  				1: r.Uint32(),
   398  				2: r.Uint32(),
   399  			})
   400  		}
   401  
   402  	case parquet.Float:
   403  		return func(r *rand.Rand) parquet.Value {
   404  			return parquet.ValueOf(r.Float32())
   405  		}
   406  
   407  	case parquet.Double:
   408  		return func(r *rand.Rand) parquet.Value {
   409  			return parquet.ValueOf(r.Float64())
   410  		}
   411  
   412  	case parquet.ByteArray:
   413  		return func(r *rand.Rand) parquet.Value {
   414  			n := r.Intn(49) + 1
   415  			b := make([]byte, n)
   416  			const characters = "1234567890qwertyuiopasdfghjklzxcvbnm "
   417  			for i := range b {
   418  				b[i] = characters[r.Intn(len(characters))]
   419  			}
   420  			return parquet.ValueOf(b)
   421  		}
   422  
   423  	case parquet.FixedLenByteArray:
   424  		arrayType := reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))
   425  		return func(r *rand.Rand) parquet.Value {
   426  			b := make([]byte, arrayType.Len())
   427  			r.Read(b)
   428  			v := reflect.New(arrayType).Elem()
   429  			reflect.Copy(v, reflect.ValueOf(b))
   430  			return parquet.ValueOf(v.Interface())
   431  		}
   432  
   433  	default:
   434  		panic("NOT IMPLEMENTED")
   435  	}
   436  }
   437  
   438  func copyRowsAndClose(w parquet.RowWriter, r parquet.Rows) (int64, error) {
   439  	defer r.Close()
   440  	return parquet.CopyRows(w, r)
   441  }
   442  
   443  func benchmarkRowsPerSecond(b *testing.B, f func() int) {
   444  	b.ResetTimer()
   445  	start := time.Now()
   446  	numRows := int64(0)
   447  
   448  	for i := 0; i < b.N; i++ {
   449  		n := f()
   450  		numRows += int64(n)
   451  	}
   452  
   453  	seconds := time.Since(start).Seconds()
   454  	b.ReportMetric(float64(numRows)/seconds, "row/s")
   455  }
   456  
   457  func generateString(r *rand.Rand, n int) string {
   458  	const characters = "1234567890qwertyuiopasdfghjklzxcvbnm"
   459  	b := new(strings.Builder)
   460  	for i := 0; i < n; i++ {
   461  		b.WriteByte(characters[r.Intn(len(characters))])
   462  	}
   463  	return b.String()
   464  }
   465  
   466  var quickCheckConfig = quick.Config{
   467  	Sizes: []int{
   468  		0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
   469  		10, 20, 30, 40, 50, 123,
   470  	},
   471  }
   472  
   473  func quickCheck(f interface{}) error {
   474  	return quickCheckConfig.Check(f)
   475  }