github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/parquet_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"math/rand"
     8  	"reflect"
     9  	"strings"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/google/uuid"
    14  
    15  	"github.com/segmentio/parquet-go"
    16  	"github.com/segmentio/parquet-go/deprecated"
    17  	"github.com/segmentio/parquet-go/internal/quick"
    18  )
    19  
    20  const (
    21  	benchmarkNumRows     = 10_000
    22  	benchmarkRowsPerStep = 1000
    23  )
    24  
    25  type benchmarkRowType struct {
    26  	ID    [16]byte `parquet:"id,uuid"`
    27  	Value float64  `parquet:"value"`
    28  }
    29  
    30  func (row benchmarkRowType) generate(prng *rand.Rand) benchmarkRowType {
    31  	prng.Read(row.ID[:])
    32  	row.Value = prng.Float64()
    33  	return row
    34  }
    35  
    36  type paddedBooleanColumn struct {
    37  	Value bool
    38  	_     [3]byte
    39  }
    40  
    41  func (row paddedBooleanColumn) generate(prng *rand.Rand) paddedBooleanColumn {
    42  	return paddedBooleanColumn{Value: prng.Int()%2 == 0}
    43  }
    44  
    45  type booleanColumn struct {
    46  	Value bool
    47  }
    48  
    49  func (row booleanColumn) generate(prng *rand.Rand) booleanColumn {
    50  	return booleanColumn{Value: prng.Int()%2 == 0}
    51  }
    52  
    53  type int32Column struct {
    54  	Value int32 `parquet:",delta"`
    55  }
    56  
    57  func (row int32Column) generate(prng *rand.Rand) int32Column {
    58  	return int32Column{Value: prng.Int31n(100)}
    59  }
    60  
    61  type int64Column struct {
    62  	Value int64 `parquet:",delta"`
    63  }
    64  
    65  func (row int64Column) generate(prng *rand.Rand) int64Column {
    66  	return int64Column{Value: prng.Int63n(100)}
    67  }
    68  
    69  type int96Column struct {
    70  	Value deprecated.Int96
    71  }
    72  
    73  func (row int96Column) generate(prng *rand.Rand) int96Column {
    74  	row.Value[0] = prng.Uint32()
    75  	row.Value[1] = prng.Uint32()
    76  	row.Value[2] = prng.Uint32()
    77  	return row
    78  }
    79  
    80  type floatColumn struct {
    81  	Value float32
    82  }
    83  
    84  func (row floatColumn) generate(prng *rand.Rand) floatColumn {
    85  	return floatColumn{Value: prng.Float32()}
    86  }
    87  
    88  type doubleColumn struct {
    89  	Value float64
    90  }
    91  
    92  func (row doubleColumn) generate(prng *rand.Rand) doubleColumn {
    93  	return doubleColumn{Value: prng.Float64()}
    94  }
    95  
    96  type byteArrayColumn struct {
    97  	Value []byte
    98  }
    99  
   100  func (row byteArrayColumn) generate(prng *rand.Rand) byteArrayColumn {
   101  	row.Value = make([]byte, prng.Intn(10))
   102  	prng.Read(row.Value)
   103  	return row
   104  }
   105  
   106  type fixedLenByteArrayColumn struct {
   107  	Value [10]byte
   108  }
   109  
   110  func (row fixedLenByteArrayColumn) generate(prng *rand.Rand) fixedLenByteArrayColumn {
   111  	prng.Read(row.Value[:])
   112  	return row
   113  }
   114  
   115  type stringColumn struct {
   116  	Value string
   117  }
   118  
   119  func (row stringColumn) generate(prng *rand.Rand) stringColumn {
   120  	return stringColumn{Value: generateString(prng, 10)}
   121  }
   122  
   123  type indexedStringColumn struct {
   124  	Value string `parquet:",dict"`
   125  }
   126  
   127  func (row indexedStringColumn) generate(prng *rand.Rand) indexedStringColumn {
   128  	return indexedStringColumn{Value: generateString(prng, 10)}
   129  }
   130  
   131  type uuidColumn struct {
   132  	Value uuid.UUID `parquet:",delta"`
   133  }
   134  
   135  func (row uuidColumn) generate(prng *rand.Rand) uuidColumn {
   136  	prng.Read(row.Value[:])
   137  	return row
   138  }
   139  
   140  type timeColumn struct {
   141  	Value time.Time
   142  }
   143  
   144  func (row timeColumn) generate(prng *rand.Rand) timeColumn {
   145  	t := time.Unix(0, prng.Int63()).UTC()
   146  	return timeColumn{Value: t}
   147  }
   148  
   149  type timeInMillisColumn struct {
   150  	Value time.Time `parquet:",timestamp(millisecond)"`
   151  }
   152  
   153  func (row timeInMillisColumn) generate(prng *rand.Rand) timeInMillisColumn {
   154  	t := time.Unix(0, prng.Int63()).UTC()
   155  	return timeInMillisColumn{Value: t}
   156  }
   157  
   158  type decimalColumn struct {
   159  	Value int64 `parquet:",decimal(0:3)"`
   160  }
   161  
   162  func (row decimalColumn) generate(prng *rand.Rand) decimalColumn {
   163  	return decimalColumn{Value: prng.Int63()}
   164  }
   165  
   166  type mapColumn struct {
   167  	Value map[utf8string]int
   168  }
   169  
   170  func (row mapColumn) generate(prng *rand.Rand) mapColumn {
   171  	n := prng.Intn(10)
   172  	row.Value = make(map[utf8string]int, n)
   173  	for i := 0; i < n; i++ {
   174  		row.Value[utf8string(generateString(prng, 8))] = prng.Intn(100)
   175  	}
   176  	return row
   177  }
   178  
   179  type addressBook struct {
   180  	Owner             utf8string   `parquet:",plain"`
   181  	OwnerPhoneNumbers []utf8string `parquet:",plain"`
   182  	Contacts          []contact
   183  }
   184  
   185  type contact struct {
   186  	Name        utf8string `parquet:",plain"`
   187  	PhoneNumber utf8string `parquet:",plain"`
   188  }
   189  
   190  func (row contact) generate(prng *rand.Rand) contact {
   191  	return contact{
   192  		Name:        utf8string(generateString(prng, 16)),
   193  		PhoneNumber: utf8string(generateString(prng, 10)),
   194  	}
   195  }
   196  
   197  type optionalInt32Column struct {
   198  	Value int32 `parquet:",optional"`
   199  }
   200  
   201  func (row optionalInt32Column) generate(prng *rand.Rand) optionalInt32Column {
   202  	return optionalInt32Column{Value: prng.Int31n(100)}
   203  }
   204  
   205  type repeatedInt32Column struct {
   206  	Values []int32
   207  }
   208  
   209  func (row repeatedInt32Column) generate(prng *rand.Rand) repeatedInt32Column {
   210  	row.Values = make([]int32, prng.Intn(10))
   211  	for i := range row.Values {
   212  		row.Values[i] = prng.Int31n(10)
   213  	}
   214  	return row
   215  }
   216  
   217  type listColumn2 struct {
   218  	Value utf8string `parquet:",optional"`
   219  }
   220  
   221  type listColumn1 struct {
   222  	List2 []listColumn2 `parquet:",list"`
   223  }
   224  
   225  type listColumn0 struct {
   226  	List1 []listColumn1 `parquet:",list"`
   227  }
   228  
   229  type nestedListColumn1 struct {
   230  	Level3 []utf8string `parquet:"level3"`
   231  }
   232  
   233  type nestedListColumn struct {
   234  	Level1 []nestedListColumn1 `parquet:"level1"`
   235  	Level2 []utf8string        `parquet:"level2"`
   236  }
   237  
   238  type utf8string string
   239  
   240  func (utf8string) Generate(rand *rand.Rand, size int) reflect.Value {
   241  	const characters = "abcdefghijklmnopqrstuvwxyz1234567890"
   242  	const maxSize = 10
   243  	if size > maxSize {
   244  		size = maxSize
   245  	}
   246  	n := rand.Intn(size)
   247  	b := make([]byte, n)
   248  	for i := range b {
   249  		b[i] = characters[rand.Intn(len(characters))]
   250  	}
   251  	return reflect.ValueOf(utf8string(b))
   252  }
   253  
   254  type Contact struct {
   255  	Name        string `parquet:"name"`
   256  	PhoneNumber string `parquet:"phoneNumber,optional,zstd"`
   257  }
   258  
   259  type AddressBook struct {
   260  	Owner             string    `parquet:"owner,zstd"`
   261  	OwnerPhoneNumbers []string  `parquet:"ownerPhoneNumbers,gzip"`
   262  	Contacts          []Contact `parquet:"contacts"`
   263  }
   264  
   265  func forEachLeafColumn(col *parquet.Column, do func(*parquet.Column) error) error {
   266  	children := col.Columns()
   267  
   268  	if len(children) == 0 {
   269  		return do(col)
   270  	}
   271  
   272  	for _, child := range children {
   273  		if err := forEachLeafColumn(child, do); err != nil {
   274  			return err
   275  		}
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  func forEachPage(pages parquet.PageReader, do func(parquet.Page) error) error {
   282  	doAndReleasePage := func(page parquet.Page) error {
   283  		defer parquet.Release(page)
   284  		return do(page)
   285  	}
   286  
   287  	for {
   288  		p, err := pages.ReadPage()
   289  		if err != nil {
   290  			if err == io.EOF {
   291  				err = nil
   292  			}
   293  			return err
   294  		}
   295  		if err := doAndReleasePage(p); err != nil {
   296  			return err
   297  		}
   298  	}
   299  }
   300  
   301  func forEachValue(values parquet.ValueReader, do func(parquet.Value) error) error {
   302  	buffer := [3]parquet.Value{}
   303  	for {
   304  		n, err := values.ReadValues(buffer[:])
   305  		for _, v := range buffer[:n] {
   306  			if err := do(v); err != nil {
   307  				return err
   308  			}
   309  		}
   310  		if err != nil {
   311  			if err == io.EOF {
   312  				err = nil
   313  			}
   314  			return err
   315  		}
   316  	}
   317  }
   318  
   319  func forEachColumnPage(col *parquet.Column, do func(*parquet.Column, parquet.Page) error) error {
   320  	return forEachLeafColumn(col, func(leaf *parquet.Column) error {
   321  		pages := leaf.Pages()
   322  		defer pages.Close()
   323  		return forEachPage(pages, func(page parquet.Page) error { return do(leaf, page) })
   324  	})
   325  }
   326  
   327  func forEachColumnValue(col *parquet.Column, do func(*parquet.Column, parquet.Value) error) error {
   328  	return forEachColumnPage(col, func(leaf *parquet.Column, page parquet.Page) error {
   329  		return forEachValue(page.Values(), func(value parquet.Value) error { return do(leaf, value) })
   330  	})
   331  }
   332  
   333  func forEachColumnChunk(file *parquet.File, do func(*parquet.Column, parquet.ColumnChunk) error) error {
   334  	return forEachLeafColumn(file.Root(), func(leaf *parquet.Column) error {
   335  		for _, rowGroup := range file.RowGroups() {
   336  			if err := do(leaf, rowGroup.ColumnChunks()[leaf.Index()]); err != nil {
   337  				return err
   338  			}
   339  		}
   340  		return nil
   341  	})
   342  }
   343  
   344  func createParquetFile(rows rows, options ...parquet.WriterOption) (*parquet.File, error) {
   345  	buffer := new(bytes.Buffer)
   346  
   347  	if err := writeParquetFile(buffer, rows, options...); err != nil {
   348  		return nil, err
   349  	}
   350  
   351  	reader := bytes.NewReader(buffer.Bytes())
   352  	return parquet.OpenFile(reader, reader.Size())
   353  }
   354  
   355  func writeParquetFile(w io.Writer, rows rows, options ...parquet.WriterOption) error {
   356  	writer := parquet.NewWriter(w, options...)
   357  
   358  	for _, row := range rows {
   359  		if err := writer.Write(row); err != nil {
   360  			return err
   361  		}
   362  	}
   363  
   364  	return writer.Close()
   365  }
   366  
   367  func writeParquetFileWithBuffer(w io.Writer, rows rows, options ...parquet.WriterOption) error {
   368  	buffer := parquet.NewBuffer()
   369  	for _, row := range rows {
   370  		if err := buffer.Write(row); err != nil {
   371  			return err
   372  		}
   373  	}
   374  
   375  	writer := parquet.NewWriter(w, options...)
   376  	numRows, err := copyRowsAndClose(writer, buffer.Rows())
   377  	if err != nil {
   378  		return err
   379  	}
   380  	if numRows != int64(len(rows)) {
   381  		return fmt.Errorf("wrong number of rows written from buffer to file: want=%d got=%d", len(rows), numRows)
   382  	}
   383  	return writer.Close()
   384  }
   385  
   386  type rows []interface{}
   387  
   388  func makeRows(any interface{}) rows {
   389  	if v, ok := any.([]interface{}); ok {
   390  		return rows(v)
   391  	}
   392  	value := reflect.ValueOf(any)
   393  	slice := make([]interface{}, value.Len())
   394  	for i := range slice {
   395  		slice[i] = value.Index(i).Interface()
   396  	}
   397  	return rows(slice)
   398  }
   399  
   400  func randValueFuncOf(t parquet.Type) func(*rand.Rand) parquet.Value {
   401  	switch k := t.Kind(); k {
   402  	case parquet.Boolean:
   403  		return func(r *rand.Rand) parquet.Value {
   404  			return parquet.ValueOf(r.Float64() < 0.5)
   405  		}
   406  
   407  	case parquet.Int32:
   408  		return func(r *rand.Rand) parquet.Value {
   409  			return parquet.ValueOf(r.Int31())
   410  		}
   411  
   412  	case parquet.Int64:
   413  		return func(r *rand.Rand) parquet.Value {
   414  			return parquet.ValueOf(r.Int63())
   415  		}
   416  
   417  	case parquet.Int96:
   418  		return func(r *rand.Rand) parquet.Value {
   419  			return parquet.ValueOf(deprecated.Int96{
   420  				0: r.Uint32(),
   421  				1: r.Uint32(),
   422  				2: r.Uint32(),
   423  			})
   424  		}
   425  
   426  	case parquet.Float:
   427  		return func(r *rand.Rand) parquet.Value {
   428  			return parquet.ValueOf(r.Float32())
   429  		}
   430  
   431  	case parquet.Double:
   432  		return func(r *rand.Rand) parquet.Value {
   433  			return parquet.ValueOf(r.Float64())
   434  		}
   435  
   436  	case parquet.ByteArray:
   437  		return func(r *rand.Rand) parquet.Value {
   438  			n := r.Intn(49) + 1
   439  			b := make([]byte, n)
   440  			const characters = "1234567890qwertyuiopasdfghjklzxcvbnm "
   441  			for i := range b {
   442  				b[i] = characters[r.Intn(len(characters))]
   443  			}
   444  			return parquet.ValueOf(b)
   445  		}
   446  
   447  	case parquet.FixedLenByteArray:
   448  		arrayType := reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))
   449  		return func(r *rand.Rand) parquet.Value {
   450  			b := make([]byte, arrayType.Len())
   451  			r.Read(b)
   452  			v := reflect.New(arrayType).Elem()
   453  			reflect.Copy(v, reflect.ValueOf(b))
   454  			return parquet.ValueOf(v.Interface())
   455  		}
   456  
   457  	default:
   458  		panic("NOT IMPLEMENTED")
   459  	}
   460  }
   461  
   462  func copyRowsAndClose(w parquet.RowWriter, r parquet.Rows) (int64, error) {
   463  	defer r.Close()
   464  	return parquet.CopyRows(w, r)
   465  }
   466  
   467  func benchmarkRowsPerSecond(b *testing.B, f func() int) {
   468  	b.ResetTimer()
   469  	start := time.Now()
   470  	numRows := int64(0)
   471  
   472  	for i := 0; i < b.N; i++ {
   473  		n := f()
   474  		numRows += int64(n)
   475  	}
   476  
   477  	seconds := time.Since(start).Seconds()
   478  	b.ReportMetric(float64(numRows)/seconds, "row/s")
   479  }
   480  
   481  func generateString(r *rand.Rand, n int) string {
   482  	const characters = "1234567890qwertyuiopasdfghjklzxcvbnm"
   483  	b := new(strings.Builder)
   484  	for i := 0; i < n; i++ {
   485  		b.WriteByte(characters[r.Intn(len(characters))])
   486  	}
   487  	return b.String()
   488  }
   489  
   490  var quickCheckConfig = quick.Config{
   491  	Sizes: []int{
   492  		0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
   493  		10, 20, 30, 40, 50, 123,
   494  		4096 + 1,
   495  	},
   496  }
   497  
   498  func quickCheck(f interface{}) error {
   499  	return quickCheckConfig.Check(f)
   500  }