github.com/apache/arrow/go/v14@v14.0.2/parquet/pqarrow/reader_writer_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow_test
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"testing"
    23  	"unsafe"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow"
    26  	"github.com/apache/arrow/go/v14/arrow/array"
    27  	"github.com/apache/arrow/go/v14/arrow/memory"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/file"
    30  	"github.com/apache/arrow/go/v14/parquet/pqarrow"
    31  	"golang.org/x/exp/rand"
    32  	"gonum.org/v1/gonum/stat/distuv"
    33  )
    34  
    35  const alternateOrNA = -1
    36  const SIZELEN = 1024 * 1024
    37  
    38  func randomUint8(size, truePct int, sampleVals [2]uint8, seed uint64) []uint8 {
    39  	ret := make([]uint8, size)
    40  	if truePct == alternateOrNA {
    41  		for idx := range ret {
    42  			ret[idx] = uint8(idx % 2)
    43  		}
    44  		return ret
    45  	}
    46  
    47  	dist := distuv.Bernoulli{
    48  		P:   float64(truePct) / 100.0,
    49  		Src: rand.NewSource(seed),
    50  	}
    51  
    52  	for idx := range ret {
    53  		ret[idx] = sampleVals[int(dist.Rand())]
    54  	}
    55  	return ret
    56  }
    57  
    58  func randomInt32(size, truePct int, sampleVals [2]int32, seed uint64) []int32 {
    59  	ret := make([]int32, size)
    60  	if truePct == alternateOrNA {
    61  		for idx := range ret {
    62  			ret[idx] = int32(idx % 2)
    63  		}
    64  		return ret
    65  	}
    66  
    67  	dist := distuv.Bernoulli{
    68  		P:   float64(truePct) / 100.0,
    69  		Src: rand.NewSource(seed),
    70  	}
    71  
    72  	for idx := range ret {
    73  		ret[idx] = sampleVals[int(dist.Rand())]
    74  	}
    75  	return ret
    76  }
    77  
    78  func tableFromVec(dt arrow.DataType, size int, data interface{}, nullable bool, nullPct int) arrow.Table {
    79  	if !nullable && nullPct != alternateOrNA {
    80  		panic("bad check")
    81  	}
    82  
    83  	var valid []bool
    84  	if nullable {
    85  		// true values select index 1 of sample values
    86  		validBytes := randomUint8(size, nullPct, [2]uint8{1, 0}, 500)
    87  		valid = *(*[]bool)(unsafe.Pointer(&validBytes))
    88  	}
    89  
    90  	bldr := array.NewBuilder(memory.DefaultAllocator, dt)
    91  	defer bldr.Release()
    92  
    93  	switch v := data.(type) {
    94  	case []int32:
    95  		bldr.(*array.Int32Builder).AppendValues(v, valid)
    96  	case []int64:
    97  		bldr.(*array.Int64Builder).AppendValues(v, valid)
    98  	case []float32:
    99  		bldr.(*array.Float32Builder).AppendValues(v, valid)
   100  	case []float64:
   101  		bldr.(*array.Float64Builder).AppendValues(v, valid)
   102  	}
   103  
   104  	arr := bldr.NewArray()
   105  
   106  	field := arrow.Field{Name: "column", Type: dt, Nullable: nullable}
   107  	sc := arrow.NewSchema([]arrow.Field{field}, nil)
   108  	col := arrow.NewColumnFromArr(field, arr)
   109  	defer col.Release()
   110  	return array.NewTable(sc, []arrow.Column{col}, int64(size))
   111  }
   112  
   113  func BenchmarkWriteColumn(b *testing.B) {
   114  	int32Values := make([]int32, SIZELEN)
   115  	int64Values := make([]int64, SIZELEN)
   116  	float32Values := make([]float32, SIZELEN)
   117  	float64Values := make([]float64, SIZELEN)
   118  	for i := 0; i < SIZELEN; i++ {
   119  		int32Values[i] = 128
   120  		int64Values[i] = 128
   121  		float32Values[i] = 128
   122  		float64Values[i] = 128
   123  	}
   124  
   125  	tests := []struct {
   126  		name     string
   127  		dt       arrow.DataType
   128  		values   interface{}
   129  		nullable bool
   130  		nbytes   int64
   131  	}{
   132  		{"int32 not nullable", arrow.PrimitiveTypes.Int32, int32Values, false, int64(arrow.Int32Traits.BytesRequired(SIZELEN))},
   133  		{"int32 nullable", arrow.PrimitiveTypes.Int32, int32Values, true, int64(arrow.Int32Traits.BytesRequired(SIZELEN))},
   134  		{"int64 not nullable", arrow.PrimitiveTypes.Int64, int64Values, false, int64(arrow.Int64Traits.BytesRequired(SIZELEN))},
   135  		{"int64 nullable", arrow.PrimitiveTypes.Int64, int64Values, true, int64(arrow.Int64Traits.BytesRequired(SIZELEN))},
   136  		{"float32 not nullable", arrow.PrimitiveTypes.Float32, float32Values, false, int64(arrow.Float32Traits.BytesRequired(SIZELEN))},
   137  		{"float32 nullable", arrow.PrimitiveTypes.Float32, float32Values, true, int64(arrow.Float32Traits.BytesRequired(SIZELEN))},
   138  		{"float64 not nullable", arrow.PrimitiveTypes.Float64, float64Values, false, int64(arrow.Float64Traits.BytesRequired(SIZELEN))},
   139  		{"float64 nullable", arrow.PrimitiveTypes.Float64, float64Values, true, int64(arrow.Float64Traits.BytesRequired(SIZELEN))},
   140  	}
   141  
   142  	props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false))
   143  	arrProps := pqarrow.DefaultWriterProps()
   144  
   145  	for _, tt := range tests {
   146  		b.Run(tt.name, func(b *testing.B) {
   147  			tbl := tableFromVec(tt.dt, SIZELEN, tt.values, tt.nullable, alternateOrNA)
   148  			b.Cleanup(func() { tbl.Release() })
   149  			var buf bytes.Buffer
   150  			buf.Grow(int(tt.nbytes))
   151  			b.ResetTimer()
   152  			b.SetBytes(tt.nbytes)
   153  
   154  			for i := 0; i < b.N; i++ {
   155  				buf.Reset()
   156  				err := pqarrow.WriteTable(tbl, &buf, SIZELEN, props, arrProps)
   157  				if err != nil {
   158  					b.Error(err)
   159  				}
   160  			}
   161  		})
   162  	}
   163  }
   164  
   165  func benchReadTable(b *testing.B, name string, tbl arrow.Table, nbytes int64) {
   166  	props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false))
   167  	arrProps := pqarrow.DefaultWriterProps()
   168  
   169  	var buf bytes.Buffer
   170  	if err := pqarrow.WriteTable(tbl, &buf, SIZELEN, props, arrProps); err != nil {
   171  		b.Error(err)
   172  	}
   173  	ctx := context.Background()
   174  
   175  	b.ResetTimer()
   176  	b.Run(name, func(b *testing.B) {
   177  		b.SetBytes(nbytes)
   178  
   179  		for i := 0; i < b.N; i++ {
   180  			pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   181  			if err != nil {
   182  				b.Error(err)
   183  			}
   184  
   185  			reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
   186  			if err != nil {
   187  				b.Error(err)
   188  			}
   189  
   190  			tbl, err := reader.ReadTable(ctx)
   191  			if err != nil {
   192  				b.Error(err)
   193  			}
   194  			defer tbl.Release()
   195  		}
   196  	})
   197  }
   198  
   199  func BenchmarkReadColumnInt32(b *testing.B) {
   200  	tests := []struct {
   201  		name     string
   202  		nullable bool
   203  		nullPct  int
   204  		fvPct    int
   205  	}{
   206  		{"int32 not null 1pct", false, alternateOrNA, 1},
   207  		{"int32 not null 10pct", false, alternateOrNA, 10},
   208  		{"int32 not null 50pct", false, alternateOrNA, 50},
   209  		{"int32 nullable alt", true, alternateOrNA, 0},
   210  		{"int32 nullable 1pct 1pct", true, 1, 1},
   211  		{"int32 nullable 10pct 10pct", true, 10, 10},
   212  		{"int32 nullable 25pct 5pct", true, 25, 5},
   213  		{"int32 nullable 50pct 50pct", true, 50, 50},
   214  		{"int32 nullable 50pct 0pct", true, 50, 0},
   215  		{"int32 nullable 99pct 50pct", true, 99, 50},
   216  		{"int32 nullable 99pct 0pct", true, 99, 0},
   217  	}
   218  
   219  	for _, tt := range tests {
   220  		values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500)
   221  		tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct)
   222  		benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
   223  	}
   224  }
   225  
   226  func BenchmarkReadColumnInt64(b *testing.B) {
   227  	tests := []struct {
   228  		name     string
   229  		nullable bool
   230  		nullPct  int
   231  		fvPct    int
   232  	}{
   233  		{"int64 not null 1pct", false, alternateOrNA, 1},
   234  		{"int64 not null 10pct", false, alternateOrNA, 10},
   235  		{"int64 not null 50pct", false, alternateOrNA, 50},
   236  		{"int64 nullable alt", true, alternateOrNA, 0},
   237  		{"int64 nullable 1pct 1pct", true, 1, 1},
   238  		{"int64 nullable 5pct 5pct", true, 5, 5},
   239  		{"int64 nullable 10pct 5pct", true, 10, 5},
   240  		{"int64 nullable 25pct 10pct", true, 25, 10},
   241  		{"int64 nullable 30pct 10pct", true, 30, 10},
   242  		{"int64 nullable 35pct 10pct", true, 35, 10},
   243  		{"int64 nullable 45pct 25pct", true, 45, 25},
   244  		{"int64 nullable 50pct 50pct", true, 50, 50},
   245  		{"int64 nullable 50pct 1pct", true, 50, 1},
   246  		{"int64 nullable 75pct 1pct", true, 75, 1},
   247  		{"int64 nullable 99pct 50pct", true, 99, 50},
   248  		{"int64 nullable 99pct 0pct", true, 99, 0},
   249  	}
   250  
   251  	for _, tt := range tests {
   252  		values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500)
   253  		tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct)
   254  		benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
   255  	}
   256  }
   257  
   258  func BenchmarkReadColumnFloat64(b *testing.B) {
   259  	tests := []struct {
   260  		name     string
   261  		nullable bool
   262  		nullPct  int
   263  		fvPct    int
   264  	}{
   265  		{"double not null 1pct", false, alternateOrNA, 0},
   266  		{"double not null 20pct", false, alternateOrNA, 20},
   267  		{"double nullable alt", true, alternateOrNA, 0},
   268  		{"double nullable 10pct 50pct", true, 10, 50},
   269  		{"double nullable 25pct 25pct", true, 25, 25},
   270  	}
   271  
   272  	for _, tt := range tests {
   273  		values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500)
   274  		tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct)
   275  		benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
   276  	}
   277  }