github.com/apache/arrow/go/v14@v14.0.2/parquet/pqarrow/reader_writer_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "bytes" 21 "context" 22 "testing" 23 "unsafe" 24 25 "github.com/apache/arrow/go/v14/arrow" 26 "github.com/apache/arrow/go/v14/arrow/array" 27 "github.com/apache/arrow/go/v14/arrow/memory" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/file" 30 "github.com/apache/arrow/go/v14/parquet/pqarrow" 31 "golang.org/x/exp/rand" 32 "gonum.org/v1/gonum/stat/distuv" 33 ) 34 35 const alternateOrNA = -1 36 const SIZELEN = 1024 * 1024 37 38 func randomUint8(size, truePct int, sampleVals [2]uint8, seed uint64) []uint8 { 39 ret := make([]uint8, size) 40 if truePct == alternateOrNA { 41 for idx := range ret { 42 ret[idx] = uint8(idx % 2) 43 } 44 return ret 45 } 46 47 dist := distuv.Bernoulli{ 48 P: float64(truePct) / 100.0, 49 Src: rand.NewSource(seed), 50 } 51 52 for idx := range ret { 53 ret[idx] = sampleVals[int(dist.Rand())] 54 } 55 return ret 56 } 57 58 func randomInt32(size, truePct int, sampleVals [2]int32, seed uint64) []int32 { 59 ret := make([]int32, size) 60 if truePct == alternateOrNA { 61 for idx := range ret { 62 ret[idx] = int32(idx % 2) 63 } 64 return ret 65 } 66 67 dist := distuv.Bernoulli{ 68 P: float64(truePct) / 100.0, 69 Src: rand.NewSource(seed), 70 } 71 72 for idx := range ret { 73 ret[idx] = sampleVals[int(dist.Rand())] 74 } 75 return ret 76 } 77 78 func tableFromVec(dt arrow.DataType, size int, data interface{}, nullable bool, nullPct int) arrow.Table { 79 if !nullable && nullPct != alternateOrNA { 80 panic("bad check") 81 } 82 83 var valid []bool 84 if nullable { 85 // true values select index 1 of sample values 86 validBytes := randomUint8(size, nullPct, [2]uint8{1, 0}, 500) 87 valid = *(*[]bool)(unsafe.Pointer(&validBytes)) 88 } 89 90 bldr := array.NewBuilder(memory.DefaultAllocator, dt) 91 defer bldr.Release() 92 93 switch v := data.(type) { 94 case []int32: 95 bldr.(*array.Int32Builder).AppendValues(v, valid) 96 case []int64: 97 bldr.(*array.Int64Builder).AppendValues(v, valid) 98 case []float32: 99 bldr.(*array.Float32Builder).AppendValues(v, valid) 100 case []float64: 101 bldr.(*array.Float64Builder).AppendValues(v, valid) 102 } 103 104 arr := bldr.NewArray() 105 106 field := arrow.Field{Name: "column", Type: dt, Nullable: nullable} 107 sc := arrow.NewSchema([]arrow.Field{field}, nil) 108 col := arrow.NewColumnFromArr(field, arr) 109 defer col.Release() 110 return array.NewTable(sc, []arrow.Column{col}, int64(size)) 111 } 112 113 func BenchmarkWriteColumn(b *testing.B) { 114 int32Values := make([]int32, SIZELEN) 115 int64Values := make([]int64, SIZELEN) 116 float32Values := make([]float32, SIZELEN) 117 float64Values := make([]float64, SIZELEN) 118 for i := 0; i < SIZELEN; i++ { 119 int32Values[i] = 128 120 int64Values[i] = 128 121 float32Values[i] = 128 122 float64Values[i] = 128 123 } 124 125 tests := []struct { 126 name string 127 dt arrow.DataType 128 values interface{} 129 nullable bool 130 nbytes int64 131 }{ 132 {"int32 not nullable", arrow.PrimitiveTypes.Int32, int32Values, false, int64(arrow.Int32Traits.BytesRequired(SIZELEN))}, 133 {"int32 nullable", arrow.PrimitiveTypes.Int32, int32Values, true, int64(arrow.Int32Traits.BytesRequired(SIZELEN))}, 134 {"int64 not nullable", arrow.PrimitiveTypes.Int64, int64Values, false, int64(arrow.Int64Traits.BytesRequired(SIZELEN))}, 135 {"int64 nullable", arrow.PrimitiveTypes.Int64, int64Values, true, int64(arrow.Int64Traits.BytesRequired(SIZELEN))}, 136 {"float32 not nullable", arrow.PrimitiveTypes.Float32, float32Values, false, int64(arrow.Float32Traits.BytesRequired(SIZELEN))}, 137 {"float32 nullable", arrow.PrimitiveTypes.Float32, float32Values, true, int64(arrow.Float32Traits.BytesRequired(SIZELEN))}, 138 {"float64 not nullable", arrow.PrimitiveTypes.Float64, float64Values, false, int64(arrow.Float64Traits.BytesRequired(SIZELEN))}, 139 {"float64 nullable", arrow.PrimitiveTypes.Float64, float64Values, true, int64(arrow.Float64Traits.BytesRequired(SIZELEN))}, 140 } 141 142 props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false)) 143 arrProps := pqarrow.DefaultWriterProps() 144 145 for _, tt := range tests { 146 b.Run(tt.name, func(b *testing.B) { 147 tbl := tableFromVec(tt.dt, SIZELEN, tt.values, tt.nullable, alternateOrNA) 148 b.Cleanup(func() { tbl.Release() }) 149 var buf bytes.Buffer 150 buf.Grow(int(tt.nbytes)) 151 b.ResetTimer() 152 b.SetBytes(tt.nbytes) 153 154 for i := 0; i < b.N; i++ { 155 buf.Reset() 156 err := pqarrow.WriteTable(tbl, &buf, SIZELEN, props, arrProps) 157 if err != nil { 158 b.Error(err) 159 } 160 } 161 }) 162 } 163 } 164 165 func benchReadTable(b *testing.B, name string, tbl arrow.Table, nbytes int64) { 166 props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false)) 167 arrProps := pqarrow.DefaultWriterProps() 168 169 var buf bytes.Buffer 170 if err := pqarrow.WriteTable(tbl, &buf, SIZELEN, props, arrProps); err != nil { 171 b.Error(err) 172 } 173 ctx := context.Background() 174 175 b.ResetTimer() 176 b.Run(name, func(b *testing.B) { 177 b.SetBytes(nbytes) 178 179 for i := 0; i < b.N; i++ { 180 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 181 if err != nil { 182 b.Error(err) 183 } 184 185 reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator) 186 if err != nil { 187 b.Error(err) 188 } 189 190 tbl, err := reader.ReadTable(ctx) 191 if err != nil { 192 b.Error(err) 193 } 194 defer tbl.Release() 195 } 196 }) 197 } 198 199 func BenchmarkReadColumnInt32(b *testing.B) { 200 tests := []struct { 201 name string 202 nullable bool 203 nullPct int 204 fvPct int 205 }{ 206 {"int32 not null 1pct", false, alternateOrNA, 1}, 207 {"int32 not null 10pct", false, alternateOrNA, 10}, 208 {"int32 not null 50pct", false, alternateOrNA, 50}, 209 {"int32 nullable alt", true, alternateOrNA, 0}, 210 {"int32 nullable 1pct 1pct", true, 1, 1}, 211 {"int32 nullable 10pct 10pct", true, 10, 10}, 212 {"int32 nullable 25pct 5pct", true, 25, 5}, 213 {"int32 nullable 50pct 50pct", true, 50, 50}, 214 {"int32 nullable 50pct 0pct", true, 50, 0}, 215 {"int32 nullable 99pct 50pct", true, 99, 50}, 216 {"int32 nullable 99pct 0pct", true, 99, 0}, 217 } 218 219 for _, tt := range tests { 220 values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500) 221 tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct) 222 benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN))) 223 } 224 } 225 226 func BenchmarkReadColumnInt64(b *testing.B) { 227 tests := []struct { 228 name string 229 nullable bool 230 nullPct int 231 fvPct int 232 }{ 233 {"int64 not null 1pct", false, alternateOrNA, 1}, 234 {"int64 not null 10pct", false, alternateOrNA, 10}, 235 {"int64 not null 50pct", false, alternateOrNA, 50}, 236 {"int64 nullable alt", true, alternateOrNA, 0}, 237 {"int64 nullable 1pct 1pct", true, 1, 1}, 238 {"int64 nullable 5pct 5pct", true, 5, 5}, 239 {"int64 nullable 10pct 5pct", true, 10, 5}, 240 {"int64 nullable 25pct 10pct", true, 25, 10}, 241 {"int64 nullable 30pct 10pct", true, 30, 10}, 242 {"int64 nullable 35pct 10pct", true, 35, 10}, 243 {"int64 nullable 45pct 25pct", true, 45, 25}, 244 {"int64 nullable 50pct 50pct", true, 50, 50}, 245 {"int64 nullable 50pct 1pct", true, 50, 1}, 246 {"int64 nullable 75pct 1pct", true, 75, 1}, 247 {"int64 nullable 99pct 50pct", true, 99, 50}, 248 {"int64 nullable 99pct 0pct", true, 99, 0}, 249 } 250 251 for _, tt := range tests { 252 values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500) 253 tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct) 254 benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN))) 255 } 256 } 257 258 func BenchmarkReadColumnFloat64(b *testing.B) { 259 tests := []struct { 260 name string 261 nullable bool 262 nullPct int 263 fvPct int 264 }{ 265 {"double not null 1pct", false, alternateOrNA, 0}, 266 {"double not null 20pct", false, alternateOrNA, 20}, 267 {"double nullable alt", true, alternateOrNA, 0}, 268 {"double nullable 10pct 50pct", true, 10, 50}, 269 {"double nullable 25pct 25pct", true, 25, 25}, 270 } 271 272 for _, tt := range tests { 273 values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500) 274 tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct) 275 benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN))) 276 } 277 }