github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/encoding_benchmarks_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding_test 18 19 import ( 20 "fmt" 21 "math" 22 "testing" 23 24 "github.com/apache/arrow/go/v14/arrow" 25 "github.com/apache/arrow/go/v14/arrow/array" 26 "github.com/apache/arrow/go/v14/arrow/memory" 27 "github.com/apache/arrow/go/v14/internal/hashing" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 30 "github.com/apache/arrow/go/v14/parquet/internal/testutils" 31 "github.com/apache/arrow/go/v14/parquet/schema" 32 ) 33 34 const ( 35 MINSIZE = 1024 36 MAXSIZE = 65536 37 ) 38 39 func BenchmarkPlainEncodingBoolean(b *testing.B) { 40 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 41 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 42 values := make([]bool, sz) 43 for idx := range values { 44 values[idx] = true 45 } 46 encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, 47 false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) 48 b.ResetTimer() 49 b.SetBytes(int64(len(values))) 50 for n := 0; n < b.N; n++ { 51 encoder.Put(values) 52 buf, _ := encoder.FlushValues() 53 buf.Release() 54 } 55 }) 56 } 57 } 58 59 func BenchmarkPlainEncodingInt32(b *testing.B) { 60 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 61 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 62 values := make([]int32, sz) 63 for idx := range values { 64 values[idx] = 64 65 } 66 encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, 67 false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) 68 b.ResetTimer() 69 b.SetBytes(int64(len(values) * arrow.Int32SizeBytes)) 70 for n := 0; n < b.N; n++ { 71 encoder.Put(values) 72 buf, _ := encoder.FlushValues() 73 buf.Release() 74 } 75 }) 76 } 77 } 78 79 func BenchmarkPlainEncodingInt64(b *testing.B) { 80 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 81 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 82 values := make([]int64, sz) 83 for idx := range values { 84 values[idx] = 64 85 } 86 encoder := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.Plain, 87 false, nil, memory.DefaultAllocator).(encoding.Int64Encoder) 88 b.ResetTimer() 89 b.SetBytes(int64(len(values) * arrow.Int64SizeBytes)) 90 for n := 0; n < b.N; n++ { 91 encoder.Put(values) 92 buf, _ := encoder.FlushValues() 93 buf.Release() 94 } 95 }) 96 } 97 } 98 99 func BenchmarkPlainEncodingFloat32(b *testing.B) { 100 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 101 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 102 values := make([]float32, sz) 103 for idx := range values { 104 values[idx] = 64.0 105 } 106 encoder := encoding.NewEncoder(parquet.Types.Float, parquet.Encodings.Plain, 107 false, nil, memory.DefaultAllocator).(encoding.Float32Encoder) 108 b.ResetTimer() 109 b.SetBytes(int64(len(values) * arrow.Float32SizeBytes)) 110 for n := 0; n < b.N; n++ { 111 encoder.Put(values) 112 buf, _ := encoder.FlushValues() 113 buf.Release() 114 } 115 }) 116 } 117 } 118 119 func BenchmarkPlainEncodingFloat64(b *testing.B) { 120 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 121 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 122 values := make([]float64, sz) 123 for idx := range values { 124 values[idx] = 64 125 } 126 encoder := encoding.NewEncoder(parquet.Types.Double, parquet.Encodings.Plain, 127 false, nil, memory.DefaultAllocator).(encoding.Float64Encoder) 128 b.ResetTimer() 129 b.SetBytes(int64(len(values) * arrow.Float64SizeBytes)) 130 for n := 0; n < b.N; n++ { 131 encoder.Put(values) 132 buf, _ := encoder.FlushValues() 133 buf.Release() 134 } 135 }) 136 } 137 } 138 139 func BenchmarkPlainDecodingBoolean(b *testing.B) { 140 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 141 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 142 output := make([]bool, sz) 143 values := make([]bool, sz) 144 for idx := range values { 145 values[idx] = true 146 } 147 encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, 148 false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder) 149 encoder.Put(values) 150 buf, _ := encoder.FlushValues() 151 defer buf.Release() 152 153 decoder := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, nil, memory.DefaultAllocator) 154 b.ResetTimer() 155 b.SetBytes(int64(len(values))) 156 for n := 0; n < b.N; n++ { 157 decoder.SetData(sz, buf.Bytes()) 158 decoder.(encoding.BooleanDecoder).Decode(output) 159 } 160 }) 161 } 162 } 163 164 func BenchmarkPlainDecodingInt32(b *testing.B) { 165 for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { 166 b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { 167 output := make([]int32, sz) 168 values := make([]int32, sz) 169 for idx := range values { 170 values[idx] = 64 171 } 172 encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain, 173 false, nil, memory.DefaultAllocator).(encoding.Int32Encoder) 174 encoder.Put(values) 175 buf, _ := encoder.FlushValues() 176 defer buf.Release() 177 178 decoder := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.Plain, nil, memory.DefaultAllocator) 179 b.ResetTimer() 180 b.SetBytes(int64(len(values))) 181 for n := 0; n < b.N; n++ { 182 decoder.SetData(sz, buf.Bytes()) 183 decoder.(encoding.Int32Decoder).Decode(output) 184 } 185 }) 186 } 187 } 188 189 func BenchmarkMemoTableFloat64(b *testing.B) { 190 tests := []struct { 191 nunique int32 192 nvalues int64 193 }{ 194 {100, 65535}, 195 {1000, 65535}, 196 {5000, 65535}, 197 } 198 199 for _, tt := range tests { 200 b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { 201 rag := testutils.NewRandomArrayGenerator(0) 202 dict := rag.Float64(int64(tt.nunique), 0) 203 indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) 204 205 values := make([]float64, tt.nvalues) 206 for idx := range values { 207 values[idx] = dict.Value(int(indices.Value(idx))) 208 } 209 210 b.ResetTimer() 211 b.Run("go map", func(b *testing.B) { 212 for i := 0; i < b.N; i++ { 213 tbl := encoding.NewFloat64MemoTable(memory.DefaultAllocator) 214 for _, v := range values { 215 tbl.GetOrInsert(v) 216 } 217 if tbl.Size() != int(tt.nunique) { 218 b.Fatal(tbl.Size(), tt.nunique) 219 } 220 } 221 }) 222 b.ResetTimer() 223 b.Run("xxh3", func(b *testing.B) { 224 for i := 0; i < b.N; i++ { 225 tbl := hashing.NewFloat64MemoTable(0) 226 for _, v := range values { 227 tbl.GetOrInsert(v) 228 } 229 if tbl.Size() != int(tt.nunique) { 230 b.Fatal(tbl.Size(), tt.nunique) 231 } 232 } 233 }) 234 }) 235 } 236 } 237 238 func BenchmarkMemoTableInt32(b *testing.B) { 239 tests := []struct { 240 nunique int32 241 nvalues int64 242 }{ 243 {100, 65535}, 244 {1000, 65535}, 245 {5000, 65535}, 246 } 247 248 for _, tt := range tests { 249 b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) { 250 rag := testutils.NewRandomArrayGenerator(0) 251 dict := rag.Int32(int64(tt.nunique), 0, math.MaxInt32-1, 0) 252 indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) 253 254 values := make([]int32, tt.nvalues) 255 for idx := range values { 256 values[idx] = dict.Value(int(indices.Value(idx))) 257 } 258 b.ResetTimer() 259 b.Run("xxh3", func(b *testing.B) { 260 for i := 0; i < b.N; i++ { 261 tbl := hashing.NewInt32MemoTable(0) 262 for _, v := range values { 263 tbl.GetOrInsert(v) 264 } 265 if tbl.Size() != int(tt.nunique) { 266 b.Fatal(tbl.Size(), tt.nunique) 267 } 268 } 269 }) 270 271 b.Run("go map", func(b *testing.B) { 272 for i := 0; i < b.N; i++ { 273 tbl := encoding.NewInt32MemoTable(memory.DefaultAllocator) 274 for _, v := range values { 275 tbl.GetOrInsert(v) 276 } 277 if tbl.Size() != int(tt.nunique) { 278 b.Fatal(tbl.Size(), tt.nunique) 279 } 280 } 281 }) 282 }) 283 } 284 } 285 286 func BenchmarkMemoTable(b *testing.B) { 287 tests := []struct { 288 nunique int32 289 minLen int32 290 maxLen int32 291 nvalues int64 292 }{ 293 {100, 32, 32, 65535}, 294 {100, 8, 32, 65535}, 295 {1000, 32, 32, 65535}, 296 {1000, 8, 32, 65535}, 297 {5000, 32, 32, 65535}, 298 {5000, 8, 32, 65535}, 299 } 300 301 for _, tt := range tests { 302 b.Run(fmt.Sprintf("%d unique len %d-%d n %d", tt.nunique, tt.minLen, tt.maxLen, tt.nvalues), func(b *testing.B) { 303 304 rag := testutils.NewRandomArrayGenerator(0) 305 dict := rag.ByteArray(int64(tt.nunique), tt.minLen, tt.maxLen, 0).(*array.String) 306 indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0) 307 308 values := make([]parquet.ByteArray, tt.nvalues) 309 for idx := range values { 310 values[idx] = []byte(dict.Value(int(indices.Value(idx)))) 311 } 312 313 b.ResetTimer() 314 315 b.Run("xxh3", func(b *testing.B) { 316 for i := 0; i < b.N; i++ { 317 tbl := hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)) 318 for _, v := range values { 319 tbl.GetOrInsert(v) 320 } 321 if tbl.Size() != int(tt.nunique) { 322 b.Fatal(tbl.Size(), tt.nunique) 323 } 324 tbl.Release() 325 } 326 }) 327 b.ResetTimer() 328 b.Run("go map", func(b *testing.B) { 329 for i := 0; i < b.N; i++ { 330 tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) 331 for _, v := range values { 332 tbl.GetOrInsert(v) 333 } 334 if tbl.Size() != int(tt.nunique) { 335 b.Fatal(tbl.Size(), tt.nunique) 336 } 337 tbl.Release() 338 } 339 }) 340 }) 341 } 342 } 343 344 func BenchmarkMemoTableAllUnique(b *testing.B) { 345 tests := []struct { 346 minLen int32 347 maxLen int32 348 nvalues int64 349 }{ 350 {32, 32, 1024}, 351 {8, 32, 1024}, 352 {32, 32, 32767}, 353 {8, 32, 32767}, 354 {32, 32, 65535}, 355 {8, 32, 65535}, 356 } 357 for _, tt := range tests { 358 b.Run(fmt.Sprintf("values %d len %d-%d", tt.nvalues, tt.minLen, tt.maxLen), func(b *testing.B) { 359 360 rag := testutils.NewRandomArrayGenerator(0) 361 dict := rag.ByteArray(tt.nvalues, tt.minLen, tt.maxLen, 0).(*array.String) 362 363 values := make([]parquet.ByteArray, tt.nvalues) 364 for idx := range values { 365 values[idx] = []byte(dict.Value(idx)) 366 } 367 368 b.ResetTimer() 369 b.Run("go map", func(b *testing.B) { 370 for i := 0; i < b.N; i++ { 371 tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator) 372 for _, v := range values { 373 tbl.GetOrInsert(v) 374 } 375 if tbl.Size() != int(tt.nvalues) { 376 b.Fatal(tbl.Size(), tt.nvalues) 377 } 378 tbl.Release() 379 } 380 }) 381 382 b.Run("xxh3", func(b *testing.B) { 383 for i := 0; i < b.N; i++ { 384 tbl := hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary)) 385 for _, v := range values { 386 tbl.GetOrInsert(v) 387 } 388 if tbl.Size() != int(tt.nvalues) { 389 b.Fatal(tbl.Size(), tt.nvalues) 390 } 391 tbl.Release() 392 } 393 }) 394 }) 395 } 396 397 } 398 399 func BenchmarkEncodeDictByteArray(b *testing.B) { 400 const ( 401 nunique = 100 402 minLen = 8 403 maxLen = 32 404 nvalues = 65535 405 ) 406 407 rag := testutils.NewRandomArrayGenerator(0) 408 dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) 409 indices := rag.Int32(nvalues, 0, nunique-1, 0) 410 411 values := make([]parquet.ByteArray, nvalues) 412 for idx := range values { 413 values[idx] = []byte(dict.Value(int(indices.Value(idx)))) 414 } 415 col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) 416 417 out := make([]byte, nunique*(maxLen+arrow.Uint32SizeBytes)) 418 b.ResetTimer() 419 for i := 0; i < b.N; i++ { 420 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) 421 enc.Put(values) 422 enc.WriteDict(out) 423 } 424 } 425 426 func BenchmarkDecodeDictByteArray(b *testing.B) { 427 const ( 428 nunique = 100 429 minLen = 32 430 maxLen = 32 431 nvalues = 65535 432 ) 433 434 rag := testutils.NewRandomArrayGenerator(0) 435 dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String) 436 indices := rag.Int32(nvalues, 0, nunique-1, 0) 437 438 values := make([]parquet.ByteArray, nvalues) 439 for idx := range values { 440 values[idx] = []byte(dict.Value(int(indices.Value(idx)))) 441 } 442 443 col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) 444 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder) 445 enc.Put(values) 446 447 dictBuf := make([]byte, enc.DictEncodedSize()) 448 enc.WriteDict(dictBuf) 449 450 idxBuf := make([]byte, enc.EstimatedDataEncodedSize()) 451 enc.WriteIndices(idxBuf) 452 453 out := make([]parquet.ByteArray, nvalues) 454 455 b.ResetTimer() 456 457 for i := 0; i < b.N; i++ { 458 dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.Plain, col, memory.DefaultAllocator) 459 dec.SetData(nunique, dictBuf) 460 dictDec := encoding.NewDictDecoder(parquet.Types.ByteArray, col, memory.DefaultAllocator).(*encoding.DictByteArrayDecoder) 461 dictDec.SetDict(dec) 462 dictDec.SetData(nvalues, idxBuf) 463 464 dictDec.Decode(out) 465 } 466 }