github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/encoding_benchmarks_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding_test
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"testing"
    23  
    24  	"github.com/apache/arrow/go/v14/arrow"
    25  	"github.com/apache/arrow/go/v14/arrow/array"
    26  	"github.com/apache/arrow/go/v14/arrow/memory"
    27  	"github.com/apache/arrow/go/v14/internal/hashing"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    30  	"github.com/apache/arrow/go/v14/parquet/internal/testutils"
    31  	"github.com/apache/arrow/go/v14/parquet/schema"
    32  )
    33  
    34  const (
    35  	MINSIZE = 1024
    36  	MAXSIZE = 65536
    37  )
    38  
    39  func BenchmarkPlainEncodingBoolean(b *testing.B) {
    40  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
    41  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
    42  			values := make([]bool, sz)
    43  			for idx := range values {
    44  				values[idx] = true
    45  			}
    46  			encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain,
    47  				false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder)
    48  			b.ResetTimer()
    49  			b.SetBytes(int64(len(values)))
    50  			for n := 0; n < b.N; n++ {
    51  				encoder.Put(values)
    52  				buf, _ := encoder.FlushValues()
    53  				buf.Release()
    54  			}
    55  		})
    56  	}
    57  }
    58  
    59  func BenchmarkPlainEncodingInt32(b *testing.B) {
    60  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
    61  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
    62  			values := make([]int32, sz)
    63  			for idx := range values {
    64  				values[idx] = 64
    65  			}
    66  			encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain,
    67  				false, nil, memory.DefaultAllocator).(encoding.Int32Encoder)
    68  			b.ResetTimer()
    69  			b.SetBytes(int64(len(values) * arrow.Int32SizeBytes))
    70  			for n := 0; n < b.N; n++ {
    71  				encoder.Put(values)
    72  				buf, _ := encoder.FlushValues()
    73  				buf.Release()
    74  			}
    75  		})
    76  	}
    77  }
    78  
    79  func BenchmarkPlainEncodingInt64(b *testing.B) {
    80  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
    81  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
    82  			values := make([]int64, sz)
    83  			for idx := range values {
    84  				values[idx] = 64
    85  			}
    86  			encoder := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.Plain,
    87  				false, nil, memory.DefaultAllocator).(encoding.Int64Encoder)
    88  			b.ResetTimer()
    89  			b.SetBytes(int64(len(values) * arrow.Int64SizeBytes))
    90  			for n := 0; n < b.N; n++ {
    91  				encoder.Put(values)
    92  				buf, _ := encoder.FlushValues()
    93  				buf.Release()
    94  			}
    95  		})
    96  	}
    97  }
    98  
    99  func BenchmarkPlainEncodingFloat32(b *testing.B) {
   100  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
   101  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
   102  			values := make([]float32, sz)
   103  			for idx := range values {
   104  				values[idx] = 64.0
   105  			}
   106  			encoder := encoding.NewEncoder(parquet.Types.Float, parquet.Encodings.Plain,
   107  				false, nil, memory.DefaultAllocator).(encoding.Float32Encoder)
   108  			b.ResetTimer()
   109  			b.SetBytes(int64(len(values) * arrow.Float32SizeBytes))
   110  			for n := 0; n < b.N; n++ {
   111  				encoder.Put(values)
   112  				buf, _ := encoder.FlushValues()
   113  				buf.Release()
   114  			}
   115  		})
   116  	}
   117  }
   118  
   119  func BenchmarkPlainEncodingFloat64(b *testing.B) {
   120  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
   121  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
   122  			values := make([]float64, sz)
   123  			for idx := range values {
   124  				values[idx] = 64
   125  			}
   126  			encoder := encoding.NewEncoder(parquet.Types.Double, parquet.Encodings.Plain,
   127  				false, nil, memory.DefaultAllocator).(encoding.Float64Encoder)
   128  			b.ResetTimer()
   129  			b.SetBytes(int64(len(values) * arrow.Float64SizeBytes))
   130  			for n := 0; n < b.N; n++ {
   131  				encoder.Put(values)
   132  				buf, _ := encoder.FlushValues()
   133  				buf.Release()
   134  			}
   135  		})
   136  	}
   137  }
   138  
   139  func BenchmarkPlainDecodingBoolean(b *testing.B) {
   140  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
   141  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
   142  			output := make([]bool, sz)
   143  			values := make([]bool, sz)
   144  			for idx := range values {
   145  				values[idx] = true
   146  			}
   147  			encoder := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain,
   148  				false, nil, memory.DefaultAllocator).(encoding.BooleanEncoder)
   149  			encoder.Put(values)
   150  			buf, _ := encoder.FlushValues()
   151  			defer buf.Release()
   152  
   153  			decoder := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, nil, memory.DefaultAllocator)
   154  			b.ResetTimer()
   155  			b.SetBytes(int64(len(values)))
   156  			for n := 0; n < b.N; n++ {
   157  				decoder.SetData(sz, buf.Bytes())
   158  				decoder.(encoding.BooleanDecoder).Decode(output)
   159  			}
   160  		})
   161  	}
   162  }
   163  
   164  func BenchmarkPlainDecodingInt32(b *testing.B) {
   165  	for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 {
   166  		b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) {
   167  			output := make([]int32, sz)
   168  			values := make([]int32, sz)
   169  			for idx := range values {
   170  				values[idx] = 64
   171  			}
   172  			encoder := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.Plain,
   173  				false, nil, memory.DefaultAllocator).(encoding.Int32Encoder)
   174  			encoder.Put(values)
   175  			buf, _ := encoder.FlushValues()
   176  			defer buf.Release()
   177  
   178  			decoder := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.Plain, nil, memory.DefaultAllocator)
   179  			b.ResetTimer()
   180  			b.SetBytes(int64(len(values)))
   181  			for n := 0; n < b.N; n++ {
   182  				decoder.SetData(sz, buf.Bytes())
   183  				decoder.(encoding.Int32Decoder).Decode(output)
   184  			}
   185  		})
   186  	}
   187  }
   188  
   189  func BenchmarkMemoTableFloat64(b *testing.B) {
   190  	tests := []struct {
   191  		nunique int32
   192  		nvalues int64
   193  	}{
   194  		{100, 65535},
   195  		{1000, 65535},
   196  		{5000, 65535},
   197  	}
   198  
   199  	for _, tt := range tests {
   200  		b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) {
   201  			rag := testutils.NewRandomArrayGenerator(0)
   202  			dict := rag.Float64(int64(tt.nunique), 0)
   203  			indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0)
   204  
   205  			values := make([]float64, tt.nvalues)
   206  			for idx := range values {
   207  				values[idx] = dict.Value(int(indices.Value(idx)))
   208  			}
   209  
   210  			b.ResetTimer()
   211  			b.Run("go map", func(b *testing.B) {
   212  				for i := 0; i < b.N; i++ {
   213  					tbl := encoding.NewFloat64MemoTable(memory.DefaultAllocator)
   214  					for _, v := range values {
   215  						tbl.GetOrInsert(v)
   216  					}
   217  					if tbl.Size() != int(tt.nunique) {
   218  						b.Fatal(tbl.Size(), tt.nunique)
   219  					}
   220  				}
   221  			})
   222  			b.ResetTimer()
   223  			b.Run("xxh3", func(b *testing.B) {
   224  				for i := 0; i < b.N; i++ {
   225  					tbl := hashing.NewFloat64MemoTable(0)
   226  					for _, v := range values {
   227  						tbl.GetOrInsert(v)
   228  					}
   229  					if tbl.Size() != int(tt.nunique) {
   230  						b.Fatal(tbl.Size(), tt.nunique)
   231  					}
   232  				}
   233  			})
   234  		})
   235  	}
   236  }
   237  
   238  func BenchmarkMemoTableInt32(b *testing.B) {
   239  	tests := []struct {
   240  		nunique int32
   241  		nvalues int64
   242  	}{
   243  		{100, 65535},
   244  		{1000, 65535},
   245  		{5000, 65535},
   246  	}
   247  
   248  	for _, tt := range tests {
   249  		b.Run(fmt.Sprintf("%d unique n %d", tt.nunique, tt.nvalues), func(b *testing.B) {
   250  			rag := testutils.NewRandomArrayGenerator(0)
   251  			dict := rag.Int32(int64(tt.nunique), 0, math.MaxInt32-1, 0)
   252  			indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0)
   253  
   254  			values := make([]int32, tt.nvalues)
   255  			for idx := range values {
   256  				values[idx] = dict.Value(int(indices.Value(idx)))
   257  			}
   258  			b.ResetTimer()
   259  			b.Run("xxh3", func(b *testing.B) {
   260  				for i := 0; i < b.N; i++ {
   261  					tbl := hashing.NewInt32MemoTable(0)
   262  					for _, v := range values {
   263  						tbl.GetOrInsert(v)
   264  					}
   265  					if tbl.Size() != int(tt.nunique) {
   266  						b.Fatal(tbl.Size(), tt.nunique)
   267  					}
   268  				}
   269  			})
   270  
   271  			b.Run("go map", func(b *testing.B) {
   272  				for i := 0; i < b.N; i++ {
   273  					tbl := encoding.NewInt32MemoTable(memory.DefaultAllocator)
   274  					for _, v := range values {
   275  						tbl.GetOrInsert(v)
   276  					}
   277  					if tbl.Size() != int(tt.nunique) {
   278  						b.Fatal(tbl.Size(), tt.nunique)
   279  					}
   280  				}
   281  			})
   282  		})
   283  	}
   284  }
   285  
   286  func BenchmarkMemoTable(b *testing.B) {
   287  	tests := []struct {
   288  		nunique int32
   289  		minLen  int32
   290  		maxLen  int32
   291  		nvalues int64
   292  	}{
   293  		{100, 32, 32, 65535},
   294  		{100, 8, 32, 65535},
   295  		{1000, 32, 32, 65535},
   296  		{1000, 8, 32, 65535},
   297  		{5000, 32, 32, 65535},
   298  		{5000, 8, 32, 65535},
   299  	}
   300  
   301  	for _, tt := range tests {
   302  		b.Run(fmt.Sprintf("%d unique len %d-%d n %d", tt.nunique, tt.minLen, tt.maxLen, tt.nvalues), func(b *testing.B) {
   303  
   304  			rag := testutils.NewRandomArrayGenerator(0)
   305  			dict := rag.ByteArray(int64(tt.nunique), tt.minLen, tt.maxLen, 0).(*array.String)
   306  			indices := rag.Int32(tt.nvalues, 0, int32(tt.nunique)-1, 0)
   307  
   308  			values := make([]parquet.ByteArray, tt.nvalues)
   309  			for idx := range values {
   310  				values[idx] = []byte(dict.Value(int(indices.Value(idx))))
   311  			}
   312  
   313  			b.ResetTimer()
   314  
   315  			b.Run("xxh3", func(b *testing.B) {
   316  				for i := 0; i < b.N; i++ {
   317  					tbl := hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary))
   318  					for _, v := range values {
   319  						tbl.GetOrInsert(v)
   320  					}
   321  					if tbl.Size() != int(tt.nunique) {
   322  						b.Fatal(tbl.Size(), tt.nunique)
   323  					}
   324  					tbl.Release()
   325  				}
   326  			})
   327  			b.ResetTimer()
   328  			b.Run("go map", func(b *testing.B) {
   329  				for i := 0; i < b.N; i++ {
   330  					tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator)
   331  					for _, v := range values {
   332  						tbl.GetOrInsert(v)
   333  					}
   334  					if tbl.Size() != int(tt.nunique) {
   335  						b.Fatal(tbl.Size(), tt.nunique)
   336  					}
   337  					tbl.Release()
   338  				}
   339  			})
   340  		})
   341  	}
   342  }
   343  
   344  func BenchmarkMemoTableAllUnique(b *testing.B) {
   345  	tests := []struct {
   346  		minLen  int32
   347  		maxLen  int32
   348  		nvalues int64
   349  	}{
   350  		{32, 32, 1024},
   351  		{8, 32, 1024},
   352  		{32, 32, 32767},
   353  		{8, 32, 32767},
   354  		{32, 32, 65535},
   355  		{8, 32, 65535},
   356  	}
   357  	for _, tt := range tests {
   358  		b.Run(fmt.Sprintf("values %d len %d-%d", tt.nvalues, tt.minLen, tt.maxLen), func(b *testing.B) {
   359  
   360  			rag := testutils.NewRandomArrayGenerator(0)
   361  			dict := rag.ByteArray(tt.nvalues, tt.minLen, tt.maxLen, 0).(*array.String)
   362  
   363  			values := make([]parquet.ByteArray, tt.nvalues)
   364  			for idx := range values {
   365  				values[idx] = []byte(dict.Value(idx))
   366  			}
   367  
   368  			b.ResetTimer()
   369  			b.Run("go map", func(b *testing.B) {
   370  				for i := 0; i < b.N; i++ {
   371  					tbl := encoding.NewBinaryMemoTable(memory.DefaultAllocator)
   372  					for _, v := range values {
   373  						tbl.GetOrInsert(v)
   374  					}
   375  					if tbl.Size() != int(tt.nvalues) {
   376  						b.Fatal(tbl.Size(), tt.nvalues)
   377  					}
   378  					tbl.Release()
   379  				}
   380  			})
   381  
   382  			b.Run("xxh3", func(b *testing.B) {
   383  				for i := 0; i < b.N; i++ {
   384  					tbl := hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary))
   385  					for _, v := range values {
   386  						tbl.GetOrInsert(v)
   387  					}
   388  					if tbl.Size() != int(tt.nvalues) {
   389  						b.Fatal(tbl.Size(), tt.nvalues)
   390  					}
   391  					tbl.Release()
   392  				}
   393  			})
   394  		})
   395  	}
   396  
   397  }
   398  
   399  func BenchmarkEncodeDictByteArray(b *testing.B) {
   400  	const (
   401  		nunique = 100
   402  		minLen  = 8
   403  		maxLen  = 32
   404  		nvalues = 65535
   405  	)
   406  
   407  	rag := testutils.NewRandomArrayGenerator(0)
   408  	dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String)
   409  	indices := rag.Int32(nvalues, 0, nunique-1, 0)
   410  
   411  	values := make([]parquet.ByteArray, nvalues)
   412  	for idx := range values {
   413  		values[idx] = []byte(dict.Value(int(indices.Value(idx))))
   414  	}
   415  	col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0)
   416  
   417  	out := make([]byte, nunique*(maxLen+arrow.Uint32SizeBytes))
   418  	b.ResetTimer()
   419  	for i := 0; i < b.N; i++ {
   420  		enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder)
   421  		enc.Put(values)
   422  		enc.WriteDict(out)
   423  	}
   424  }
   425  
   426  func BenchmarkDecodeDictByteArray(b *testing.B) {
   427  	const (
   428  		nunique = 100
   429  		minLen  = 32
   430  		maxLen  = 32
   431  		nvalues = 65535
   432  	)
   433  
   434  	rag := testutils.NewRandomArrayGenerator(0)
   435  	dict := rag.ByteArray(nunique, minLen, maxLen, 0).(*array.String)
   436  	indices := rag.Int32(nvalues, 0, nunique-1, 0)
   437  
   438  	values := make([]parquet.ByteArray, nvalues)
   439  	for idx := range values {
   440  		values[idx] = []byte(dict.Value(int(indices.Value(idx))))
   441  	}
   442  
   443  	col := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0)
   444  	enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.PlainDict, true, col, memory.DefaultAllocator).(*encoding.DictByteArrayEncoder)
   445  	enc.Put(values)
   446  
   447  	dictBuf := make([]byte, enc.DictEncodedSize())
   448  	enc.WriteDict(dictBuf)
   449  
   450  	idxBuf := make([]byte, enc.EstimatedDataEncodedSize())
   451  	enc.WriteIndices(idxBuf)
   452  
   453  	out := make([]parquet.ByteArray, nvalues)
   454  
   455  	b.ResetTimer()
   456  
   457  	for i := 0; i < b.N; i++ {
   458  		dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.Plain, col, memory.DefaultAllocator)
   459  		dec.SetData(nunique, dictBuf)
   460  		dictDec := encoding.NewDictDecoder(parquet.Types.ByteArray, col, memory.DefaultAllocator).(*encoding.DictByteArrayDecoder)
   461  		dictDec.SetDict(dec)
   462  		dictDec.SetData(nvalues, idxBuf)
   463  
   464  		dictDec.Decode(out)
   465  	}
   466  }