github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/encoding_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding_test
    18  
    19  import (
    20  	"fmt"
    21  	"reflect"
    22  	"testing"
    23  	"unsafe"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow"
    26  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    27  	"github.com/apache/arrow/go/v7/arrow/memory"
    28  	"github.com/apache/arrow/go/v7/parquet"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    30  	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
    31  	"github.com/apache/arrow/go/v7/parquet/schema"
    32  	"github.com/stretchr/testify/assert"
    33  	"github.com/stretchr/testify/suite"
    34  )
    35  
    36  type nodeFactory func(string, parquet.Repetition, int32) *schema.PrimitiveNode
    37  
    38  func createNodeFactory(t reflect.Type) nodeFactory {
    39  	switch t {
    40  	case reflect.TypeOf(true):
    41  		return schema.NewBooleanNode
    42  	case reflect.TypeOf(int32(0)):
    43  		return schema.NewInt32Node
    44  	case reflect.TypeOf(int64(0)):
    45  		return schema.NewInt64Node
    46  	case reflect.TypeOf(parquet.Int96{}):
    47  		return schema.NewInt96Node
    48  	case reflect.TypeOf(float32(0)):
    49  		return schema.NewFloat32Node
    50  	case reflect.TypeOf(float64(0)):
    51  		return schema.NewFloat64Node
    52  	case reflect.TypeOf(parquet.ByteArray{}):
    53  		return schema.NewByteArrayNode
    54  	case reflect.TypeOf(parquet.FixedLenByteArray{}):
    55  		return func(name string, rep parquet.Repetition, field int32) *schema.PrimitiveNode {
    56  			return schema.NewFixedLenByteArrayNode(name, rep, 12, field)
    57  		}
    58  	}
    59  	return nil
    60  }
    61  
    62  func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, heap *memory.Buffer) (interface{}, interface{}) {
    63  	switch t {
    64  	case reflect.TypeOf(true):
    65  		draws := *(*[]bool)(unsafe.Pointer(&drawbuf))
    66  		decode := *(*[]bool)(unsafe.Pointer(&decodebuf))
    67  		testutils.InitValues(draws[:nvals], heap)
    68  
    69  		for j := 1; j < repeats; j++ {
    70  			for k := 0; k < nvals; k++ {
    71  				draws[nvals*j+k] = draws[k]
    72  			}
    73  		}
    74  
    75  		return draws[:nvals*repeats], decode[:nvals*repeats]
    76  	case reflect.TypeOf(int32(0)):
    77  		draws := arrow.Int32Traits.CastFromBytes(drawbuf)
    78  		decode := arrow.Int32Traits.CastFromBytes(decodebuf)
    79  		testutils.InitValues(draws[:nvals], heap)
    80  
    81  		for j := 1; j < repeats; j++ {
    82  			for k := 0; k < nvals; k++ {
    83  				draws[nvals*j+k] = draws[k]
    84  			}
    85  		}
    86  
    87  		return draws[:nvals*repeats], decode[:nvals*repeats]
    88  	case reflect.TypeOf(int64(0)):
    89  		draws := arrow.Int64Traits.CastFromBytes(drawbuf)
    90  		decode := arrow.Int64Traits.CastFromBytes(decodebuf)
    91  		testutils.InitValues(draws[:nvals], heap)
    92  
    93  		for j := 1; j < repeats; j++ {
    94  			for k := 0; k < nvals; k++ {
    95  				draws[nvals*j+k] = draws[k]
    96  			}
    97  		}
    98  
    99  		return draws[:nvals*repeats], decode[:nvals*repeats]
   100  	case reflect.TypeOf(parquet.Int96{}):
   101  		draws := parquet.Int96Traits.CastFromBytes(drawbuf)
   102  		decode := parquet.Int96Traits.CastFromBytes(decodebuf)
   103  		testutils.InitValues(draws[:nvals], heap)
   104  
   105  		for j := 1; j < repeats; j++ {
   106  			for k := 0; k < nvals; k++ {
   107  				draws[nvals*j+k] = draws[k]
   108  			}
   109  		}
   110  
   111  		return draws[:nvals*repeats], decode[:nvals*repeats]
   112  	case reflect.TypeOf(float32(0)):
   113  		draws := arrow.Float32Traits.CastFromBytes(drawbuf)
   114  		decode := arrow.Float32Traits.CastFromBytes(decodebuf)
   115  		testutils.InitValues(draws[:nvals], heap)
   116  
   117  		for j := 1; j < repeats; j++ {
   118  			for k := 0; k < nvals; k++ {
   119  				draws[nvals*j+k] = draws[k]
   120  			}
   121  		}
   122  
   123  		return draws[:nvals*repeats], decode[:nvals*repeats]
   124  	case reflect.TypeOf(float64(0)):
   125  		draws := arrow.Float64Traits.CastFromBytes(drawbuf)
   126  		decode := arrow.Float64Traits.CastFromBytes(decodebuf)
   127  		testutils.InitValues(draws[:nvals], heap)
   128  
   129  		for j := 1; j < repeats; j++ {
   130  			for k := 0; k < nvals; k++ {
   131  				draws[nvals*j+k] = draws[k]
   132  			}
   133  		}
   134  
   135  		return draws[:nvals*repeats], decode[:nvals*repeats]
   136  	case reflect.TypeOf(parquet.ByteArray{}):
   137  		draws := make([]parquet.ByteArray, nvals*repeats)
   138  		decode := make([]parquet.ByteArray, nvals*repeats)
   139  		testutils.InitValues(draws[:nvals], heap)
   140  
   141  		for j := 1; j < repeats; j++ {
   142  			for k := 0; k < nvals; k++ {
   143  				draws[nvals*j+k] = draws[k]
   144  			}
   145  		}
   146  
   147  		return draws[:nvals*repeats], decode[:nvals*repeats]
   148  	case reflect.TypeOf(parquet.FixedLenByteArray{}):
   149  		draws := make([]parquet.FixedLenByteArray, nvals*repeats)
   150  		decode := make([]parquet.FixedLenByteArray, nvals*repeats)
   151  		testutils.InitValues(draws[:nvals], heap)
   152  
   153  		for j := 1; j < repeats; j++ {
   154  			for k := 0; k < nvals; k++ {
   155  				draws[nvals*j+k] = draws[k]
   156  			}
   157  		}
   158  
   159  		return draws[:nvals*repeats], decode[:nvals*repeats]
   160  	}
   161  	return nil, nil
   162  }
   163  
   164  func encode(enc encoding.TypedEncoder, vals interface{}) {
   165  	switch v := vals.(type) {
   166  	case []bool:
   167  		enc.(encoding.BooleanEncoder).Put(v)
   168  	case []int32:
   169  		enc.(encoding.Int32Encoder).Put(v)
   170  	case []int64:
   171  		enc.(encoding.Int64Encoder).Put(v)
   172  	case []parquet.Int96:
   173  		enc.(encoding.Int96Encoder).Put(v)
   174  	case []float32:
   175  		enc.(encoding.Float32Encoder).Put(v)
   176  	case []float64:
   177  		enc.(encoding.Float64Encoder).Put(v)
   178  	case []parquet.ByteArray:
   179  		enc.(encoding.ByteArrayEncoder).Put(v)
   180  	case []parquet.FixedLenByteArray:
   181  		enc.(encoding.FixedLenByteArrayEncoder).Put(v)
   182  	}
   183  }
   184  
   185  func encodeSpaced(enc encoding.TypedEncoder, vals interface{}, validBits []byte, validBitsOffset int64) {
   186  	switch v := vals.(type) {
   187  	case []bool:
   188  		enc.(encoding.BooleanEncoder).PutSpaced(v, validBits, validBitsOffset)
   189  	case []int32:
   190  		enc.(encoding.Int32Encoder).PutSpaced(v, validBits, validBitsOffset)
   191  	case []int64:
   192  		enc.(encoding.Int64Encoder).PutSpaced(v, validBits, validBitsOffset)
   193  	case []parquet.Int96:
   194  		enc.(encoding.Int96Encoder).PutSpaced(v, validBits, validBitsOffset)
   195  	case []float32:
   196  		enc.(encoding.Float32Encoder).PutSpaced(v, validBits, validBitsOffset)
   197  	case []float64:
   198  		enc.(encoding.Float64Encoder).PutSpaced(v, validBits, validBitsOffset)
   199  	case []parquet.ByteArray:
   200  		enc.(encoding.ByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset)
   201  	case []parquet.FixedLenByteArray:
   202  		enc.(encoding.FixedLenByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset)
   203  	}
   204  }
   205  
   206  func decode(dec encoding.TypedDecoder, out interface{}) (int, error) {
   207  	switch v := out.(type) {
   208  	case []bool:
   209  		return dec.(encoding.BooleanDecoder).Decode(v)
   210  	case []int32:
   211  		return dec.(encoding.Int32Decoder).Decode(v)
   212  	case []int64:
   213  		return dec.(encoding.Int64Decoder).Decode(v)
   214  	case []parquet.Int96:
   215  		return dec.(encoding.Int96Decoder).Decode(v)
   216  	case []float32:
   217  		return dec.(encoding.Float32Decoder).Decode(v)
   218  	case []float64:
   219  		return dec.(encoding.Float64Decoder).Decode(v)
   220  	case []parquet.ByteArray:
   221  		return dec.(encoding.ByteArrayDecoder).Decode(v)
   222  	case []parquet.FixedLenByteArray:
   223  		return dec.(encoding.FixedLenByteArrayDecoder).Decode(v)
   224  	}
   225  	return 0, nil
   226  }
   227  
   228  func decodeSpaced(dec encoding.TypedDecoder, out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   229  	switch v := out.(type) {
   230  	case []bool:
   231  		return dec.(encoding.BooleanDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   232  	case []int32:
   233  		return dec.(encoding.Int32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   234  	case []int64:
   235  		return dec.(encoding.Int64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   236  	case []parquet.Int96:
   237  		return dec.(encoding.Int96Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   238  	case []float32:
   239  		return dec.(encoding.Float32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   240  	case []float64:
   241  		return dec.(encoding.Float64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   242  	case []parquet.ByteArray:
   243  		return dec.(encoding.ByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   244  	case []parquet.FixedLenByteArray:
   245  		return dec.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   246  	}
   247  	return 0, nil
   248  }
   249  
   250  type BaseEncodingTestSuite struct {
   251  	suite.Suite
   252  
   253  	descr   *schema.Column
   254  	typeLen int
   255  	mem     memory.Allocator
   256  	typ     reflect.Type
   257  
   258  	nvalues     int
   259  	heap        *memory.Buffer
   260  	inputBytes  *memory.Buffer
   261  	outputBytes *memory.Buffer
   262  	nodeFactory nodeFactory
   263  
   264  	draws     interface{}
   265  	decodeBuf interface{}
   266  }
   267  
   268  func (b *BaseEncodingTestSuite) SetupSuite() {
   269  	b.mem = memory.DefaultAllocator
   270  	b.inputBytes = memory.NewResizableBuffer(b.mem)
   271  	b.outputBytes = memory.NewResizableBuffer(b.mem)
   272  	b.heap = memory.NewResizableBuffer(b.mem)
   273  	b.nodeFactory = createNodeFactory(b.typ)
   274  }
   275  
   276  func (b *BaseEncodingTestSuite) TearDownSuite() {
   277  	b.inputBytes.Release()
   278  	b.outputBytes.Release()
   279  	b.heap.Release()
   280  }
   281  
   282  func (b *BaseEncodingTestSuite) SetupTest() {
   283  	b.descr = schema.NewColumn(b.nodeFactory("name", parquet.Repetitions.Optional, -1), 0, 0)
   284  	b.typeLen = int(b.descr.TypeLength())
   285  }
   286  
   287  func (b *BaseEncodingTestSuite) initData(nvalues, repeats int) {
   288  	b.nvalues = nvalues * repeats
   289  	b.inputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size()))
   290  	b.outputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size()))
   291  	memory.Set(b.inputBytes.Buf(), 0)
   292  	memory.Set(b.outputBytes.Buf(), 0)
   293  
   294  	b.draws, b.decodeBuf = initdata(b.typ, b.inputBytes.Buf(), b.outputBytes.Buf(), nvalues, repeats, b.heap)
   295  }
   296  
   297  func (b *BaseEncodingTestSuite) encodeTestData(e parquet.Encoding) (encoding.Buffer, error) {
   298  	enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator)
   299  	b.Equal(e, enc.Encoding())
   300  	b.Equal(b.descr.PhysicalType(), enc.Type())
   301  	encode(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface())
   302  	return enc.FlushValues()
   303  }
   304  
   305  func (b *BaseEncodingTestSuite) decodeTestData(e parquet.Encoding, buf []byte) {
   306  	dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem)
   307  	b.Equal(e, dec.Encoding())
   308  	b.Equal(b.descr.PhysicalType(), dec.Type())
   309  
   310  	dec.SetData(b.nvalues, buf)
   311  	decoded, _ := decode(dec, b.decodeBuf)
   312  	b.Equal(b.nvalues, decoded)
   313  	b.Equal(reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), reflect.ValueOf(b.decodeBuf).Slice(0, b.nvalues).Interface())
   314  }
   315  
   316  func (b *BaseEncodingTestSuite) encodeTestDataSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (encoding.Buffer, error) {
   317  	enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator)
   318  	encodeSpaced(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), validBits, validBitsOffset)
   319  	return enc.FlushValues()
   320  }
   321  
   322  func (b *BaseEncodingTestSuite) decodeTestDataSpaced(e parquet.Encoding, nullCount int, buf []byte, validBits []byte, validBitsOffset int64) {
   323  	dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem)
   324  	dec.SetData(b.nvalues-nullCount, buf)
   325  	decoded, _ := decodeSpaced(dec, b.decodeBuf, nullCount, validBits, validBitsOffset)
   326  	b.Equal(b.nvalues, decoded)
   327  
   328  	drawval := reflect.ValueOf(b.draws)
   329  	decodeval := reflect.ValueOf(b.decodeBuf)
   330  	for j := 0; j < b.nvalues; j++ {
   331  		if bitutil.BitIsSet(validBits, int(validBitsOffset)+j) {
   332  			b.Equal(drawval.Index(j).Interface(), decodeval.Index(j).Interface())
   333  		}
   334  	}
   335  }
   336  
   337  func (b *BaseEncodingTestSuite) checkRoundTrip(e parquet.Encoding) {
   338  	buf, _ := b.encodeTestData(e)
   339  	defer buf.Release()
   340  	b.decodeTestData(e, buf.Bytes())
   341  }
   342  
   343  func (b *BaseEncodingTestSuite) checkRoundTripSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) {
   344  	buf, _ := b.encodeTestDataSpaced(e, validBits, validBitsOffset)
   345  	defer buf.Release()
   346  
   347  	nullCount := 0
   348  	for i := 0; i < b.nvalues; i++ {
   349  		if bitutil.BitIsNotSet(validBits, int(validBitsOffset)+i) {
   350  			nullCount++
   351  		}
   352  	}
   353  	b.decodeTestDataSpaced(e, nullCount, buf.Bytes(), validBits, validBitsOffset)
   354  }
   355  
   356  func (b *BaseEncodingTestSuite) TestBasicRoundTrip() {
   357  	b.initData(10000, 1)
   358  	b.checkRoundTrip(parquet.Encodings.Plain)
   359  }
   360  
   361  func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() {
   362  	b.initData(10000, 1)
   363  
   364  	switch b.typ {
   365  	case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)):
   366  		b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked)
   367  	default:
   368  		b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) })
   369  	}
   370  }
   371  
   372  func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() {
   373  	b.initData(10000, 1)
   374  
   375  	switch b.typ {
   376  	case reflect.TypeOf(parquet.ByteArray{}):
   377  		b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray)
   378  	default:
   379  		b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) })
   380  	}
   381  }
   382  
   383  func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() {
   384  	b.initData(10000, 1)
   385  
   386  	switch b.typ {
   387  	case reflect.TypeOf(parquet.ByteArray{}):
   388  		b.checkRoundTrip(parquet.Encodings.DeltaByteArray)
   389  	default:
   390  		b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) })
   391  	}
   392  }
   393  
   394  func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() {
   395  	exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) {
   396  		b.Run(fmt.Sprintf("%d vals %d repeats %d offset %0.3f null", vals, repeats, validBitsOffset, 1-nullProb), func() {
   397  			b.initData(vals, repeats)
   398  
   399  			size := int64(b.nvalues) + validBitsOffset
   400  			r := testutils.NewRandomArrayGenerator(1923)
   401  			arr := r.Uint8(size, 0, 100, 1-nullProb)
   402  			validBits := arr.NullBitmapBytes()
   403  			if validBits != nil {
   404  				b.checkRoundTripSpaced(parquet.Encodings.Plain, validBits, validBitsOffset)
   405  				switch b.typ {
   406  				case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)):
   407  					b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset)
   408  				case reflect.TypeOf(parquet.ByteArray{}):
   409  					b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset)
   410  					b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset)
   411  				}
   412  			}
   413  		})
   414  	}
   415  
   416  	const (
   417  		avx512Size    = 64
   418  		simdSize      = avx512Size
   419  		multiSimdSize = simdSize * 33
   420  	)
   421  
   422  	for _, nullProb := range []float64{0.001, 0.1, 0.5, 0.9, 0.999} {
   423  		// Test with both size and offset up to 3 simd block
   424  		for i := 1; i < simdSize*3; i++ {
   425  			exec(i, 1, 0, nullProb)
   426  			exec(i, 1, int64(i+1), nullProb)
   427  		}
   428  		// large block and offset
   429  		exec(multiSimdSize, 1, 0, nullProb)
   430  		exec(multiSimdSize+33, 1, 0, nullProb)
   431  		exec(multiSimdSize, 1, 33, nullProb)
   432  		exec(multiSimdSize+33, 1, 33, nullProb)
   433  	}
   434  }
   435  
   436  func TestEncoding(t *testing.T) {
   437  	tests := []struct {
   438  		name string
   439  		typ  reflect.Type
   440  	}{
   441  		{"Bool", reflect.TypeOf(true)},
   442  		{"Int32", reflect.TypeOf(int32(0))},
   443  		{"Int64", reflect.TypeOf(int64(0))},
   444  		{"Float32", reflect.TypeOf(float32(0))},
   445  		{"Float64", reflect.TypeOf(float64(0))},
   446  		{"Int96", reflect.TypeOf(parquet.Int96{})},
   447  		{"ByteArray", reflect.TypeOf(parquet.ByteArray{})},
   448  		{"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})},
   449  	}
   450  
   451  	for _, tt := range tests {
   452  		t.Run(tt.name, func(t *testing.T) {
   453  			suite.Run(t, &BaseEncodingTestSuite{typ: tt.typ})
   454  		})
   455  	}
   456  }
   457  
   458  type DictionaryEncodingTestSuite struct {
   459  	BaseEncodingTestSuite
   460  }
   461  
   462  func (d *DictionaryEncodingTestSuite) encodeTestDataDict(e parquet.Encoding) (dictBuffer, indices encoding.Buffer, numEntries int) {
   463  	enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder)
   464  
   465  	d.Equal(parquet.Encodings.PlainDict, enc.Encoding())
   466  	d.Equal(d.descr.PhysicalType(), enc.Type())
   467  	encode(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface())
   468  	dictBuffer = memory.NewResizableBuffer(d.mem)
   469  	dictBuffer.Resize(enc.DictEncodedSize())
   470  	enc.WriteDict(dictBuffer.Bytes())
   471  	indices, _ = enc.FlushValues()
   472  	numEntries = enc.NumEntries()
   473  	return
   474  }
   475  
   476  func (d *DictionaryEncodingTestSuite) encodeTestDataDictSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (dictBuffer, indices encoding.Buffer, numEntries int) {
   477  	enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder)
   478  	d.Equal(d.descr.PhysicalType(), enc.Type())
   479  
   480  	encodeSpaced(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), validBits, validBitsOffset)
   481  	dictBuffer = memory.NewResizableBuffer(d.mem)
   482  	dictBuffer.Resize(enc.DictEncodedSize())
   483  	enc.WriteDict(dictBuffer.Bytes())
   484  	indices, _ = enc.FlushValues()
   485  	numEntries = enc.NumEntries()
   486  	return
   487  }
   488  
   489  func (d *DictionaryEncodingTestSuite) checkRoundTrip() {
   490  	dictBuffer, indices, numEntries := d.encodeTestDataDict(parquet.Encodings.Plain)
   491  	defer dictBuffer.Release()
   492  	defer indices.Release()
   493  	validBits := make([]byte, int(bitutil.BytesForBits(int64(d.nvalues)))+1)
   494  	memory.Set(validBits, 255)
   495  
   496  	spacedBuffer, indicesSpaced, _ := d.encodeTestDataDictSpaced(parquet.Encodings.Plain, validBits, 0)
   497  	defer spacedBuffer.Release()
   498  	defer indicesSpaced.Release()
   499  	d.Equal(indices.Bytes(), indicesSpaced.Bytes())
   500  
   501  	dictDecoder := encoding.NewDecoder(testutils.TypeToParquetType(d.typ), parquet.Encodings.Plain, d.descr, d.mem)
   502  	d.Equal(d.descr.PhysicalType(), dictDecoder.Type())
   503  	dictDecoder.SetData(numEntries, dictBuffer.Bytes())
   504  	decoder := encoding.NewDictDecoder(testutils.TypeToParquetType(d.typ), d.descr, d.mem)
   505  	decoder.SetDict(dictDecoder)
   506  	decoder.SetData(d.nvalues, indices.Bytes())
   507  
   508  	decoded, _ := decode(decoder, d.decodeBuf)
   509  	d.Equal(d.nvalues, decoded)
   510  	d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface())
   511  
   512  	decoder.SetData(d.nvalues, indices.Bytes())
   513  	decoded, _ = decodeSpaced(decoder, d.decodeBuf, 0, validBits, 0)
   514  	d.Equal(d.nvalues, decoded)
   515  	d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface())
   516  }
   517  
   518  func (d *DictionaryEncodingTestSuite) TestBasicRoundTrip() {
   519  	d.initData(2500, 2)
   520  	d.checkRoundTrip()
   521  }
   522  
   523  func TestDictEncoding(t *testing.T) {
   524  	tests := []struct {
   525  		name string
   526  		typ  reflect.Type
   527  	}{
   528  		{"Int32", reflect.TypeOf(int32(0))},
   529  		{"Int64", reflect.TypeOf(int64(0))},
   530  		{"Float32", reflect.TypeOf(float32(0))},
   531  		{"Float64", reflect.TypeOf(float64(0))},
   532  		{"ByteArray", reflect.TypeOf(parquet.ByteArray{})},
   533  		{"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})},
   534  	}
   535  
   536  	for _, tt := range tests {
   537  		t.Run(tt.name, func(t *testing.T) {
   538  			suite.Run(t, &DictionaryEncodingTestSuite{BaseEncodingTestSuite{typ: tt.typ}})
   539  		})
   540  	}
   541  }
   542  
   543  func TestWriteDeltaBitPackedInt32(t *testing.T) {
   544  	column := schema.NewColumn(schema.NewInt32Node("int32", parquet.Repetitions.Required, -1), 0, 0)
   545  
   546  	tests := []struct {
   547  		name     string
   548  		toencode []int32
   549  		expected []byte
   550  	}{
   551  		{"simple 12345", []int32{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}},
   552  		{"odd vals", []int32{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}},
   553  	}
   554  
   555  	for _, tt := range tests {
   556  		t.Run(tt.name, func(t *testing.T) {
   557  			enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   558  
   559  			enc.(encoding.Int32Encoder).Put(tt.toencode)
   560  			buf, _ := enc.FlushValues()
   561  			defer buf.Release()
   562  
   563  			assert.Equal(t, tt.expected, buf.Bytes())
   564  
   565  			dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   566  
   567  			dec.(encoding.Int32Decoder).SetData(len(tt.toencode), tt.expected)
   568  			out := make([]int32, len(tt.toencode))
   569  			dec.(encoding.Int32Decoder).Decode(out)
   570  			assert.Equal(t, tt.toencode, out)
   571  		})
   572  	}
   573  
   574  	t.Run("test progressive decoding", func(t *testing.T) {
   575  		values := make([]int32, 1000)
   576  		testutils.FillRandomInt32(0, values)
   577  
   578  		enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   579  		enc.(encoding.Int32Encoder).Put(values)
   580  		buf, _ := enc.FlushValues()
   581  		defer buf.Release()
   582  
   583  		dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   584  		dec.(encoding.Int32Decoder).SetData(len(values), buf.Bytes())
   585  
   586  		valueBuf := make([]int32, 100)
   587  		for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) {
   588  			dec.(encoding.Int32Decoder).Decode(valueBuf)
   589  			assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j)
   590  		}
   591  	})
   592  }
   593  
   594  func TestWriteDeltaBitPackedInt64(t *testing.T) {
   595  	column := schema.NewColumn(schema.NewInt64Node("int64", parquet.Repetitions.Required, -1), 0, 0)
   596  
   597  	tests := []struct {
   598  		name     string
   599  		toencode []int64
   600  		expected []byte
   601  	}{
   602  		{"simple 12345", []int64{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}},
   603  		{"odd vals", []int64{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}},
   604  	}
   605  
   606  	for _, tt := range tests {
   607  		t.Run(tt.name, func(t *testing.T) {
   608  			enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   609  
   610  			enc.(encoding.Int64Encoder).Put(tt.toencode)
   611  			buf, _ := enc.FlushValues()
   612  			defer buf.Release()
   613  
   614  			assert.Equal(t, tt.expected, buf.Bytes())
   615  
   616  			dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   617  
   618  			dec.(encoding.Int64Decoder).SetData(len(tt.toencode), tt.expected)
   619  			out := make([]int64, len(tt.toencode))
   620  			dec.(encoding.Int64Decoder).Decode(out)
   621  			assert.Equal(t, tt.toencode, out)
   622  		})
   623  	}
   624  
   625  	t.Run("test progressive decoding", func(t *testing.T) {
   626  		values := make([]int64, 1000)
   627  		testutils.FillRandomInt64(0, values)
   628  
   629  		enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   630  		enc.(encoding.Int64Encoder).Put(values)
   631  		buf, _ := enc.FlushValues()
   632  		defer buf.Release()
   633  
   634  		dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   635  		dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes())
   636  
   637  		valueBuf := make([]int64, 100)
   638  		for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) {
   639  			decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf)
   640  			assert.Equal(t, len(valueBuf), decoded)
   641  			assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j)
   642  		}
   643  	})
   644  }
   645  
   646  func TestDeltaLengthByteArrayEncoding(t *testing.T) {
   647  	column := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0)
   648  
   649  	test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")}
   650  	expected := []byte{128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70}
   651  
   652  	enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, false, column, memory.DefaultAllocator)
   653  	enc.(encoding.ByteArrayEncoder).Put(test)
   654  	buf, _ := enc.FlushValues()
   655  	defer buf.Release()
   656  
   657  	assert.Equal(t, expected, buf.Bytes())
   658  
   659  	dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, column, nil)
   660  	dec.SetData(len(test), expected)
   661  	out := make([]parquet.ByteArray, len(test))
   662  	decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out)
   663  	assert.Equal(t, len(test), decoded)
   664  	assert.Equal(t, test, out)
   665  }
   666  
   667  func TestDeltaByteArrayEncoding(t *testing.T) {
   668  	test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")}
   669  	expected := []byte{128, 1, 4, 4, 0, 0, 0, 0, 0, 0, 128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70}
   670  
   671  	enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, false, nil, nil)
   672  	enc.(encoding.ByteArrayEncoder).Put(test)
   673  	buf, _ := enc.FlushValues()
   674  	defer buf.Release()
   675  
   676  	assert.Equal(t, expected, buf.Bytes())
   677  
   678  	dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, nil, nil)
   679  	dec.SetData(len(test), expected)
   680  	out := make([]parquet.ByteArray, len(test))
   681  	decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out)
   682  	assert.Equal(t, len(test), decoded)
   683  	assert.Equal(t, test, out)
   684  }