github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/encoding_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding_test
    18  
    19  import (
    20  	"bufio"
    21  	"fmt"
    22  	"os"
    23  	"path"
    24  	"reflect"
    25  	"strconv"
    26  	"testing"
    27  	"unsafe"
    28  
    29  	"github.com/apache/arrow/go/v14/arrow"
    30  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    31  	"github.com/apache/arrow/go/v14/arrow/memory"
    32  	"github.com/apache/arrow/go/v14/parquet"
    33  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    34  	"github.com/apache/arrow/go/v14/parquet/internal/testutils"
    35  	"github.com/apache/arrow/go/v14/parquet/schema"
    36  	"github.com/stretchr/testify/assert"
    37  	"github.com/stretchr/testify/require"
    38  	"github.com/stretchr/testify/suite"
    39  )
    40  
    41  type nodeFactory func(string, parquet.Repetition, int32) *schema.PrimitiveNode
    42  
    43  func createNodeFactory(t reflect.Type) nodeFactory {
    44  	switch t {
    45  	case reflect.TypeOf(true):
    46  		return schema.NewBooleanNode
    47  	case reflect.TypeOf(int32(0)):
    48  		return schema.NewInt32Node
    49  	case reflect.TypeOf(int64(0)):
    50  		return schema.NewInt64Node
    51  	case reflect.TypeOf(parquet.Int96{}):
    52  		return schema.NewInt96Node
    53  	case reflect.TypeOf(float32(0)):
    54  		return schema.NewFloat32Node
    55  	case reflect.TypeOf(float64(0)):
    56  		return schema.NewFloat64Node
    57  	case reflect.TypeOf(parquet.ByteArray{}):
    58  		return schema.NewByteArrayNode
    59  	case reflect.TypeOf(parquet.FixedLenByteArray{}):
    60  		return func(name string, rep parquet.Repetition, field int32) *schema.PrimitiveNode {
    61  			return schema.NewFixedLenByteArrayNode(name, rep, 12, field)
    62  		}
    63  	}
    64  	return nil
    65  }
    66  
    67  func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, heap *memory.Buffer) (interface{}, interface{}) {
    68  	switch t {
    69  	case reflect.TypeOf(true):
    70  		draws := *(*[]bool)(unsafe.Pointer(&drawbuf))
    71  		decode := *(*[]bool)(unsafe.Pointer(&decodebuf))
    72  		testutils.InitValues(draws[:nvals], heap)
    73  
    74  		for j := 1; j < repeats; j++ {
    75  			for k := 0; k < nvals; k++ {
    76  				draws[nvals*j+k] = draws[k]
    77  			}
    78  		}
    79  
    80  		return draws[:nvals*repeats], decode[:nvals*repeats]
    81  	case reflect.TypeOf(int32(0)):
    82  		draws := arrow.Int32Traits.CastFromBytes(drawbuf)
    83  		decode := arrow.Int32Traits.CastFromBytes(decodebuf)
    84  		testutils.InitValues(draws[:nvals], heap)
    85  
    86  		for j := 1; j < repeats; j++ {
    87  			for k := 0; k < nvals; k++ {
    88  				draws[nvals*j+k] = draws[k]
    89  			}
    90  		}
    91  
    92  		return draws[:nvals*repeats], decode[:nvals*repeats]
    93  	case reflect.TypeOf(int64(0)):
    94  		draws := arrow.Int64Traits.CastFromBytes(drawbuf)
    95  		decode := arrow.Int64Traits.CastFromBytes(decodebuf)
    96  		testutils.InitValues(draws[:nvals], heap)
    97  
    98  		for j := 1; j < repeats; j++ {
    99  			for k := 0; k < nvals; k++ {
   100  				draws[nvals*j+k] = draws[k]
   101  			}
   102  		}
   103  
   104  		return draws[:nvals*repeats], decode[:nvals*repeats]
   105  	case reflect.TypeOf(parquet.Int96{}):
   106  		draws := parquet.Int96Traits.CastFromBytes(drawbuf)
   107  		decode := parquet.Int96Traits.CastFromBytes(decodebuf)
   108  		testutils.InitValues(draws[:nvals], heap)
   109  
   110  		for j := 1; j < repeats; j++ {
   111  			for k := 0; k < nvals; k++ {
   112  				draws[nvals*j+k] = draws[k]
   113  			}
   114  		}
   115  
   116  		return draws[:nvals*repeats], decode[:nvals*repeats]
   117  	case reflect.TypeOf(float32(0)):
   118  		draws := arrow.Float32Traits.CastFromBytes(drawbuf)
   119  		decode := arrow.Float32Traits.CastFromBytes(decodebuf)
   120  		testutils.InitValues(draws[:nvals], heap)
   121  
   122  		for j := 1; j < repeats; j++ {
   123  			for k := 0; k < nvals; k++ {
   124  				draws[nvals*j+k] = draws[k]
   125  			}
   126  		}
   127  
   128  		return draws[:nvals*repeats], decode[:nvals*repeats]
   129  	case reflect.TypeOf(float64(0)):
   130  		draws := arrow.Float64Traits.CastFromBytes(drawbuf)
   131  		decode := arrow.Float64Traits.CastFromBytes(decodebuf)
   132  		testutils.InitValues(draws[:nvals], heap)
   133  
   134  		for j := 1; j < repeats; j++ {
   135  			for k := 0; k < nvals; k++ {
   136  				draws[nvals*j+k] = draws[k]
   137  			}
   138  		}
   139  
   140  		return draws[:nvals*repeats], decode[:nvals*repeats]
   141  	case reflect.TypeOf(parquet.ByteArray{}):
   142  		draws := make([]parquet.ByteArray, nvals*repeats)
   143  		decode := make([]parquet.ByteArray, nvals*repeats)
   144  		testutils.InitValues(draws[:nvals], heap)
   145  
   146  		for j := 1; j < repeats; j++ {
   147  			for k := 0; k < nvals; k++ {
   148  				draws[nvals*j+k] = draws[k]
   149  			}
   150  		}
   151  
   152  		return draws[:nvals*repeats], decode[:nvals*repeats]
   153  	case reflect.TypeOf(parquet.FixedLenByteArray{}):
   154  		draws := make([]parquet.FixedLenByteArray, nvals*repeats)
   155  		decode := make([]parquet.FixedLenByteArray, nvals*repeats)
   156  		testutils.InitValues(draws[:nvals], heap)
   157  
   158  		for j := 1; j < repeats; j++ {
   159  			for k := 0; k < nvals; k++ {
   160  				draws[nvals*j+k] = draws[k]
   161  			}
   162  		}
   163  
   164  		return draws[:nvals*repeats], decode[:nvals*repeats]
   165  	}
   166  	return nil, nil
   167  }
   168  
   169  func encode(enc encoding.TypedEncoder, vals interface{}) {
   170  	switch v := vals.(type) {
   171  	case []bool:
   172  		enc.(encoding.BooleanEncoder).Put(v)
   173  	case []int32:
   174  		enc.(encoding.Int32Encoder).Put(v)
   175  	case []int64:
   176  		enc.(encoding.Int64Encoder).Put(v)
   177  	case []parquet.Int96:
   178  		enc.(encoding.Int96Encoder).Put(v)
   179  	case []float32:
   180  		enc.(encoding.Float32Encoder).Put(v)
   181  	case []float64:
   182  		enc.(encoding.Float64Encoder).Put(v)
   183  	case []parquet.ByteArray:
   184  		enc.(encoding.ByteArrayEncoder).Put(v)
   185  	case []parquet.FixedLenByteArray:
   186  		enc.(encoding.FixedLenByteArrayEncoder).Put(v)
   187  	}
   188  }
   189  
   190  func encodeSpaced(enc encoding.TypedEncoder, vals interface{}, validBits []byte, validBitsOffset int64) {
   191  	switch v := vals.(type) {
   192  	case []bool:
   193  		enc.(encoding.BooleanEncoder).PutSpaced(v, validBits, validBitsOffset)
   194  	case []int32:
   195  		enc.(encoding.Int32Encoder).PutSpaced(v, validBits, validBitsOffset)
   196  	case []int64:
   197  		enc.(encoding.Int64Encoder).PutSpaced(v, validBits, validBitsOffset)
   198  	case []parquet.Int96:
   199  		enc.(encoding.Int96Encoder).PutSpaced(v, validBits, validBitsOffset)
   200  	case []float32:
   201  		enc.(encoding.Float32Encoder).PutSpaced(v, validBits, validBitsOffset)
   202  	case []float64:
   203  		enc.(encoding.Float64Encoder).PutSpaced(v, validBits, validBitsOffset)
   204  	case []parquet.ByteArray:
   205  		enc.(encoding.ByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset)
   206  	case []parquet.FixedLenByteArray:
   207  		enc.(encoding.FixedLenByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset)
   208  	}
   209  }
   210  
   211  func decode(dec encoding.TypedDecoder, out interface{}) (int, error) {
   212  	switch v := out.(type) {
   213  	case []bool:
   214  		return dec.(encoding.BooleanDecoder).Decode(v)
   215  	case []int32:
   216  		return dec.(encoding.Int32Decoder).Decode(v)
   217  	case []int64:
   218  		return dec.(encoding.Int64Decoder).Decode(v)
   219  	case []parquet.Int96:
   220  		return dec.(encoding.Int96Decoder).Decode(v)
   221  	case []float32:
   222  		return dec.(encoding.Float32Decoder).Decode(v)
   223  	case []float64:
   224  		return dec.(encoding.Float64Decoder).Decode(v)
   225  	case []parquet.ByteArray:
   226  		return dec.(encoding.ByteArrayDecoder).Decode(v)
   227  	case []parquet.FixedLenByteArray:
   228  		return dec.(encoding.FixedLenByteArrayDecoder).Decode(v)
   229  	}
   230  	return 0, nil
   231  }
   232  
   233  func decodeSpaced(dec encoding.TypedDecoder, out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   234  	switch v := out.(type) {
   235  	case []bool:
   236  		return dec.(encoding.BooleanDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   237  	case []int32:
   238  		return dec.(encoding.Int32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   239  	case []int64:
   240  		return dec.(encoding.Int64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   241  	case []parquet.Int96:
   242  		return dec.(encoding.Int96Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   243  	case []float32:
   244  		return dec.(encoding.Float32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   245  	case []float64:
   246  		return dec.(encoding.Float64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   247  	case []parquet.ByteArray:
   248  		return dec.(encoding.ByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   249  	case []parquet.FixedLenByteArray:
   250  		return dec.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset)
   251  	}
   252  	return 0, nil
   253  }
   254  
   255  type BaseEncodingTestSuite struct {
   256  	suite.Suite
   257  
   258  	descr   *schema.Column
   259  	typeLen int
   260  	mem     memory.Allocator
   261  	typ     reflect.Type
   262  
   263  	nvalues     int
   264  	heap        *memory.Buffer
   265  	inputBytes  *memory.Buffer
   266  	outputBytes *memory.Buffer
   267  	nodeFactory nodeFactory
   268  
   269  	draws     interface{}
   270  	decodeBuf interface{}
   271  }
   272  
   273  func (b *BaseEncodingTestSuite) SetupSuite() {
   274  	b.mem = memory.DefaultAllocator
   275  	b.inputBytes = memory.NewResizableBuffer(b.mem)
   276  	b.outputBytes = memory.NewResizableBuffer(b.mem)
   277  	b.heap = memory.NewResizableBuffer(b.mem)
   278  	b.nodeFactory = createNodeFactory(b.typ)
   279  }
   280  
   281  func (b *BaseEncodingTestSuite) TearDownSuite() {
   282  	b.inputBytes.Release()
   283  	b.outputBytes.Release()
   284  	b.heap.Release()
   285  }
   286  
   287  func (b *BaseEncodingTestSuite) SetupTest() {
   288  	b.descr = schema.NewColumn(b.nodeFactory("name", parquet.Repetitions.Optional, -1), 0, 0)
   289  	b.typeLen = int(b.descr.TypeLength())
   290  }
   291  
   292  func (b *BaseEncodingTestSuite) initData(nvalues, repeats int) {
   293  	b.nvalues = nvalues * repeats
   294  	b.inputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size()))
   295  	b.outputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size()))
   296  	memory.Set(b.inputBytes.Buf(), 0)
   297  	memory.Set(b.outputBytes.Buf(), 0)
   298  
   299  	b.draws, b.decodeBuf = initdata(b.typ, b.inputBytes.Buf(), b.outputBytes.Buf(), nvalues, repeats, b.heap)
   300  }
   301  
   302  func (b *BaseEncodingTestSuite) encodeTestData(e parquet.Encoding) (encoding.Buffer, error) {
   303  	enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator)
   304  	b.Equal(e, enc.Encoding())
   305  	b.Equal(b.descr.PhysicalType(), enc.Type())
   306  	encode(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface())
   307  	return enc.FlushValues()
   308  }
   309  
   310  func (b *BaseEncodingTestSuite) decodeTestData(e parquet.Encoding, buf []byte) {
   311  	dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem)
   312  	b.Equal(e, dec.Encoding())
   313  	b.Equal(b.descr.PhysicalType(), dec.Type())
   314  
   315  	dec.SetData(b.nvalues, buf)
   316  	decoded, _ := decode(dec, b.decodeBuf)
   317  	b.Equal(b.nvalues, decoded)
   318  	b.Equal(reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), reflect.ValueOf(b.decodeBuf).Slice(0, b.nvalues).Interface())
   319  }
   320  
   321  func (b *BaseEncodingTestSuite) encodeTestDataSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (encoding.Buffer, error) {
   322  	enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator)
   323  	encodeSpaced(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), validBits, validBitsOffset)
   324  	return enc.FlushValues()
   325  }
   326  
   327  func (b *BaseEncodingTestSuite) decodeTestDataSpaced(e parquet.Encoding, nullCount int, buf []byte, validBits []byte, validBitsOffset int64) {
   328  	dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem)
   329  	dec.SetData(b.nvalues-nullCount, buf)
   330  	decoded, _ := decodeSpaced(dec, b.decodeBuf, nullCount, validBits, validBitsOffset)
   331  	b.Equal(b.nvalues, decoded)
   332  
   333  	drawval := reflect.ValueOf(b.draws)
   334  	decodeval := reflect.ValueOf(b.decodeBuf)
   335  	for j := 0; j < b.nvalues; j++ {
   336  		if bitutil.BitIsSet(validBits, int(validBitsOffset)+j) {
   337  			b.Equal(drawval.Index(j).Interface(), decodeval.Index(j).Interface())
   338  		}
   339  	}
   340  }
   341  
   342  func (b *BaseEncodingTestSuite) checkRoundTrip(e parquet.Encoding) {
   343  	buf, _ := b.encodeTestData(e)
   344  	defer buf.Release()
   345  	b.decodeTestData(e, buf.Bytes())
   346  }
   347  
   348  func (b *BaseEncodingTestSuite) checkRoundTripSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) {
   349  	buf, _ := b.encodeTestDataSpaced(e, validBits, validBitsOffset)
   350  	defer buf.Release()
   351  
   352  	nullCount := 0
   353  	for i := 0; i < b.nvalues; i++ {
   354  		if bitutil.BitIsNotSet(validBits, int(validBitsOffset)+i) {
   355  			nullCount++
   356  		}
   357  	}
   358  	b.decodeTestDataSpaced(e, nullCount, buf.Bytes(), validBits, validBitsOffset)
   359  }
   360  
   361  func (b *BaseEncodingTestSuite) TestBasicRoundTrip() {
   362  	b.initData(10000, 1)
   363  	b.checkRoundTrip(parquet.Encodings.Plain)
   364  }
   365  
   366  func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() {
   367  	b.initData(10000, 1)
   368  
   369  	switch b.typ {
   370  	case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)):
   371  		b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked)
   372  	default:
   373  		b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) })
   374  	}
   375  }
   376  
   377  func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() {
   378  	b.initData(10000, 1)
   379  
   380  	switch b.typ {
   381  	case reflect.TypeOf(parquet.ByteArray{}):
   382  		b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray)
   383  	default:
   384  		b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) })
   385  	}
   386  }
   387  
   388  func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() {
   389  	b.initData(10000, 1)
   390  
   391  	switch b.typ {
   392  	case reflect.TypeOf(parquet.ByteArray{}):
   393  		b.checkRoundTrip(parquet.Encodings.DeltaByteArray)
   394  	default:
   395  		b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) })
   396  	}
   397  }
   398  
   399  func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() {
   400  	exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) {
   401  		b.Run(fmt.Sprintf("%d vals %d repeats %d offset %0.3f null", vals, repeats, validBitsOffset, 1-nullProb), func() {
   402  			b.initData(vals, repeats)
   403  
   404  			size := int64(b.nvalues) + validBitsOffset
   405  			r := testutils.NewRandomArrayGenerator(1923)
   406  			arr := r.Uint8(size, 0, 100, 1-nullProb)
   407  			validBits := arr.NullBitmapBytes()
   408  			if validBits != nil {
   409  				b.checkRoundTripSpaced(parquet.Encodings.Plain, validBits, validBitsOffset)
   410  				switch b.typ {
   411  				case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)):
   412  					b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset)
   413  				case reflect.TypeOf(parquet.ByteArray{}):
   414  					b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset)
   415  					b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset)
   416  				}
   417  			}
   418  		})
   419  	}
   420  
   421  	const (
   422  		avx512Size    = 64
   423  		simdSize      = avx512Size
   424  		multiSimdSize = simdSize * 33
   425  	)
   426  
   427  	for _, nullProb := range []float64{0.001, 0.1, 0.5, 0.9, 0.999} {
   428  		// Test with both size and offset up to 3 simd block
   429  		for i := 1; i < simdSize*3; i++ {
   430  			exec(i, 1, 0, nullProb)
   431  			exec(i, 1, int64(i+1), nullProb)
   432  		}
   433  		// large block and offset
   434  		exec(multiSimdSize, 1, 0, nullProb)
   435  		exec(multiSimdSize+33, 1, 0, nullProb)
   436  		exec(multiSimdSize, 1, 33, nullProb)
   437  		exec(multiSimdSize+33, 1, 33, nullProb)
   438  	}
   439  }
   440  
   441  func TestEncoding(t *testing.T) {
   442  	tests := []struct {
   443  		name string
   444  		typ  reflect.Type
   445  	}{
   446  		{"Bool", reflect.TypeOf(true)},
   447  		{"Int32", reflect.TypeOf(int32(0))},
   448  		{"Int64", reflect.TypeOf(int64(0))},
   449  		{"Float32", reflect.TypeOf(float32(0))},
   450  		{"Float64", reflect.TypeOf(float64(0))},
   451  		{"Int96", reflect.TypeOf(parquet.Int96{})},
   452  		{"ByteArray", reflect.TypeOf(parquet.ByteArray{})},
   453  		{"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})},
   454  	}
   455  
   456  	for _, tt := range tests {
   457  		t.Run(tt.name, func(t *testing.T) {
   458  			suite.Run(t, &BaseEncodingTestSuite{typ: tt.typ})
   459  		})
   460  	}
   461  }
   462  
   463  type DictionaryEncodingTestSuite struct {
   464  	BaseEncodingTestSuite
   465  }
   466  
   467  func (d *DictionaryEncodingTestSuite) encodeTestDataDict(e parquet.Encoding) (dictBuffer, indices encoding.Buffer, numEntries int) {
   468  	enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder)
   469  
   470  	d.Equal(parquet.Encodings.PlainDict, enc.Encoding())
   471  	d.Equal(d.descr.PhysicalType(), enc.Type())
   472  	encode(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface())
   473  	dictBuffer = memory.NewResizableBuffer(d.mem)
   474  	dictBuffer.Resize(enc.DictEncodedSize())
   475  	enc.WriteDict(dictBuffer.Bytes())
   476  	indices, _ = enc.FlushValues()
   477  	numEntries = enc.NumEntries()
   478  	return
   479  }
   480  
   481  func (d *DictionaryEncodingTestSuite) encodeTestDataDictSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (dictBuffer, indices encoding.Buffer, numEntries int) {
   482  	enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder)
   483  	d.Equal(d.descr.PhysicalType(), enc.Type())
   484  
   485  	encodeSpaced(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), validBits, validBitsOffset)
   486  	dictBuffer = memory.NewResizableBuffer(d.mem)
   487  	dictBuffer.Resize(enc.DictEncodedSize())
   488  	enc.WriteDict(dictBuffer.Bytes())
   489  	indices, _ = enc.FlushValues()
   490  	numEntries = enc.NumEntries()
   491  	return
   492  }
   493  
   494  func (d *DictionaryEncodingTestSuite) checkRoundTrip() {
   495  	dictBuffer, indices, numEntries := d.encodeTestDataDict(parquet.Encodings.Plain)
   496  	defer dictBuffer.Release()
   497  	defer indices.Release()
   498  	validBits := make([]byte, int(bitutil.BytesForBits(int64(d.nvalues)))+1)
   499  	memory.Set(validBits, 255)
   500  
   501  	spacedBuffer, indicesSpaced, _ := d.encodeTestDataDictSpaced(parquet.Encodings.Plain, validBits, 0)
   502  	defer spacedBuffer.Release()
   503  	defer indicesSpaced.Release()
   504  	d.Equal(indices.Bytes(), indicesSpaced.Bytes())
   505  
   506  	dictDecoder := encoding.NewDecoder(testutils.TypeToParquetType(d.typ), parquet.Encodings.Plain, d.descr, d.mem)
   507  	d.Equal(d.descr.PhysicalType(), dictDecoder.Type())
   508  	dictDecoder.SetData(numEntries, dictBuffer.Bytes())
   509  	decoder := encoding.NewDictDecoder(testutils.TypeToParquetType(d.typ), d.descr, d.mem)
   510  	decoder.SetDict(dictDecoder)
   511  	decoder.SetData(d.nvalues, indices.Bytes())
   512  
   513  	decoded, _ := decode(decoder, d.decodeBuf)
   514  	d.Equal(d.nvalues, decoded)
   515  	d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface())
   516  
   517  	decoder.SetData(d.nvalues, indices.Bytes())
   518  	decoded, _ = decodeSpaced(decoder, d.decodeBuf, 0, validBits, 0)
   519  	d.Equal(d.nvalues, decoded)
   520  	d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface())
   521  }
   522  
   523  func (d *DictionaryEncodingTestSuite) TestBasicRoundTrip() {
   524  	d.initData(2500, 2)
   525  	d.checkRoundTrip()
   526  }
   527  
   528  func TestDictEncoding(t *testing.T) {
   529  	tests := []struct {
   530  		name string
   531  		typ  reflect.Type
   532  	}{
   533  		{"Int32", reflect.TypeOf(int32(0))},
   534  		{"Int64", reflect.TypeOf(int64(0))},
   535  		{"Float32", reflect.TypeOf(float32(0))},
   536  		{"Float64", reflect.TypeOf(float64(0))},
   537  		{"ByteArray", reflect.TypeOf(parquet.ByteArray{})},
   538  		{"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})},
   539  	}
   540  
   541  	for _, tt := range tests {
   542  		t.Run(tt.name, func(t *testing.T) {
   543  			suite.Run(t, &DictionaryEncodingTestSuite{BaseEncodingTestSuite{typ: tt.typ}})
   544  		})
   545  	}
   546  }
   547  
   548  func TestWriteDeltaBitPackedInt32(t *testing.T) {
   549  	column := schema.NewColumn(schema.NewInt32Node("int32", parquet.Repetitions.Required, -1), 0, 0)
   550  
   551  	tests := []struct {
   552  		name     string
   553  		toencode []int32
   554  		expected []byte
   555  	}{
   556  		{"simple 12345", []int32{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}},
   557  		{"odd vals", []int32{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}},
   558  	}
   559  
   560  	for _, tt := range tests {
   561  		t.Run(tt.name, func(t *testing.T) {
   562  			enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   563  
   564  			enc.(encoding.Int32Encoder).Put(tt.toencode)
   565  			buf, _ := enc.FlushValues()
   566  			defer buf.Release()
   567  
   568  			assert.Equal(t, tt.expected, buf.Bytes())
   569  
   570  			dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   571  
   572  			dec.(encoding.Int32Decoder).SetData(len(tt.toencode), tt.expected)
   573  			out := make([]int32, len(tt.toencode))
   574  			dec.(encoding.Int32Decoder).Decode(out)
   575  			assert.Equal(t, tt.toencode, out)
   576  		})
   577  	}
   578  
   579  	t.Run("test progressive decoding", func(t *testing.T) {
   580  		values := make([]int32, 1000)
   581  		testutils.FillRandomInt32(0, values)
   582  
   583  		enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   584  		enc.(encoding.Int32Encoder).Put(values)
   585  		buf, _ := enc.FlushValues()
   586  		defer buf.Release()
   587  
   588  		dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   589  		dec.(encoding.Int32Decoder).SetData(len(values), buf.Bytes())
   590  
   591  		valueBuf := make([]int32, 100)
   592  		for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) {
   593  			dec.(encoding.Int32Decoder).Decode(valueBuf)
   594  			assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j)
   595  		}
   596  	})
   597  }
   598  
   599  func TestWriteDeltaBitPackedInt64(t *testing.T) {
   600  	column := schema.NewColumn(schema.NewInt64Node("int64", parquet.Repetitions.Required, -1), 0, 0)
   601  
   602  	tests := []struct {
   603  		name     string
   604  		toencode []int64
   605  		expected []byte
   606  	}{
   607  		{"simple 12345", []int64{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}},
   608  		{"odd vals", []int64{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}},
   609  	}
   610  
   611  	for _, tt := range tests {
   612  		t.Run(tt.name, func(t *testing.T) {
   613  			enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   614  
   615  			enc.(encoding.Int64Encoder).Put(tt.toencode)
   616  			buf, _ := enc.FlushValues()
   617  			defer buf.Release()
   618  
   619  			assert.Equal(t, tt.expected, buf.Bytes())
   620  
   621  			dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   622  
   623  			dec.(encoding.Int64Decoder).SetData(len(tt.toencode), tt.expected)
   624  			out := make([]int64, len(tt.toencode))
   625  			dec.(encoding.Int64Decoder).Decode(out)
   626  			assert.Equal(t, tt.toencode, out)
   627  		})
   628  	}
   629  
   630  	t.Run("test progressive decoding", func(t *testing.T) {
   631  		values := make([]int64, 1000)
   632  		testutils.FillRandomInt64(0, values)
   633  
   634  		enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   635  		enc.(encoding.Int64Encoder).Put(values)
   636  		buf, _ := enc.FlushValues()
   637  		defer buf.Release()
   638  
   639  		dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   640  		dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes())
   641  
   642  		valueBuf := make([]int64, 100)
   643  		for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) {
   644  			decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf)
   645  			assert.Equal(t, len(valueBuf), decoded)
   646  			assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j)
   647  		}
   648  	})
   649  
   650  	t.Run("GH-37102", func(t *testing.T) {
   651  		values := []int64{
   652  			0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   653  			0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   654  			0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   655  			0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   656  			0, 0,
   657  		}
   658  
   659  		enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator)
   660  		enc.(encoding.Int64Encoder).Put(values)
   661  		buf, _ := enc.FlushValues()
   662  		defer buf.Release()
   663  
   664  		dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator)
   665  		dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes())
   666  
   667  		valueBuf := make([]int64, len(values))
   668  
   669  		decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf)
   670  		assert.Equal(t, len(valueBuf), decoded)
   671  		assert.Equal(t, values, valueBuf)
   672  	})
   673  }
   674  
   675  func TestDeltaLengthByteArrayEncoding(t *testing.T) {
   676  	column := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0)
   677  
   678  	test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")}
   679  	expected := []byte{128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70}
   680  
   681  	enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, false, column, memory.DefaultAllocator)
   682  	enc.(encoding.ByteArrayEncoder).Put(test)
   683  	buf, _ := enc.FlushValues()
   684  	defer buf.Release()
   685  
   686  	assert.Equal(t, expected, buf.Bytes())
   687  
   688  	dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, column, nil)
   689  	dec.SetData(len(test), expected)
   690  	out := make([]parquet.ByteArray, len(test))
   691  	decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out)
   692  	assert.Equal(t, len(test), decoded)
   693  	assert.Equal(t, test, out)
   694  }
   695  
   696  func TestDeltaByteArrayEncoding(t *testing.T) {
   697  	test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")}
   698  	expected := []byte{128, 1, 4, 4, 0, 0, 0, 0, 0, 0, 128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70}
   699  
   700  	enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, false, nil, nil)
   701  	enc.(encoding.ByteArrayEncoder).Put(test)
   702  	buf, _ := enc.FlushValues()
   703  	defer buf.Release()
   704  
   705  	assert.Equal(t, expected, buf.Bytes())
   706  
   707  	dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, nil, nil)
   708  	dec.SetData(len(test), expected)
   709  	out := make([]parquet.ByteArray, len(test))
   710  	decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out)
   711  	assert.Equal(t, len(test), decoded)
   712  	assert.Equal(t, test, out)
   713  }
   714  
   715  func TestDeltaBitPacking(t *testing.T) {
   716  	datadir := os.Getenv("ARROW_TEST_DATA")
   717  	if datadir == "" {
   718  		return
   719  	}
   720  
   721  	fname := path.Join(datadir, "parquet/timestamp.data")
   722  	require.FileExists(t, fname)
   723  	f, err := os.Open(fname)
   724  	if err != nil {
   725  		t.Fatal(err)
   726  	}
   727  	defer f.Close()
   728  
   729  	values := make([]int64, 0)
   730  
   731  	scanner := bufio.NewScanner(f)
   732  	for scanner.Scan() {
   733  		v, err := strconv.ParseInt(scanner.Text(), 10, 64)
   734  		if err != nil {
   735  			t.Fatal(err)
   736  		}
   737  		values = append(values, v)
   738  	}
   739  
   740  	if err := scanner.Err(); err != nil {
   741  		t.Fatal(err)
   742  	}
   743  
   744  	col := schema.NewColumn(schema.MustPrimitive(schema.NewPrimitiveNode("foo", parquet.Repetitions.Required,
   745  		parquet.Types.Int64, -1, -1)), 0, 0)
   746  	enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, col, memory.DefaultAllocator).(encoding.Int64Encoder)
   747  
   748  	enc.Put(values)
   749  	buf, err := enc.FlushValues()
   750  	if err != nil {
   751  		t.Fatal(err)
   752  	}
   753  	defer buf.Release()
   754  
   755  	dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, col, memory.DefaultAllocator).(encoding.Int64Decoder)
   756  	dec.SetData(len(values), buf.Bytes())
   757  
   758  	ll := len(values)
   759  	for i := 0; i < ll; i += 1024 {
   760  		out := make([]int64, 1024)
   761  		n, err := dec.Decode(out)
   762  		if err != nil {
   763  			t.Fatal(err)
   764  		}
   765  		assert.Equal(t, values[:n], out[:n])
   766  		values = values[n:]
   767  	}
   768  	assert.Equal(t, dec.ValuesLeft(), 0)
   769  }
   770  
   771  func TestBooleanPlainDecoderAfterFlushing(t *testing.T) {
   772  	descr := schema.NewColumn(schema.NewBooleanNode("bool", parquet.Repetitions.Optional, -1), 0, 0)
   773  	enc := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, false, descr, memory.DefaultAllocator)
   774  	benc := enc.(encoding.BooleanEncoder)
   775  
   776  	dec := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, descr, memory.DefaultAllocator)
   777  	decSlice := make([]bool, 1)
   778  	bdec := dec.(encoding.BooleanDecoder)
   779  
   780  	// Write and extract two different values
   781  	// This is validating that `FlushValues` wholly
   782  	// resets the encoder state.
   783  	benc.Put([]bool{true})
   784  	buf1, err := benc.FlushValues()
   785  	assert.NoError(t, err)
   786  
   787  	benc.Put([]bool{false})
   788  	buf2, err := benc.FlushValues()
   789  	assert.NoError(t, err)
   790  
   791  	// Decode buf1, expect true
   792  	err = bdec.SetData(1, buf1.Buf())
   793  	assert.NoError(t, err)
   794  	n, err := bdec.Decode(decSlice)
   795  	assert.NoError(t, err)
   796  	assert.Equal(t, n, 1)
   797  	assert.Equal(t, decSlice[0], true)
   798  
   799  	// Decode buf2, expect false
   800  	err = bdec.SetData(1, buf2.Buf())
   801  	assert.NoError(t, err)
   802  	n, err = bdec.Decode(decSlice)
   803  	assert.NoError(t, err)
   804  	assert.Equal(t, n, 1)
   805  	assert.Equal(t, decSlice[0], false)
   806  }