github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/delta_bit_packing.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"math/bits"
    23  	"reflect"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow"
    26  	"github.com/apache/arrow/go/v14/arrow/memory"
    27  	shared_utils "github.com/apache/arrow/go/v14/internal/utils"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/utils"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  // see the deltaBitPack encoder for a description of the encoding format that is
    34  // used for delta-bitpacking.
    35  type deltaBitPackDecoder struct {
    36  	decoder
    37  
    38  	mem memory.Allocator
    39  
    40  	usedFirst            bool
    41  	bitdecoder           *utils.BitReader
    42  	blockSize            uint64
    43  	currentBlockVals     uint32
    44  	miniBlocks           uint64
    45  	valsPerMini          uint32
    46  	currentMiniBlockVals uint32
    47  	minDelta             int64
    48  	miniBlockIdx         uint64
    49  
    50  	deltaBitWidths *memory.Buffer
    51  	deltaBitWidth  byte
    52  
    53  	totalValues uint64
    54  	lastVal     int64
    55  }
    56  
    57  // returns the number of bytes read so far
    58  func (d *deltaBitPackDecoder) bytesRead() int64 {
    59  	return d.bitdecoder.CurOffset()
    60  }
    61  
    62  func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem }
    63  
    64  // SetData sets the bytes and the expected number of values to decode
    65  // into the decoder, updating the decoder and allowing it to be reused.
    66  func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) error {
    67  	// set our data into the underlying decoder for the type
    68  	if err := d.decoder.SetData(nvalues, data); err != nil {
    69  		return err
    70  	}
    71  	// create a bit reader for our decoder's values
    72  	d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data))
    73  	d.currentBlockVals = 0
    74  	d.currentMiniBlockVals = 0
    75  	if d.deltaBitWidths == nil {
    76  		d.deltaBitWidths = memory.NewResizableBuffer(d.mem)
    77  	}
    78  
    79  	var ok bool
    80  	d.blockSize, ok = d.bitdecoder.GetVlqInt()
    81  	if !ok {
    82  		return xerrors.New("parquet: eof exception")
    83  	}
    84  
    85  	if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok {
    86  		return xerrors.New("parquet: eof exception")
    87  	}
    88  
    89  	if d.totalValues, ok = d.bitdecoder.GetVlqInt(); !ok {
    90  		return xerrors.New("parquet: eof exception")
    91  	}
    92  
    93  	if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok {
    94  		return xerrors.New("parquet: eof exception")
    95  	}
    96  
    97  	if d.miniBlocks != 0 {
    98  		d.valsPerMini = uint32(d.blockSize / d.miniBlocks)
    99  	}
   100  	return nil
   101  }
   102  
   103  // initialize a block to decode
   104  func (d *deltaBitPackDecoder) initBlock() error {
   105  	// first we grab the min delta value that we'll start from
   106  	var ok bool
   107  	if d.minDelta, ok = d.bitdecoder.GetZigZagVlqInt(); !ok {
   108  		return xerrors.New("parquet: eof exception")
   109  	}
   110  
   111  	// ensure we have enough space for our miniblocks to decode the widths
   112  	d.deltaBitWidths.Resize(int(d.miniBlocks))
   113  
   114  	var err error
   115  	for i := uint64(0); i < d.miniBlocks; i++ {
   116  		if d.deltaBitWidths.Bytes()[i], err = d.bitdecoder.ReadByte(); err != nil {
   117  			return err
   118  		}
   119  	}
   120  
   121  	d.miniBlockIdx = 0
   122  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[0]
   123  	d.currentBlockVals = uint32(d.blockSize)
   124  	return nil
   125  }
   126  
   127  // DeltaBitPackInt32Decoder decodes Int32 values which are packed using the Delta BitPacking algorithm.
   128  type DeltaBitPackInt32Decoder struct {
   129  	*deltaBitPackDecoder
   130  
   131  	miniBlockValues []int32
   132  }
   133  
   134  func (d *DeltaBitPackInt32Decoder) unpackNextMini() error {
   135  	if d.miniBlockValues == nil {
   136  		d.miniBlockValues = make([]int32, 0, int(d.valsPerMini))
   137  	} else {
   138  		d.miniBlockValues = d.miniBlockValues[:0]
   139  	}
   140  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
   141  	d.currentMiniBlockVals = d.valsPerMini
   142  
   143  	for j := 0; j < int(d.valsPerMini); j++ {
   144  		delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
   145  		if !ok {
   146  			return xerrors.New("parquet: eof exception")
   147  		}
   148  
   149  		d.lastVal += int64(delta) + int64(d.minDelta)
   150  		d.miniBlockValues = append(d.miniBlockValues, int32(d.lastVal))
   151  	}
   152  	d.miniBlockIdx++
   153  	return nil
   154  }
   155  
   156  // Decode retrieves min(remaining values, len(out)) values from the data and returns the number
   157  // of values actually decoded and any errors encountered.
   158  func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) {
   159  	max := shared_utils.MinInt(len(out), int(d.totalValues))
   160  	if max == 0 {
   161  		return 0, nil
   162  	}
   163  
   164  	out = out[:max]
   165  	if !d.usedFirst { // starting value to calculate deltas against
   166  		out[0] = int32(d.lastVal)
   167  		out = out[1:]
   168  		d.usedFirst = true
   169  	}
   170  
   171  	var err error
   172  	for len(out) > 0 { // unpack mini blocks until we get all the values we need
   173  		if d.currentBlockVals == 0 {
   174  			err = d.initBlock()
   175  		}
   176  		if d.currentMiniBlockVals == 0 {
   177  			err = d.unpackNextMini()
   178  		}
   179  		if err != nil {
   180  			return 0, err
   181  		}
   182  
   183  		// copy as many values from our mini block as we can into out
   184  		start := int(d.valsPerMini - d.currentMiniBlockVals)
   185  		numCopied := copy(out, d.miniBlockValues[start:])
   186  
   187  		out = out[numCopied:]
   188  		d.currentBlockVals -= uint32(numCopied)
   189  		d.currentMiniBlockVals -= uint32(numCopied)
   190  	}
   191  	d.nvals -= max
   192  	return max, nil
   193  }
   194  
   195  // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap
   196  func (d *DeltaBitPackInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   197  	toread := len(out) - nullCount
   198  	values, err := d.Decode(out[:toread])
   199  	if err != nil {
   200  		return values, err
   201  	}
   202  	if values != toread {
   203  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   204  	}
   205  
   206  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   207  }
   208  
   209  // Type returns the physical parquet type that this decoder decodes, in this case Int32
   210  func (DeltaBitPackInt32Decoder) Type() parquet.Type {
   211  	return parquet.Types.Int32
   212  }
   213  
   214  // DeltaBitPackInt64Decoder decodes a delta bit packed int64 column of data.
   215  type DeltaBitPackInt64Decoder struct {
   216  	*deltaBitPackDecoder
   217  
   218  	miniBlockValues []int64
   219  }
   220  
   221  func (d *DeltaBitPackInt64Decoder) unpackNextMini() error {
   222  	if d.miniBlockValues == nil {
   223  		d.miniBlockValues = make([]int64, 0, int(d.valsPerMini))
   224  	} else {
   225  		d.miniBlockValues = d.miniBlockValues[:0]
   226  	}
   227  
   228  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
   229  	d.currentMiniBlockVals = d.valsPerMini
   230  
   231  	for j := 0; j < int(d.valsPerMini); j++ {
   232  		delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
   233  		if !ok {
   234  			return xerrors.New("parquet: eof exception")
   235  		}
   236  
   237  		d.lastVal += int64(delta) + int64(d.minDelta)
   238  		d.miniBlockValues = append(d.miniBlockValues, d.lastVal)
   239  	}
   240  	d.miniBlockIdx++
   241  	return nil
   242  }
   243  
   244  // Decode retrieves min(remaining values, len(out)) values from the data and returns the number
   245  // of values actually decoded and any errors encountered.
   246  func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) {
   247  	max := shared_utils.MinInt(len(out), d.nvals)
   248  	if max == 0 {
   249  		return 0, nil
   250  	}
   251  
   252  	out = out[:max]
   253  	if !d.usedFirst {
   254  		out[0] = d.lastVal
   255  		out = out[1:]
   256  		d.usedFirst = true
   257  	}
   258  
   259  	var err error
   260  	for len(out) > 0 {
   261  		if d.currentBlockVals == 0 {
   262  			err = d.initBlock()
   263  		}
   264  		if d.currentMiniBlockVals == 0 {
   265  			err = d.unpackNextMini()
   266  		}
   267  
   268  		if err != nil {
   269  			return 0, err
   270  		}
   271  
   272  		start := int(d.valsPerMini - d.currentMiniBlockVals)
   273  		numCopied := copy(out, d.miniBlockValues[start:])
   274  
   275  		out = out[numCopied:]
   276  		d.currentBlockVals -= uint32(numCopied)
   277  		d.currentMiniBlockVals -= uint32(numCopied)
   278  	}
   279  	d.nvals -= max
   280  	return max, nil
   281  }
   282  
   283  // Type returns the physical parquet type that this decoder decodes, in this case Int64
   284  func (DeltaBitPackInt64Decoder) Type() parquet.Type {
   285  	return parquet.Types.Int64
   286  }
   287  
   288  // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap
   289  func (d DeltaBitPackInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   290  	toread := len(out) - nullCount
   291  	values, err := d.Decode(out[:toread])
   292  	if err != nil {
   293  		return values, err
   294  	}
   295  	if values != toread {
   296  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   297  	}
   298  
   299  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   300  }
   301  
   302  const (
   303  	// block size must be a multiple of 128
   304  	defaultBlockSize     = 128
   305  	defaultNumMiniBlocks = 4
   306  	// block size / number of mini blocks must result in a multiple of 32
   307  	defaultNumValuesPerMini = 32
   308  	// max size of the header for the delta blocks
   309  	maxHeaderWriterSize = 32
   310  )
   311  
   312  // deltaBitPackEncoder is an encoder for the DeltaBinary Packing format
   313  // as per the parquet spec.
   314  //
   315  // Consists of a header followed by blocks of delta encoded values binary packed.
   316  //
   317  //	Format
   318  //		[header] [block 1] [block 2] ... [block N]
   319  //
   320  //	Header
   321  //		[block size] [number of mini blocks per block] [total value count] [first value]
   322  //
   323  //	Block
   324  //		[min delta] [list of bitwidths of the miniblocks] [miniblocks...]
   325  //
   326  // Sets aside bytes at the start of the internal buffer where the header will be written,
   327  // and only writes the header when FlushValues is called before returning it.
   328  type deltaBitPackEncoder struct {
   329  	encoder
   330  
   331  	bitWriter  *utils.BitWriter
   332  	totalVals  uint64
   333  	firstVal   int64
   334  	currentVal int64
   335  
   336  	blockSize     uint64
   337  	miniBlockSize uint64
   338  	numMiniBlocks uint64
   339  	deltas        []int64
   340  }
   341  
   342  // flushBlock flushes out a finished block for writing to the underlying encoder
   343  func (enc *deltaBitPackEncoder) flushBlock() {
   344  	if len(enc.deltas) == 0 {
   345  		return
   346  	}
   347  
   348  	// determine the minimum delta value
   349  	minDelta := int64(math.MaxInt64)
   350  	for _, delta := range enc.deltas {
   351  		if delta < minDelta {
   352  			minDelta = delta
   353  		}
   354  	}
   355  
   356  	enc.bitWriter.WriteZigZagVlqInt(minDelta)
   357  	// reserve enough bytes to write out our miniblock deltas
   358  	offset, _ := enc.bitWriter.SkipBytes(int(enc.numMiniBlocks))
   359  
   360  	valuesToWrite := int64(len(enc.deltas))
   361  	for i := 0; i < int(enc.numMiniBlocks); i++ {
   362  		n := shared_utils.Min(int64(enc.miniBlockSize), valuesToWrite)
   363  		if n == 0 {
   364  			break
   365  		}
   366  
   367  		maxDelta := int64(math.MinInt64)
   368  		start := i * int(enc.miniBlockSize)
   369  		for _, val := range enc.deltas[start : start+int(n)] {
   370  			maxDelta = shared_utils.Max(maxDelta, val)
   371  		}
   372  
   373  		// compute bit width to store (max_delta - min_delta)
   374  		width := uint(bits.Len64(uint64(maxDelta - minDelta)))
   375  		// write out the bit width we used into the bytes we reserved earlier
   376  		enc.bitWriter.WriteAt([]byte{byte(width)}, int64(offset+i))
   377  
   378  		// write out our deltas
   379  		for _, val := range enc.deltas[start : start+int(n)] {
   380  			enc.bitWriter.WriteValue(uint64(val-minDelta), width)
   381  		}
   382  
   383  		valuesToWrite -= n
   384  
   385  		// pad the last block if n < miniBlockSize
   386  		for ; n < int64(enc.miniBlockSize); n++ {
   387  			enc.bitWriter.WriteValue(0, width)
   388  		}
   389  	}
   390  	enc.deltas = enc.deltas[:0]
   391  }
   392  
   393  // putInternal is the implementation for actually writing data which must be
   394  // integral data as int, int8, int32, or int64.
   395  func (enc *deltaBitPackEncoder) putInternal(data interface{}) {
   396  	v := reflect.ValueOf(data)
   397  	if v.Len() == 0 {
   398  		return
   399  	}
   400  
   401  	idx := 0
   402  	if enc.totalVals == 0 {
   403  		enc.blockSize = defaultBlockSize
   404  		enc.numMiniBlocks = defaultNumMiniBlocks
   405  		enc.miniBlockSize = defaultNumValuesPerMini
   406  
   407  		enc.firstVal = v.Index(0).Int()
   408  		enc.currentVal = enc.firstVal
   409  		idx = 1
   410  
   411  		enc.bitWriter = utils.NewBitWriter(enc.sink)
   412  	}
   413  
   414  	enc.totalVals += uint64(v.Len())
   415  	for ; idx < v.Len(); idx++ {
   416  		val := v.Index(idx).Int()
   417  		enc.deltas = append(enc.deltas, val-enc.currentVal)
   418  		enc.currentVal = val
   419  		if len(enc.deltas) == int(enc.blockSize) {
   420  			enc.flushBlock()
   421  		}
   422  	}
   423  }
   424  
   425  // FlushValues flushes any remaining data and returns the finished encoded buffer
   426  // or returns nil and any error encountered during flushing.
   427  func (enc *deltaBitPackEncoder) FlushValues() (Buffer, error) {
   428  	if enc.bitWriter != nil {
   429  		// write any remaining values
   430  		enc.flushBlock()
   431  		enc.bitWriter.Flush(true)
   432  	} else {
   433  		enc.blockSize = defaultBlockSize
   434  		enc.numMiniBlocks = defaultNumMiniBlocks
   435  		enc.miniBlockSize = defaultNumValuesPerMini
   436  	}
   437  
   438  	buffer := make([]byte, maxHeaderWriterSize)
   439  	headerWriter := utils.NewBitWriter(utils.NewWriterAtBuffer(buffer))
   440  
   441  	headerWriter.WriteVlqInt(uint64(enc.blockSize))
   442  	headerWriter.WriteVlqInt(uint64(enc.numMiniBlocks))
   443  	headerWriter.WriteVlqInt(uint64(enc.totalVals))
   444  	headerWriter.WriteZigZagVlqInt(int64(enc.firstVal))
   445  	headerWriter.Flush(false)
   446  
   447  	buffer = buffer[:headerWriter.Written()]
   448  	enc.totalVals = 0
   449  
   450  	if enc.bitWriter != nil {
   451  		flushed := enc.sink.Finish()
   452  		defer flushed.Release()
   453  
   454  		buffer = append(buffer, flushed.Buf()[:enc.bitWriter.Written()]...)
   455  	}
   456  	return poolBuffer{memory.NewBufferBytes(buffer)}, nil
   457  }
   458  
   459  // EstimatedDataEncodedSize returns the current amount of data actually flushed out and written
   460  func (enc *deltaBitPackEncoder) EstimatedDataEncodedSize() int64 {
   461  	return int64(enc.bitWriter.Written())
   462  }
   463  
   464  // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data.
   465  type DeltaBitPackInt32Encoder struct {
   466  	*deltaBitPackEncoder
   467  }
   468  
   469  // Put writes the values from the provided slice of int32 to the encoder
   470  func (enc DeltaBitPackInt32Encoder) Put(in []int32) {
   471  	enc.putInternal(in)
   472  }
   473  
   474  // PutSpaced takes a slice of int32 along with a bitmap that describes the nulls and an offset into the bitmap
   475  // in order to write spaced data to the encoder.
   476  func (enc DeltaBitPackInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) {
   477  	buffer := memory.NewResizableBuffer(enc.mem)
   478  	buffer.Reserve(arrow.Int32Traits.BytesRequired(len(in)))
   479  	defer buffer.Release()
   480  
   481  	data := arrow.Int32Traits.CastFromBytes(buffer.Buf())
   482  	nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   483  	enc.Put(data[:nvalid])
   484  }
   485  
   486  // Type returns the underlying physical type this encoder works with, in this case Int32
   487  func (DeltaBitPackInt32Encoder) Type() parquet.Type {
   488  	return parquet.Types.Int32
   489  }
   490  
   491  // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data.
   492  type DeltaBitPackInt64Encoder struct {
   493  	*deltaBitPackEncoder
   494  }
   495  
   496  // Put writes the values from the provided slice of int64 to the encoder
   497  func (enc DeltaBitPackInt64Encoder) Put(in []int64) {
   498  	enc.putInternal(in)
   499  }
   500  
   501  // PutSpaced takes a slice of int64 along with a bitmap that describes the nulls and an offset into the bitmap
   502  // in order to write spaced data to the encoder.
   503  func (enc DeltaBitPackInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) {
   504  	buffer := memory.NewResizableBuffer(enc.mem)
   505  	buffer.Reserve(arrow.Int64Traits.BytesRequired(len(in)))
   506  	defer buffer.Release()
   507  
   508  	data := arrow.Int64Traits.CastFromBytes(buffer.Buf())
   509  	nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   510  	enc.Put(data[:nvalid])
   511  }
   512  
   513  // Type returns the underlying physical type this encoder works with, in this case Int64
   514  func (DeltaBitPackInt64Encoder) Type() parquet.Type {
   515  	return parquet.Types.Int64
   516  }