github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/delta_bit_packing.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"math/bits"
    23  	"reflect"
    24  
    25  	"github.com/apache/arrow/go/v10/arrow"
    26  	"github.com/apache/arrow/go/v10/arrow/memory"
    27  	shared_utils "github.com/apache/arrow/go/v10/internal/utils"
    28  	"github.com/apache/arrow/go/v10/parquet"
    29  	"github.com/apache/arrow/go/v10/parquet/internal/utils"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  // see the deltaBitPack encoder for a description of the encoding format that is
    34  // used for delta-bitpacking.
    35  type deltaBitPackDecoder struct {
    36  	decoder
    37  
    38  	mem memory.Allocator
    39  
    40  	usedFirst            bool
    41  	bitdecoder           *utils.BitReader
    42  	blockSize            uint64
    43  	currentBlockVals     uint32
    44  	miniBlocks           uint64
    45  	valsPerMini          uint32
    46  	currentMiniBlockVals uint32
    47  	minDelta             int64
    48  	miniBlockIdx         uint64
    49  
    50  	deltaBitWidths *memory.Buffer
    51  	deltaBitWidth  byte
    52  
    53  	totalValues uint64
    54  	lastVal     int64
    55  }
    56  
    57  // returns the number of bytes read so far
    58  func (d *deltaBitPackDecoder) bytesRead() int64 {
    59  	return d.bitdecoder.CurOffset()
    60  }
    61  
    62  func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem }
    63  
    64  // SetData sets the bytes and the expected number of values to decode
    65  // into the decoder, updating the decoder and allowing it to be reused.
    66  func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) error {
    67  	// set our data into the underlying decoder for the type
    68  	if err := d.decoder.SetData(nvalues, data); err != nil {
    69  		return err
    70  	}
    71  	// create a bit reader for our decoder's values
    72  	d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data))
    73  	d.currentBlockVals = 0
    74  	d.currentMiniBlockVals = 0
    75  	if d.deltaBitWidths == nil {
    76  		d.deltaBitWidths = memory.NewResizableBuffer(d.mem)
    77  	}
    78  
    79  	var ok bool
    80  	d.blockSize, ok = d.bitdecoder.GetVlqInt()
    81  	if !ok {
    82  		return xerrors.New("parquet: eof exception")
    83  	}
    84  
    85  	if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok {
    86  		return xerrors.New("parquet: eof exception")
    87  	}
    88  
    89  	if d.totalValues, ok = d.bitdecoder.GetVlqInt(); !ok {
    90  		return xerrors.New("parquet: eof exception")
    91  	}
    92  
    93  	if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok {
    94  		return xerrors.New("parquet: eof exception")
    95  	}
    96  
    97  	if d.miniBlocks != 0 {
    98  		d.valsPerMini = uint32(d.blockSize / d.miniBlocks)
    99  	}
   100  	return nil
   101  }
   102  
   103  // initialize a block to decode
   104  func (d *deltaBitPackDecoder) initBlock() error {
   105  	// first we grab the min delta value that we'll start from
   106  	var ok bool
   107  	if d.minDelta, ok = d.bitdecoder.GetZigZagVlqInt(); !ok {
   108  		return xerrors.New("parquet: eof exception")
   109  	}
   110  
   111  	// ensure we have enough space for our miniblocks to decode the widths
   112  	d.deltaBitWidths.Resize(int(d.miniBlocks))
   113  
   114  	var err error
   115  	for i := uint64(0); i < d.miniBlocks; i++ {
   116  		if d.deltaBitWidths.Bytes()[i], err = d.bitdecoder.ReadByte(); err != nil {
   117  			return err
   118  		}
   119  	}
   120  
   121  	d.miniBlockIdx = 0
   122  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[0]
   123  	d.currentBlockVals = uint32(d.blockSize)
   124  	return nil
   125  }
   126  
   127  // DeltaBitPackInt32Decoder decodes Int32 values which are packed using the Delta BitPacking algorithm.
   128  type DeltaBitPackInt32Decoder struct {
   129  	*deltaBitPackDecoder
   130  
   131  	miniBlockValues []int32
   132  }
   133  
   134  func (d *DeltaBitPackInt32Decoder) unpackNextMini() error {
   135  	if d.miniBlockValues == nil {
   136  		d.miniBlockValues = make([]int32, 0, int(d.valsPerMini))
   137  	} else {
   138  		d.miniBlockValues = d.miniBlockValues[:0]
   139  	}
   140  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
   141  	d.currentMiniBlockVals = d.valsPerMini
   142  
   143  	for j := 0; j < int(d.valsPerMini); j++ {
   144  		delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
   145  		if !ok {
   146  			return xerrors.New("parquet: eof exception")
   147  		}
   148  
   149  		d.lastVal += int64(delta) + int64(d.minDelta)
   150  		d.miniBlockValues = append(d.miniBlockValues, int32(d.lastVal))
   151  	}
   152  	d.miniBlockIdx++
   153  	return nil
   154  }
   155  
   156  // Decode retrieves min(remaining values, len(out)) values from the data and returns the number
   157  // of values actually decoded and any errors encountered.
   158  func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) {
   159  	max := shared_utils.MinInt(len(out), d.nvals)
   160  	if max == 0 {
   161  		return 0, nil
   162  	}
   163  
   164  	out = out[:max]
   165  	if !d.usedFirst { // starting value to calculate deltas against
   166  		out[0] = int32(d.lastVal)
   167  		out = out[1:]
   168  		d.usedFirst = true
   169  	}
   170  
   171  	var err error
   172  	for len(out) > 0 { // unpack mini blocks until we get all the values we need
   173  		if d.currentBlockVals == 0 {
   174  			err = d.initBlock()
   175  		}
   176  		if d.currentMiniBlockVals == 0 {
   177  			err = d.unpackNextMini()
   178  		}
   179  		if err != nil {
   180  			return 0, err
   181  		}
   182  
   183  		// copy as many values from our mini block as we can into out
   184  		start := int(d.valsPerMini - d.currentMiniBlockVals)
   185  		end := shared_utils.MinInt(int(d.valsPerMini), len(out))
   186  		copy(out, d.miniBlockValues[start:end])
   187  
   188  		numCopied := end - start
   189  		out = out[numCopied:]
   190  		d.currentBlockVals -= uint32(numCopied)
   191  		d.currentMiniBlockVals -= uint32(numCopied)
   192  	}
   193  	return max, nil
   194  }
   195  
   196  // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap
   197  func (d *DeltaBitPackInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   198  	toread := len(out) - nullCount
   199  	values, err := d.Decode(out[:toread])
   200  	if err != nil {
   201  		return values, err
   202  	}
   203  	if values != toread {
   204  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   205  	}
   206  
   207  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   208  }
   209  
   210  // Type returns the physical parquet type that this decoder decodes, in this case Int32
   211  func (DeltaBitPackInt32Decoder) Type() parquet.Type {
   212  	return parquet.Types.Int32
   213  }
   214  
   215  // DeltaBitPackInt64Decoder decodes a delta bit packed int64 column of data.
   216  type DeltaBitPackInt64Decoder struct {
   217  	*deltaBitPackDecoder
   218  
   219  	miniBlockValues []int64
   220  }
   221  
   222  func (d *DeltaBitPackInt64Decoder) unpackNextMini() error {
   223  	if d.miniBlockValues == nil {
   224  		d.miniBlockValues = make([]int64, 0, int(d.valsPerMini))
   225  	} else {
   226  		d.miniBlockValues = d.miniBlockValues[:0]
   227  	}
   228  
   229  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
   230  	d.currentMiniBlockVals = d.valsPerMini
   231  
   232  	for j := 0; j < int(d.valsPerMini); j++ {
   233  		delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
   234  		if !ok {
   235  			return xerrors.New("parquet: eof exception")
   236  		}
   237  
   238  		d.lastVal += int64(delta) + int64(d.minDelta)
   239  		d.miniBlockValues = append(d.miniBlockValues, d.lastVal)
   240  	}
   241  	d.miniBlockIdx++
   242  	return nil
   243  }
   244  
   245  // Decode retrieves min(remaining values, len(out)) values from the data and returns the number
   246  // of values actually decoded and any errors encountered.
   247  func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) {
   248  	max := shared_utils.MinInt(len(out), d.nvals)
   249  	if max == 0 {
   250  		return 0, nil
   251  	}
   252  
   253  	out = out[:max]
   254  	if !d.usedFirst {
   255  		out[0] = d.lastVal
   256  		out = out[1:]
   257  		d.usedFirst = true
   258  	}
   259  
   260  	var err error
   261  	for len(out) > 0 {
   262  		if d.currentBlockVals == 0 {
   263  			err = d.initBlock()
   264  		}
   265  		if d.currentMiniBlockVals == 0 {
   266  			err = d.unpackNextMini()
   267  		}
   268  
   269  		if err != nil {
   270  			return 0, err
   271  		}
   272  
   273  		start := int(d.valsPerMini - d.currentMiniBlockVals)
   274  		end := shared_utils.MinInt(int(d.valsPerMini), len(out))
   275  		copy(out, d.miniBlockValues[start:end])
   276  
   277  		numCopied := end - start
   278  		out = out[numCopied:]
   279  		d.currentBlockVals -= uint32(numCopied)
   280  		d.currentMiniBlockVals -= uint32(numCopied)
   281  	}
   282  	return max, nil
   283  }
   284  
   285  // Type returns the physical parquet type that this decoder decodes, in this case Int64
   286  func (DeltaBitPackInt64Decoder) Type() parquet.Type {
   287  	return parquet.Types.Int64
   288  }
   289  
   290  // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap
   291  func (d DeltaBitPackInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   292  	toread := len(out) - nullCount
   293  	values, err := d.Decode(out[:toread])
   294  	if err != nil {
   295  		return values, err
   296  	}
   297  	if values != toread {
   298  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   299  	}
   300  
   301  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   302  }
   303  
   304  const (
   305  	// block size must be a multiple of 128
   306  	defaultBlockSize     = 128
   307  	defaultNumMiniBlocks = 4
   308  	// block size / number of mini blocks must result in a multiple of 32
   309  	defaultNumValuesPerMini = 32
   310  	// max size of the header for the delta blocks
   311  	maxHeaderWriterSize = 32
   312  )
   313  
   314  // deltaBitPackEncoder is an encoder for the DeltaBinary Packing format
   315  // as per the parquet spec.
   316  //
   317  // Consists of a header followed by blocks of delta encoded values binary packed.
   318  //
   319  //	Format
   320  // 		[header] [block 1] [block 2] ... [block N]
   321  //
   322  //	Header
   323  //		[block size] [number of mini blocks per block] [total value count] [first value]
   324  //
   325  //	Block
   326  //		[min delta] [list of bitwidths of the miniblocks] [miniblocks...]
   327  //
   328  // Sets aside bytes at the start of the internal buffer where the header will be written,
   329  // and only writes the header when FlushValues is called before returning it.
   330  type deltaBitPackEncoder struct {
   331  	encoder
   332  
   333  	bitWriter  *utils.BitWriter
   334  	totalVals  uint64
   335  	firstVal   int64
   336  	currentVal int64
   337  
   338  	blockSize     uint64
   339  	miniBlockSize uint64
   340  	numMiniBlocks uint64
   341  	deltas        []int64
   342  }
   343  
   344  // flushBlock flushes out a finished block for writing to the underlying encoder
   345  func (enc *deltaBitPackEncoder) flushBlock() {
   346  	if len(enc.deltas) == 0 {
   347  		return
   348  	}
   349  
   350  	// determine the minimum delta value
   351  	minDelta := int64(math.MaxInt64)
   352  	for _, delta := range enc.deltas {
   353  		if delta < minDelta {
   354  			minDelta = delta
   355  		}
   356  	}
   357  
   358  	enc.bitWriter.WriteZigZagVlqInt(minDelta)
   359  	// reserve enough bytes to write out our miniblock deltas
   360  	offset := enc.bitWriter.ReserveBytes(int(enc.numMiniBlocks))
   361  
   362  	valuesToWrite := int64(len(enc.deltas))
   363  	for i := 0; i < int(enc.numMiniBlocks); i++ {
   364  		n := shared_utils.Min(int64(enc.miniBlockSize), valuesToWrite)
   365  		if n == 0 {
   366  			break
   367  		}
   368  
   369  		maxDelta := int64(math.MinInt64)
   370  		start := i * int(enc.miniBlockSize)
   371  		for _, val := range enc.deltas[start : start+int(n)] {
   372  			maxDelta = shared_utils.Max(maxDelta, val)
   373  		}
   374  
   375  		// compute bit width to store (max_delta - min_delta)
   376  		width := uint(bits.Len64(uint64(maxDelta - minDelta)))
   377  		// write out the bit width we used into the bytes we reserved earlier
   378  		enc.bitWriter.WriteAt([]byte{byte(width)}, int64(offset+i))
   379  
   380  		// write out our deltas
   381  		for _, val := range enc.deltas[start : start+int(n)] {
   382  			enc.bitWriter.WriteValue(uint64(val-minDelta), width)
   383  		}
   384  
   385  		valuesToWrite -= n
   386  
   387  		// pad the last block if n < miniBlockSize
   388  		for ; n < int64(enc.miniBlockSize); n++ {
   389  			enc.bitWriter.WriteValue(0, width)
   390  		}
   391  	}
   392  	enc.deltas = enc.deltas[:0]
   393  }
   394  
   395  // putInternal is the implementation for actually writing data which must be
   396  // integral data as int, int8, int32, or int64.
   397  func (enc *deltaBitPackEncoder) putInternal(data interface{}) {
   398  	v := reflect.ValueOf(data)
   399  	if v.Len() == 0 {
   400  		return
   401  	}
   402  
   403  	idx := 0
   404  	if enc.totalVals == 0 {
   405  		enc.blockSize = defaultBlockSize
   406  		enc.numMiniBlocks = defaultNumMiniBlocks
   407  		enc.miniBlockSize = defaultNumValuesPerMini
   408  
   409  		enc.firstVal = v.Index(0).Int()
   410  		enc.currentVal = enc.firstVal
   411  		idx = 1
   412  
   413  		enc.bitWriter = utils.NewBitWriter(enc.sink)
   414  	}
   415  
   416  	enc.totalVals += uint64(v.Len())
   417  	for ; idx < v.Len(); idx++ {
   418  		val := v.Index(idx).Int()
   419  		enc.deltas = append(enc.deltas, val-enc.currentVal)
   420  		enc.currentVal = val
   421  		if len(enc.deltas) == int(enc.blockSize) {
   422  			enc.flushBlock()
   423  		}
   424  	}
   425  }
   426  
   427  // FlushValues flushes any remaining data and returns the finished encoded buffer
   428  // or returns nil and any error encountered during flushing.
   429  func (enc *deltaBitPackEncoder) FlushValues() (Buffer, error) {
   430  	if enc.bitWriter != nil {
   431  		// write any remaining values
   432  		enc.flushBlock()
   433  		enc.bitWriter.Flush(true)
   434  	} else {
   435  		enc.blockSize = defaultBlockSize
   436  		enc.numMiniBlocks = defaultNumMiniBlocks
   437  		enc.miniBlockSize = defaultNumValuesPerMini
   438  	}
   439  
   440  	buffer := make([]byte, maxHeaderWriterSize)
   441  	headerWriter := utils.NewBitWriter(utils.NewWriterAtBuffer(buffer))
   442  
   443  	headerWriter.WriteVlqInt(uint64(enc.blockSize))
   444  	headerWriter.WriteVlqInt(uint64(enc.numMiniBlocks))
   445  	headerWriter.WriteVlqInt(uint64(enc.totalVals))
   446  	headerWriter.WriteZigZagVlqInt(int64(enc.firstVal))
   447  	headerWriter.Flush(false)
   448  
   449  	buffer = buffer[:headerWriter.Written()]
   450  	enc.totalVals = 0
   451  
   452  	if enc.bitWriter != nil {
   453  		flushed := enc.sink.Finish()
   454  		defer flushed.Release()
   455  
   456  		buffer = append(buffer, flushed.Buf()[:enc.bitWriter.Written()]...)
   457  	}
   458  	return poolBuffer{memory.NewBufferBytes(buffer)}, nil
   459  }
   460  
   461  // EstimatedDataEncodedSize returns the current amount of data actually flushed out and written
   462  func (enc *deltaBitPackEncoder) EstimatedDataEncodedSize() int64 {
   463  	return int64(enc.bitWriter.Written())
   464  }
   465  
   466  // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data.
   467  type DeltaBitPackInt32Encoder struct {
   468  	*deltaBitPackEncoder
   469  }
   470  
   471  // Put writes the values from the provided slice of int32 to the encoder
   472  func (enc DeltaBitPackInt32Encoder) Put(in []int32) {
   473  	enc.putInternal(in)
   474  }
   475  
   476  // PutSpaced takes a slice of int32 along with a bitmap that describes the nulls and an offset into the bitmap
   477  // in order to write spaced data to the encoder.
   478  func (enc DeltaBitPackInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) {
   479  	buffer := memory.NewResizableBuffer(enc.mem)
   480  	buffer.Reserve(arrow.Int32Traits.BytesRequired(len(in)))
   481  	defer buffer.Release()
   482  
   483  	data := arrow.Int32Traits.CastFromBytes(buffer.Buf())
   484  	nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   485  	enc.Put(data[:nvalid])
   486  }
   487  
   488  // Type returns the underlying physical type this encoder works with, in this case Int32
   489  func (DeltaBitPackInt32Encoder) Type() parquet.Type {
   490  	return parquet.Types.Int32
   491  }
   492  
   493  // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data.
   494  type DeltaBitPackInt64Encoder struct {
   495  	*deltaBitPackEncoder
   496  }
   497  
   498  // Put writes the values from the provided slice of int64 to the encoder
   499  func (enc DeltaBitPackInt64Encoder) Put(in []int64) {
   500  	enc.putInternal(in)
   501  }
   502  
   503  // PutSpaced takes a slice of int64 along with a bitmap that describes the nulls and an offset into the bitmap
   504  // in order to write spaced data to the encoder.
   505  func (enc DeltaBitPackInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) {
   506  	buffer := memory.NewResizableBuffer(enc.mem)
   507  	buffer.Reserve(arrow.Int64Traits.BytesRequired(len(in)))
   508  	defer buffer.Release()
   509  
   510  	data := arrow.Int64Traits.CastFromBytes(buffer.Buf())
   511  	nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   512  	enc.Put(data[:nvalid])
   513  }
   514  
   515  // Type returns the underlying physical type this encoder works with, in this case Int64
   516  func (DeltaBitPackInt64Encoder) Type() parquet.Type {
   517  	return parquet.Types.Int64
   518  }