github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/delta_bit_packing.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"math/bits"
    23  	"reflect"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow"
    26  	"github.com/apache/arrow/go/v7/arrow/memory"
    27  	"github.com/apache/arrow/go/v7/parquet"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    29  	"golang.org/x/xerrors"
    30  )
    31  
    32  // see the deltaBitPack encoder for a description of the encoding format that is
    33  // used for delta-bitpacking.
    34  type deltaBitPackDecoder struct {
    35  	decoder
    36  
    37  	mem memory.Allocator
    38  
    39  	usedFirst            bool
    40  	bitdecoder           *utils.BitReader
    41  	blockSize            uint64
    42  	currentBlockVals     uint32
    43  	miniBlocks           uint64
    44  	valsPerMini          uint32
    45  	currentMiniBlockVals uint32
    46  	minDelta             int64
    47  	miniBlockIdx         uint64
    48  
    49  	deltaBitWidths *memory.Buffer
    50  	deltaBitWidth  byte
    51  
    52  	lastVal int64
    53  }
    54  
    55  // returns the number of bytes read so far
    56  func (d *deltaBitPackDecoder) bytesRead() int64 {
    57  	return d.bitdecoder.CurOffset()
    58  }
    59  
    60  func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem }
    61  
    62  // SetData sets the bytes and the expected number of values to decode
    63  // into the decoder, updating the decoder and allowing it to be reused.
    64  func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) error {
    65  	// set our data into the underlying decoder for the type
    66  	if err := d.decoder.SetData(nvalues, data); err != nil {
    67  		return err
    68  	}
    69  	// create a bit reader for our decoder's values
    70  	d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data))
    71  	d.currentBlockVals = 0
    72  	d.currentMiniBlockVals = 0
    73  	if d.deltaBitWidths == nil {
    74  		d.deltaBitWidths = memory.NewResizableBuffer(d.mem)
    75  	}
    76  
    77  	var ok bool
    78  	d.blockSize, ok = d.bitdecoder.GetVlqInt()
    79  	if !ok {
    80  		return xerrors.New("parquet: eof exception")
    81  	}
    82  
    83  	if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok {
    84  		return xerrors.New("parquet: eof exception")
    85  	}
    86  
    87  	var totalValues uint64
    88  	if totalValues, ok = d.bitdecoder.GetVlqInt(); !ok {
    89  		return xerrors.New("parquet: eof exception")
    90  	}
    91  
    92  	if int(totalValues) != d.nvals {
    93  		return xerrors.New("parquet: mismatch between number of values and count in data header")
    94  	}
    95  
    96  	if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok {
    97  		return xerrors.New("parquet: eof exception")
    98  	}
    99  
   100  	if d.miniBlocks != 0 {
   101  		d.valsPerMini = uint32(d.blockSize / d.miniBlocks)
   102  	}
   103  	return nil
   104  }
   105  
   106  // initialize a block to decode
   107  func (d *deltaBitPackDecoder) initBlock() error {
   108  	// first we grab the min delta value that we'll start from
   109  	var ok bool
   110  	if d.minDelta, ok = d.bitdecoder.GetZigZagVlqInt(); !ok {
   111  		return xerrors.New("parquet: eof exception")
   112  	}
   113  
   114  	// ensure we have enough space for our miniblocks to decode the widths
   115  	d.deltaBitWidths.Resize(int(d.miniBlocks))
   116  
   117  	var err error
   118  	for i := uint64(0); i < d.miniBlocks; i++ {
   119  		if d.deltaBitWidths.Bytes()[i], err = d.bitdecoder.ReadByte(); err != nil {
   120  			return err
   121  		}
   122  	}
   123  
   124  	d.miniBlockIdx = 0
   125  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[0]
   126  	d.currentBlockVals = uint32(d.blockSize)
   127  	return nil
   128  }
   129  
   130  // DeltaBitPackInt32Decoder decodes Int32 values which are packed using the Delta BitPacking algorithm.
   131  type DeltaBitPackInt32Decoder struct {
   132  	*deltaBitPackDecoder
   133  
   134  	miniBlockValues []int32
   135  }
   136  
   137  func (d *DeltaBitPackInt32Decoder) unpackNextMini() error {
   138  	if d.miniBlockValues == nil {
   139  		d.miniBlockValues = make([]int32, 0, int(d.valsPerMini))
   140  	} else {
   141  		d.miniBlockValues = d.miniBlockValues[:0]
   142  	}
   143  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
   144  	d.currentMiniBlockVals = d.valsPerMini
   145  
   146  	for j := 0; j < int(d.valsPerMini); j++ {
   147  		delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
   148  		if !ok {
   149  			return xerrors.New("parquet: eof exception")
   150  		}
   151  
   152  		d.lastVal += int64(delta) + int64(d.minDelta)
   153  		d.miniBlockValues = append(d.miniBlockValues, int32(d.lastVal))
   154  	}
   155  	d.miniBlockIdx++
   156  	return nil
   157  }
   158  
   159  // Decode retrieves min(remaining values, len(out)) values from the data and returns the number
   160  // of values actually decoded and any errors encountered.
   161  func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) {
   162  	max := utils.MinInt(len(out), d.nvals)
   163  	if max == 0 {
   164  		return 0, nil
   165  	}
   166  
   167  	out = out[:max]
   168  	if !d.usedFirst { // starting value to calculate deltas against
   169  		out[0] = int32(d.lastVal)
   170  		out = out[1:]
   171  		d.usedFirst = true
   172  	}
   173  
   174  	var err error
   175  	for len(out) > 0 { // unpack mini blocks until we get all the values we need
   176  		if d.currentBlockVals == 0 {
   177  			err = d.initBlock()
   178  		}
   179  		if d.currentMiniBlockVals == 0 {
   180  			err = d.unpackNextMini()
   181  		}
   182  		if err != nil {
   183  			return 0, err
   184  		}
   185  
   186  		// copy as many values from our mini block as we can into out
   187  		start := int(d.valsPerMini - d.currentMiniBlockVals)
   188  		end := utils.MinInt(int(d.valsPerMini), len(out))
   189  		copy(out, d.miniBlockValues[start:end])
   190  
   191  		numCopied := end - start
   192  		out = out[numCopied:]
   193  		d.currentBlockVals -= uint32(numCopied)
   194  		d.currentMiniBlockVals -= uint32(numCopied)
   195  	}
   196  	return max, nil
   197  }
   198  
   199  // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap
   200  func (d *DeltaBitPackInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   201  	toread := len(out) - nullCount
   202  	values, err := d.Decode(out[:toread])
   203  	if err != nil {
   204  		return values, err
   205  	}
   206  	if values != toread {
   207  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   208  	}
   209  
   210  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   211  }
   212  
   213  // Type returns the physical parquet type that this decoder decodes, in this case Int32
   214  func (DeltaBitPackInt32Decoder) Type() parquet.Type {
   215  	return parquet.Types.Int32
   216  }
   217  
   218  // DeltaBitPackInt64Decoder decodes a delta bit packed int64 column of data.
   219  type DeltaBitPackInt64Decoder struct {
   220  	*deltaBitPackDecoder
   221  
   222  	miniBlockValues []int64
   223  }
   224  
   225  func (d *DeltaBitPackInt64Decoder) unpackNextMini() error {
   226  	if d.miniBlockValues == nil {
   227  		d.miniBlockValues = make([]int64, 0, int(d.valsPerMini))
   228  	} else {
   229  		d.miniBlockValues = d.miniBlockValues[:0]
   230  	}
   231  
   232  	d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
   233  	d.currentMiniBlockVals = d.valsPerMini
   234  
   235  	for j := 0; j < int(d.valsPerMini); j++ {
   236  		delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
   237  		if !ok {
   238  			return xerrors.New("parquet: eof exception")
   239  		}
   240  
   241  		d.lastVal += int64(delta) + int64(d.minDelta)
   242  		d.miniBlockValues = append(d.miniBlockValues, d.lastVal)
   243  	}
   244  	d.miniBlockIdx++
   245  	return nil
   246  }
   247  
   248  // Decode retrieves min(remaining values, len(out)) values from the data and returns the number
   249  // of values actually decoded and any errors encountered.
   250  func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) {
   251  	max := utils.MinInt(len(out), d.nvals)
   252  	if max == 0 {
   253  		return 0, nil
   254  	}
   255  
   256  	out = out[:max]
   257  	if !d.usedFirst {
   258  		out[0] = d.lastVal
   259  		out = out[1:]
   260  		d.usedFirst = true
   261  	}
   262  
   263  	var err error
   264  	for len(out) > 0 {
   265  		if d.currentBlockVals == 0 {
   266  			err = d.initBlock()
   267  		}
   268  		if d.currentMiniBlockVals == 0 {
   269  			err = d.unpackNextMini()
   270  		}
   271  
   272  		if err != nil {
   273  			return 0, err
   274  		}
   275  
   276  		start := int(d.valsPerMini - d.currentMiniBlockVals)
   277  		end := utils.MinInt(int(d.valsPerMini), len(out))
   278  		copy(out, d.miniBlockValues[start:end])
   279  
   280  		numCopied := end - start
   281  		out = out[numCopied:]
   282  		d.currentBlockVals -= uint32(numCopied)
   283  		d.currentMiniBlockVals -= uint32(numCopied)
   284  	}
   285  	return max, nil
   286  }
   287  
   288  // Type returns the physical parquet type that this decoder decodes, in this case Int64
   289  func (DeltaBitPackInt64Decoder) Type() parquet.Type {
   290  	return parquet.Types.Int64
   291  }
   292  
   293  // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap
   294  func (d DeltaBitPackInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   295  	toread := len(out) - nullCount
   296  	values, err := d.Decode(out[:toread])
   297  	if err != nil {
   298  		return values, err
   299  	}
   300  	if values != toread {
   301  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   302  	}
   303  
   304  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   305  }
   306  
   307  const (
   308  	// block size must be a multiple of 128
   309  	defaultBlockSize     = 128
   310  	defaultNumMiniBlocks = 4
   311  	// block size / number of mini blocks must result in a multiple of 32
   312  	defaultNumValuesPerMini = 32
   313  	// max size of the header for the delta blocks
   314  	maxHeaderWriterSize = 32
   315  )
   316  
   317  // deltaBitPackEncoder is an encoder for the DeltaBinary Packing format
   318  // as per the parquet spec.
   319  //
   320  // Consists of a header followed by blocks of delta encoded values binary packed.
   321  //
   322  //	Format
   323  // 		[header] [block 1] [block 2] ... [block N]
   324  //
   325  //	Header
   326  //		[block size] [number of mini blocks per block] [total value count] [first value]
   327  //
   328  //	Block
   329  //		[min delta] [list of bitwidths of the miniblocks] [miniblocks...]
   330  //
   331  // Sets aside bytes at the start of the internal buffer where the header will be written,
   332  // and only writes the header when FlushValues is called before returning it.
   333  type deltaBitPackEncoder struct {
   334  	encoder
   335  
   336  	bitWriter  *utils.BitWriter
   337  	totalVals  uint64
   338  	firstVal   int64
   339  	currentVal int64
   340  
   341  	blockSize     uint64
   342  	miniBlockSize uint64
   343  	numMiniBlocks uint64
   344  	deltas        []int64
   345  }
   346  
   347  // flushBlock flushes out a finished block for writing to the underlying encoder
   348  func (enc *deltaBitPackEncoder) flushBlock() {
   349  	if len(enc.deltas) == 0 {
   350  		return
   351  	}
   352  
   353  	// determine the minimum delta value
   354  	minDelta := int64(math.MaxInt64)
   355  	for _, delta := range enc.deltas {
   356  		if delta < minDelta {
   357  			minDelta = delta
   358  		}
   359  	}
   360  
   361  	enc.bitWriter.WriteZigZagVlqInt(minDelta)
   362  	// reserve enough bytes to write out our miniblock deltas
   363  	offset := enc.bitWriter.ReserveBytes(int(enc.numMiniBlocks))
   364  
   365  	valuesToWrite := int64(len(enc.deltas))
   366  	for i := 0; i < int(enc.numMiniBlocks); i++ {
   367  		n := utils.Min(int64(enc.miniBlockSize), valuesToWrite)
   368  		if n == 0 {
   369  			break
   370  		}
   371  
   372  		maxDelta := int64(math.MinInt64)
   373  		start := i * int(enc.miniBlockSize)
   374  		for _, val := range enc.deltas[start : start+int(n)] {
   375  			maxDelta = utils.Max(maxDelta, val)
   376  		}
   377  
   378  		// compute bit width to store (max_delta - min_delta)
   379  		width := uint(bits.Len64(uint64(maxDelta - minDelta)))
   380  		// write out the bit width we used into the bytes we reserved earlier
   381  		enc.bitWriter.WriteAt([]byte{byte(width)}, int64(offset+i))
   382  
   383  		// write out our deltas
   384  		for _, val := range enc.deltas[start : start+int(n)] {
   385  			enc.bitWriter.WriteValue(uint64(val-minDelta), width)
   386  		}
   387  
   388  		valuesToWrite -= n
   389  
   390  		// pad the last block if n < miniBlockSize
   391  		for ; n < int64(enc.miniBlockSize); n++ {
   392  			enc.bitWriter.WriteValue(0, width)
   393  		}
   394  	}
   395  	enc.deltas = enc.deltas[:0]
   396  }
   397  
   398  // putInternal is the implementation for actually writing data which must be
   399  // integral data as int, int8, int32, or int64.
   400  func (enc *deltaBitPackEncoder) putInternal(data interface{}) {
   401  	v := reflect.ValueOf(data)
   402  	if v.Len() == 0 {
   403  		return
   404  	}
   405  
   406  	idx := 0
   407  	if enc.totalVals == 0 {
   408  		enc.blockSize = defaultBlockSize
   409  		enc.numMiniBlocks = defaultNumMiniBlocks
   410  		enc.miniBlockSize = defaultNumValuesPerMini
   411  
   412  		enc.firstVal = v.Index(0).Int()
   413  		enc.currentVal = enc.firstVal
   414  		idx = 1
   415  
   416  		enc.bitWriter = utils.NewBitWriter(enc.sink)
   417  	}
   418  
   419  	enc.totalVals += uint64(v.Len())
   420  	for ; idx < v.Len(); idx++ {
   421  		val := v.Index(idx).Int()
   422  		enc.deltas = append(enc.deltas, val-enc.currentVal)
   423  		enc.currentVal = val
   424  		if len(enc.deltas) == int(enc.blockSize) {
   425  			enc.flushBlock()
   426  		}
   427  	}
   428  }
   429  
   430  // FlushValues flushes any remaining data and returns the finished encoded buffer
   431  // or returns nil and any error encountered during flushing.
   432  func (enc *deltaBitPackEncoder) FlushValues() (Buffer, error) {
   433  	if enc.bitWriter != nil {
   434  		// write any remaining values
   435  		enc.flushBlock()
   436  		enc.bitWriter.Flush(true)
   437  	} else {
   438  		enc.blockSize = defaultBlockSize
   439  		enc.numMiniBlocks = defaultNumMiniBlocks
   440  		enc.miniBlockSize = defaultNumValuesPerMini
   441  	}
   442  
   443  	buffer := make([]byte, maxHeaderWriterSize)
   444  	headerWriter := utils.NewBitWriter(utils.NewWriterAtBuffer(buffer))
   445  
   446  	headerWriter.WriteVlqInt(uint64(enc.blockSize))
   447  	headerWriter.WriteVlqInt(uint64(enc.numMiniBlocks))
   448  	headerWriter.WriteVlqInt(uint64(enc.totalVals))
   449  	headerWriter.WriteZigZagVlqInt(int64(enc.firstVal))
   450  	headerWriter.Flush(false)
   451  
   452  	buffer = buffer[:headerWriter.Written()]
   453  	enc.totalVals = 0
   454  
   455  	if enc.bitWriter != nil {
   456  		flushed := enc.sink.Finish()
   457  		defer flushed.Release()
   458  
   459  		buffer = append(buffer, flushed.Buf()[:enc.bitWriter.Written()]...)
   460  	}
   461  	return poolBuffer{memory.NewBufferBytes(buffer)}, nil
   462  }
   463  
   464  // EstimatedDataEncodedSize returns the current amount of data actually flushed out and written
   465  func (enc *deltaBitPackEncoder) EstimatedDataEncodedSize() int64 {
   466  	return int64(enc.bitWriter.Written())
   467  }
   468  
   469  // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data.
   470  type DeltaBitPackInt32Encoder struct {
   471  	*deltaBitPackEncoder
   472  }
   473  
   474  // Put writes the values from the provided slice of int32 to the encoder
   475  func (enc DeltaBitPackInt32Encoder) Put(in []int32) {
   476  	enc.putInternal(in)
   477  }
   478  
   479  // PutSpaced takes a slice of int32 along with a bitmap that describes the nulls and an offset into the bitmap
   480  // in order to write spaced data to the encoder.
   481  func (enc DeltaBitPackInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) {
   482  	buffer := memory.NewResizableBuffer(enc.mem)
   483  	buffer.Reserve(arrow.Int32Traits.BytesRequired(len(in)))
   484  	defer buffer.Release()
   485  
   486  	data := arrow.Int32Traits.CastFromBytes(buffer.Buf())
   487  	nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   488  	enc.Put(data[:nvalid])
   489  }
   490  
   491  // Type returns the underlying physical type this encoder works with, in this case Int32
   492  func (DeltaBitPackInt32Encoder) Type() parquet.Type {
   493  	return parquet.Types.Int32
   494  }
   495  
   496  // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data.
   497  type DeltaBitPackInt64Encoder struct {
   498  	*deltaBitPackEncoder
   499  }
   500  
   501  // Put writes the values from the provided slice of int64 to the encoder
   502  func (enc DeltaBitPackInt64Encoder) Put(in []int64) {
   503  	enc.putInternal(in)
   504  }
   505  
   506  // PutSpaced takes a slice of int64 along with a bitmap that describes the nulls and an offset into the bitmap
   507  // in order to write spaced data to the encoder.
   508  func (enc DeltaBitPackInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) {
   509  	buffer := memory.NewResizableBuffer(enc.mem)
   510  	buffer.Reserve(arrow.Int64Traits.BytesRequired(len(in)))
   511  	defer buffer.Release()
   512  
   513  	data := arrow.Int64Traits.CastFromBytes(buffer.Buf())
   514  	nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   515  	enc.Put(data[:nvalid])
   516  }
   517  
   518  // Type returns the underlying physical type this encoder works with, in this case Int64
   519  func (DeltaBitPackInt64Encoder) Type() parquet.Type {
   520  	return parquet.Types.Int64
   521  }