github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/binary_packed.go (about)

     1  package delta
     2  
     3  import (
     4  	"encoding/binary"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"math/bits"
     9  
    10  	"github.com/vc42/parquet-go/encoding"
    11  	"github.com/vc42/parquet-go/format"
    12  	"github.com/vc42/parquet-go/internal/bitpack"
    13  	"github.com/vc42/parquet-go/internal/unsafecast"
    14  )
    15  
    16  type BinaryPackedEncoding struct {
    17  	encoding.NotSupported
    18  }
    19  
    20  func (e *BinaryPackedEncoding) String() string {
    21  	return "DELTA_BINARY_PACKED"
    22  }
    23  
    24  func (e *BinaryPackedEncoding) Encoding() format.Encoding {
    25  	return format.DeltaBinaryPacked
    26  }
    27  
    28  func (e *BinaryPackedEncoding) EncodeInt32(dst, src []byte) ([]byte, error) {
    29  	if (len(src) % 4) != 0 {
    30  		return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "INT64", len(src))
    31  	}
    32  	return encodeInt32(dst[:0], bytesToInt32(src)), nil
    33  }
    34  
    35  func (e *BinaryPackedEncoding) EncodeInt64(dst, src []byte) ([]byte, error) {
    36  	if (len(src) % 8) != 0 {
    37  		return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "INT64", len(src))
    38  	}
    39  	return encodeInt64(dst[:0], bytesToInt64(src)), nil
    40  }
    41  
    42  func (e *BinaryPackedEncoding) DecodeInt32(dst, src []byte) ([]byte, error) {
    43  	dst, _, err := decodeInt32(dst[:0], src)
    44  	return dst, e.wrap(err)
    45  }
    46  
    47  func (e *BinaryPackedEncoding) DecodeInt64(dst, src []byte) ([]byte, error) {
    48  	dst, _, err := decodeInt64(dst[:0], src)
    49  	return dst, e.wrap(err)
    50  }
    51  
    52  func (e *BinaryPackedEncoding) wrap(err error) error {
    53  	if err != nil {
    54  		err = encoding.Error(e, err)
    55  	}
    56  	return err
    57  }
    58  
    59  const (
    60  	blockSize     = 128
    61  	numMiniBlocks = 4
    62  	miniBlockSize = blockSize / numMiniBlocks
    63  	// The parquet spec does not enforce a limit to the block size, but we need
    64  	// one otherwise invalid inputs may result in unbounded memory allocations.
    65  	//
    66  	// 65K+ values should be enough for any valid use case.
    67  	maxSupportedBlockSize = 65536
    68  
    69  	maxHeaderLength32    = 4 * binary.MaxVarintLen64
    70  	maxMiniBlockLength32 = binary.MaxVarintLen64 + numMiniBlocks + (4 * blockSize)
    71  
    72  	maxHeaderLength64    = 8 * binary.MaxVarintLen64
    73  	maxMiniBlockLength64 = binary.MaxVarintLen64 + numMiniBlocks + (8 * blockSize)
    74  )
    75  
    76  var (
    77  	encodeInt32 = encodeInt32Default
    78  	encodeInt64 = encodeInt64Default
    79  )
    80  
    81  func encodeInt32Default(dst []byte, src []int32) []byte {
    82  	totalValues := len(src)
    83  	firstValue := int32(0)
    84  	if totalValues > 0 {
    85  		firstValue = src[0]
    86  	}
    87  
    88  	n := len(dst)
    89  	dst = resize(dst, n+maxHeaderLength32)
    90  	dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))]
    91  
    92  	if totalValues < 2 {
    93  		return dst
    94  	}
    95  
    96  	lastValue := firstValue
    97  	for i := 1; i < len(src); i += blockSize {
    98  		block := [blockSize]int32{}
    99  		blockLength := copy(block[:], src[i:])
   100  
   101  		lastValue = blockDeltaInt32(&block, lastValue)
   102  		minDelta := blockMinInt32(&block)
   103  		blockSubInt32(&block, minDelta)
   104  		blockClearInt32(&block, blockLength)
   105  
   106  		bitWidths := [numMiniBlocks]byte{}
   107  		blockBitWidthsInt32(&bitWidths, &block)
   108  
   109  		n := len(dst)
   110  		dst = resize(dst, n+maxMiniBlockLength32+4)
   111  		n += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths)
   112  
   113  		for i, bitWidth := range bitWidths {
   114  			if bitWidth != 0 {
   115  				miniBlock := (*[miniBlockSize]int32)(block[i*miniBlockSize:])
   116  				encodeMiniBlockInt32(dst[n:], miniBlock, uint(bitWidth))
   117  				n += (miniBlockSize * int(bitWidth)) / 8
   118  			}
   119  		}
   120  
   121  		dst = dst[:n]
   122  	}
   123  
   124  	return dst
   125  }
   126  
   127  func encodeInt64Default(dst []byte, src []int64) []byte {
   128  	totalValues := len(src)
   129  	firstValue := int64(0)
   130  	if totalValues > 0 {
   131  		firstValue = src[0]
   132  	}
   133  
   134  	n := len(dst)
   135  	dst = resize(dst, n+maxHeaderLength64)
   136  	dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, firstValue)]
   137  
   138  	if totalValues < 2 {
   139  		return dst
   140  	}
   141  
   142  	lastValue := firstValue
   143  	for i := 1; i < len(src); i += blockSize {
   144  		block := [blockSize]int64{}
   145  		blockLength := copy(block[:], src[i:])
   146  
   147  		lastValue = blockDeltaInt64(&block, lastValue)
   148  		minDelta := blockMinInt64(&block)
   149  		blockSubInt64(&block, minDelta)
   150  		blockClearInt64(&block, blockLength)
   151  
   152  		bitWidths := [numMiniBlocks]byte{}
   153  		blockBitWidthsInt64(&bitWidths, &block)
   154  
   155  		n := len(dst)
   156  		dst = resize(dst, n+maxMiniBlockLength64+8)
   157  		n += encodeBlockHeader(dst[n:], minDelta, bitWidths)
   158  
   159  		for i, bitWidth := range bitWidths {
   160  			if bitWidth != 0 {
   161  				miniBlock := (*[miniBlockSize]int64)(block[i*miniBlockSize:])
   162  				encodeMiniBlockInt64(dst[n:], miniBlock, uint(bitWidth))
   163  				n += (miniBlockSize * int(bitWidth)) / 8
   164  			}
   165  		}
   166  
   167  		dst = dst[:n]
   168  	}
   169  
   170  	return dst
   171  }
   172  
   173  func encodeBinaryPackedHeader(dst []byte, blockSize, numMiniBlocks, totalValues int, firstValue int64) (n int) {
   174  	n += binary.PutUvarint(dst[n:], uint64(blockSize))
   175  	n += binary.PutUvarint(dst[n:], uint64(numMiniBlocks))
   176  	n += binary.PutUvarint(dst[n:], uint64(totalValues))
   177  	n += binary.PutVarint(dst[n:], firstValue)
   178  	return n
   179  }
   180  
   181  func encodeBlockHeader(dst []byte, minDelta int64, bitWidths [numMiniBlocks]byte) (n int) {
   182  	n += binary.PutVarint(dst, int64(minDelta))
   183  	n += copy(dst[n:], bitWidths[:])
   184  	return n
   185  }
   186  
   187  func blockClearInt32(block *[blockSize]int32, blockLength int) {
   188  	if blockLength < blockSize {
   189  		clear := block[blockLength:]
   190  		for i := range clear {
   191  			clear[i] = 0
   192  		}
   193  	}
   194  }
   195  
   196  func blockDeltaInt32(block *[blockSize]int32, lastValue int32) int32 {
   197  	for i, v := range block {
   198  		block[i], lastValue = v-lastValue, v
   199  	}
   200  	return lastValue
   201  }
   202  
   203  func blockMinInt32(block *[blockSize]int32) int32 {
   204  	min := block[0]
   205  	for _, v := range block[1:] {
   206  		if v < min {
   207  			min = v
   208  		}
   209  	}
   210  	return min
   211  }
   212  
   213  func blockSubInt32(block *[blockSize]int32, value int32) {
   214  	for i := range block {
   215  		block[i] -= value
   216  	}
   217  }
   218  
   219  func blockBitWidthsInt32(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32) {
   220  	for i := range bitWidths {
   221  		j := (i + 0) * miniBlockSize
   222  		k := (i + 1) * miniBlockSize
   223  		bitWidth := 0
   224  
   225  		for _, v := range block[j:k] {
   226  			if n := bits.Len32(uint32(v)); n > bitWidth {
   227  				bitWidth = n
   228  			}
   229  		}
   230  
   231  		bitWidths[i] = byte(bitWidth)
   232  	}
   233  }
   234  
   235  func blockClearInt64(block *[blockSize]int64, blockLength int) {
   236  	if blockLength < blockSize {
   237  		clear := block[blockLength:]
   238  		for i := range clear {
   239  			clear[i] = 0
   240  		}
   241  	}
   242  }
   243  
   244  func blockDeltaInt64(block *[blockSize]int64, lastValue int64) int64 {
   245  	for i, v := range block {
   246  		block[i], lastValue = v-lastValue, v
   247  	}
   248  	return lastValue
   249  }
   250  
   251  func blockMinInt64(block *[blockSize]int64) int64 {
   252  	min := block[0]
   253  	for _, v := range block[1:] {
   254  		if v < min {
   255  			min = v
   256  		}
   257  	}
   258  	return min
   259  }
   260  
   261  func blockSubInt64(block *[blockSize]int64, value int64) {
   262  	for i := range block {
   263  		block[i] -= value
   264  	}
   265  }
   266  
   267  func blockBitWidthsInt64(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64) {
   268  	for i := range bitWidths {
   269  		j := (i + 0) * miniBlockSize
   270  		k := (i + 1) * miniBlockSize
   271  		bitWidth := 0
   272  
   273  		for _, v := range block[j:k] {
   274  			if n := bits.Len64(uint64(v)); n > bitWidth {
   275  				bitWidth = n
   276  			}
   277  		}
   278  
   279  		bitWidths[i] = byte(bitWidth)
   280  	}
   281  }
   282  
   283  func decodeInt32(dst, src []byte) ([]byte, []byte, error) {
   284  	blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src)
   285  	if err != nil {
   286  		return dst, src, err
   287  	}
   288  	if totalValues == 0 {
   289  		return dst, src, nil
   290  	}
   291  	if firstValue < math.MinInt32 || firstValue > math.MaxInt32 {
   292  		return dst, src, fmt.Errorf("first value out of range: %d", firstValue)
   293  	}
   294  
   295  	writeOffset := len(dst)
   296  	dst = resize(dst, len(dst)+4*totalValues)
   297  	out := unsafecast.BytesToInt32(dst)
   298  	out[writeOffset] = int32(firstValue)
   299  	writeOffset++
   300  	totalValues--
   301  	lastValue := int32(firstValue)
   302  	numValuesInMiniBlock := blockSize / numMiniBlocks
   303  
   304  	const padding = 16
   305  	miniBlockTemp := make([]byte, 256+padding)
   306  
   307  	for totalValues > 0 && len(src) > 0 {
   308  		var minDelta int64
   309  		var bitWidths []byte
   310  		minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks)
   311  		if err != nil {
   312  			return dst, src, err
   313  		}
   314  
   315  		blockOffset := writeOffset
   316  
   317  		for _, bitWidth := range bitWidths {
   318  			n := min(numValuesInMiniBlock, totalValues)
   319  			if bitWidth != 0 {
   320  				miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8
   321  				miniBlockData := src
   322  				if miniBlockSize <= len(src) {
   323  					miniBlockData = miniBlockData[:miniBlockSize]
   324  				}
   325  				src = src[len(miniBlockData):]
   326  				if cap(miniBlockData) < miniBlockSize+bitpack.PaddingInt32 {
   327  					miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt32)
   328  					miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)]
   329  				}
   330  				miniBlockData = miniBlockData[:miniBlockSize]
   331  				bitpack.UnpackInt32(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth))
   332  			}
   333  			writeOffset += n
   334  			totalValues -= n
   335  			if totalValues == 0 {
   336  				break
   337  			}
   338  		}
   339  
   340  		lastValue = decodeBlockInt32(out[blockOffset:writeOffset], int32(minDelta), lastValue)
   341  	}
   342  
   343  	if totalValues > 0 {
   344  		return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF)
   345  	}
   346  
   347  	return dst, src, nil
   348  }
   349  
   350  func decodeInt64(dst, src []byte) ([]byte, []byte, error) {
   351  	blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src)
   352  	if err != nil {
   353  		return dst, src, err
   354  	}
   355  	if totalValues == 0 {
   356  		return dst, src, nil
   357  	}
   358  
   359  	writeOffset := len(dst)
   360  	dst = resize(dst, len(dst)+8*totalValues)
   361  	out := unsafecast.BytesToInt64(dst)
   362  	out[writeOffset] = firstValue
   363  	writeOffset++
   364  	totalValues--
   365  	lastValue := firstValue
   366  	numValuesInMiniBlock := blockSize / numMiniBlocks
   367  
   368  	const padding = 16
   369  	miniBlockTemp := make([]byte, 512+padding)
   370  
   371  	for totalValues > 0 && len(src) > 0 {
   372  		var minDelta int64
   373  		var bitWidths []byte
   374  		minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks)
   375  		if err != nil {
   376  			return dst, src, err
   377  		}
   378  		blockOffset := writeOffset
   379  
   380  		for _, bitWidth := range bitWidths {
   381  			n := min(numValuesInMiniBlock, totalValues)
   382  			if bitWidth != 0 {
   383  				miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8
   384  				miniBlockData := src
   385  				if miniBlockSize <= len(src) {
   386  					miniBlockData = src[:miniBlockSize]
   387  				}
   388  				src = src[len(miniBlockData):]
   389  				if len(miniBlockData) < miniBlockSize+bitpack.PaddingInt64 {
   390  					miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt64)
   391  					miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)]
   392  				}
   393  				miniBlockData = miniBlockData[:miniBlockSize]
   394  				bitpack.UnpackInt64(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth))
   395  			}
   396  			writeOffset += n
   397  			totalValues -= n
   398  			if totalValues == 0 {
   399  				break
   400  			}
   401  		}
   402  
   403  		lastValue = decodeBlockInt64(out[blockOffset:writeOffset], minDelta, lastValue)
   404  	}
   405  
   406  	if totalValues > 0 {
   407  		return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF)
   408  	}
   409  
   410  	return dst, src, nil
   411  }
   412  
   413  func decodeBinaryPackedHeader(src []byte) (blockSize, numMiniBlocks, totalValues int, firstValue int64, next []byte, err error) {
   414  	u := uint64(0)
   415  	n := 0
   416  	i := 0
   417  
   418  	if u, n, err = decodeUvarint(src[i:], "block size"); err != nil {
   419  		return
   420  	}
   421  	i += n
   422  	blockSize = int(u)
   423  
   424  	if u, n, err = decodeUvarint(src[i:], "number of mini-blocks"); err != nil {
   425  		return
   426  	}
   427  	i += n
   428  	numMiniBlocks = int(u)
   429  
   430  	if u, n, err = decodeUvarint(src[i:], "total values"); err != nil {
   431  		return
   432  	}
   433  	i += n
   434  	totalValues = int(u)
   435  
   436  	if firstValue, n, err = decodeVarint(src[i:], "first value"); err != nil {
   437  		return
   438  	}
   439  	i += n
   440  
   441  	if numMiniBlocks == 0 {
   442  		err = fmt.Errorf("invalid number of mini block (%d)", numMiniBlocks)
   443  	} else if (blockSize <= 0) || (blockSize%128) != 0 {
   444  		err = fmt.Errorf("invalid block size is not a multiple of 128 (%d)", blockSize)
   445  	} else if blockSize > maxSupportedBlockSize {
   446  		err = fmt.Errorf("invalid block size is too large (%d)", blockSize)
   447  	} else if miniBlockSize := blockSize / numMiniBlocks; (numMiniBlocks <= 0) || (miniBlockSize%32) != 0 {
   448  		err = fmt.Errorf("invalid mini block size is not a multiple of 32 (%d)", miniBlockSize)
   449  	} else if totalValues < 0 {
   450  		err = fmt.Errorf("invalid total number of values is negative (%d)", totalValues)
   451  	} else if totalValues > math.MaxInt32 {
   452  		err = fmt.Errorf("too many values: %d", totalValues)
   453  	}
   454  
   455  	return blockSize, numMiniBlocks, totalValues, firstValue, src[i:], err
   456  }
   457  
   458  func decodeBinaryPackedBlock(src []byte, numMiniBlocks int) (minDelta int64, bitWidths, next []byte, err error) {
   459  	minDelta, n, err := decodeVarint(src, "min delta")
   460  	if err != nil {
   461  		return 0, nil, src, err
   462  	}
   463  	src = src[n:]
   464  	if len(src) < numMiniBlocks {
   465  		bitWidths, next = src, nil
   466  	} else {
   467  		bitWidths, next = src[:numMiniBlocks], src[numMiniBlocks:]
   468  	}
   469  	return minDelta, bitWidths, next, nil
   470  }
   471  
   472  func decodeUvarint(buf []byte, what string) (u uint64, n int, err error) {
   473  	u, n = binary.Uvarint(buf)
   474  	if n == 0 {
   475  		return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF)
   476  	}
   477  	if n < 0 {
   478  		return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf))
   479  	}
   480  	return u, n, nil
   481  }
   482  
   483  func decodeVarint(buf []byte, what string) (v int64, n int, err error) {
   484  	v, n = binary.Varint(buf)
   485  	if n == 0 {
   486  		return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF)
   487  	}
   488  	if n < 0 {
   489  		return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf))
   490  	}
   491  	return v, n, nil
   492  }