github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding/delta/binary_packed.go (about)

     1  package delta
     2  
     3  import (
     4  	"encoding/binary"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"math/bits"
     9  
    10  	"github.com/segmentio/parquet-go/encoding"
    11  	"github.com/segmentio/parquet-go/format"
    12  	"github.com/segmentio/parquet-go/internal/bitpack"
    13  	"github.com/segmentio/parquet-go/internal/unsafecast"
    14  )
    15  
    16  type BinaryPackedEncoding struct {
    17  	encoding.NotSupported
    18  }
    19  
    20  func (e *BinaryPackedEncoding) String() string {
    21  	return "DELTA_BINARY_PACKED"
    22  }
    23  
    24  func (e *BinaryPackedEncoding) Encoding() format.Encoding {
    25  	return format.DeltaBinaryPacked
    26  }
    27  
    28  func (e *BinaryPackedEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {
    29  	return encodeInt32(dst[:0], src), nil
    30  }
    31  
    32  func (e *BinaryPackedEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) {
    33  	return encodeInt64(dst[:0], src), nil
    34  }
    35  
    36  func (e *BinaryPackedEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {
    37  	buf := unsafecast.Int32ToBytes(dst)
    38  	buf, _, err := decodeInt32(buf[:0], src)
    39  	return unsafecast.BytesToInt32(buf), e.wrap(err)
    40  }
    41  
    42  func (e *BinaryPackedEncoding) DecodeInt64(dst []int64, src []byte) ([]int64, error) {
    43  	buf := unsafecast.Int64ToBytes(dst)
    44  	buf, _, err := decodeInt64(buf[:0], src)
    45  	return unsafecast.BytesToInt64(buf), e.wrap(err)
    46  }
    47  
    48  func (e *BinaryPackedEncoding) wrap(err error) error {
    49  	if err != nil {
    50  		err = encoding.Error(e, err)
    51  	}
    52  	return err
    53  }
    54  
    55  const (
    56  	blockSize     = 128
    57  	numMiniBlocks = 4
    58  	miniBlockSize = blockSize / numMiniBlocks
    59  	// The parquet spec does not enforce a limit to the block size, but we need
    60  	// one otherwise invalid inputs may result in unbounded memory allocations.
    61  	//
    62  	// 65K+ values should be enough for any valid use case.
    63  	maxSupportedBlockSize = 65536
    64  
    65  	maxHeaderLength32    = 4 * binary.MaxVarintLen64
    66  	maxMiniBlockLength32 = binary.MaxVarintLen64 + numMiniBlocks + (4 * blockSize)
    67  
    68  	maxHeaderLength64    = 8 * binary.MaxVarintLen64
    69  	maxMiniBlockLength64 = binary.MaxVarintLen64 + numMiniBlocks + (8 * blockSize)
    70  )
    71  
    72  var (
    73  	encodeInt32 = encodeInt32Default
    74  	encodeInt64 = encodeInt64Default
    75  )
    76  
    77  func encodeInt32Default(dst []byte, src []int32) []byte {
    78  	totalValues := len(src)
    79  	firstValue := int32(0)
    80  	if totalValues > 0 {
    81  		firstValue = src[0]
    82  	}
    83  
    84  	n := len(dst)
    85  	dst = resize(dst, n+maxHeaderLength32)
    86  	dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))]
    87  
    88  	if totalValues < 2 {
    89  		return dst
    90  	}
    91  
    92  	lastValue := firstValue
    93  	for i := 1; i < len(src); i += blockSize {
    94  		block := [blockSize]int32{}
    95  		blockLength := copy(block[:], src[i:])
    96  
    97  		lastValue = blockDeltaInt32(&block, lastValue)
    98  		minDelta := blockMinInt32(&block)
    99  		blockSubInt32(&block, minDelta)
   100  		blockClearInt32(&block, blockLength)
   101  
   102  		bitWidths := [numMiniBlocks]byte{}
   103  		blockBitWidthsInt32(&bitWidths, &block)
   104  
   105  		n := len(dst)
   106  		dst = resize(dst, n+maxMiniBlockLength32+4)
   107  		n += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths)
   108  
   109  		for i, bitWidth := range bitWidths {
   110  			if bitWidth != 0 {
   111  				miniBlock := (*[miniBlockSize]int32)(block[i*miniBlockSize:])
   112  				encodeMiniBlockInt32(dst[n:], miniBlock, uint(bitWidth))
   113  				n += (miniBlockSize * int(bitWidth)) / 8
   114  			}
   115  		}
   116  
   117  		dst = dst[:n]
   118  	}
   119  
   120  	return dst
   121  }
   122  
   123  func encodeInt64Default(dst []byte, src []int64) []byte {
   124  	totalValues := len(src)
   125  	firstValue := int64(0)
   126  	if totalValues > 0 {
   127  		firstValue = src[0]
   128  	}
   129  
   130  	n := len(dst)
   131  	dst = resize(dst, n+maxHeaderLength64)
   132  	dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, firstValue)]
   133  
   134  	if totalValues < 2 {
   135  		return dst
   136  	}
   137  
   138  	lastValue := firstValue
   139  	for i := 1; i < len(src); i += blockSize {
   140  		block := [blockSize]int64{}
   141  		blockLength := copy(block[:], src[i:])
   142  
   143  		lastValue = blockDeltaInt64(&block, lastValue)
   144  		minDelta := blockMinInt64(&block)
   145  		blockSubInt64(&block, minDelta)
   146  		blockClearInt64(&block, blockLength)
   147  
   148  		bitWidths := [numMiniBlocks]byte{}
   149  		blockBitWidthsInt64(&bitWidths, &block)
   150  
   151  		n := len(dst)
   152  		dst = resize(dst, n+maxMiniBlockLength64+8)
   153  		n += encodeBlockHeader(dst[n:], minDelta, bitWidths)
   154  
   155  		for i, bitWidth := range bitWidths {
   156  			if bitWidth != 0 {
   157  				miniBlock := (*[miniBlockSize]int64)(block[i*miniBlockSize:])
   158  				encodeMiniBlockInt64(dst[n:], miniBlock, uint(bitWidth))
   159  				n += (miniBlockSize * int(bitWidth)) / 8
   160  			}
   161  		}
   162  
   163  		dst = dst[:n]
   164  	}
   165  
   166  	return dst
   167  }
   168  
   169  func encodeBinaryPackedHeader(dst []byte, blockSize, numMiniBlocks, totalValues int, firstValue int64) (n int) {
   170  	n += binary.PutUvarint(dst[n:], uint64(blockSize))
   171  	n += binary.PutUvarint(dst[n:], uint64(numMiniBlocks))
   172  	n += binary.PutUvarint(dst[n:], uint64(totalValues))
   173  	n += binary.PutVarint(dst[n:], firstValue)
   174  	return n
   175  }
   176  
   177  func encodeBlockHeader(dst []byte, minDelta int64, bitWidths [numMiniBlocks]byte) (n int) {
   178  	n += binary.PutVarint(dst, int64(minDelta))
   179  	n += copy(dst[n:], bitWidths[:])
   180  	return n
   181  }
   182  
   183  func blockClearInt32(block *[blockSize]int32, blockLength int) {
   184  	if blockLength < blockSize {
   185  		clear := block[blockLength:]
   186  		for i := range clear {
   187  			clear[i] = 0
   188  		}
   189  	}
   190  }
   191  
   192  func blockDeltaInt32(block *[blockSize]int32, lastValue int32) int32 {
   193  	for i, v := range block {
   194  		block[i], lastValue = v-lastValue, v
   195  	}
   196  	return lastValue
   197  }
   198  
   199  func blockMinInt32(block *[blockSize]int32) int32 {
   200  	min := block[0]
   201  	for _, v := range block[1:] {
   202  		if v < min {
   203  			min = v
   204  		}
   205  	}
   206  	return min
   207  }
   208  
   209  func blockSubInt32(block *[blockSize]int32, value int32) {
   210  	for i := range block {
   211  		block[i] -= value
   212  	}
   213  }
   214  
   215  func blockBitWidthsInt32(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32) {
   216  	for i := range bitWidths {
   217  		j := (i + 0) * miniBlockSize
   218  		k := (i + 1) * miniBlockSize
   219  		bitWidth := 0
   220  
   221  		for _, v := range block[j:k] {
   222  			if n := bits.Len32(uint32(v)); n > bitWidth {
   223  				bitWidth = n
   224  			}
   225  		}
   226  
   227  		bitWidths[i] = byte(bitWidth)
   228  	}
   229  }
   230  
   231  func blockClearInt64(block *[blockSize]int64, blockLength int) {
   232  	if blockLength < blockSize {
   233  		clear := block[blockLength:]
   234  		for i := range clear {
   235  			clear[i] = 0
   236  		}
   237  	}
   238  }
   239  
   240  func blockDeltaInt64(block *[blockSize]int64, lastValue int64) int64 {
   241  	for i, v := range block {
   242  		block[i], lastValue = v-lastValue, v
   243  	}
   244  	return lastValue
   245  }
   246  
   247  func blockMinInt64(block *[blockSize]int64) int64 {
   248  	min := block[0]
   249  	for _, v := range block[1:] {
   250  		if v < min {
   251  			min = v
   252  		}
   253  	}
   254  	return min
   255  }
   256  
   257  func blockSubInt64(block *[blockSize]int64, value int64) {
   258  	for i := range block {
   259  		block[i] -= value
   260  	}
   261  }
   262  
   263  func blockBitWidthsInt64(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64) {
   264  	for i := range bitWidths {
   265  		j := (i + 0) * miniBlockSize
   266  		k := (i + 1) * miniBlockSize
   267  		bitWidth := 0
   268  
   269  		for _, v := range block[j:k] {
   270  			if n := bits.Len64(uint64(v)); n > bitWidth {
   271  				bitWidth = n
   272  			}
   273  		}
   274  
   275  		bitWidths[i] = byte(bitWidth)
   276  	}
   277  }
   278  
   279  func decodeInt32(dst, src []byte) ([]byte, []byte, error) {
   280  	blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src)
   281  	if err != nil {
   282  		return dst, src, err
   283  	}
   284  	if totalValues == 0 {
   285  		return dst, src, nil
   286  	}
   287  	if firstValue < math.MinInt32 || firstValue > math.MaxInt32 {
   288  		return dst, src, fmt.Errorf("first value out of range: %d", firstValue)
   289  	}
   290  
   291  	writeOffset := len(dst)
   292  	dst = resize(dst, len(dst)+4*totalValues)
   293  	out := unsafecast.BytesToInt32(dst)
   294  	out[writeOffset] = int32(firstValue)
   295  	writeOffset++
   296  	totalValues--
   297  	lastValue := int32(firstValue)
   298  	numValuesInMiniBlock := blockSize / numMiniBlocks
   299  
   300  	const padding = 16
   301  	miniBlockTemp := make([]byte, 256+padding)
   302  
   303  	for totalValues > 0 && len(src) > 0 {
   304  		var minDelta int64
   305  		var bitWidths []byte
   306  		minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks)
   307  		if err != nil {
   308  			return dst, src, err
   309  		}
   310  
   311  		blockOffset := writeOffset
   312  
   313  		for _, bitWidth := range bitWidths {
   314  			n := min(numValuesInMiniBlock, totalValues)
   315  			if bitWidth != 0 {
   316  				miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8
   317  				miniBlockData := src
   318  				if miniBlockSize <= len(src) {
   319  					miniBlockData = miniBlockData[:miniBlockSize]
   320  				}
   321  				src = src[len(miniBlockData):]
   322  				if cap(miniBlockData) < miniBlockSize+bitpack.PaddingInt32 {
   323  					miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt32)
   324  					miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)]
   325  				}
   326  				miniBlockData = miniBlockData[:miniBlockSize]
   327  				bitpack.UnpackInt32(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth))
   328  			}
   329  			writeOffset += n
   330  			totalValues -= n
   331  			if totalValues == 0 {
   332  				break
   333  			}
   334  		}
   335  
   336  		lastValue = decodeBlockInt32(out[blockOffset:writeOffset], int32(minDelta), lastValue)
   337  	}
   338  
   339  	if totalValues > 0 {
   340  		return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF)
   341  	}
   342  
   343  	return dst, src, nil
   344  }
   345  
   346  func decodeInt64(dst, src []byte) ([]byte, []byte, error) {
   347  	blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src)
   348  	if err != nil {
   349  		return dst, src, err
   350  	}
   351  	if totalValues == 0 {
   352  		return dst, src, nil
   353  	}
   354  
   355  	writeOffset := len(dst)
   356  	dst = resize(dst, len(dst)+8*totalValues)
   357  	out := unsafecast.BytesToInt64(dst)
   358  	out[writeOffset] = firstValue
   359  	writeOffset++
   360  	totalValues--
   361  	lastValue := firstValue
   362  	numValuesInMiniBlock := blockSize / numMiniBlocks
   363  
   364  	const padding = 16
   365  	miniBlockTemp := make([]byte, 512+padding)
   366  
   367  	for totalValues > 0 && len(src) > 0 {
   368  		var minDelta int64
   369  		var bitWidths []byte
   370  		minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks)
   371  		if err != nil {
   372  			return dst, src, err
   373  		}
   374  		blockOffset := writeOffset
   375  
   376  		for _, bitWidth := range bitWidths {
   377  			n := min(numValuesInMiniBlock, totalValues)
   378  			if bitWidth != 0 {
   379  				miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8
   380  				miniBlockData := src
   381  				if miniBlockSize <= len(src) {
   382  					miniBlockData = src[:miniBlockSize]
   383  				}
   384  				src = src[len(miniBlockData):]
   385  				if len(miniBlockData) < miniBlockSize+bitpack.PaddingInt64 {
   386  					miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt64)
   387  					miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)]
   388  				}
   389  				miniBlockData = miniBlockData[:miniBlockSize]
   390  				bitpack.UnpackInt64(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth))
   391  			}
   392  			writeOffset += n
   393  			totalValues -= n
   394  			if totalValues == 0 {
   395  				break
   396  			}
   397  		}
   398  
   399  		lastValue = decodeBlockInt64(out[blockOffset:writeOffset], minDelta, lastValue)
   400  	}
   401  
   402  	if totalValues > 0 {
   403  		return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF)
   404  	}
   405  
   406  	return dst, src, nil
   407  }
   408  
   409  func decodeBinaryPackedHeader(src []byte) (blockSize, numMiniBlocks, totalValues int, firstValue int64, next []byte, err error) {
   410  	u := uint64(0)
   411  	n := 0
   412  	i := 0
   413  
   414  	if u, n, err = decodeUvarint(src[i:], "block size"); err != nil {
   415  		return
   416  	}
   417  	i += n
   418  	blockSize = int(u)
   419  
   420  	if u, n, err = decodeUvarint(src[i:], "number of mini-blocks"); err != nil {
   421  		return
   422  	}
   423  	i += n
   424  	numMiniBlocks = int(u)
   425  
   426  	if u, n, err = decodeUvarint(src[i:], "total values"); err != nil {
   427  		return
   428  	}
   429  	i += n
   430  	totalValues = int(u)
   431  
   432  	if firstValue, n, err = decodeVarint(src[i:], "first value"); err != nil {
   433  		return
   434  	}
   435  	i += n
   436  
   437  	if numMiniBlocks == 0 {
   438  		err = fmt.Errorf("invalid number of mini block (%d)", numMiniBlocks)
   439  	} else if (blockSize <= 0) || (blockSize%128) != 0 {
   440  		err = fmt.Errorf("invalid block size is not a multiple of 128 (%d)", blockSize)
   441  	} else if blockSize > maxSupportedBlockSize {
   442  		err = fmt.Errorf("invalid block size is too large (%d)", blockSize)
   443  	} else if miniBlockSize := blockSize / numMiniBlocks; (numMiniBlocks <= 0) || (miniBlockSize%32) != 0 {
   444  		err = fmt.Errorf("invalid mini block size is not a multiple of 32 (%d)", miniBlockSize)
   445  	} else if totalValues < 0 {
   446  		err = fmt.Errorf("invalid total number of values is negative (%d)", totalValues)
   447  	} else if totalValues > math.MaxInt32 {
   448  		err = fmt.Errorf("too many values: %d", totalValues)
   449  	}
   450  
   451  	return blockSize, numMiniBlocks, totalValues, firstValue, src[i:], err
   452  }
   453  
   454  func decodeBinaryPackedBlock(src []byte, numMiniBlocks int) (minDelta int64, bitWidths, next []byte, err error) {
   455  	minDelta, n, err := decodeVarint(src, "min delta")
   456  	if err != nil {
   457  		return 0, nil, src, err
   458  	}
   459  	src = src[n:]
   460  	if len(src) < numMiniBlocks {
   461  		bitWidths, next = src, nil
   462  	} else {
   463  		bitWidths, next = src[:numMiniBlocks], src[numMiniBlocks:]
   464  	}
   465  	return minDelta, bitWidths, next, nil
   466  }
   467  
   468  func decodeUvarint(buf []byte, what string) (u uint64, n int, err error) {
   469  	u, n = binary.Uvarint(buf)
   470  	if n == 0 {
   471  		return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF)
   472  	}
   473  	if n < 0 {
   474  		return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf))
   475  	}
   476  	return u, n, nil
   477  }
   478  
   479  func decodeVarint(buf []byte, what string) (v int64, n int, err error) {
   480  	v, n = binary.Varint(buf)
   481  	if n == 0 {
   482  		return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF)
   483  	}
   484  	if n < 0 {
   485  		return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf))
   486  	}
   487  	return v, n, nil
   488  }