github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/rle/rle.go (about)

     1  // Package rle implements the hybrid RLE/Bit-Packed encoding employed in
     2  // repetition and definition levels, dictionary indexed data pages, and
     3  // boolean values in the PLAIN encoding.
     4  //
     5  // https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3
     6  package rle
     7  
     8  import (
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"unsafe"
    13  
    14  	"github.com/vc42/parquet-go/encoding"
    15  	"github.com/vc42/parquet-go/format"
    16  	"github.com/vc42/parquet-go/internal/bitpack"
    17  	"github.com/vc42/parquet-go/internal/bytealg"
    18  	"github.com/vc42/parquet-go/internal/unsafecast"
    19  )
    20  
    21  const (
    22  	// This limit is intended to prevent unbounded memory allocations when
    23  	// decoding runs.
    24  	//
    25  	// We use a generous limit which allows for over 16 million values per page
    26  	// if there is only one run to encode the repetition or definition levels
    27  	// (this should be uncommon).
    28  	maxSupportedValueCount = 16 * 1024 * 1024
    29  )
    30  
    31  type Encoding struct {
    32  	encoding.NotSupported
    33  	BitWidth int
    34  }
    35  
    36  func (e *Encoding) String() string {
    37  	return "RLE"
    38  }
    39  
    40  func (e *Encoding) Encoding() format.Encoding {
    41  	return format.RLE
    42  }
    43  
    44  func (e *Encoding) EncodeLevels(dst, src []byte) ([]byte, error) {
    45  	dst, err := encodeBytes(dst[:0], src, uint(e.BitWidth))
    46  	return dst, e.wrap(err)
    47  }
    48  
    49  func (e *Encoding) EncodeBoolean(dst, src []byte) ([]byte, error) {
    50  	// In the case of encoding a boolean values, the 4 bytes length of the
    51  	// output is expected by the parquet format. We add the bytes as placeholder
    52  	// before appending the encoded data.
    53  	dst = append(dst[:0], 0, 0, 0, 0)
    54  	dst, err := encodeBits(dst, src)
    55  	binary.LittleEndian.PutUint32(dst, uint32(len(dst))-4)
    56  	return dst, e.wrap(err)
    57  }
    58  
    59  func (e *Encoding) EncodeInt32(dst, src []byte) ([]byte, error) {
    60  	if (len(src) % 4) != 0 {
    61  		return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "INT32", len(src))
    62  	}
    63  	dst, err := encodeInt32(dst[:0], unsafecast.BytesToInt32(src), uint(e.BitWidth))
    64  	return dst, e.wrap(err)
    65  }
    66  
    67  func (e *Encoding) DecodeLevels(dst, src []byte) ([]byte, error) {
    68  	dst, err := decodeBytes(dst[:0], src, uint(e.BitWidth))
    69  	return dst, e.wrap(err)
    70  }
    71  
    72  func (e *Encoding) DecodeBoolean(dst, src []byte) ([]byte, error) {
    73  	if len(src) == 4 {
    74  		return dst[:0], nil
    75  	}
    76  	if len(src) < 4 {
    77  		return dst[:0], fmt.Errorf("input shorter than 4 bytes: %w", io.ErrUnexpectedEOF)
    78  	}
    79  	n := int(binary.LittleEndian.Uint32(src))
    80  	src = src[4:]
    81  	if n > len(src) {
    82  		return dst[:0], fmt.Errorf("input shorter than length prefix: %d < %d: %w", len(src), n, io.ErrUnexpectedEOF)
    83  	}
    84  	dst, err := decodeBits(dst[:0], src[:n])
    85  	return dst, e.wrap(err)
    86  }
    87  
    88  func (e *Encoding) DecodeInt32(dst, src []byte) ([]byte, error) {
    89  	dst, err := decodeInt32(dst[:0], src, uint(e.BitWidth))
    90  	return dst, e.wrap(err)
    91  }
    92  
    93  func (e *Encoding) wrap(err error) error {
    94  	if err != nil {
    95  		err = encoding.Error(e, err)
    96  	}
    97  	return err
    98  }
    99  
   100  func encodeBits(dst, src []byte) ([]byte, error) {
   101  	if len(src) == 0 || isZero(src) || isOnes(src) {
   102  		dst = appendUvarint(dst, uint64(8*len(src))<<1)
   103  		if len(src) > 0 {
   104  			dst = append(dst, src[0])
   105  		}
   106  		return dst, nil
   107  	}
   108  
   109  	for i := 0; i < len(src); {
   110  		j := i + 1
   111  
   112  		// Look for contiguous sections of 8 bits, all zeros or ones; these
   113  		// are run-length encoded as it only takes 2 or 3 bytes to store these
   114  		// sequences.
   115  		if src[i] == 0 || src[i] == 0xFF {
   116  			for j < len(src) && src[i] == src[j] {
   117  				j++
   118  			}
   119  
   120  			if n := j - i; n > 1 {
   121  				dst = appendRunLengthBits(dst, 8*n, src[i])
   122  				i = j
   123  				continue
   124  			}
   125  		}
   126  
   127  		// Sequences of bits that are neither all zeroes or ones are bit-packed,
   128  		// which is a simple copy of the input to the output preceded with the
   129  		// bit-pack header.
   130  		for j < len(src) && (src[j-1] != src[j] || (src[j] != 0 && src[j] == 0xFF)) {
   131  			j++
   132  		}
   133  
   134  		if (j-i) > 1 && j < len(src) {
   135  			j--
   136  		}
   137  
   138  		dst = appendBitPackedBits(dst, src[i:j])
   139  		i = j
   140  	}
   141  	return dst, nil
   142  }
   143  
   144  func encodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) {
   145  	if bitWidth > 8 {
   146  		return dst, errEncodeInvalidBitWidth("INT8", bitWidth)
   147  	}
   148  	if bitWidth == 0 {
   149  		if !isZero(src) {
   150  			return dst, errEncodeInvalidBitWidth("INT8", bitWidth)
   151  		}
   152  		return appendUvarint(dst, uint64(len(src))<<1), nil
   153  	}
   154  
   155  	if len(src) >= 8 {
   156  		words := unsafe.Slice((*uint64)(unsafe.Pointer(&src[0])), len(src)/8)
   157  
   158  		for i := 0; i < len(words); {
   159  			j := i
   160  			pattern := broadcast8x1(words[i])
   161  
   162  			for j < len(words) && words[j] == pattern {
   163  				j++
   164  			}
   165  
   166  			if i < j {
   167  				dst = appendRunLengthBytes(dst, 8*(j-i), byte(pattern))
   168  			} else {
   169  				j++
   170  
   171  				for j < len(words) && words[j] != broadcast8x1(words[j-1]) {
   172  					j++
   173  				}
   174  
   175  				dst = appendBitPackedBytes(dst, words[i:j], bitWidth)
   176  			}
   177  
   178  			i = j
   179  		}
   180  	}
   181  
   182  	for i := (len(src) / 8) * 8; i < len(src); {
   183  		j := i + 1
   184  
   185  		for j < len(src) && src[i] == src[j] {
   186  			j++
   187  		}
   188  
   189  		dst = appendRunLengthBytes(dst, j-i, src[i])
   190  		i = j
   191  	}
   192  
   193  	return dst, nil
   194  }
   195  
   196  func encodeInt32(dst []byte, src []int32, bitWidth uint) ([]byte, error) {
   197  	if bitWidth > 32 {
   198  		return dst, errEncodeInvalidBitWidth("INT32", bitWidth)
   199  	}
   200  	if bitWidth == 0 {
   201  		if !isZero(unsafecast.Int32ToBytes(src)) {
   202  			return dst, errEncodeInvalidBitWidth("INT32", bitWidth)
   203  		}
   204  		return appendUvarint(dst, uint64(len(src))<<1), nil
   205  	}
   206  
   207  	if len(src) >= 8 {
   208  		words := unsafe.Slice((*[8]int32)(unsafe.Pointer(&src[0])), len(src)/8)
   209  
   210  		for i := 0; i < len(words); {
   211  			j := i
   212  			pattern := broadcast8x4(words[i][0])
   213  
   214  			for j < len(words) && words[j] == pattern {
   215  				j++
   216  			}
   217  
   218  			if i < j {
   219  				dst = appendRunLengthInt32(dst, 8*(j-i), pattern[0], bitWidth)
   220  			} else {
   221  				j += 1
   222  				j += encodeInt32IndexEqual8Contiguous(words[j:])
   223  				dst = appendBitPackedInt32(dst, words[i:j], bitWidth)
   224  			}
   225  
   226  			i = j
   227  		}
   228  	}
   229  
   230  	for i := (len(src) / 8) * 8; i < len(src); {
   231  		j := i + 1
   232  
   233  		for j < len(src) && src[i] == src[j] {
   234  			j++
   235  		}
   236  
   237  		dst = appendRunLengthInt32(dst, j-i, src[i], bitWidth)
   238  		i = j
   239  	}
   240  
   241  	return dst, nil
   242  }
   243  
   244  func decodeBits(dst, src []byte) ([]byte, error) {
   245  	for i := 0; i < len(src); {
   246  		u, n := binary.Uvarint(src[i:])
   247  		if n == 0 {
   248  			return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF)
   249  		}
   250  		if n < 0 {
   251  			return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src))
   252  		}
   253  		i += n
   254  
   255  		count, bitpacked := uint(u>>1), (u&1) != 0
   256  		if count > maxSupportedValueCount {
   257  			return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount)
   258  		}
   259  		if bitpacked {
   260  			n := int(count)
   261  			j := i + n
   262  
   263  			if j > len(src) {
   264  				return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", n, io.ErrUnexpectedEOF)
   265  			}
   266  
   267  			dst = append(dst, src[i:j]...)
   268  			i = j
   269  		} else {
   270  			word := byte(0)
   271  			if i < len(src) {
   272  				word = src[i]
   273  				i++
   274  			}
   275  
   276  			offset := len(dst)
   277  			length := bitpack.ByteCount(count)
   278  			dst = resize(dst, offset+length)
   279  			bytealg.Broadcast(dst[offset:], word)
   280  		}
   281  	}
   282  	return dst, nil
   283  }
   284  
   285  func decodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) {
   286  	if bitWidth > 8 {
   287  		return dst, errDecodeInvalidBitWidth("INT8", bitWidth)
   288  	}
   289  
   290  	for i := 0; i < len(src); {
   291  		u, n := binary.Uvarint(src[i:])
   292  		if n == 0 {
   293  			return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF)
   294  		}
   295  		if n < 0 {
   296  			return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src))
   297  		}
   298  		i += n
   299  
   300  		count, bitpacked := uint(u>>1), (u&1) != 0
   301  		if count > maxSupportedValueCount {
   302  			return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount)
   303  		}
   304  		if bitpacked {
   305  			count *= 8
   306  			j := i + bitpack.ByteCount(count*bitWidth)
   307  
   308  			if j > len(src) {
   309  				return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", 8*count, io.ErrUnexpectedEOF)
   310  			}
   311  
   312  			offset := len(dst)
   313  			length := int(count)
   314  			dst = resize(dst, offset+length)
   315  			decodeBytesBitpack(dst[offset:], src[i:j], count, bitWidth)
   316  
   317  			i = j
   318  		} else {
   319  			if bitWidth != 0 && (i+1) > len(src) {
   320  				return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF)
   321  			}
   322  
   323  			word := byte(0)
   324  			if bitWidth != 0 {
   325  				word = src[i]
   326  				i++
   327  			}
   328  
   329  			offset := len(dst)
   330  			length := int(count)
   331  			dst = resize(dst, offset+length)
   332  			bytealg.Broadcast(dst[offset:], word)
   333  		}
   334  	}
   335  
   336  	return dst, nil
   337  }
   338  
   339  func decodeInt32(dst, src []byte, bitWidth uint) ([]byte, error) {
   340  	if bitWidth > 32 {
   341  		return dst, errDecodeInvalidBitWidth("INT32", bitWidth)
   342  	}
   343  
   344  	buf := make([]byte, 2*bitpack.PaddingInt32)
   345  
   346  	for i := 0; i < len(src); {
   347  		u, n := binary.Uvarint(src[i:])
   348  		if n == 0 {
   349  			return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF)
   350  		}
   351  		if n < 0 {
   352  			return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src))
   353  		}
   354  		i += n
   355  
   356  		count, bitpacked := uint(u>>1), (u&1) != 0
   357  		if count > maxSupportedValueCount {
   358  			return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount)
   359  		}
   360  		if bitpacked {
   361  			offset := len(dst)
   362  			length := int(count * bitWidth)
   363  			dst = resize(dst, offset+4*8*int(count))
   364  
   365  			// The bitpack.UnpackInt32 function requires the input to be padded
   366  			// or the function panics. If there is enough room in the input
   367  			// buffer we can use it, otherwise we have to copy it to a larger
   368  			// location (which should rarely happen).
   369  			in := src[i : i+length]
   370  			if (cap(in) - len(in)) >= bitpack.PaddingInt32 {
   371  				in = in[:cap(in)]
   372  			} else {
   373  				buf = resize(buf, len(in)+bitpack.PaddingInt32)
   374  				copy(buf, in)
   375  				in = buf
   376  			}
   377  
   378  			out := unsafecast.BytesToInt32(dst[offset:])
   379  			bitpack.UnpackInt32(out, in, bitWidth)
   380  			i += length
   381  		} else {
   382  			j := i + bitpack.ByteCount(bitWidth)
   383  
   384  			if j > len(src) {
   385  				return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF)
   386  			}
   387  
   388  			bits := [4]byte{}
   389  			copy(bits[:], src[i:j])
   390  			dst = appendRepeat(dst, bits[:], count)
   391  			i = j
   392  		}
   393  	}
   394  
   395  	return dst, nil
   396  }
   397  
   398  func errEncodeInvalidBitWidth(typ string, bitWidth uint) error {
   399  	return errInvalidBitWidth("encode", typ, bitWidth)
   400  }
   401  
   402  func errDecodeInvalidBitWidth(typ string, bitWidth uint) error {
   403  	return errInvalidBitWidth("decode", typ, bitWidth)
   404  }
   405  
   406  func errInvalidBitWidth(op, typ string, bitWidth uint) error {
   407  	return fmt.Errorf("cannot %s %s with invalid bit-width=%d", op, typ, bitWidth)
   408  }
   409  
   410  func appendRepeat(dst, pattern []byte, count uint) []byte {
   411  	offset := len(dst)
   412  	length := int(count) * len(pattern)
   413  	dst = resize(dst, offset+length)
   414  	i := offset + copy(dst[offset:], pattern)
   415  	for i < len(dst) {
   416  		i += copy(dst[i:], dst[offset:i])
   417  	}
   418  	return dst
   419  }
   420  
   421  func appendUvarint(dst []byte, u uint64) []byte {
   422  	var b [binary.MaxVarintLen64]byte
   423  	var n = binary.PutUvarint(b[:], u)
   424  	return append(dst, b[:n]...)
   425  }
   426  
   427  func appendRunLengthBits(dst []byte, count int, value byte) []byte {
   428  	return appendRunLengthBytes(dst, count, value)
   429  }
   430  
   431  func appendBitPackedBits(dst []byte, words []byte) []byte {
   432  	n := len(dst)
   433  	dst = resize(dst, n+binary.MaxVarintLen64+len(words))
   434  	n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1)
   435  	n += copy(dst[n:], words)
   436  	return dst[:n]
   437  }
   438  
   439  func appendRunLengthBytes(dst []byte, count int, value byte) []byte {
   440  	n := len(dst)
   441  	dst = resize(dst, n+binary.MaxVarintLen64+1)
   442  	n += binary.PutUvarint(dst[n:], uint64(count)<<1)
   443  	dst[n] = value
   444  	return dst[:n+1]
   445  }
   446  
   447  func appendBitPackedBytes(dst []byte, words []uint64, bitWidth uint) []byte {
   448  	n := len(dst)
   449  	dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+8)
   450  	n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1)
   451  	n += encodeBytesBitpack(dst[n:], words, bitWidth)
   452  	return dst[:n]
   453  }
   454  
   455  func appendRunLengthInt32(dst []byte, count int, value int32, bitWidth uint) []byte {
   456  	n := len(dst)
   457  	dst = resize(dst, n+binary.MaxVarintLen64+4)
   458  	n += binary.PutUvarint(dst[n:], uint64(count)<<1)
   459  	binary.LittleEndian.PutUint32(dst[n:], uint32(value))
   460  	return dst[:n+bitpack.ByteCount(bitWidth)]
   461  }
   462  
   463  func appendBitPackedInt32(dst []byte, words [][8]int32, bitWidth uint) []byte {
   464  	n := len(dst)
   465  	dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+32)
   466  	n += binary.PutUvarint(dst[n:], uint64(len(words))<<1|1)
   467  	n += encodeInt32Bitpack(dst[n:], words, bitWidth)
   468  	return dst[:n]
   469  }
   470  
   471  func broadcast8x1(v uint64) uint64 {
   472  	return (v & 0xFF) * 0x0101010101010101
   473  }
   474  
   475  func broadcast8x4(v int32) [8]int32 {
   476  	return [8]int32{v, v, v, v, v, v, v, v}
   477  }
   478  
   479  func isZero(data []byte) bool {
   480  	return bytealg.Count(data, 0x00) == len(data)
   481  }
   482  
   483  func isOnes(data []byte) bool {
   484  	return bytealg.Count(data, 0xFF) == len(data)
   485  }
   486  
   487  func resize(buf []byte, size int) []byte {
   488  	if cap(buf) < size {
   489  		return grow(buf, size)
   490  	}
   491  	return buf[:size]
   492  }
   493  
   494  func grow(buf []byte, size int) []byte {
   495  	newCap := 2 * cap(buf)
   496  	if newCap < size {
   497  		newCap = size
   498  	}
   499  	newBuf := make([]byte, size, newCap)
   500  	copy(newBuf, buf)
   501  	return newBuf
   502  }
   503  
   504  func encodeInt32BitpackDefault(dst []byte, src [][8]int32, bitWidth uint) int {
   505  	bits := unsafe.Slice((*int32)(unsafe.Pointer(&src[0])), len(src)*8)
   506  	bitpack.PackInt32(dst, bits, bitWidth)
   507  	return bitpack.ByteCount(uint(len(src)*8) * bitWidth)
   508  }
   509  
   510  func encodeBytesBitpackDefault(dst []byte, src []uint64, bitWidth uint) int {
   511  	bitMask := uint64(1<<bitWidth) - 1
   512  	n := 0
   513  
   514  	for _, word := range src {
   515  		word = (word & bitMask) |
   516  			(((word >> 8) & bitMask) << (1 * bitWidth)) |
   517  			(((word >> 16) & bitMask) << (2 * bitWidth)) |
   518  			(((word >> 24) & bitMask) << (3 * bitWidth)) |
   519  			(((word >> 32) & bitMask) << (4 * bitWidth)) |
   520  			(((word >> 40) & bitMask) << (5 * bitWidth)) |
   521  			(((word >> 48) & bitMask) << (6 * bitWidth)) |
   522  			(((word >> 56) & bitMask) << (7 * bitWidth))
   523  		binary.LittleEndian.PutUint64(dst[n:], word)
   524  		n += int(bitWidth)
   525  	}
   526  
   527  	return n
   528  }
   529  
   530  func decodeBytesBitpackDefault(dst, src []byte, count, bitWidth uint) {
   531  	dst = dst[:0]
   532  
   533  	bitMask := uint64(1<<bitWidth) - 1
   534  	byteCount := bitpack.ByteCount(8 * bitWidth)
   535  
   536  	for i := 0; count > 0; count -= 8 {
   537  		j := i + byteCount
   538  
   539  		bits := [8]byte{}
   540  		copy(bits[:], src[i:j])
   541  		word := binary.LittleEndian.Uint64(bits[:])
   542  
   543  		dst = append(dst,
   544  			byte((word>>(0*bitWidth))&bitMask),
   545  			byte((word>>(1*bitWidth))&bitMask),
   546  			byte((word>>(2*bitWidth))&bitMask),
   547  			byte((word>>(3*bitWidth))&bitMask),
   548  			byte((word>>(4*bitWidth))&bitMask),
   549  			byte((word>>(5*bitWidth))&bitMask),
   550  			byte((word>>(6*bitWidth))&bitMask),
   551  			byte((word>>(7*bitWidth))&bitMask),
   552  		)
   553  
   554  		i = j
   555  	}
   556  }