github.com/parquet-go/parquet-go@v0.20.0/encoding/rle/rle.go (about)

     1  // Package rle implements the hybrid RLE/Bit-Packed encoding employed in
     2  // repetition and definition levels, dictionary indexed data pages, and
     3  // boolean values in the PLAIN encoding.
     4  //
     5  // https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3
     6  package rle
     7  
     8  import (
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"unsafe"
    13  
    14  	"github.com/parquet-go/parquet-go/encoding"
    15  	"github.com/parquet-go/parquet-go/format"
    16  	"github.com/parquet-go/parquet-go/internal/bitpack"
    17  	"github.com/parquet-go/parquet-go/internal/bytealg"
    18  	"github.com/parquet-go/parquet-go/internal/unsafecast"
    19  )
    20  
    21  const (
    22  	// This limit is intended to prevent unbounded memory allocations when
    23  	// decoding runs.
    24  	//
    25  	// We use a generous limit which allows for over 16 million values per page
    26  	// if there is only one run to encode the repetition or definition levels
    27  	// (this should be uncommon).
    28  	maxSupportedValueCount = 16 * 1024 * 1024
    29  )
    30  
    31  type Encoding struct {
    32  	encoding.NotSupported
    33  	BitWidth int
    34  }
    35  
    36  func (e *Encoding) String() string {
    37  	return "RLE"
    38  }
    39  
    40  func (e *Encoding) Encoding() format.Encoding {
    41  	return format.RLE
    42  }
    43  
    44  func (e *Encoding) EncodeLevels(dst []byte, src []uint8) ([]byte, error) {
    45  	dst, err := encodeBytes(dst[:0], src, uint(e.BitWidth))
    46  	return dst, e.wrap(err)
    47  }
    48  
    49  func (e *Encoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) {
    50  	// In the case of encoding a boolean values, the 4 bytes length of the
    51  	// output is expected by the parquet format. We add the bytes as placeholder
    52  	// before appending the encoded data.
    53  	dst = append(dst[:0], 0, 0, 0, 0)
    54  	dst, err := encodeBits(dst, src)
    55  	binary.LittleEndian.PutUint32(dst, uint32(len(dst))-4)
    56  	return dst, e.wrap(err)
    57  }
    58  
    59  func (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {
    60  	dst, err := encodeInt32(dst[:0], src, uint(e.BitWidth))
    61  	return dst, e.wrap(err)
    62  }
    63  
    64  func (e *Encoding) DecodeLevels(dst []uint8, src []byte) ([]uint8, error) {
    65  	dst, err := decodeBytes(dst[:0], src, uint(e.BitWidth))
    66  	return dst, e.wrap(err)
    67  }
    68  
    69  func (e *Encoding) DecodeBoolean(dst []byte, src []byte) ([]byte, error) {
    70  	if len(src) == 4 {
    71  		return dst[:0], nil
    72  	}
    73  	if len(src) < 4 {
    74  		return dst[:0], fmt.Errorf("input shorter than 4 bytes: %w", io.ErrUnexpectedEOF)
    75  	}
    76  	n := int(binary.LittleEndian.Uint32(src))
    77  	src = src[4:]
    78  	if n > len(src) {
    79  		return dst[:0], fmt.Errorf("input shorter than length prefix: %d < %d: %w", len(src), n, io.ErrUnexpectedEOF)
    80  	}
    81  	dst, err := decodeBits(dst[:0], src[:n])
    82  	return dst, e.wrap(err)
    83  }
    84  
    85  func (e *Encoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) {
    86  	buf := unsafecast.Int32ToBytes(dst)
    87  	buf, err := decodeInt32(buf[:0], src, uint(e.BitWidth))
    88  	return unsafecast.BytesToInt32(buf), e.wrap(err)
    89  }
    90  
    91  func (e *Encoding) wrap(err error) error {
    92  	if err != nil {
    93  		err = encoding.Error(e, err)
    94  	}
    95  	return err
    96  }
    97  
    98  func encodeBits(dst, src []byte) ([]byte, error) {
    99  	if len(src) == 0 || isZero(src) || isOnes(src) {
   100  		dst = appendUvarint(dst, uint64(8*len(src))<<1)
   101  		if len(src) > 0 {
   102  			dst = append(dst, src[0])
   103  		}
   104  		return dst, nil
   105  	}
   106  
   107  	for i := 0; i < len(src); {
   108  		j := i + 1
   109  
   110  		// Look for contiguous sections of 8 bits, all zeros or ones; these
   111  		// are run-length encoded as it only takes 2 or 3 bytes to store these
   112  		// sequences.
   113  		if src[i] == 0 || src[i] == 0xFF {
   114  			for j < len(src) && src[i] == src[j] {
   115  				j++
   116  			}
   117  
   118  			if n := j - i; n > 1 {
   119  				dst = appendRunLengthBits(dst, 8*n, src[i])
   120  				i = j
   121  				continue
   122  			}
   123  		}
   124  
   125  		// Sequences of bits that are neither all zeroes or ones are bit-packed,
   126  		// which is a simple copy of the input to the output preceded with the
   127  		// bit-pack header.
   128  		for j < len(src) && (src[j-1] != src[j] || (src[j] != 0 && src[j] == 0xFF)) {
   129  			j++
   130  		}
   131  
   132  		if (j-i) > 1 && j < len(src) {
   133  			j--
   134  		}
   135  
   136  		dst = appendBitPackedBits(dst, src[i:j])
   137  		i = j
   138  	}
   139  	return dst, nil
   140  }
   141  
   142  func encodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) {
   143  	if bitWidth > 8 {
   144  		return dst, errEncodeInvalidBitWidth("INT8", bitWidth)
   145  	}
   146  	if bitWidth == 0 {
   147  		if !isZero(src) {
   148  			return dst, errEncodeInvalidBitWidth("INT8", bitWidth)
   149  		}
   150  		return appendUvarint(dst, uint64(len(src))<<1), nil
   151  	}
   152  
   153  	if len(src) >= 8 {
   154  		words := unsafe.Slice((*uint64)(unsafe.Pointer(&src[0])), len(src)/8)
   155  
   156  		for i := 0; i < len(words); {
   157  			j := i
   158  			pattern := broadcast8x1(words[i])
   159  
   160  			for j < len(words) && words[j] == pattern {
   161  				j++
   162  			}
   163  
   164  			if i < j {
   165  				dst = appendRunLengthBytes(dst, 8*(j-i), byte(pattern))
   166  			} else {
   167  				j++
   168  
   169  				for j < len(words) && words[j] != broadcast8x1(words[j-1]) {
   170  					j++
   171  				}
   172  
   173  				dst = appendBitPackedBytes(dst, words[i:j], bitWidth)
   174  			}
   175  
   176  			i = j
   177  		}
   178  	}
   179  
   180  	for i := (len(src) / 8) * 8; i < len(src); {
   181  		j := i + 1
   182  
   183  		for j < len(src) && src[i] == src[j] {
   184  			j++
   185  		}
   186  
   187  		dst = appendRunLengthBytes(dst, j-i, src[i])
   188  		i = j
   189  	}
   190  
   191  	return dst, nil
   192  }
   193  
   194  func encodeInt32(dst []byte, src []int32, bitWidth uint) ([]byte, error) {
   195  	if bitWidth > 32 {
   196  		return dst, errEncodeInvalidBitWidth("INT32", bitWidth)
   197  	}
   198  	if bitWidth == 0 {
   199  		if !isZero(unsafecast.Int32ToBytes(src)) {
   200  			return dst, errEncodeInvalidBitWidth("INT32", bitWidth)
   201  		}
   202  		return appendUvarint(dst, uint64(len(src))<<1), nil
   203  	}
   204  
   205  	if len(src) >= 8 {
   206  		words := unsafe.Slice((*[8]int32)(unsafe.Pointer(&src[0])), len(src)/8)
   207  
   208  		for i := 0; i < len(words); {
   209  			j := i
   210  			pattern := broadcast8x4(words[i][0])
   211  
   212  			for j < len(words) && words[j] == pattern {
   213  				j++
   214  			}
   215  
   216  			if i < j {
   217  				dst = appendRunLengthInt32(dst, 8*(j-i), pattern[0], bitWidth)
   218  			} else {
   219  				j += 1
   220  				j += encodeInt32IndexEqual8Contiguous(words[j:])
   221  				dst = appendBitPackedInt32(dst, words[i:j], bitWidth)
   222  			}
   223  
   224  			i = j
   225  		}
   226  	}
   227  
   228  	for i := (len(src) / 8) * 8; i < len(src); {
   229  		j := i + 1
   230  
   231  		for j < len(src) && src[i] == src[j] {
   232  			j++
   233  		}
   234  
   235  		dst = appendRunLengthInt32(dst, j-i, src[i], bitWidth)
   236  		i = j
   237  	}
   238  
   239  	return dst, nil
   240  }
   241  
   242  func decodeBits(dst, src []byte) ([]byte, error) {
   243  	for i := 0; i < len(src); {
   244  		u, n := binary.Uvarint(src[i:])
   245  		if n == 0 {
   246  			return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF)
   247  		}
   248  		if n < 0 {
   249  			return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src))
   250  		}
   251  		i += n
   252  
   253  		count, bitpacked := uint(u>>1), (u&1) != 0
   254  		if count > maxSupportedValueCount {
   255  			return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount)
   256  		}
   257  		if bitpacked {
   258  			n := int(count)
   259  			j := i + n
   260  
   261  			if j > len(src) {
   262  				return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", n, io.ErrUnexpectedEOF)
   263  			}
   264  
   265  			dst = append(dst, src[i:j]...)
   266  			i = j
   267  		} else {
   268  			word := byte(0)
   269  			if i < len(src) {
   270  				word = src[i]
   271  				i++
   272  			}
   273  
   274  			offset := len(dst)
   275  			length := bitpack.ByteCount(count)
   276  			dst = resize(dst, offset+length)
   277  			bytealg.Broadcast(dst[offset:], word)
   278  		}
   279  	}
   280  	return dst, nil
   281  }
   282  
   283  func decodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) {
   284  	if bitWidth > 8 {
   285  		return dst, errDecodeInvalidBitWidth("INT8", bitWidth)
   286  	}
   287  
   288  	for i := 0; i < len(src); {
   289  		u, n := binary.Uvarint(src[i:])
   290  		if n == 0 {
   291  			return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF)
   292  		}
   293  		if n < 0 {
   294  			return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src))
   295  		}
   296  		i += n
   297  
   298  		count, bitpacked := uint(u>>1), (u&1) != 0
   299  		if count > maxSupportedValueCount {
   300  			return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount)
   301  		}
   302  		if bitpacked {
   303  			count *= 8
   304  			j := i + bitpack.ByteCount(count*bitWidth)
   305  
   306  			if j > len(src) {
   307  				return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", 8*count, io.ErrUnexpectedEOF)
   308  			}
   309  
   310  			offset := len(dst)
   311  			length := int(count)
   312  			dst = resize(dst, offset+length)
   313  			decodeBytesBitpack(dst[offset:], src[i:j], count, bitWidth)
   314  
   315  			i = j
   316  		} else {
   317  			if bitWidth != 0 && (i+1) > len(src) {
   318  				return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF)
   319  			}
   320  
   321  			word := byte(0)
   322  			if bitWidth != 0 {
   323  				word = src[i]
   324  				i++
   325  			}
   326  
   327  			offset := len(dst)
   328  			length := int(count)
   329  			dst = resize(dst, offset+length)
   330  			bytealg.Broadcast(dst[offset:], word)
   331  		}
   332  	}
   333  
   334  	return dst, nil
   335  }
   336  
   337  func decodeInt32(dst, src []byte, bitWidth uint) ([]byte, error) {
   338  	if bitWidth > 32 {
   339  		return dst, errDecodeInvalidBitWidth("INT32", bitWidth)
   340  	}
   341  
   342  	buf := make([]byte, 2*bitpack.PaddingInt32)
   343  
   344  	for i := 0; i < len(src); {
   345  		u, n := binary.Uvarint(src[i:])
   346  		if n == 0 {
   347  			return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF)
   348  		}
   349  		if n < 0 {
   350  			return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src))
   351  		}
   352  		i += n
   353  
   354  		count, bitpacked := uint(u>>1), (u&1) != 0
   355  		if count > maxSupportedValueCount {
   356  			return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount)
   357  		}
   358  		if bitpacked {
   359  			offset := len(dst)
   360  			length := int(count * bitWidth)
   361  			dst = resize(dst, offset+4*8*int(count))
   362  
   363  			// The bitpack.UnpackInt32 function requires the input to be padded
   364  			// or the function panics. If there is enough room in the input
   365  			// buffer we can use it, otherwise we have to copy it to a larger
   366  			// location (which should rarely happen).
   367  			in := src[i : i+length]
   368  			if (cap(in) - len(in)) >= bitpack.PaddingInt32 {
   369  				in = in[:cap(in)]
   370  			} else {
   371  				buf = resize(buf, len(in)+bitpack.PaddingInt32)
   372  				copy(buf, in)
   373  				in = buf
   374  			}
   375  
   376  			out := unsafecast.BytesToInt32(dst[offset:])
   377  			bitpack.UnpackInt32(out, in, bitWidth)
   378  			i += length
   379  		} else {
   380  			j := i + bitpack.ByteCount(bitWidth)
   381  
   382  			if j > len(src) {
   383  				return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF)
   384  			}
   385  
   386  			bits := [4]byte{}
   387  			copy(bits[:], src[i:j])
   388  			dst = appendRepeat(dst, bits[:], count)
   389  			i = j
   390  		}
   391  	}
   392  
   393  	return dst, nil
   394  }
   395  
   396  func errEncodeInvalidBitWidth(typ string, bitWidth uint) error {
   397  	return errInvalidBitWidth("encode", typ, bitWidth)
   398  }
   399  
   400  func errDecodeInvalidBitWidth(typ string, bitWidth uint) error {
   401  	return errInvalidBitWidth("decode", typ, bitWidth)
   402  }
   403  
   404  func errInvalidBitWidth(op, typ string, bitWidth uint) error {
   405  	return fmt.Errorf("cannot %s %s with invalid bit-width=%d", op, typ, bitWidth)
   406  }
   407  
   408  func appendRepeat(dst, pattern []byte, count uint) []byte {
   409  	offset := len(dst)
   410  	length := int(count) * len(pattern)
   411  	dst = resize(dst, offset+length)
   412  	i := offset + copy(dst[offset:], pattern)
   413  	for i < len(dst) {
   414  		i += copy(dst[i:], dst[offset:i])
   415  	}
   416  	return dst
   417  }
   418  
   419  func appendUvarint(dst []byte, u uint64) []byte {
   420  	var b [binary.MaxVarintLen64]byte
   421  	var n = binary.PutUvarint(b[:], u)
   422  	return append(dst, b[:n]...)
   423  }
   424  
   425  func appendRunLengthBits(dst []byte, count int, value byte) []byte {
   426  	return appendRunLengthBytes(dst, count, value)
   427  }
   428  
   429  func appendBitPackedBits(dst []byte, words []byte) []byte {
   430  	n := len(dst)
   431  	dst = resize(dst, n+binary.MaxVarintLen64+len(words))
   432  	n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1)
   433  	n += copy(dst[n:], words)
   434  	return dst[:n]
   435  }
   436  
   437  func appendRunLengthBytes(dst []byte, count int, value byte) []byte {
   438  	n := len(dst)
   439  	dst = resize(dst, n+binary.MaxVarintLen64+1)
   440  	n += binary.PutUvarint(dst[n:], uint64(count)<<1)
   441  	dst[n] = value
   442  	return dst[:n+1]
   443  }
   444  
   445  func appendBitPackedBytes(dst []byte, words []uint64, bitWidth uint) []byte {
   446  	n := len(dst)
   447  	dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+8)
   448  	n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1)
   449  	n += encodeBytesBitpack(dst[n:], words, bitWidth)
   450  	return dst[:n]
   451  }
   452  
   453  func appendRunLengthInt32(dst []byte, count int, value int32, bitWidth uint) []byte {
   454  	n := len(dst)
   455  	dst = resize(dst, n+binary.MaxVarintLen64+4)
   456  	n += binary.PutUvarint(dst[n:], uint64(count)<<1)
   457  	binary.LittleEndian.PutUint32(dst[n:], uint32(value))
   458  	return dst[:n+bitpack.ByteCount(bitWidth)]
   459  }
   460  
   461  func appendBitPackedInt32(dst []byte, words [][8]int32, bitWidth uint) []byte {
   462  	n := len(dst)
   463  	dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+32)
   464  	n += binary.PutUvarint(dst[n:], uint64(len(words))<<1|1)
   465  	n += encodeInt32Bitpack(dst[n:], words, bitWidth)
   466  	return dst[:n]
   467  }
   468  
   469  func broadcast8x1(v uint64) uint64 {
   470  	return (v & 0xFF) * 0x0101010101010101
   471  }
   472  
   473  func broadcast8x4(v int32) [8]int32 {
   474  	return [8]int32{v, v, v, v, v, v, v, v}
   475  }
   476  
   477  func isZero(data []byte) bool {
   478  	return bytealg.Count(data, 0x00) == len(data)
   479  }
   480  
   481  func isOnes(data []byte) bool {
   482  	return bytealg.Count(data, 0xFF) == len(data)
   483  }
   484  
   485  func resize(buf []byte, size int) []byte {
   486  	if cap(buf) < size {
   487  		return grow(buf, size)
   488  	}
   489  	return buf[:size]
   490  }
   491  
   492  func grow(buf []byte, size int) []byte {
   493  	newCap := 2 * cap(buf)
   494  	if newCap < size {
   495  		newCap = size
   496  	}
   497  	newBuf := make([]byte, size, newCap)
   498  	copy(newBuf, buf)
   499  	return newBuf
   500  }
   501  
   502  func encodeInt32BitpackDefault(dst []byte, src [][8]int32, bitWidth uint) int {
   503  	bits := unsafe.Slice((*int32)(unsafe.Pointer(&src[0])), len(src)*8)
   504  	bitpack.PackInt32(dst, bits, bitWidth)
   505  	return bitpack.ByteCount(uint(len(src)*8) * bitWidth)
   506  }
   507  
   508  func encodeBytesBitpackDefault(dst []byte, src []uint64, bitWidth uint) int {
   509  	bitMask := uint64(1<<bitWidth) - 1
   510  	n := 0
   511  
   512  	for _, word := range src {
   513  		word = (word & bitMask) |
   514  			(((word >> 8) & bitMask) << (1 * bitWidth)) |
   515  			(((word >> 16) & bitMask) << (2 * bitWidth)) |
   516  			(((word >> 24) & bitMask) << (3 * bitWidth)) |
   517  			(((word >> 32) & bitMask) << (4 * bitWidth)) |
   518  			(((word >> 40) & bitMask) << (5 * bitWidth)) |
   519  			(((word >> 48) & bitMask) << (6 * bitWidth)) |
   520  			(((word >> 56) & bitMask) << (7 * bitWidth))
   521  		binary.LittleEndian.PutUint64(dst[n:], word)
   522  		n += int(bitWidth)
   523  	}
   524  
   525  	return n
   526  }
   527  
   528  func decodeBytesBitpackDefault(dst, src []byte, count, bitWidth uint) {
   529  	dst = dst[:0]
   530  
   531  	bitMask := uint64(1<<bitWidth) - 1
   532  	byteCount := bitpack.ByteCount(8 * bitWidth)
   533  
   534  	for i := 0; count > 0; count -= 8 {
   535  		j := i + byteCount
   536  
   537  		bits := [8]byte{}
   538  		copy(bits[:], src[i:j])
   539  		word := binary.LittleEndian.Uint64(bits[:])
   540  
   541  		dst = append(dst,
   542  			byte((word>>(0*bitWidth))&bitMask),
   543  			byte((word>>(1*bitWidth))&bitMask),
   544  			byte((word>>(2*bitWidth))&bitMask),
   545  			byte((word>>(3*bitWidth))&bitMask),
   546  			byte((word>>(4*bitWidth))&bitMask),
   547  			byte((word>>(5*bitWidth))&bitMask),
   548  			byte((word>>(6*bitWidth))&bitMask),
   549  			byte((word>>(7*bitWidth))&bitMask),
   550  		)
   551  
   552  		i = j
   553  	}
   554  }