github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding/delta/byte_array.go (about)

     1  package delta
     2  
     3  import (
     4  	"bytes"
     5  	"sort"
     6  
     7  	"github.com/segmentio/parquet-go/encoding"
     8  	"github.com/segmentio/parquet-go/format"
     9  )
    10  
    11  const (
    12  	maxLinearSearchPrefixLength = 64 // arbitrary
    13  )
    14  
    15  type ByteArrayEncoding struct {
    16  	encoding.NotSupported
    17  }
    18  
    19  func (e *ByteArrayEncoding) String() string {
    20  	return "DELTA_BYTE_ARRAY"
    21  }
    22  
    23  func (e *ByteArrayEncoding) Encoding() format.Encoding {
    24  	return format.DeltaByteArray
    25  }
    26  
    27  func (e *ByteArrayEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {
    28  	prefix := getInt32Buffer()
    29  	defer putInt32Buffer(prefix)
    30  
    31  	length := getInt32Buffer()
    32  	defer putInt32Buffer(length)
    33  
    34  	totalSize := 0
    35  	if len(offsets) > 0 {
    36  		lastValue := ([]byte)(nil)
    37  		baseOffset := offsets[0]
    38  
    39  		for _, endOffset := range offsets[1:] {
    40  			v := src[baseOffset:endOffset:endOffset]
    41  			n := int(endOffset - baseOffset)
    42  			p := 0
    43  			baseOffset = endOffset
    44  
    45  			if len(v) <= maxLinearSearchPrefixLength {
    46  				p = linearSearchPrefixLength(lastValue, v)
    47  			} else {
    48  				p = binarySearchPrefixLength(lastValue, v)
    49  			}
    50  
    51  			prefix.values = append(prefix.values, int32(p))
    52  			length.values = append(length.values, int32(n-p))
    53  			lastValue = v
    54  			totalSize += n - p
    55  		}
    56  	}
    57  
    58  	dst = dst[:0]
    59  	dst = encodeInt32(dst, prefix.values)
    60  	dst = encodeInt32(dst, length.values)
    61  	dst = resize(dst, len(dst)+totalSize)
    62  
    63  	if len(offsets) > 0 {
    64  		b := dst[len(dst)-totalSize:]
    65  		i := int(offsets[0])
    66  		j := 0
    67  
    68  		_ = length.values[:len(prefix.values)]
    69  
    70  		for k, p := range prefix.values {
    71  			n := p + length.values[k]
    72  			j += copy(b[j:], src[i+int(p):i+int(n)])
    73  			i += int(n)
    74  		}
    75  	}
    76  
    77  	return dst, nil
    78  }
    79  
    80  func (e *ByteArrayEncoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {
    81  	// The parquet specs say that this encoding is only supported for BYTE_ARRAY
    82  	// values, but the reference Java implementation appears to support
    83  	// FIXED_LEN_BYTE_ARRAY as well:
    84  	// https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java#L211
    85  	if size < 0 || size > encoding.MaxFixedLenByteArraySize {
    86  		return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument)
    87  	}
    88  	if (len(src) % size) != 0 {
    89  		return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "FIXED_LEN_BYTE_ARRAY", len(src))
    90  	}
    91  
    92  	prefix := getInt32Buffer()
    93  	defer putInt32Buffer(prefix)
    94  
    95  	length := getInt32Buffer()
    96  	defer putInt32Buffer(length)
    97  
    98  	totalSize := 0
    99  	lastValue := ([]byte)(nil)
   100  
   101  	for i := size; i <= len(src); i += size {
   102  		v := src[i-size : i : i]
   103  		p := linearSearchPrefixLength(lastValue, v)
   104  		n := size - p
   105  		prefix.values = append(prefix.values, int32(p))
   106  		length.values = append(length.values, int32(n))
   107  		lastValue = v
   108  		totalSize += n
   109  	}
   110  
   111  	dst = dst[:0]
   112  	dst = encodeInt32(dst, prefix.values)
   113  	dst = encodeInt32(dst, length.values)
   114  	dst = resize(dst, len(dst)+totalSize)
   115  
   116  	b := dst[len(dst)-totalSize:]
   117  	i := 0
   118  	j := 0
   119  
   120  	for _, p := range prefix.values {
   121  		j += copy(b[j:], src[i+int(p):i+size])
   122  		i += size
   123  	}
   124  
   125  	return dst, nil
   126  }
   127  
   128  func (e *ByteArrayEncoding) DecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error) {
   129  	dst, offsets = dst[:0], offsets[:0]
   130  
   131  	prefix := getInt32Buffer()
   132  	defer putInt32Buffer(prefix)
   133  
   134  	suffix := getInt32Buffer()
   135  	defer putInt32Buffer(suffix)
   136  
   137  	var err error
   138  	src, err = prefix.decode(src)
   139  	if err != nil {
   140  		return dst, offsets, e.wrapf("decoding prefix lengths: %w", err)
   141  	}
   142  	src, err = suffix.decode(src)
   143  	if err != nil {
   144  		return dst, offsets, e.wrapf("decoding suffix lengths: %w", err)
   145  	}
   146  	if len(prefix.values) != len(suffix.values) {
   147  		return dst, offsets, e.wrap(errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values)))
   148  	}
   149  	return decodeByteArray(dst, src, prefix.values, suffix.values, offsets)
   150  }
   151  
   152  func (e *ByteArrayEncoding) DecodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {
   153  	dst = dst[:0]
   154  
   155  	if size < 0 || size > encoding.MaxFixedLenByteArraySize {
   156  		return dst, e.wrap(encoding.ErrInvalidArgument)
   157  	}
   158  
   159  	prefix := getInt32Buffer()
   160  	defer putInt32Buffer(prefix)
   161  
   162  	suffix := getInt32Buffer()
   163  	defer putInt32Buffer(suffix)
   164  
   165  	var err error
   166  	src, err = prefix.decode(src)
   167  	if err != nil {
   168  		return dst, e.wrapf("decoding prefix lengths: %w", err)
   169  	}
   170  	src, err = suffix.decode(src)
   171  	if err != nil {
   172  		return dst, e.wrapf("decoding suffix lengths: %w", err)
   173  	}
   174  	if len(prefix.values) != len(suffix.values) {
   175  		return dst, e.wrap(errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values)))
   176  	}
   177  	return decodeFixedLenByteArray(dst[:0], src, size, prefix.values, suffix.values)
   178  }
   179  
   180  func (e *ByteArrayEncoding) EstimateDecodeByteArraySize(src []byte) int {
   181  	length := getInt32Buffer()
   182  	defer putInt32Buffer(length)
   183  	src, _ = length.decode(src)
   184  	sum := int(length.sum())
   185  	length.decode(src)
   186  	return sum + int(length.sum())
   187  }
   188  
   189  func (e *ByteArrayEncoding) wrap(err error) error {
   190  	if err != nil {
   191  		err = encoding.Error(e, err)
   192  	}
   193  	return err
   194  }
   195  
   196  func (e *ByteArrayEncoding) wrapf(msg string, args ...interface{}) error {
   197  	return encoding.Errorf(e, msg, args...)
   198  }
   199  
   200  func linearSearchPrefixLength(base, data []byte) (n int) {
   201  	for n < len(base) && n < len(data) && base[n] == data[n] {
   202  		n++
   203  	}
   204  	return n
   205  }
   206  
   207  func binarySearchPrefixLength(base, data []byte) int {
   208  	n := len(base)
   209  	if n > len(data) {
   210  		n = len(data)
   211  	}
   212  	return sort.Search(n, func(i int) bool {
   213  		return !bytes.Equal(base[:i+1], data[:i+1])
   214  	})
   215  }