github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/byte_array.go (about)

     1  package delta
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"sort"
     7  
     8  	"github.com/vc42/parquet-go/encoding"
     9  	"github.com/vc42/parquet-go/encoding/plain"
    10  	"github.com/vc42/parquet-go/format"
    11  )
    12  
    13  const (
    14  	maxLinearSearchPrefixLength = 64 // arbitrary
    15  )
    16  
    17  type ByteArrayEncoding struct {
    18  	encoding.NotSupported
    19  }
    20  
    21  func (e *ByteArrayEncoding) String() string {
    22  	return "DELTA_BYTE_ARRAY"
    23  }
    24  
    25  func (e *ByteArrayEncoding) Encoding() format.Encoding {
    26  	return format.DeltaByteArray
    27  }
    28  
    29  func (e *ByteArrayEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) {
    30  	prefix := getInt32Buffer()
    31  	defer putInt32Buffer(prefix)
    32  
    33  	length := getInt32Buffer()
    34  	defer putInt32Buffer(length)
    35  
    36  	totalSize := 0
    37  	lastValue := ([]byte)(nil)
    38  
    39  	for i := 0; i < len(src); {
    40  		r := len(src) - i
    41  		if r < plain.ByteArrayLengthSize {
    42  			return dst[:0], plain.ErrTooShort(r)
    43  		}
    44  		n := plain.ByteArrayLength(src[i:])
    45  		i += plain.ByteArrayLengthSize
    46  		r -= plain.ByteArrayLengthSize
    47  		if n > r {
    48  			return dst[:0], plain.ErrTooShort(n)
    49  		}
    50  		if n > plain.MaxByteArrayLength {
    51  			return dst[:0], plain.ErrTooLarge(n)
    52  		}
    53  		v := src[i : i+n : i+n]
    54  		p := 0
    55  
    56  		if len(v) <= maxLinearSearchPrefixLength {
    57  			p = linearSearchPrefixLength(lastValue, v)
    58  		} else {
    59  			p = binarySearchPrefixLength(lastValue, v)
    60  		}
    61  
    62  		prefix.values = append(prefix.values, int32(p))
    63  		length.values = append(length.values, int32(n-p))
    64  		lastValue = v
    65  		totalSize += n - p
    66  		i += n
    67  	}
    68  
    69  	dst = dst[:0]
    70  	dst = encodeInt32(dst, prefix.values)
    71  	dst = encodeInt32(dst, length.values)
    72  	dst = resize(dst, len(dst)+totalSize)
    73  
    74  	b := dst[len(dst)-totalSize:]
    75  	i := plain.ByteArrayLengthSize
    76  	j := 0
    77  
    78  	_ = length.values[:len(prefix.values)]
    79  
    80  	for k, p := range prefix.values {
    81  		n := p + length.values[k]
    82  		j += copy(b[j:], src[i+int(p):i+int(n)])
    83  		i += plain.ByteArrayLengthSize
    84  		i += int(n)
    85  	}
    86  
    87  	return dst, nil
    88  }
    89  
    90  func (e *ByteArrayEncoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) {
    91  	// The parquet specs say that this encoding is only supported for BYTE_ARRAY
    92  	// values, but the reference Java implementation appears to support
    93  	// FIXED_LEN_BYTE_ARRAY as well:
    94  	// https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java#L211
    95  	if size < 0 || size > encoding.MaxFixedLenByteArraySize {
    96  		return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument)
    97  	}
    98  	if (len(src) % size) != 0 {
    99  		return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "FIXED_LEN_BYTE_ARRAY", len(src))
   100  	}
   101  
   102  	prefix := getInt32Buffer()
   103  	defer putInt32Buffer(prefix)
   104  
   105  	length := getInt32Buffer()
   106  	defer putInt32Buffer(length)
   107  
   108  	totalSize := 0
   109  	lastValue := ([]byte)(nil)
   110  
   111  	for i := size; i <= len(src); i += size {
   112  		v := src[i-size : i : i]
   113  		p := linearSearchPrefixLength(lastValue, v)
   114  		n := size - p
   115  		prefix.values = append(prefix.values, int32(p))
   116  		length.values = append(length.values, int32(n))
   117  		lastValue = v
   118  		totalSize += n
   119  	}
   120  
   121  	dst = dst[:0]
   122  	dst = encodeInt32(dst, prefix.values)
   123  	dst = encodeInt32(dst, length.values)
   124  	dst = resize(dst, len(dst)+totalSize)
   125  
   126  	b := dst[len(dst)-totalSize:]
   127  	i := 0
   128  	j := 0
   129  
   130  	for _, p := range prefix.values {
   131  		j += copy(b[j:], src[i+int(p):i+size])
   132  		i += size
   133  	}
   134  
   135  	return dst, nil
   136  }
   137  
   138  func (e *ByteArrayEncoding) DecodeByteArray(dst, src []byte) ([]byte, error) {
   139  	dst = dst[:0]
   140  
   141  	prefix := getInt32Buffer()
   142  	defer putInt32Buffer(prefix)
   143  
   144  	suffix := getInt32Buffer()
   145  	defer putInt32Buffer(suffix)
   146  
   147  	var err error
   148  	src, err = prefix.decode(src)
   149  	if err != nil {
   150  		return dst, encoding.Errorf(e, "decoding prefix lengths: %w", err)
   151  	}
   152  	src, err = suffix.decode(src)
   153  	if err != nil {
   154  		return dst, encoding.Errorf(e, "decoding suffix lengths: %w", err)
   155  	}
   156  	if len(prefix.values) != len(suffix.values) {
   157  		return dst, encoding.Error(e, errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values)))
   158  	}
   159  	return decodeByteArray(dst, src, prefix.values, suffix.values)
   160  }
   161  
   162  func (e *ByteArrayEncoding) DecodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) {
   163  	dst = dst[:0]
   164  
   165  	if size < 0 || size > encoding.MaxFixedLenByteArraySize {
   166  		return dst, encoding.Error(e, encoding.ErrInvalidArgument)
   167  	}
   168  
   169  	prefix := getInt32Buffer()
   170  	defer putInt32Buffer(prefix)
   171  
   172  	suffix := getInt32Buffer()
   173  	defer putInt32Buffer(suffix)
   174  
   175  	var err error
   176  	src, err = prefix.decode(src)
   177  	if err != nil {
   178  		return dst, fmt.Errorf("decoding prefix lengths: %w", err)
   179  	}
   180  	src, err = suffix.decode(src)
   181  	if err != nil {
   182  		return dst, fmt.Errorf("decoding suffix lengths: %w", err)
   183  	}
   184  	if len(prefix.values) != len(suffix.values) {
   185  		return dst, errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values))
   186  	}
   187  	return decodeFixedLenByteArray(dst, src, size, prefix.values, suffix.values)
   188  }
   189  
   190  func linearSearchPrefixLength(base, data []byte) (n int) {
   191  	for n < len(base) && n < len(data) && base[n] == data[n] {
   192  		n++
   193  	}
   194  	return n
   195  }
   196  
   197  func binarySearchPrefixLength(base, data []byte) int {
   198  	n := len(base)
   199  	if n > len(data) {
   200  		n = len(data)
   201  	}
   202  	return sort.Search(n, func(i int) bool {
   203  		return !bytes.Equal(base[:i+1], data[:i+1])
   204  	})
   205  }