github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/encoding/delta/byte_array.go (about) 1 package delta 2 3 import ( 4 "bytes" 5 "sort" 6 7 "github.com/parquet-go/parquet-go/encoding" 8 "github.com/parquet-go/parquet-go/format" 9 ) 10 11 const ( 12 maxLinearSearchPrefixLength = 64 // arbitrary 13 ) 14 15 type ByteArrayEncoding struct { 16 encoding.NotSupported 17 } 18 19 func (e *ByteArrayEncoding) String() string { 20 return "DELTA_BYTE_ARRAY" 21 } 22 23 func (e *ByteArrayEncoding) Encoding() format.Encoding { 24 return format.DeltaByteArray 25 } 26 27 func (e *ByteArrayEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) { 28 prefix := getInt32Buffer() 29 defer putInt32Buffer(prefix) 30 31 length := getInt32Buffer() 32 defer putInt32Buffer(length) 33 34 totalSize := 0 35 if len(offsets) > 0 { 36 lastValue := ([]byte)(nil) 37 baseOffset := offsets[0] 38 39 for _, endOffset := range offsets[1:] { 40 v := src[baseOffset:endOffset:endOffset] 41 n := int(endOffset - baseOffset) 42 p := 0 43 baseOffset = endOffset 44 45 if len(v) <= maxLinearSearchPrefixLength { 46 p = linearSearchPrefixLength(lastValue, v) 47 } else { 48 p = binarySearchPrefixLength(lastValue, v) 49 } 50 51 prefix.values = append(prefix.values, int32(p)) 52 length.values = append(length.values, int32(n-p)) 53 lastValue = v 54 totalSize += n - p 55 } 56 } 57 58 dst = dst[:0] 59 dst = encodeInt32(dst, prefix.values) 60 dst = encodeInt32(dst, length.values) 61 dst = resize(dst, len(dst)+totalSize) 62 63 if len(offsets) > 0 { 64 b := dst[len(dst)-totalSize:] 65 i := int(offsets[0]) 66 j := 0 67 68 _ = length.values[:len(prefix.values)] 69 70 for k, p := range prefix.values { 71 n := p + length.values[k] 72 j += copy(b[j:], src[i+int(p):i+int(n)]) 73 i += int(n) 74 } 75 } 76 77 return dst, nil 78 } 79 80 func (e *ByteArrayEncoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) { 81 // The parquet specs say that this encoding is only supported for BYTE_ARRAY 82 // values, but the reference Java implementation appears to support 83 // FIXED_LEN_BYTE_ARRAY as well: 84 // https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java#L211 85 if size < 0 || size > encoding.MaxFixedLenByteArraySize { 86 return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument) 87 } 88 if (len(src) % size) != 0 { 89 return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "FIXED_LEN_BYTE_ARRAY", len(src)) 90 } 91 92 prefix := getInt32Buffer() 93 defer putInt32Buffer(prefix) 94 95 length := getInt32Buffer() 96 defer putInt32Buffer(length) 97 98 totalSize := 0 99 lastValue := ([]byte)(nil) 100 101 for i := size; i <= len(src); i += size { 102 v := src[i-size : i : i] 103 p := linearSearchPrefixLength(lastValue, v) 104 n := size - p 105 prefix.values = append(prefix.values, int32(p)) 106 length.values = append(length.values, int32(n)) 107 lastValue = v 108 totalSize += n 109 } 110 111 dst = dst[:0] 112 dst = encodeInt32(dst, prefix.values) 113 dst = encodeInt32(dst, length.values) 114 dst = resize(dst, len(dst)+totalSize) 115 116 b := dst[len(dst)-totalSize:] 117 i := 0 118 j := 0 119 120 for _, p := range prefix.values { 121 j += copy(b[j:], src[i+int(p):i+size]) 122 i += size 123 } 124 125 return dst, nil 126 } 127 128 func (e *ByteArrayEncoding) DecodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, []uint32, error) { 129 dst, offsets = dst[:0], offsets[:0] 130 131 prefix := getInt32Buffer() 132 defer putInt32Buffer(prefix) 133 134 suffix := getInt32Buffer() 135 defer putInt32Buffer(suffix) 136 137 var err error 138 src, err = prefix.decode(src) 139 if err != nil { 140 return dst, offsets, e.wrapf("decoding prefix lengths: %w", err) 141 } 142 src, err = suffix.decode(src) 143 if err != nil { 144 return dst, offsets, e.wrapf("decoding suffix lengths: %w", err) 145 } 146 if len(prefix.values) != len(suffix.values) { 147 return dst, offsets, e.wrap(errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values))) 148 } 149 return decodeByteArray(dst, src, prefix.values, suffix.values, offsets) 150 } 151 152 func (e *ByteArrayEncoding) DecodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) { 153 dst = dst[:0] 154 155 if size < 0 || size > encoding.MaxFixedLenByteArraySize { 156 return dst, e.wrap(encoding.ErrInvalidArgument) 157 } 158 159 prefix := getInt32Buffer() 160 defer putInt32Buffer(prefix) 161 162 suffix := getInt32Buffer() 163 defer putInt32Buffer(suffix) 164 165 var err error 166 src, err = prefix.decode(src) 167 if err != nil { 168 return dst, e.wrapf("decoding prefix lengths: %w", err) 169 } 170 src, err = suffix.decode(src) 171 if err != nil { 172 return dst, e.wrapf("decoding suffix lengths: %w", err) 173 } 174 if len(prefix.values) != len(suffix.values) { 175 return dst, e.wrap(errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values))) 176 } 177 return decodeFixedLenByteArray(dst[:0], src, size, prefix.values, suffix.values) 178 } 179 180 func (e *ByteArrayEncoding) EstimateDecodeByteArraySize(src []byte) int { 181 length := getInt32Buffer() 182 defer putInt32Buffer(length) 183 src, _ = length.decode(src) 184 sum := int(length.sum()) 185 length.decode(src) 186 return sum + int(length.sum()) 187 } 188 189 func (e *ByteArrayEncoding) wrap(err error) error { 190 if err != nil { 191 err = encoding.Error(e, err) 192 } 193 return err 194 } 195 196 func (e *ByteArrayEncoding) wrapf(msg string, args ...interface{}) error { 197 return encoding.Errorf(e, msg, args...) 198 } 199 200 func linearSearchPrefixLength(base, data []byte) (n int) { 201 for n < len(base) && n < len(data) && base[n] == data[n] { 202 n++ 203 } 204 return n 205 } 206 207 func binarySearchPrefixLength(base, data []byte) int { 208 n := len(base) 209 if n > len(data) { 210 n = len(data) 211 } 212 return sort.Search(n, func(i int) bool { 213 return !bytes.Equal(base[:i+1], data[:i+1]) 214 }) 215 }