github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/byte_array.go (about) 1 package delta 2 3 import ( 4 "bytes" 5 "fmt" 6 "sort" 7 8 "github.com/vc42/parquet-go/encoding" 9 "github.com/vc42/parquet-go/encoding/plain" 10 "github.com/vc42/parquet-go/format" 11 ) 12 13 const ( 14 maxLinearSearchPrefixLength = 64 // arbitrary 15 ) 16 17 type ByteArrayEncoding struct { 18 encoding.NotSupported 19 } 20 21 func (e *ByteArrayEncoding) String() string { 22 return "DELTA_BYTE_ARRAY" 23 } 24 25 func (e *ByteArrayEncoding) Encoding() format.Encoding { 26 return format.DeltaByteArray 27 } 28 29 func (e *ByteArrayEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) { 30 prefix := getInt32Buffer() 31 defer putInt32Buffer(prefix) 32 33 length := getInt32Buffer() 34 defer putInt32Buffer(length) 35 36 totalSize := 0 37 lastValue := ([]byte)(nil) 38 39 for i := 0; i < len(src); { 40 r := len(src) - i 41 if r < plain.ByteArrayLengthSize { 42 return dst[:0], plain.ErrTooShort(r) 43 } 44 n := plain.ByteArrayLength(src[i:]) 45 i += plain.ByteArrayLengthSize 46 r -= plain.ByteArrayLengthSize 47 if n > r { 48 return dst[:0], plain.ErrTooShort(n) 49 } 50 if n > plain.MaxByteArrayLength { 51 return dst[:0], plain.ErrTooLarge(n) 52 } 53 v := src[i : i+n : i+n] 54 p := 0 55 56 if len(v) <= maxLinearSearchPrefixLength { 57 p = linearSearchPrefixLength(lastValue, v) 58 } else { 59 p = binarySearchPrefixLength(lastValue, v) 60 } 61 62 prefix.values = append(prefix.values, int32(p)) 63 length.values = append(length.values, int32(n-p)) 64 lastValue = v 65 totalSize += n - p 66 i += n 67 } 68 69 dst = dst[:0] 70 dst = encodeInt32(dst, prefix.values) 71 dst = encodeInt32(dst, length.values) 72 dst = resize(dst, len(dst)+totalSize) 73 74 b := dst[len(dst)-totalSize:] 75 i := plain.ByteArrayLengthSize 76 j := 0 77 78 _ = length.values[:len(prefix.values)] 79 80 for k, p := range prefix.values { 81 n := p + length.values[k] 82 j += copy(b[j:], src[i+int(p):i+int(n)]) 83 i += plain.ByteArrayLengthSize 84 i += int(n) 85 } 86 87 return dst, nil 88 } 89 90 func (e *ByteArrayEncoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { 91 // The parquet specs say that this encoding is only supported for BYTE_ARRAY 92 // values, but the reference Java implementation appears to support 93 // FIXED_LEN_BYTE_ARRAY as well: 94 // https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java#L211 95 if size < 0 || size > encoding.MaxFixedLenByteArraySize { 96 return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument) 97 } 98 if (len(src) % size) != 0 { 99 return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "FIXED_LEN_BYTE_ARRAY", len(src)) 100 } 101 102 prefix := getInt32Buffer() 103 defer putInt32Buffer(prefix) 104 105 length := getInt32Buffer() 106 defer putInt32Buffer(length) 107 108 totalSize := 0 109 lastValue := ([]byte)(nil) 110 111 for i := size; i <= len(src); i += size { 112 v := src[i-size : i : i] 113 p := linearSearchPrefixLength(lastValue, v) 114 n := size - p 115 prefix.values = append(prefix.values, int32(p)) 116 length.values = append(length.values, int32(n)) 117 lastValue = v 118 totalSize += n 119 } 120 121 dst = dst[:0] 122 dst = encodeInt32(dst, prefix.values) 123 dst = encodeInt32(dst, length.values) 124 dst = resize(dst, len(dst)+totalSize) 125 126 b := dst[len(dst)-totalSize:] 127 i := 0 128 j := 0 129 130 for _, p := range prefix.values { 131 j += copy(b[j:], src[i+int(p):i+size]) 132 i += size 133 } 134 135 return dst, nil 136 } 137 138 func (e *ByteArrayEncoding) DecodeByteArray(dst, src []byte) ([]byte, error) { 139 dst = dst[:0] 140 141 prefix := getInt32Buffer() 142 defer putInt32Buffer(prefix) 143 144 suffix := getInt32Buffer() 145 defer putInt32Buffer(suffix) 146 147 var err error 148 src, err = prefix.decode(src) 149 if err != nil { 150 return dst, encoding.Errorf(e, "decoding prefix lengths: %w", err) 151 } 152 src, err = suffix.decode(src) 153 if err != nil { 154 return dst, encoding.Errorf(e, "decoding suffix lengths: %w", err) 155 } 156 if len(prefix.values) != len(suffix.values) { 157 return dst, encoding.Error(e, errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values))) 158 } 159 return decodeByteArray(dst, src, prefix.values, suffix.values) 160 } 161 162 func (e *ByteArrayEncoding) DecodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { 163 dst = dst[:0] 164 165 if size < 0 || size > encoding.MaxFixedLenByteArraySize { 166 return dst, encoding.Error(e, encoding.ErrInvalidArgument) 167 } 168 169 prefix := getInt32Buffer() 170 defer putInt32Buffer(prefix) 171 172 suffix := getInt32Buffer() 173 defer putInt32Buffer(suffix) 174 175 var err error 176 src, err = prefix.decode(src) 177 if err != nil { 178 return dst, fmt.Errorf("decoding prefix lengths: %w", err) 179 } 180 src, err = suffix.decode(src) 181 if err != nil { 182 return dst, fmt.Errorf("decoding suffix lengths: %w", err) 183 } 184 if len(prefix.values) != len(suffix.values) { 185 return dst, errPrefixAndSuffixLengthMismatch(len(prefix.values), len(suffix.values)) 186 } 187 return decodeFixedLenByteArray(dst, src, size, prefix.values, suffix.values) 188 } 189 190 func linearSearchPrefixLength(base, data []byte) (n int) { 191 for n < len(base) && n < len(data) && base[n] == data[n] { 192 n++ 193 } 194 return n 195 } 196 197 func binarySearchPrefixLength(base, data []byte) int { 198 n := len(base) 199 if n > len(data) { 200 n = len(data) 201 } 202 return sort.Search(n, func(i int) bool { 203 return !bytes.Equal(base[:i+1], data[:i+1]) 204 }) 205 }