storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/encoding/delta-encode.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2019 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package encoding 18 19 import ( 20 "fmt" 21 22 "storj.io/minio/pkg/s3select/internal/parquet-go/common" 23 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 24 ) 25 26 const ( 27 blockSize = 128 28 miniBlockSize = 32 29 miniBlockCount = blockSize / miniBlockSize 30 ) 31 32 var deltaEncodeHeaderBytes []byte 33 34 func init() { 35 deltaEncodeHeaderBytes = varIntEncode(blockSize) 36 deltaEncodeHeaderBytes = append(deltaEncodeHeaderBytes, varIntEncode(miniBlockCount)...) 37 } 38 39 // Supported Types: BOOLEAN, INT32, INT64 40 func bitPackedEncode(values interface{}, bitWidth uint64, withHeader bool, parquetType parquet.Type) []byte { 41 var i64s []int64 42 switch parquetType { 43 case parquet.Type_BOOLEAN: 44 bs, ok := values.([]bool) 45 if !ok { 46 panic(fmt.Errorf("expected slice of bool")) 47 } 48 49 i64s = make([]int64, len(bs)) 50 for i := range bs { 51 if bs[i] { 52 i64s[i] = 1 53 } 54 } 55 case parquet.Type_INT32: 56 i32s, ok := values.([]int32) 57 if !ok { 58 panic(fmt.Errorf("expected slice of int32")) 59 } 60 61 for i := range i32s { 62 i64s[i] = int64(i32s[i]) 63 } 64 case parquet.Type_INT64: 65 var ok bool 66 i64s, ok = values.([]int64) 67 if !ok { 68 panic(fmt.Errorf("expected slice of int64")) 69 } 70 default: 71 panic(fmt.Errorf("%v parquet type unsupported", parquetType)) 72 } 73 74 if len(i64s) == 0 { 75 return nil 76 } 77 78 var valueByte byte 79 bitsSet := uint64(0) 80 bitsNeeded := uint64(8) 81 bitsToSet := bitWidth 82 value := i64s[0] 83 84 valueBytes := []byte{} 85 for i := 0; i < len(i64s); { 86 if bitsToSet >= bitsNeeded { 87 valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded)) 88 valueBytes = append(valueBytes, valueByte) 89 bitsToSet -= bitsNeeded 90 bitsSet += bitsNeeded 91 92 bitsNeeded = 8 93 valueByte = 0 94 95 if bitsToSet <= 0 && (i+1) < len(i64s) { 96 i++ 97 value = i64s[i] 98 bitsToSet = bitWidth 99 bitsSet = 0 100 } 101 } else { 102 valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded)) 103 i++ 104 105 if i < len(i64s) { 106 value = i64s[i] 107 } 108 109 bitsNeeded -= bitsToSet 110 bitsToSet = bitWidth 111 bitsSet = 0 112 } 113 } 114 115 if withHeader { 116 header := uint64(((len(i64s) / 8) << 1) | 1) 117 headerBytes := varIntEncode(header) 118 return append(headerBytes, valueBytes...) 119 } 120 121 return valueBytes 122 } 123 124 func deltaEncodeInt32s(i32s []int32) (data []byte) { 125 getValue := func(i32 int32) uint64 { 126 return uint64((i32 >> 31) ^ (i32 << 1)) 127 } 128 129 data = append(data, deltaEncodeHeaderBytes...) 130 data = append(data, varIntEncode(uint64(len(i32s)))...) 131 data = append(data, varIntEncode(getValue(i32s[0]))...) 132 133 for i := 1; i < len(i32s); { 134 block := []int32{} 135 minDelta := int32(0x7FFFFFFF) 136 137 for ; i < len(i32s) && len(block) < blockSize; i++ { 138 delta := i32s[i] - i32s[i-1] 139 block = append(block, delta) 140 if delta < minDelta { 141 minDelta = delta 142 } 143 } 144 145 for len(block) < blockSize { 146 block = append(block, minDelta) 147 } 148 149 bitWidths := make([]byte, miniBlockCount) 150 for j := 0; j < miniBlockCount; j++ { 151 maxValue := int32(0) 152 for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ { 153 block[k] -= minDelta 154 if block[k] > maxValue { 155 maxValue = block[k] 156 } 157 } 158 159 bitWidths[j] = byte(common.BitWidth(uint64(maxValue))) 160 } 161 162 minDeltaZigZag := getValue(minDelta) 163 data = append(data, varIntEncode(minDeltaZigZag)...) 164 data = append(data, bitWidths...) 165 166 for j := 0; j < miniBlockCount; j++ { 167 bitPacked := bitPackedEncode( 168 block[j*miniBlockSize:(j+1)*miniBlockSize], 169 uint64(bitWidths[j]), 170 false, 171 parquet.Type_INT32, 172 ) 173 data = append(data, bitPacked...) 174 } 175 } 176 177 return data 178 } 179 180 func deltaEncodeInt64s(i64s []int64) (data []byte) { 181 getValue := func(i64 int64) uint64 { 182 return uint64((i64 >> 63) ^ (i64 << 1)) 183 } 184 185 data = append(data, deltaEncodeHeaderBytes...) 186 data = append(data, varIntEncode(uint64(len(i64s)))...) 187 data = append(data, varIntEncode(getValue(i64s[0]))...) 188 189 for i := 1; i < len(i64s); { 190 block := []int64{} 191 minDelta := int64(0x7FFFFFFFFFFFFFFF) 192 193 for ; i < len(i64s) && len(block) < blockSize; i++ { 194 delta := i64s[i] - i64s[i-1] 195 block = append(block, delta) 196 if delta < minDelta { 197 minDelta = delta 198 } 199 } 200 201 for len(block) < blockSize { 202 block = append(block, minDelta) 203 } 204 205 bitWidths := make([]byte, miniBlockCount) 206 for j := 0; j < miniBlockCount; j++ { 207 maxValue := int64(0) 208 for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ { 209 block[k] -= minDelta 210 if block[k] > maxValue { 211 maxValue = block[k] 212 } 213 } 214 215 bitWidths[j] = byte(common.BitWidth(uint64(maxValue))) 216 } 217 218 minDeltaZigZag := getValue(minDelta) 219 data = append(data, varIntEncode(minDeltaZigZag)...) 220 data = append(data, bitWidths...) 221 222 for j := 0; j < miniBlockCount; j++ { 223 bitPacked := bitPackedEncode( 224 block[j*miniBlockSize:(j+1)*miniBlockSize], 225 uint64(bitWidths[j]), 226 false, 227 parquet.Type_INT64, 228 ) 229 data = append(data, bitPacked...) 230 } 231 } 232 233 return data 234 } 235 236 // DeltaEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5 237 // 238 // Supported Types: INT32, INT64. 239 func DeltaEncode(values interface{}, parquetType parquet.Type) []byte { 240 switch parquetType { 241 case parquet.Type_INT32: 242 i32s, ok := values.([]int32) 243 if !ok { 244 panic(fmt.Errorf("expected slice of int32")) 245 } 246 return deltaEncodeInt32s(i32s) 247 case parquet.Type_INT64: 248 i64s, ok := values.([]int64) 249 if !ok { 250 panic(fmt.Errorf("expected slice of int64")) 251 } 252 return deltaEncodeInt64s(i64s) 253 } 254 255 panic(fmt.Errorf("%v parquet type unsupported", parquetType)) 256 } 257 258 // DeltaLengthByteArrayEncode encodes bytes slices specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6 259 // 260 // Supported Types: BYTE_ARRAY 261 func DeltaLengthByteArrayEncode(bytesSlices [][]byte) (data []byte) { 262 lengths := make([]int32, len(bytesSlices)) 263 for i, bytes := range bytesSlices { 264 lengths[i] = int32(len(bytes)) 265 } 266 267 data = deltaEncodeInt32s(lengths) 268 for _, bytes := range bytesSlices { 269 data = append(data, []byte(bytes)...) 270 } 271 272 return data 273 } 274 275 // DeltaByteArrayEncode encodes sequence of strings values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7 276 // 277 // Supported Types: BYTE_ARRAY 278 func DeltaByteArrayEncode(bytesSlices [][]byte) (data []byte) { 279 prefixLengths := make([]int32, len(bytesSlices)) 280 suffixes := make([][]byte, len(bytesSlices)) 281 282 var i, j int 283 for i = 1; i < len(bytesSlices); i++ { 284 for j = 0; j < len(bytesSlices[i-1]) && j < len(bytesSlices[i]); j++ { 285 if bytesSlices[i-1][j] != bytesSlices[i][j] { 286 break 287 } 288 } 289 290 prefixLengths[i] = int32(j) 291 suffixes[i] = bytesSlices[i][j:] 292 } 293 294 data = deltaEncodeInt32s(prefixLengths) 295 return append(data, DeltaLengthByteArrayEncode(suffixes)...) 296 }