storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/encode.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2019 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package parquet 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "errors" 23 "fmt" 24 "math" 25 26 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 27 ) 28 29 func boolsToBytes(bs []bool) []byte { 30 size := (len(bs) + 7) / 8 31 result := make([]byte, size) 32 for i := range bs { 33 if bs[i] { 34 result[i/8] |= 1 << uint32(i%8) 35 } 36 } 37 38 return result 39 } 40 41 func int32sToBytes(i32s []int32) []byte { 42 buf := make([]byte, 4*len(i32s)) 43 for i, i32 := range i32s { 44 binary.LittleEndian.PutUint32(buf[i*4:], uint32(i32)) 45 } 46 return buf 47 } 48 49 func int64sToBytes(i64s []int64) []byte { 50 buf := make([]byte, 8*len(i64s)) 51 for i, i64 := range i64s { 52 binary.LittleEndian.PutUint64(buf[i*8:], uint64(i64)) 53 } 54 return buf 55 } 56 57 func float32sToBytes(f32s []float32) []byte { 58 buf := make([]byte, 4*len(f32s)) 59 for i, f32 := range f32s { 60 binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f32)) 61 } 62 return buf 63 } 64 65 func float64sToBytes(f64s []float64) []byte { 66 buf := make([]byte, 8*len(f64s)) 67 for i, f64 := range f64s { 68 binary.LittleEndian.PutUint64(buf[i*8:], math.Float64bits(f64)) 69 } 70 return buf 71 } 72 73 func byteSlicesToBytes(byteSlices [][]byte) []byte { 74 buf := new(bytes.Buffer) 75 for _, s := range byteSlices { 76 if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil { 77 panic(err) 78 } 79 80 if _, err := buf.Write(s); err != nil { 81 panic(err) 82 } 83 } 84 85 return buf.Bytes() 86 } 87 88 func byteArraysToBytes(arrayList [][]byte) []byte { 89 buf := new(bytes.Buffer) 90 arrayLen := -1 91 for _, array := range arrayList { 92 if arrayLen != -1 && len(array) != arrayLen { 93 panic(errors.New("array list does not have same length")) 94 } 95 96 arrayLen = len(array) 97 if _, err := buf.Write(array); err != nil { 98 panic(err) 99 } 100 } 101 102 return buf.Bytes() 103 } 104 105 func int96sToBytes(i96s [][]byte) []byte { 106 return byteArraysToBytes(i96s) 107 } 108 109 func valuesToBytes(values interface{}, dataType parquet.Type) []byte { 110 switch dataType { 111 case parquet.Type_BOOLEAN: 112 return boolsToBytes(values.([]bool)) 113 case parquet.Type_INT32: 114 return int32sToBytes(values.([]int32)) 115 case parquet.Type_INT64: 116 return int64sToBytes(values.([]int64)) 117 case parquet.Type_INT96: 118 return int96sToBytes(values.([][]byte)) 119 case parquet.Type_FLOAT: 120 return float32sToBytes(values.([]float32)) 121 case parquet.Type_DOUBLE: 122 return float64sToBytes(values.([]float64)) 123 case parquet.Type_BYTE_ARRAY: 124 return byteSlicesToBytes(values.([][]byte)) 125 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 126 return byteArraysToBytes(values.([][]byte)) 127 } 128 129 return []byte{} 130 } 131 132 func valueToBytes(value interface{}, dataType parquet.Type) []byte { 133 var values interface{} 134 switch dataType { 135 case parquet.Type_BOOLEAN: 136 values = []bool{value.(bool)} 137 case parquet.Type_INT32: 138 values = []int32{value.(int32)} 139 case parquet.Type_INT64: 140 values = []int64{value.(int64)} 141 case parquet.Type_INT96: 142 values = [][]byte{value.([]byte)} 143 case parquet.Type_FLOAT: 144 values = []float32{value.(float32)} 145 case parquet.Type_DOUBLE: 146 values = []float64{value.(float64)} 147 case parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY: 148 values = [][]byte{value.([]byte)} 149 } 150 151 return valuesToBytes(values, dataType) 152 } 153 154 func unsignedVarIntToBytes(ui64 uint64) []byte { 155 size := (getBitWidth(ui64) + 6) / 7 156 if size == 0 { 157 return []byte{0} 158 } 159 160 buf := make([]byte, size) 161 for i := uint64(0); i < size; i++ { 162 buf[i] = byte(ui64&0x7F) | 0x80 163 ui64 >>= 7 164 } 165 buf[size-1] &= 0x7F 166 167 return buf 168 } 169 170 func valuesToRLEBytes(values interface{}, bitWidth int32, valueType parquet.Type) []byte { 171 vals := valuesToInterfaces(values, valueType) 172 result := []byte{} 173 j := 0 174 for i := 0; i < len(vals); i = j { 175 for j = i + 1; j < len(vals) && vals[i] == vals[j]; j++ { 176 } 177 headerBytes := unsignedVarIntToBytes(uint64((j - i) << 1)) 178 result = append(result, headerBytes...) 179 180 valBytes := valueToBytes(vals[i], valueType) 181 byteCount := (bitWidth + 7) / 8 182 result = append(result, valBytes[:byteCount]...) 183 } 184 185 return result 186 } 187 188 func valuesToRLEBitPackedHybridBytes(values interface{}, bitWidth int32, dataType parquet.Type) []byte { 189 rleBytes := valuesToRLEBytes(values, bitWidth, dataType) 190 lenBytes := valueToBytes(int32(len(rleBytes)), parquet.Type_INT32) 191 return append(lenBytes, rleBytes...) 192 } 193 194 func valuesToBitPackedBytes(values interface{}, bitWidth int64, withHeader bool, dataType parquet.Type) []byte { 195 var i64s []int64 196 switch dataType { 197 case parquet.Type_BOOLEAN: 198 bs := values.([]bool) 199 i64s = make([]int64, len(bs)) 200 for i := range bs { 201 if bs[i] { 202 i64s[i] = 1 203 } 204 } 205 case parquet.Type_INT32: 206 i32s := values.([]int32) 207 i64s = make([]int64, len(i32s)) 208 for i := range i32s { 209 i64s[i] = int64(i32s[i]) 210 } 211 case parquet.Type_INT64: 212 i64s = values.([]int64) 213 default: 214 panic(fmt.Errorf("data type %v is not supported for bit packing", dataType)) 215 } 216 217 if len(i64s) == 0 { 218 return nil 219 } 220 221 var valueByte byte 222 bitsSet := uint64(0) 223 bitsNeeded := uint64(8) 224 bitsToSet := uint64(bitWidth) 225 value := i64s[0] 226 227 valueBytes := []byte{} 228 for i := 0; i < len(i64s); { 229 if bitsToSet >= bitsNeeded { 230 valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded)) 231 valueBytes = append(valueBytes, valueByte) 232 bitsToSet -= bitsNeeded 233 bitsSet += bitsNeeded 234 235 bitsNeeded = 8 236 valueByte = 0 237 238 if bitsToSet <= 0 && (i+1) < len(i64s) { 239 i++ 240 value = i64s[i] 241 bitsToSet = uint64(bitWidth) 242 bitsSet = 0 243 } 244 } else { 245 valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded)) 246 i++ 247 248 if i < len(i64s) { 249 value = i64s[i] 250 } 251 252 bitsNeeded -= bitsToSet 253 bitsToSet = uint64(bitWidth) 254 bitsSet = 0 255 } 256 } 257 258 if withHeader { 259 header := uint64(((len(i64s) / 8) << 1) | 1) 260 headerBytes := unsignedVarIntToBytes(header) 261 return append(headerBytes, valueBytes...) 262 } 263 264 return valueBytes 265 } 266 267 const ( 268 blockSize = 128 269 subBlockSize = 32 270 subBlockCount = blockSize / subBlockSize 271 ) 272 273 var ( 274 blockSizeBytes = unsignedVarIntToBytes(blockSize) 275 subBlockCountBytes = unsignedVarIntToBytes(subBlockCount) 276 ) 277 278 func int32ToDeltaBytes(i32s []int32) []byte { 279 getValue := func(i32 int32) uint64 { 280 return uint64((i32 >> 31) ^ (i32 << 1)) 281 } 282 283 result := append([]byte{}, blockSizeBytes...) 284 result = append(result, subBlockCountBytes...) 285 result = append(result, unsignedVarIntToBytes(uint64(len(i32s)))...) 286 result = append(result, unsignedVarIntToBytes(getValue(i32s[0]))...) 287 288 for i := 1; i < len(i32s); { 289 block := []int32{} 290 minDelta := int32(0x7FFFFFFF) 291 292 for ; i < len(i32s) && len(block) < blockSize; i++ { 293 delta := i32s[i] - i32s[i-1] 294 block = append(block, delta) 295 if delta < minDelta { 296 minDelta = delta 297 } 298 } 299 300 for len(block) < blockSize { 301 block = append(block, minDelta) 302 } 303 304 bitWidths := make([]byte, subBlockCount) 305 for j := 0; j < subBlockCount; j++ { 306 maxValue := int32(0) 307 for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ { 308 block[k] -= minDelta 309 if block[k] > maxValue { 310 maxValue = block[k] 311 } 312 } 313 314 bitWidths[j] = byte(getBitWidth(uint64(maxValue))) 315 } 316 317 minDeltaZigZag := getValue(minDelta) 318 result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...) 319 result = append(result, bitWidths...) 320 321 for j := 0; j < subBlockCount; j++ { 322 bitPacked := valuesToBitPackedBytes( 323 block[j*subBlockSize:(j+1)*subBlockSize], 324 int64(bitWidths[j]), 325 false, 326 parquet.Type_INT32, 327 ) 328 result = append(result, bitPacked...) 329 } 330 } 331 332 return result 333 } 334 335 func int64ToDeltaBytes(i64s []int64) []byte { 336 getValue := func(i64 int64) uint64 { 337 return uint64((i64 >> 63) ^ (i64 << 1)) 338 } 339 340 result := append([]byte{}, blockSizeBytes...) 341 result = append(result, subBlockCountBytes...) 342 result = append(result, unsignedVarIntToBytes(uint64(len(i64s)))...) 343 result = append(result, unsignedVarIntToBytes(getValue(i64s[0]))...) 344 345 for i := 1; i < len(i64s); { 346 block := []int64{} 347 minDelta := int64(0x7FFFFFFFFFFFFFFF) 348 349 for ; i < len(i64s) && len(block) < blockSize; i++ { 350 delta := i64s[i] - i64s[i-1] 351 block = append(block, delta) 352 if delta < minDelta { 353 minDelta = delta 354 } 355 } 356 357 for len(block) < blockSize { 358 block = append(block, minDelta) 359 } 360 361 bitWidths := make([]byte, subBlockCount) 362 for j := 0; j < subBlockCount; j++ { 363 maxValue := int64(0) 364 for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ { 365 block[k] -= minDelta 366 if block[k] > maxValue { 367 maxValue = block[k] 368 } 369 } 370 371 bitWidths[j] = byte(getBitWidth(uint64(maxValue))) 372 } 373 374 minDeltaZigZag := getValue(minDelta) 375 result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...) 376 result = append(result, bitWidths...) 377 378 for j := 0; j < subBlockCount; j++ { 379 bitPacked := valuesToBitPackedBytes( 380 block[j*subBlockSize:(j+1)*subBlockSize], 381 int64(bitWidths[j]), 382 false, 383 parquet.Type_INT64, 384 ) 385 result = append(result, bitPacked...) 386 } 387 } 388 389 return result 390 } 391 392 func valuesToDeltaBytes(values interface{}, dataType parquet.Type) []byte { 393 switch dataType { 394 case parquet.Type_INT32: 395 return int32ToDeltaBytes(values.([]int32)) 396 case parquet.Type_INT64: 397 return int64ToDeltaBytes(values.([]int64)) 398 } 399 400 return nil 401 } 402 403 func stringsToDeltaLengthByteArrayBytes(strs []string) []byte { 404 lengths := make([]int32, len(strs)) 405 for i, s := range strs { 406 lengths[i] = int32(len(s)) 407 } 408 409 result := int32ToDeltaBytes(lengths) 410 for _, s := range strs { 411 result = append(result, []byte(s)...) 412 } 413 414 return result 415 } 416 417 func stringsToDeltaByteArrayBytes(strs []string) []byte { 418 prefixLengths := make([]int32, len(strs)) 419 suffixes := make([]string, len(strs)) 420 421 var i, j int 422 for i = 1; i < len(strs); i++ { 423 for j = 0; j < len(strs[i-1]) && j < len(strs[i]); j++ { 424 if strs[i-1][j] != strs[i][j] { 425 break 426 } 427 } 428 429 prefixLengths[i] = int32(j) 430 suffixes[i] = strs[i][j:] 431 } 432 433 result := int32ToDeltaBytes(prefixLengths) 434 return append(result, stringsToDeltaLengthByteArrayBytes(suffixes)...) 435 } 436 437 func encodeValues(values interface{}, dataType parquet.Type, encoding parquet.Encoding, bitWidth int32) []byte { 438 switch encoding { 439 case parquet.Encoding_RLE: 440 return valuesToRLEBitPackedHybridBytes(values, bitWidth, dataType) 441 case parquet.Encoding_DELTA_BINARY_PACKED: 442 return valuesToDeltaBytes(values, dataType) 443 case parquet.Encoding_DELTA_BYTE_ARRAY: 444 return stringsToDeltaByteArrayBytes(values.([]string)) 445 case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: 446 return stringsToDeltaLengthByteArrayBytes(values.([]string)) 447 } 448 449 return valuesToBytes(values, dataType) 450 }