storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/decode.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2018 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package parquet 18 19 import ( 20 "bytes" 21 "errors" 22 "fmt" 23 "math" 24 25 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 26 ) 27 28 func i64sToi32s(i64s []int64) (i32s []int32) { 29 i32s = make([]int32, len(i64s)) 30 for i := range i64s { 31 i32s[i] = int32(i64s[i]) 32 } 33 34 return i32s 35 } 36 37 func readBitPacked(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) { 38 count := header * 8 39 40 if count == 0 { 41 return result, nil 42 } 43 44 if bitWidth == 0 { 45 return make([]int64, count), nil 46 } 47 48 data := make([]byte, header*bitWidth) 49 if _, err = reader.Read(data); err != nil { 50 return nil, err 51 } 52 53 var val, used, left, b uint64 54 55 valNeedBits := bitWidth 56 i := -1 57 for { 58 if left <= 0 { 59 i++ 60 if i >= len(data) { 61 break 62 } 63 64 b = uint64(data[i]) 65 left = 8 66 used = 0 67 } 68 69 if left >= valNeedBits { 70 val |= ((b >> used) & ((1 << valNeedBits) - 1)) << (bitWidth - valNeedBits) 71 result = append(result, int64(val)) 72 val = 0 73 left -= valNeedBits 74 used += valNeedBits 75 valNeedBits = bitWidth 76 } else { 77 val |= (b >> used) << (bitWidth - valNeedBits) 78 valNeedBits -= left 79 left = 0 80 } 81 } 82 83 return result, nil 84 } 85 86 func readBools(reader *bytes.Reader, count uint64) (result []bool, err error) { 87 i64s, err := readBitPacked(reader, count, 1) 88 if err != nil { 89 return nil, err 90 } 91 92 var i uint64 93 for i = 0; i < count; i++ { 94 result = append(result, i64s[i] > 0) 95 } 96 97 return result, nil 98 } 99 100 func readInt32s(reader *bytes.Reader, count uint64) (result []int32, err error) { 101 buf := make([]byte, 4) 102 103 var i uint64 104 for i = 0; i < count; i++ { 105 if _, err = reader.Read(buf); err != nil { 106 return nil, err 107 } 108 109 result = append(result, int32(bytesToUint32(buf))) 110 } 111 112 return result, nil 113 } 114 115 func readInt64s(reader *bytes.Reader, count uint64) (result []int64, err error) { 116 buf := make([]byte, 8) 117 118 var i uint64 119 for i = 0; i < count; i++ { 120 if _, err = reader.Read(buf); err != nil { 121 return nil, err 122 } 123 124 result = append(result, int64(bytesToUint64(buf))) 125 } 126 127 return result, nil 128 } 129 130 func readInt96s(reader *bytes.Reader, count uint64) (result [][]byte, err error) { 131 var i uint64 132 for i = 0; i < count; i++ { 133 buf := make([]byte, 12) 134 135 if _, err = reader.Read(buf); err != nil { 136 return nil, err 137 } 138 139 result = append(result, buf) 140 } 141 142 return result, nil 143 } 144 145 func readFloats(reader *bytes.Reader, count uint64) (result []float32, err error) { 146 buf := make([]byte, 4) 147 148 var i uint64 149 for i = 0; i < count; i++ { 150 if _, err = reader.Read(buf); err != nil { 151 return nil, err 152 } 153 154 result = append(result, math.Float32frombits(bytesToUint32(buf))) 155 } 156 157 return result, nil 158 } 159 160 func readDoubles(reader *bytes.Reader, count uint64) (result []float64, err error) { 161 buf := make([]byte, 8) 162 163 var i uint64 164 for i = 0; i < count; i++ { 165 if _, err = reader.Read(buf); err != nil { 166 return nil, err 167 } 168 169 result = append(result, math.Float64frombits(bytesToUint64(buf))) 170 } 171 172 return result, nil 173 } 174 175 func readByteArrays(reader *bytes.Reader, count uint64) (result [][]byte, err error) { 176 buf := make([]byte, 4) 177 var length uint32 178 var data []byte 179 180 var i uint64 181 for i = 0; i < count; i++ { 182 if _, err = reader.Read(buf); err != nil { 183 return nil, err 184 } 185 186 length = bytesToUint32(buf) 187 data = make([]byte, length) 188 if length > 0 { 189 if _, err = reader.Read(data); err != nil { 190 return nil, err 191 } 192 } 193 194 result = append(result, data) 195 } 196 197 return result, nil 198 } 199 200 func readFixedLenByteArrays(reader *bytes.Reader, count, length uint64) (result [][]byte, err error) { 201 var i uint64 202 for i = 0; i < count; i++ { 203 data := make([]byte, length) 204 if _, err = reader.Read(data); err != nil { 205 return nil, err 206 } 207 208 result = append(result, data) 209 } 210 211 return result, nil 212 } 213 214 func readValues(reader *bytes.Reader, dataType parquet.Type, count, length uint64) (interface{}, error) { 215 switch dataType { 216 case parquet.Type_BOOLEAN: 217 return readBools(reader, count) 218 case parquet.Type_INT32: 219 return readInt32s(reader, count) 220 case parquet.Type_INT64: 221 return readInt64s(reader, count) 222 case parquet.Type_INT96: 223 return readInt96s(reader, count) 224 case parquet.Type_FLOAT: 225 return readFloats(reader, count) 226 case parquet.Type_DOUBLE: 227 return readDoubles(reader, count) 228 case parquet.Type_BYTE_ARRAY: 229 return readByteArrays(reader, count) 230 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 231 return readFixedLenByteArrays(reader, count, length) 232 } 233 234 return nil, fmt.Errorf("unknown parquet type %v", dataType) 235 } 236 237 func readUnsignedVarInt(reader *bytes.Reader) (v uint64, err error) { 238 var b byte 239 var shift uint64 240 241 for { 242 if b, err = reader.ReadByte(); err != nil { 243 return 0, err 244 } 245 246 if v |= ((uint64(b) & 0x7F) << shift); b&0x80 == 0 { 247 break 248 } 249 250 shift += 7 251 } 252 253 return v, nil 254 } 255 256 func readRLE(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) { 257 width := (bitWidth + 7) / 8 258 data := make([]byte, width) 259 if width > 0 { 260 if _, err = reader.Read(data); err != nil { 261 return nil, err 262 } 263 } 264 265 if width < 4 { 266 data = append(data, make([]byte, 4-width)...) 267 } 268 269 val := int64(bytesToUint32(data)) 270 count := header >> 1 271 if count > math.MaxInt64/8 { 272 // 8 bytes/element. 273 return nil, errors.New("parquet: size too large") 274 } 275 result = make([]int64, count) 276 for i := range result { 277 result[i] = val 278 } 279 280 return result, nil 281 } 282 283 func readRLEBitPackedHybrid(reader *bytes.Reader, length, bitWidth uint64) (result []int64, err error) { 284 if length <= 0 { 285 var i32s []int32 286 i32s, err = readInt32s(reader, 1) 287 if err != nil { 288 return nil, err 289 } 290 if i32s[0] < 0 { 291 return nil, errors.New("parquet: negative RLEBitPackedHybrid length") 292 } 293 length = uint64(i32s[0]) 294 } 295 296 buf := make([]byte, length) 297 if _, err = reader.Read(buf); err != nil { 298 return nil, err 299 } 300 301 reader = bytes.NewReader(buf) 302 for reader.Len() > 0 { 303 header, err := readUnsignedVarInt(reader) 304 if err != nil { 305 return nil, err 306 } 307 308 var i64s []int64 309 if header&1 == 0 { 310 i64s, err = readRLE(reader, header, bitWidth) 311 } else { 312 i64s, err = readBitPacked(reader, header>>1, bitWidth) 313 } 314 315 if err != nil { 316 return nil, err 317 } 318 319 result = append(result, i64s...) 320 } 321 322 return result, nil 323 } 324 325 func readDeltaBinaryPackedInt(reader *bytes.Reader) (result []int64, err error) { 326 blockSize, err := readUnsignedVarInt(reader) 327 if err != nil { 328 return nil, err 329 } 330 331 numMiniblocksInBlock, err := readUnsignedVarInt(reader) 332 if err != nil { 333 return nil, err 334 } 335 336 numValues, err := readUnsignedVarInt(reader) 337 if err != nil { 338 return nil, err 339 } 340 341 firstValueZigZag, err := readUnsignedVarInt(reader) 342 if err != nil { 343 return nil, err 344 } 345 346 v := int64(firstValueZigZag>>1) ^ (-int64(firstValueZigZag & 1)) 347 result = append(result, v) 348 if numMiniblocksInBlock == 0 { 349 return nil, errors.New("parquet: zero mini blocks in block") 350 } 351 numValuesInMiniBlock := blockSize / numMiniblocksInBlock 352 353 bitWidths := make([]uint64, numMiniblocksInBlock) 354 for uint64(len(result)) < numValues { 355 minDeltaZigZag, err := readUnsignedVarInt(reader) 356 if err != nil { 357 return nil, err 358 } 359 360 for i := 0; uint64(i) < numMiniblocksInBlock; i++ { 361 b, err := reader.ReadByte() 362 if err != nil { 363 return nil, err 364 } 365 bitWidths[i] = uint64(b) 366 } 367 368 minDelta := int64(minDeltaZigZag>>1) ^ (-int64(minDeltaZigZag & 1)) 369 for i := 0; uint64(i) < numMiniblocksInBlock; i++ { 370 i64s, err := readBitPacked(reader, numValuesInMiniBlock/8, bitWidths[i]) 371 if err != nil { 372 return nil, err 373 } 374 375 for j := range i64s { 376 v += i64s[j] + minDelta 377 result = append(result, v) 378 } 379 } 380 } 381 382 return result[:numValues], nil 383 } 384 385 func readDeltaLengthByteArrays(reader *bytes.Reader) (result [][]byte, err error) { 386 i64s, err := readDeltaBinaryPackedInt(reader) 387 if err != nil { 388 return nil, err 389 } 390 391 for i := 0; i < len(i64s); i++ { 392 arrays, err := readFixedLenByteArrays(reader, 1, uint64(i64s[i])) 393 if err != nil { 394 return nil, err 395 } 396 397 result = append(result, arrays[0]) 398 } 399 400 return result, nil 401 } 402 403 func readDeltaByteArrays(reader *bytes.Reader) (result [][]byte, err error) { 404 i64s, err := readDeltaBinaryPackedInt(reader) 405 if err != nil { 406 return nil, err 407 } 408 409 suffixes, err := readDeltaLengthByteArrays(reader) 410 if err != nil { 411 return nil, err 412 } 413 414 result = append(result, suffixes[0]) 415 for i := 1; i < len(i64s); i++ { 416 prefixLength := i64s[i] 417 val := append([]byte{}, result[i-1][:prefixLength]...) 418 val = append(val, suffixes[i]...) 419 result = append(result, val) 420 } 421 422 return result, nil 423 } 424 425 func readDataPageValues( 426 bytesReader *bytes.Reader, 427 encoding parquet.Encoding, 428 dataType parquet.Type, 429 convertedType parquet.ConvertedType, 430 count, bitWidth uint64, 431 ) (result interface{}, resultDataType parquet.Type, err error) { 432 switch encoding { 433 case parquet.Encoding_PLAIN: 434 result, err = readValues(bytesReader, dataType, count, bitWidth) 435 return result, dataType, err 436 437 case parquet.Encoding_PLAIN_DICTIONARY: 438 b, err := bytesReader.ReadByte() 439 if err != nil { 440 return nil, -1, err 441 } 442 443 i64s, err := readRLEBitPackedHybrid(bytesReader, uint64(bytesReader.Len()), uint64(b)) 444 if err != nil { 445 return nil, -1, err 446 } 447 if len(i64s) < int(count) || count > math.MaxInt64/8 { 448 return nil, -1, errors.New("parquet: value out of range") 449 } 450 return i64s[:count], parquet.Type_INT64, nil 451 452 case parquet.Encoding_RLE: 453 i64s, err := readRLEBitPackedHybrid(bytesReader, 0, bitWidth) 454 if err != nil { 455 return nil, -1, err 456 } 457 458 if len(i64s) < int(count) || count > math.MaxInt64/8 { 459 return nil, -1, errors.New("parquet: value out of range") 460 } 461 i64s = i64s[:count] 462 463 if dataType == parquet.Type_INT32 { 464 return i64sToi32s(i64s), parquet.Type_INT32, nil 465 } 466 467 return i64s, parquet.Type_INT64, nil 468 469 case parquet.Encoding_BIT_PACKED: 470 return nil, -1, fmt.Errorf("deprecated parquet encoding %v", parquet.Encoding_BIT_PACKED) 471 472 case parquet.Encoding_DELTA_BINARY_PACKED: 473 i64s, err := readDeltaBinaryPackedInt(bytesReader) 474 if err != nil { 475 return nil, -1, err 476 } 477 478 if len(i64s) < int(count) || count > math.MaxInt64/8 { 479 return nil, -1, errors.New("parquet: value out of range") 480 } 481 i64s = i64s[:count] 482 483 if dataType == parquet.Type_INT32 { 484 return i64sToi32s(i64s), parquet.Type_INT32, nil 485 } 486 487 return i64s, parquet.Type_INT64, nil 488 489 case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: 490 byteSlices, err := readDeltaLengthByteArrays(bytesReader) 491 if err != nil { 492 return nil, -1, err 493 } 494 if len(byteSlices) < int(count) || count > math.MaxInt64/24 { 495 return nil, -1, errors.New("parquet: value out of range") 496 } 497 498 return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil 499 500 case parquet.Encoding_DELTA_BYTE_ARRAY: 501 byteSlices, err := readDeltaByteArrays(bytesReader) 502 if err != nil { 503 return nil, -1, err 504 } 505 if len(byteSlices) < int(count) || count > math.MaxInt64/24 { 506 return nil, -1, errors.New("parquet: value out of range") 507 } 508 509 return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil 510 } 511 512 return nil, -1, fmt.Errorf("unsupported parquet encoding %v", encoding) 513 }