github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/delta/binary_packed.go (about) 1 package delta 2 3 import ( 4 "encoding/binary" 5 "fmt" 6 "io" 7 "math" 8 "math/bits" 9 10 "github.com/vc42/parquet-go/encoding" 11 "github.com/vc42/parquet-go/format" 12 "github.com/vc42/parquet-go/internal/bitpack" 13 "github.com/vc42/parquet-go/internal/unsafecast" 14 ) 15 16 type BinaryPackedEncoding struct { 17 encoding.NotSupported 18 } 19 20 func (e *BinaryPackedEncoding) String() string { 21 return "DELTA_BINARY_PACKED" 22 } 23 24 func (e *BinaryPackedEncoding) Encoding() format.Encoding { 25 return format.DeltaBinaryPacked 26 } 27 28 func (e *BinaryPackedEncoding) EncodeInt32(dst, src []byte) ([]byte, error) { 29 if (len(src) % 4) != 0 { 30 return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "INT64", len(src)) 31 } 32 return encodeInt32(dst[:0], bytesToInt32(src)), nil 33 } 34 35 func (e *BinaryPackedEncoding) EncodeInt64(dst, src []byte) ([]byte, error) { 36 if (len(src) % 8) != 0 { 37 return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "INT64", len(src)) 38 } 39 return encodeInt64(dst[:0], bytesToInt64(src)), nil 40 } 41 42 func (e *BinaryPackedEncoding) DecodeInt32(dst, src []byte) ([]byte, error) { 43 dst, _, err := decodeInt32(dst[:0], src) 44 return dst, e.wrap(err) 45 } 46 47 func (e *BinaryPackedEncoding) DecodeInt64(dst, src []byte) ([]byte, error) { 48 dst, _, err := decodeInt64(dst[:0], src) 49 return dst, e.wrap(err) 50 } 51 52 func (e *BinaryPackedEncoding) wrap(err error) error { 53 if err != nil { 54 err = encoding.Error(e, err) 55 } 56 return err 57 } 58 59 const ( 60 blockSize = 128 61 numMiniBlocks = 4 62 miniBlockSize = blockSize / numMiniBlocks 63 // The parquet spec does not enforce a limit to the block size, but we need 64 // one otherwise invalid inputs may result in unbounded memory allocations. 65 // 66 // 65K+ values should be enough for any valid use case. 67 maxSupportedBlockSize = 65536 68 69 maxHeaderLength32 = 4 * binary.MaxVarintLen64 70 maxMiniBlockLength32 = binary.MaxVarintLen64 + numMiniBlocks + (4 * blockSize) 71 72 maxHeaderLength64 = 8 * binary.MaxVarintLen64 73 maxMiniBlockLength64 = binary.MaxVarintLen64 + numMiniBlocks + (8 * blockSize) 74 ) 75 76 var ( 77 encodeInt32 = encodeInt32Default 78 encodeInt64 = encodeInt64Default 79 ) 80 81 func encodeInt32Default(dst []byte, src []int32) []byte { 82 totalValues := len(src) 83 firstValue := int32(0) 84 if totalValues > 0 { 85 firstValue = src[0] 86 } 87 88 n := len(dst) 89 dst = resize(dst, n+maxHeaderLength32) 90 dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))] 91 92 if totalValues < 2 { 93 return dst 94 } 95 96 lastValue := firstValue 97 for i := 1; i < len(src); i += blockSize { 98 block := [blockSize]int32{} 99 blockLength := copy(block[:], src[i:]) 100 101 lastValue = blockDeltaInt32(&block, lastValue) 102 minDelta := blockMinInt32(&block) 103 blockSubInt32(&block, minDelta) 104 blockClearInt32(&block, blockLength) 105 106 bitWidths := [numMiniBlocks]byte{} 107 blockBitWidthsInt32(&bitWidths, &block) 108 109 n := len(dst) 110 dst = resize(dst, n+maxMiniBlockLength32+4) 111 n += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths) 112 113 for i, bitWidth := range bitWidths { 114 if bitWidth != 0 { 115 miniBlock := (*[miniBlockSize]int32)(block[i*miniBlockSize:]) 116 encodeMiniBlockInt32(dst[n:], miniBlock, uint(bitWidth)) 117 n += (miniBlockSize * int(bitWidth)) / 8 118 } 119 } 120 121 dst = dst[:n] 122 } 123 124 return dst 125 } 126 127 func encodeInt64Default(dst []byte, src []int64) []byte { 128 totalValues := len(src) 129 firstValue := int64(0) 130 if totalValues > 0 { 131 firstValue = src[0] 132 } 133 134 n := len(dst) 135 dst = resize(dst, n+maxHeaderLength64) 136 dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, firstValue)] 137 138 if totalValues < 2 { 139 return dst 140 } 141 142 lastValue := firstValue 143 for i := 1; i < len(src); i += blockSize { 144 block := [blockSize]int64{} 145 blockLength := copy(block[:], src[i:]) 146 147 lastValue = blockDeltaInt64(&block, lastValue) 148 minDelta := blockMinInt64(&block) 149 blockSubInt64(&block, minDelta) 150 blockClearInt64(&block, blockLength) 151 152 bitWidths := [numMiniBlocks]byte{} 153 blockBitWidthsInt64(&bitWidths, &block) 154 155 n := len(dst) 156 dst = resize(dst, n+maxMiniBlockLength64+8) 157 n += encodeBlockHeader(dst[n:], minDelta, bitWidths) 158 159 for i, bitWidth := range bitWidths { 160 if bitWidth != 0 { 161 miniBlock := (*[miniBlockSize]int64)(block[i*miniBlockSize:]) 162 encodeMiniBlockInt64(dst[n:], miniBlock, uint(bitWidth)) 163 n += (miniBlockSize * int(bitWidth)) / 8 164 } 165 } 166 167 dst = dst[:n] 168 } 169 170 return dst 171 } 172 173 func encodeBinaryPackedHeader(dst []byte, blockSize, numMiniBlocks, totalValues int, firstValue int64) (n int) { 174 n += binary.PutUvarint(dst[n:], uint64(blockSize)) 175 n += binary.PutUvarint(dst[n:], uint64(numMiniBlocks)) 176 n += binary.PutUvarint(dst[n:], uint64(totalValues)) 177 n += binary.PutVarint(dst[n:], firstValue) 178 return n 179 } 180 181 func encodeBlockHeader(dst []byte, minDelta int64, bitWidths [numMiniBlocks]byte) (n int) { 182 n += binary.PutVarint(dst, int64(minDelta)) 183 n += copy(dst[n:], bitWidths[:]) 184 return n 185 } 186 187 func blockClearInt32(block *[blockSize]int32, blockLength int) { 188 if blockLength < blockSize { 189 clear := block[blockLength:] 190 for i := range clear { 191 clear[i] = 0 192 } 193 } 194 } 195 196 func blockDeltaInt32(block *[blockSize]int32, lastValue int32) int32 { 197 for i, v := range block { 198 block[i], lastValue = v-lastValue, v 199 } 200 return lastValue 201 } 202 203 func blockMinInt32(block *[blockSize]int32) int32 { 204 min := block[0] 205 for _, v := range block[1:] { 206 if v < min { 207 min = v 208 } 209 } 210 return min 211 } 212 213 func blockSubInt32(block *[blockSize]int32, value int32) { 214 for i := range block { 215 block[i] -= value 216 } 217 } 218 219 func blockBitWidthsInt32(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32) { 220 for i := range bitWidths { 221 j := (i + 0) * miniBlockSize 222 k := (i + 1) * miniBlockSize 223 bitWidth := 0 224 225 for _, v := range block[j:k] { 226 if n := bits.Len32(uint32(v)); n > bitWidth { 227 bitWidth = n 228 } 229 } 230 231 bitWidths[i] = byte(bitWidth) 232 } 233 } 234 235 func blockClearInt64(block *[blockSize]int64, blockLength int) { 236 if blockLength < blockSize { 237 clear := block[blockLength:] 238 for i := range clear { 239 clear[i] = 0 240 } 241 } 242 } 243 244 func blockDeltaInt64(block *[blockSize]int64, lastValue int64) int64 { 245 for i, v := range block { 246 block[i], lastValue = v-lastValue, v 247 } 248 return lastValue 249 } 250 251 func blockMinInt64(block *[blockSize]int64) int64 { 252 min := block[0] 253 for _, v := range block[1:] { 254 if v < min { 255 min = v 256 } 257 } 258 return min 259 } 260 261 func blockSubInt64(block *[blockSize]int64, value int64) { 262 for i := range block { 263 block[i] -= value 264 } 265 } 266 267 func blockBitWidthsInt64(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64) { 268 for i := range bitWidths { 269 j := (i + 0) * miniBlockSize 270 k := (i + 1) * miniBlockSize 271 bitWidth := 0 272 273 for _, v := range block[j:k] { 274 if n := bits.Len64(uint64(v)); n > bitWidth { 275 bitWidth = n 276 } 277 } 278 279 bitWidths[i] = byte(bitWidth) 280 } 281 } 282 283 func decodeInt32(dst, src []byte) ([]byte, []byte, error) { 284 blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src) 285 if err != nil { 286 return dst, src, err 287 } 288 if totalValues == 0 { 289 return dst, src, nil 290 } 291 if firstValue < math.MinInt32 || firstValue > math.MaxInt32 { 292 return dst, src, fmt.Errorf("first value out of range: %d", firstValue) 293 } 294 295 writeOffset := len(dst) 296 dst = resize(dst, len(dst)+4*totalValues) 297 out := unsafecast.BytesToInt32(dst) 298 out[writeOffset] = int32(firstValue) 299 writeOffset++ 300 totalValues-- 301 lastValue := int32(firstValue) 302 numValuesInMiniBlock := blockSize / numMiniBlocks 303 304 const padding = 16 305 miniBlockTemp := make([]byte, 256+padding) 306 307 for totalValues > 0 && len(src) > 0 { 308 var minDelta int64 309 var bitWidths []byte 310 minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks) 311 if err != nil { 312 return dst, src, err 313 } 314 315 blockOffset := writeOffset 316 317 for _, bitWidth := range bitWidths { 318 n := min(numValuesInMiniBlock, totalValues) 319 if bitWidth != 0 { 320 miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8 321 miniBlockData := src 322 if miniBlockSize <= len(src) { 323 miniBlockData = miniBlockData[:miniBlockSize] 324 } 325 src = src[len(miniBlockData):] 326 if cap(miniBlockData) < miniBlockSize+bitpack.PaddingInt32 { 327 miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt32) 328 miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)] 329 } 330 miniBlockData = miniBlockData[:miniBlockSize] 331 bitpack.UnpackInt32(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) 332 } 333 writeOffset += n 334 totalValues -= n 335 if totalValues == 0 { 336 break 337 } 338 } 339 340 lastValue = decodeBlockInt32(out[blockOffset:writeOffset], int32(minDelta), lastValue) 341 } 342 343 if totalValues > 0 { 344 return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF) 345 } 346 347 return dst, src, nil 348 } 349 350 func decodeInt64(dst, src []byte) ([]byte, []byte, error) { 351 blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src) 352 if err != nil { 353 return dst, src, err 354 } 355 if totalValues == 0 { 356 return dst, src, nil 357 } 358 359 writeOffset := len(dst) 360 dst = resize(dst, len(dst)+8*totalValues) 361 out := unsafecast.BytesToInt64(dst) 362 out[writeOffset] = firstValue 363 writeOffset++ 364 totalValues-- 365 lastValue := firstValue 366 numValuesInMiniBlock := blockSize / numMiniBlocks 367 368 const padding = 16 369 miniBlockTemp := make([]byte, 512+padding) 370 371 for totalValues > 0 && len(src) > 0 { 372 var minDelta int64 373 var bitWidths []byte 374 minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks) 375 if err != nil { 376 return dst, src, err 377 } 378 blockOffset := writeOffset 379 380 for _, bitWidth := range bitWidths { 381 n := min(numValuesInMiniBlock, totalValues) 382 if bitWidth != 0 { 383 miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8 384 miniBlockData := src 385 if miniBlockSize <= len(src) { 386 miniBlockData = src[:miniBlockSize] 387 } 388 src = src[len(miniBlockData):] 389 if len(miniBlockData) < miniBlockSize+bitpack.PaddingInt64 { 390 miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt64) 391 miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)] 392 } 393 miniBlockData = miniBlockData[:miniBlockSize] 394 bitpack.UnpackInt64(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) 395 } 396 writeOffset += n 397 totalValues -= n 398 if totalValues == 0 { 399 break 400 } 401 } 402 403 lastValue = decodeBlockInt64(out[blockOffset:writeOffset], minDelta, lastValue) 404 } 405 406 if totalValues > 0 { 407 return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF) 408 } 409 410 return dst, src, nil 411 } 412 413 func decodeBinaryPackedHeader(src []byte) (blockSize, numMiniBlocks, totalValues int, firstValue int64, next []byte, err error) { 414 u := uint64(0) 415 n := 0 416 i := 0 417 418 if u, n, err = decodeUvarint(src[i:], "block size"); err != nil { 419 return 420 } 421 i += n 422 blockSize = int(u) 423 424 if u, n, err = decodeUvarint(src[i:], "number of mini-blocks"); err != nil { 425 return 426 } 427 i += n 428 numMiniBlocks = int(u) 429 430 if u, n, err = decodeUvarint(src[i:], "total values"); err != nil { 431 return 432 } 433 i += n 434 totalValues = int(u) 435 436 if firstValue, n, err = decodeVarint(src[i:], "first value"); err != nil { 437 return 438 } 439 i += n 440 441 if numMiniBlocks == 0 { 442 err = fmt.Errorf("invalid number of mini block (%d)", numMiniBlocks) 443 } else if (blockSize <= 0) || (blockSize%128) != 0 { 444 err = fmt.Errorf("invalid block size is not a multiple of 128 (%d)", blockSize) 445 } else if blockSize > maxSupportedBlockSize { 446 err = fmt.Errorf("invalid block size is too large (%d)", blockSize) 447 } else if miniBlockSize := blockSize / numMiniBlocks; (numMiniBlocks <= 0) || (miniBlockSize%32) != 0 { 448 err = fmt.Errorf("invalid mini block size is not a multiple of 32 (%d)", miniBlockSize) 449 } else if totalValues < 0 { 450 err = fmt.Errorf("invalid total number of values is negative (%d)", totalValues) 451 } else if totalValues > math.MaxInt32 { 452 err = fmt.Errorf("too many values: %d", totalValues) 453 } 454 455 return blockSize, numMiniBlocks, totalValues, firstValue, src[i:], err 456 } 457 458 func decodeBinaryPackedBlock(src []byte, numMiniBlocks int) (minDelta int64, bitWidths, next []byte, err error) { 459 minDelta, n, err := decodeVarint(src, "min delta") 460 if err != nil { 461 return 0, nil, src, err 462 } 463 src = src[n:] 464 if len(src) < numMiniBlocks { 465 bitWidths, next = src, nil 466 } else { 467 bitWidths, next = src[:numMiniBlocks], src[numMiniBlocks:] 468 } 469 return minDelta, bitWidths, next, nil 470 } 471 472 func decodeUvarint(buf []byte, what string) (u uint64, n int, err error) { 473 u, n = binary.Uvarint(buf) 474 if n == 0 { 475 return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF) 476 } 477 if n < 0 { 478 return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf)) 479 } 480 return u, n, nil 481 } 482 483 func decodeVarint(buf []byte, what string) (v int64, n int, err error) { 484 v, n = binary.Varint(buf) 485 if n == 0 { 486 return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF) 487 } 488 if n < 0 { 489 return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf)) 490 } 491 return v, n, nil 492 }