github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding/delta/binary_packed.go (about) 1 package delta 2 3 import ( 4 "encoding/binary" 5 "fmt" 6 "io" 7 "math" 8 "math/bits" 9 10 "github.com/segmentio/parquet-go/encoding" 11 "github.com/segmentio/parquet-go/format" 12 "github.com/segmentio/parquet-go/internal/bitpack" 13 "github.com/segmentio/parquet-go/internal/unsafecast" 14 ) 15 16 type BinaryPackedEncoding struct { 17 encoding.NotSupported 18 } 19 20 func (e *BinaryPackedEncoding) String() string { 21 return "DELTA_BINARY_PACKED" 22 } 23 24 func (e *BinaryPackedEncoding) Encoding() format.Encoding { 25 return format.DeltaBinaryPacked 26 } 27 28 func (e *BinaryPackedEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { 29 return encodeInt32(dst[:0], src), nil 30 } 31 32 func (e *BinaryPackedEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) { 33 return encodeInt64(dst[:0], src), nil 34 } 35 36 func (e *BinaryPackedEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { 37 buf := unsafecast.Int32ToBytes(dst) 38 buf, _, err := decodeInt32(buf[:0], src) 39 return unsafecast.BytesToInt32(buf), e.wrap(err) 40 } 41 42 func (e *BinaryPackedEncoding) DecodeInt64(dst []int64, src []byte) ([]int64, error) { 43 buf := unsafecast.Int64ToBytes(dst) 44 buf, _, err := decodeInt64(buf[:0], src) 45 return unsafecast.BytesToInt64(buf), e.wrap(err) 46 } 47 48 func (e *BinaryPackedEncoding) wrap(err error) error { 49 if err != nil { 50 err = encoding.Error(e, err) 51 } 52 return err 53 } 54 55 const ( 56 blockSize = 128 57 numMiniBlocks = 4 58 miniBlockSize = blockSize / numMiniBlocks 59 // The parquet spec does not enforce a limit to the block size, but we need 60 // one otherwise invalid inputs may result in unbounded memory allocations. 61 // 62 // 65K+ values should be enough for any valid use case. 63 maxSupportedBlockSize = 65536 64 65 maxHeaderLength32 = 4 * binary.MaxVarintLen64 66 maxMiniBlockLength32 = binary.MaxVarintLen64 + numMiniBlocks + (4 * blockSize) 67 68 maxHeaderLength64 = 8 * binary.MaxVarintLen64 69 maxMiniBlockLength64 = binary.MaxVarintLen64 + numMiniBlocks + (8 * blockSize) 70 ) 71 72 var ( 73 encodeInt32 = encodeInt32Default 74 encodeInt64 = encodeInt64Default 75 ) 76 77 func encodeInt32Default(dst []byte, src []int32) []byte { 78 totalValues := len(src) 79 firstValue := int32(0) 80 if totalValues > 0 { 81 firstValue = src[0] 82 } 83 84 n := len(dst) 85 dst = resize(dst, n+maxHeaderLength32) 86 dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, int64(firstValue))] 87 88 if totalValues < 2 { 89 return dst 90 } 91 92 lastValue := firstValue 93 for i := 1; i < len(src); i += blockSize { 94 block := [blockSize]int32{} 95 blockLength := copy(block[:], src[i:]) 96 97 lastValue = blockDeltaInt32(&block, lastValue) 98 minDelta := blockMinInt32(&block) 99 blockSubInt32(&block, minDelta) 100 blockClearInt32(&block, blockLength) 101 102 bitWidths := [numMiniBlocks]byte{} 103 blockBitWidthsInt32(&bitWidths, &block) 104 105 n := len(dst) 106 dst = resize(dst, n+maxMiniBlockLength32+4) 107 n += encodeBlockHeader(dst[n:], int64(minDelta), bitWidths) 108 109 for i, bitWidth := range bitWidths { 110 if bitWidth != 0 { 111 miniBlock := (*[miniBlockSize]int32)(block[i*miniBlockSize:]) 112 encodeMiniBlockInt32(dst[n:], miniBlock, uint(bitWidth)) 113 n += (miniBlockSize * int(bitWidth)) / 8 114 } 115 } 116 117 dst = dst[:n] 118 } 119 120 return dst 121 } 122 123 func encodeInt64Default(dst []byte, src []int64) []byte { 124 totalValues := len(src) 125 firstValue := int64(0) 126 if totalValues > 0 { 127 firstValue = src[0] 128 } 129 130 n := len(dst) 131 dst = resize(dst, n+maxHeaderLength64) 132 dst = dst[:n+encodeBinaryPackedHeader(dst[n:], blockSize, numMiniBlocks, totalValues, firstValue)] 133 134 if totalValues < 2 { 135 return dst 136 } 137 138 lastValue := firstValue 139 for i := 1; i < len(src); i += blockSize { 140 block := [blockSize]int64{} 141 blockLength := copy(block[:], src[i:]) 142 143 lastValue = blockDeltaInt64(&block, lastValue) 144 minDelta := blockMinInt64(&block) 145 blockSubInt64(&block, minDelta) 146 blockClearInt64(&block, blockLength) 147 148 bitWidths := [numMiniBlocks]byte{} 149 blockBitWidthsInt64(&bitWidths, &block) 150 151 n := len(dst) 152 dst = resize(dst, n+maxMiniBlockLength64+8) 153 n += encodeBlockHeader(dst[n:], minDelta, bitWidths) 154 155 for i, bitWidth := range bitWidths { 156 if bitWidth != 0 { 157 miniBlock := (*[miniBlockSize]int64)(block[i*miniBlockSize:]) 158 encodeMiniBlockInt64(dst[n:], miniBlock, uint(bitWidth)) 159 n += (miniBlockSize * int(bitWidth)) / 8 160 } 161 } 162 163 dst = dst[:n] 164 } 165 166 return dst 167 } 168 169 func encodeBinaryPackedHeader(dst []byte, blockSize, numMiniBlocks, totalValues int, firstValue int64) (n int) { 170 n += binary.PutUvarint(dst[n:], uint64(blockSize)) 171 n += binary.PutUvarint(dst[n:], uint64(numMiniBlocks)) 172 n += binary.PutUvarint(dst[n:], uint64(totalValues)) 173 n += binary.PutVarint(dst[n:], firstValue) 174 return n 175 } 176 177 func encodeBlockHeader(dst []byte, minDelta int64, bitWidths [numMiniBlocks]byte) (n int) { 178 n += binary.PutVarint(dst, int64(minDelta)) 179 n += copy(dst[n:], bitWidths[:]) 180 return n 181 } 182 183 func blockClearInt32(block *[blockSize]int32, blockLength int) { 184 if blockLength < blockSize { 185 clear := block[blockLength:] 186 for i := range clear { 187 clear[i] = 0 188 } 189 } 190 } 191 192 func blockDeltaInt32(block *[blockSize]int32, lastValue int32) int32 { 193 for i, v := range block { 194 block[i], lastValue = v-lastValue, v 195 } 196 return lastValue 197 } 198 199 func blockMinInt32(block *[blockSize]int32) int32 { 200 min := block[0] 201 for _, v := range block[1:] { 202 if v < min { 203 min = v 204 } 205 } 206 return min 207 } 208 209 func blockSubInt32(block *[blockSize]int32, value int32) { 210 for i := range block { 211 block[i] -= value 212 } 213 } 214 215 func blockBitWidthsInt32(bitWidths *[numMiniBlocks]byte, block *[blockSize]int32) { 216 for i := range bitWidths { 217 j := (i + 0) * miniBlockSize 218 k := (i + 1) * miniBlockSize 219 bitWidth := 0 220 221 for _, v := range block[j:k] { 222 if n := bits.Len32(uint32(v)); n > bitWidth { 223 bitWidth = n 224 } 225 } 226 227 bitWidths[i] = byte(bitWidth) 228 } 229 } 230 231 func blockClearInt64(block *[blockSize]int64, blockLength int) { 232 if blockLength < blockSize { 233 clear := block[blockLength:] 234 for i := range clear { 235 clear[i] = 0 236 } 237 } 238 } 239 240 func blockDeltaInt64(block *[blockSize]int64, lastValue int64) int64 { 241 for i, v := range block { 242 block[i], lastValue = v-lastValue, v 243 } 244 return lastValue 245 } 246 247 func blockMinInt64(block *[blockSize]int64) int64 { 248 min := block[0] 249 for _, v := range block[1:] { 250 if v < min { 251 min = v 252 } 253 } 254 return min 255 } 256 257 func blockSubInt64(block *[blockSize]int64, value int64) { 258 for i := range block { 259 block[i] -= value 260 } 261 } 262 263 func blockBitWidthsInt64(bitWidths *[numMiniBlocks]byte, block *[blockSize]int64) { 264 for i := range bitWidths { 265 j := (i + 0) * miniBlockSize 266 k := (i + 1) * miniBlockSize 267 bitWidth := 0 268 269 for _, v := range block[j:k] { 270 if n := bits.Len64(uint64(v)); n > bitWidth { 271 bitWidth = n 272 } 273 } 274 275 bitWidths[i] = byte(bitWidth) 276 } 277 } 278 279 func decodeInt32(dst, src []byte) ([]byte, []byte, error) { 280 blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src) 281 if err != nil { 282 return dst, src, err 283 } 284 if totalValues == 0 { 285 return dst, src, nil 286 } 287 if firstValue < math.MinInt32 || firstValue > math.MaxInt32 { 288 return dst, src, fmt.Errorf("first value out of range: %d", firstValue) 289 } 290 291 writeOffset := len(dst) 292 dst = resize(dst, len(dst)+4*totalValues) 293 out := unsafecast.BytesToInt32(dst) 294 out[writeOffset] = int32(firstValue) 295 writeOffset++ 296 totalValues-- 297 lastValue := int32(firstValue) 298 numValuesInMiniBlock := blockSize / numMiniBlocks 299 300 const padding = 16 301 miniBlockTemp := make([]byte, 256+padding) 302 303 for totalValues > 0 && len(src) > 0 { 304 var minDelta int64 305 var bitWidths []byte 306 minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks) 307 if err != nil { 308 return dst, src, err 309 } 310 311 blockOffset := writeOffset 312 313 for _, bitWidth := range bitWidths { 314 n := min(numValuesInMiniBlock, totalValues) 315 if bitWidth != 0 { 316 miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8 317 miniBlockData := src 318 if miniBlockSize <= len(src) { 319 miniBlockData = miniBlockData[:miniBlockSize] 320 } 321 src = src[len(miniBlockData):] 322 if cap(miniBlockData) < miniBlockSize+bitpack.PaddingInt32 { 323 miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt32) 324 miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)] 325 } 326 miniBlockData = miniBlockData[:miniBlockSize] 327 bitpack.UnpackInt32(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) 328 } 329 writeOffset += n 330 totalValues -= n 331 if totalValues == 0 { 332 break 333 } 334 } 335 336 lastValue = decodeBlockInt32(out[blockOffset:writeOffset], int32(minDelta), lastValue) 337 } 338 339 if totalValues > 0 { 340 return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF) 341 } 342 343 return dst, src, nil 344 } 345 346 func decodeInt64(dst, src []byte) ([]byte, []byte, error) { 347 blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src) 348 if err != nil { 349 return dst, src, err 350 } 351 if totalValues == 0 { 352 return dst, src, nil 353 } 354 355 writeOffset := len(dst) 356 dst = resize(dst, len(dst)+8*totalValues) 357 out := unsafecast.BytesToInt64(dst) 358 out[writeOffset] = firstValue 359 writeOffset++ 360 totalValues-- 361 lastValue := firstValue 362 numValuesInMiniBlock := blockSize / numMiniBlocks 363 364 const padding = 16 365 miniBlockTemp := make([]byte, 512+padding) 366 367 for totalValues > 0 && len(src) > 0 { 368 var minDelta int64 369 var bitWidths []byte 370 minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks) 371 if err != nil { 372 return dst, src, err 373 } 374 blockOffset := writeOffset 375 376 for _, bitWidth := range bitWidths { 377 n := min(numValuesInMiniBlock, totalValues) 378 if bitWidth != 0 { 379 miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8 380 miniBlockData := src 381 if miniBlockSize <= len(src) { 382 miniBlockData = src[:miniBlockSize] 383 } 384 src = src[len(miniBlockData):] 385 if len(miniBlockData) < miniBlockSize+bitpack.PaddingInt64 { 386 miniBlockTemp = resize(miniBlockTemp[:0], miniBlockSize+bitpack.PaddingInt64) 387 miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)] 388 } 389 miniBlockData = miniBlockData[:miniBlockSize] 390 bitpack.UnpackInt64(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) 391 } 392 writeOffset += n 393 totalValues -= n 394 if totalValues == 0 { 395 break 396 } 397 } 398 399 lastValue = decodeBlockInt64(out[blockOffset:writeOffset], minDelta, lastValue) 400 } 401 402 if totalValues > 0 { 403 return dst, src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF) 404 } 405 406 return dst, src, nil 407 } 408 409 func decodeBinaryPackedHeader(src []byte) (blockSize, numMiniBlocks, totalValues int, firstValue int64, next []byte, err error) { 410 u := uint64(0) 411 n := 0 412 i := 0 413 414 if u, n, err = decodeUvarint(src[i:], "block size"); err != nil { 415 return 416 } 417 i += n 418 blockSize = int(u) 419 420 if u, n, err = decodeUvarint(src[i:], "number of mini-blocks"); err != nil { 421 return 422 } 423 i += n 424 numMiniBlocks = int(u) 425 426 if u, n, err = decodeUvarint(src[i:], "total values"); err != nil { 427 return 428 } 429 i += n 430 totalValues = int(u) 431 432 if firstValue, n, err = decodeVarint(src[i:], "first value"); err != nil { 433 return 434 } 435 i += n 436 437 if numMiniBlocks == 0 { 438 err = fmt.Errorf("invalid number of mini block (%d)", numMiniBlocks) 439 } else if (blockSize <= 0) || (blockSize%128) != 0 { 440 err = fmt.Errorf("invalid block size is not a multiple of 128 (%d)", blockSize) 441 } else if blockSize > maxSupportedBlockSize { 442 err = fmt.Errorf("invalid block size is too large (%d)", blockSize) 443 } else if miniBlockSize := blockSize / numMiniBlocks; (numMiniBlocks <= 0) || (miniBlockSize%32) != 0 { 444 err = fmt.Errorf("invalid mini block size is not a multiple of 32 (%d)", miniBlockSize) 445 } else if totalValues < 0 { 446 err = fmt.Errorf("invalid total number of values is negative (%d)", totalValues) 447 } else if totalValues > math.MaxInt32 { 448 err = fmt.Errorf("too many values: %d", totalValues) 449 } 450 451 return blockSize, numMiniBlocks, totalValues, firstValue, src[i:], err 452 } 453 454 func decodeBinaryPackedBlock(src []byte, numMiniBlocks int) (minDelta int64, bitWidths, next []byte, err error) { 455 minDelta, n, err := decodeVarint(src, "min delta") 456 if err != nil { 457 return 0, nil, src, err 458 } 459 src = src[n:] 460 if len(src) < numMiniBlocks { 461 bitWidths, next = src, nil 462 } else { 463 bitWidths, next = src[:numMiniBlocks], src[numMiniBlocks:] 464 } 465 return minDelta, bitWidths, next, nil 466 } 467 468 func decodeUvarint(buf []byte, what string) (u uint64, n int, err error) { 469 u, n = binary.Uvarint(buf) 470 if n == 0 { 471 return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF) 472 } 473 if n < 0 { 474 return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf)) 475 } 476 return u, n, nil 477 } 478 479 func decodeVarint(buf []byte, what string) (v int64, n int, err error) { 480 v, n = binary.Varint(buf) 481 if n == 0 { 482 return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF) 483 } 484 if n < 0 { 485 return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf)) 486 } 487 return v, n, nil 488 }