github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/encoding/rle/rle.go (about) 1 // Package rle implements the hybrid RLE/Bit-Packed encoding employed in 2 // repetition and definition levels, dictionary indexed data pages, and 3 // boolean values in the PLAIN encoding. 4 // 5 // https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 6 package rle 7 8 import ( 9 "encoding/binary" 10 "fmt" 11 "io" 12 "unsafe" 13 14 "github.com/vc42/parquet-go/encoding" 15 "github.com/vc42/parquet-go/format" 16 "github.com/vc42/parquet-go/internal/bitpack" 17 "github.com/vc42/parquet-go/internal/bytealg" 18 "github.com/vc42/parquet-go/internal/unsafecast" 19 ) 20 21 const ( 22 // This limit is intended to prevent unbounded memory allocations when 23 // decoding runs. 24 // 25 // We use a generous limit which allows for over 16 million values per page 26 // if there is only one run to encode the repetition or definition levels 27 // (this should be uncommon). 28 maxSupportedValueCount = 16 * 1024 * 1024 29 ) 30 31 type Encoding struct { 32 encoding.NotSupported 33 BitWidth int 34 } 35 36 func (e *Encoding) String() string { 37 return "RLE" 38 } 39 40 func (e *Encoding) Encoding() format.Encoding { 41 return format.RLE 42 } 43 44 func (e *Encoding) EncodeLevels(dst, src []byte) ([]byte, error) { 45 dst, err := encodeBytes(dst[:0], src, uint(e.BitWidth)) 46 return dst, e.wrap(err) 47 } 48 49 func (e *Encoding) EncodeBoolean(dst, src []byte) ([]byte, error) { 50 // In the case of encoding a boolean values, the 4 bytes length of the 51 // output is expected by the parquet format. We add the bytes as placeholder 52 // before appending the encoded data. 53 dst = append(dst[:0], 0, 0, 0, 0) 54 dst, err := encodeBits(dst, src) 55 binary.LittleEndian.PutUint32(dst, uint32(len(dst))-4) 56 return dst, e.wrap(err) 57 } 58 59 func (e *Encoding) EncodeInt32(dst, src []byte) ([]byte, error) { 60 if (len(src) % 4) != 0 { 61 return dst[:0], encoding.ErrEncodeInvalidInputSize(e, "INT32", len(src)) 62 } 63 dst, err := encodeInt32(dst[:0], unsafecast.BytesToInt32(src), uint(e.BitWidth)) 64 return dst, e.wrap(err) 65 } 66 67 func (e *Encoding) DecodeLevels(dst, src []byte) ([]byte, error) { 68 dst, err := decodeBytes(dst[:0], src, uint(e.BitWidth)) 69 return dst, e.wrap(err) 70 } 71 72 func (e *Encoding) DecodeBoolean(dst, src []byte) ([]byte, error) { 73 if len(src) == 4 { 74 return dst[:0], nil 75 } 76 if len(src) < 4 { 77 return dst[:0], fmt.Errorf("input shorter than 4 bytes: %w", io.ErrUnexpectedEOF) 78 } 79 n := int(binary.LittleEndian.Uint32(src)) 80 src = src[4:] 81 if n > len(src) { 82 return dst[:0], fmt.Errorf("input shorter than length prefix: %d < %d: %w", len(src), n, io.ErrUnexpectedEOF) 83 } 84 dst, err := decodeBits(dst[:0], src[:n]) 85 return dst, e.wrap(err) 86 } 87 88 func (e *Encoding) DecodeInt32(dst, src []byte) ([]byte, error) { 89 dst, err := decodeInt32(dst[:0], src, uint(e.BitWidth)) 90 return dst, e.wrap(err) 91 } 92 93 func (e *Encoding) wrap(err error) error { 94 if err != nil { 95 err = encoding.Error(e, err) 96 } 97 return err 98 } 99 100 func encodeBits(dst, src []byte) ([]byte, error) { 101 if len(src) == 0 || isZero(src) || isOnes(src) { 102 dst = appendUvarint(dst, uint64(8*len(src))<<1) 103 if len(src) > 0 { 104 dst = append(dst, src[0]) 105 } 106 return dst, nil 107 } 108 109 for i := 0; i < len(src); { 110 j := i + 1 111 112 // Look for contiguous sections of 8 bits, all zeros or ones; these 113 // are run-length encoded as it only takes 2 or 3 bytes to store these 114 // sequences. 115 if src[i] == 0 || src[i] == 0xFF { 116 for j < len(src) && src[i] == src[j] { 117 j++ 118 } 119 120 if n := j - i; n > 1 { 121 dst = appendRunLengthBits(dst, 8*n, src[i]) 122 i = j 123 continue 124 } 125 } 126 127 // Sequences of bits that are neither all zeroes or ones are bit-packed, 128 // which is a simple copy of the input to the output preceded with the 129 // bit-pack header. 130 for j < len(src) && (src[j-1] != src[j] || (src[j] != 0 && src[j] == 0xFF)) { 131 j++ 132 } 133 134 if (j-i) > 1 && j < len(src) { 135 j-- 136 } 137 138 dst = appendBitPackedBits(dst, src[i:j]) 139 i = j 140 } 141 return dst, nil 142 } 143 144 func encodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) { 145 if bitWidth > 8 { 146 return dst, errEncodeInvalidBitWidth("INT8", bitWidth) 147 } 148 if bitWidth == 0 { 149 if !isZero(src) { 150 return dst, errEncodeInvalidBitWidth("INT8", bitWidth) 151 } 152 return appendUvarint(dst, uint64(len(src))<<1), nil 153 } 154 155 if len(src) >= 8 { 156 words := unsafe.Slice((*uint64)(unsafe.Pointer(&src[0])), len(src)/8) 157 158 for i := 0; i < len(words); { 159 j := i 160 pattern := broadcast8x1(words[i]) 161 162 for j < len(words) && words[j] == pattern { 163 j++ 164 } 165 166 if i < j { 167 dst = appendRunLengthBytes(dst, 8*(j-i), byte(pattern)) 168 } else { 169 j++ 170 171 for j < len(words) && words[j] != broadcast8x1(words[j-1]) { 172 j++ 173 } 174 175 dst = appendBitPackedBytes(dst, words[i:j], bitWidth) 176 } 177 178 i = j 179 } 180 } 181 182 for i := (len(src) / 8) * 8; i < len(src); { 183 j := i + 1 184 185 for j < len(src) && src[i] == src[j] { 186 j++ 187 } 188 189 dst = appendRunLengthBytes(dst, j-i, src[i]) 190 i = j 191 } 192 193 return dst, nil 194 } 195 196 func encodeInt32(dst []byte, src []int32, bitWidth uint) ([]byte, error) { 197 if bitWidth > 32 { 198 return dst, errEncodeInvalidBitWidth("INT32", bitWidth) 199 } 200 if bitWidth == 0 { 201 if !isZero(unsafecast.Int32ToBytes(src)) { 202 return dst, errEncodeInvalidBitWidth("INT32", bitWidth) 203 } 204 return appendUvarint(dst, uint64(len(src))<<1), nil 205 } 206 207 if len(src) >= 8 { 208 words := unsafe.Slice((*[8]int32)(unsafe.Pointer(&src[0])), len(src)/8) 209 210 for i := 0; i < len(words); { 211 j := i 212 pattern := broadcast8x4(words[i][0]) 213 214 for j < len(words) && words[j] == pattern { 215 j++ 216 } 217 218 if i < j { 219 dst = appendRunLengthInt32(dst, 8*(j-i), pattern[0], bitWidth) 220 } else { 221 j += 1 222 j += encodeInt32IndexEqual8Contiguous(words[j:]) 223 dst = appendBitPackedInt32(dst, words[i:j], bitWidth) 224 } 225 226 i = j 227 } 228 } 229 230 for i := (len(src) / 8) * 8; i < len(src); { 231 j := i + 1 232 233 for j < len(src) && src[i] == src[j] { 234 j++ 235 } 236 237 dst = appendRunLengthInt32(dst, j-i, src[i], bitWidth) 238 i = j 239 } 240 241 return dst, nil 242 } 243 244 func decodeBits(dst, src []byte) ([]byte, error) { 245 for i := 0; i < len(src); { 246 u, n := binary.Uvarint(src[i:]) 247 if n == 0 { 248 return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF) 249 } 250 if n < 0 { 251 return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src)) 252 } 253 i += n 254 255 count, bitpacked := uint(u>>1), (u&1) != 0 256 if count > maxSupportedValueCount { 257 return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) 258 } 259 if bitpacked { 260 n := int(count) 261 j := i + n 262 263 if j > len(src) { 264 return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", n, io.ErrUnexpectedEOF) 265 } 266 267 dst = append(dst, src[i:j]...) 268 i = j 269 } else { 270 word := byte(0) 271 if i < len(src) { 272 word = src[i] 273 i++ 274 } 275 276 offset := len(dst) 277 length := bitpack.ByteCount(count) 278 dst = resize(dst, offset+length) 279 bytealg.Broadcast(dst[offset:], word) 280 } 281 } 282 return dst, nil 283 } 284 285 func decodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) { 286 if bitWidth > 8 { 287 return dst, errDecodeInvalidBitWidth("INT8", bitWidth) 288 } 289 290 for i := 0; i < len(src); { 291 u, n := binary.Uvarint(src[i:]) 292 if n == 0 { 293 return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF) 294 } 295 if n < 0 { 296 return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src)) 297 } 298 i += n 299 300 count, bitpacked := uint(u>>1), (u&1) != 0 301 if count > maxSupportedValueCount { 302 return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) 303 } 304 if bitpacked { 305 count *= 8 306 j := i + bitpack.ByteCount(count*bitWidth) 307 308 if j > len(src) { 309 return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", 8*count, io.ErrUnexpectedEOF) 310 } 311 312 offset := len(dst) 313 length := int(count) 314 dst = resize(dst, offset+length) 315 decodeBytesBitpack(dst[offset:], src[i:j], count, bitWidth) 316 317 i = j 318 } else { 319 if bitWidth != 0 && (i+1) > len(src) { 320 return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF) 321 } 322 323 word := byte(0) 324 if bitWidth != 0 { 325 word = src[i] 326 i++ 327 } 328 329 offset := len(dst) 330 length := int(count) 331 dst = resize(dst, offset+length) 332 bytealg.Broadcast(dst[offset:], word) 333 } 334 } 335 336 return dst, nil 337 } 338 339 func decodeInt32(dst, src []byte, bitWidth uint) ([]byte, error) { 340 if bitWidth > 32 { 341 return dst, errDecodeInvalidBitWidth("INT32", bitWidth) 342 } 343 344 buf := make([]byte, 2*bitpack.PaddingInt32) 345 346 for i := 0; i < len(src); { 347 u, n := binary.Uvarint(src[i:]) 348 if n == 0 { 349 return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF) 350 } 351 if n < 0 { 352 return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src)) 353 } 354 i += n 355 356 count, bitpacked := uint(u>>1), (u&1) != 0 357 if count > maxSupportedValueCount { 358 return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) 359 } 360 if bitpacked { 361 offset := len(dst) 362 length := int(count * bitWidth) 363 dst = resize(dst, offset+4*8*int(count)) 364 365 // The bitpack.UnpackInt32 function requires the input to be padded 366 // or the function panics. If there is enough room in the input 367 // buffer we can use it, otherwise we have to copy it to a larger 368 // location (which should rarely happen). 369 in := src[i : i+length] 370 if (cap(in) - len(in)) >= bitpack.PaddingInt32 { 371 in = in[:cap(in)] 372 } else { 373 buf = resize(buf, len(in)+bitpack.PaddingInt32) 374 copy(buf, in) 375 in = buf 376 } 377 378 out := unsafecast.BytesToInt32(dst[offset:]) 379 bitpack.UnpackInt32(out, in, bitWidth) 380 i += length 381 } else { 382 j := i + bitpack.ByteCount(bitWidth) 383 384 if j > len(src) { 385 return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF) 386 } 387 388 bits := [4]byte{} 389 copy(bits[:], src[i:j]) 390 dst = appendRepeat(dst, bits[:], count) 391 i = j 392 } 393 } 394 395 return dst, nil 396 } 397 398 func errEncodeInvalidBitWidth(typ string, bitWidth uint) error { 399 return errInvalidBitWidth("encode", typ, bitWidth) 400 } 401 402 func errDecodeInvalidBitWidth(typ string, bitWidth uint) error { 403 return errInvalidBitWidth("decode", typ, bitWidth) 404 } 405 406 func errInvalidBitWidth(op, typ string, bitWidth uint) error { 407 return fmt.Errorf("cannot %s %s with invalid bit-width=%d", op, typ, bitWidth) 408 } 409 410 func appendRepeat(dst, pattern []byte, count uint) []byte { 411 offset := len(dst) 412 length := int(count) * len(pattern) 413 dst = resize(dst, offset+length) 414 i := offset + copy(dst[offset:], pattern) 415 for i < len(dst) { 416 i += copy(dst[i:], dst[offset:i]) 417 } 418 return dst 419 } 420 421 func appendUvarint(dst []byte, u uint64) []byte { 422 var b [binary.MaxVarintLen64]byte 423 var n = binary.PutUvarint(b[:], u) 424 return append(dst, b[:n]...) 425 } 426 427 func appendRunLengthBits(dst []byte, count int, value byte) []byte { 428 return appendRunLengthBytes(dst, count, value) 429 } 430 431 func appendBitPackedBits(dst []byte, words []byte) []byte { 432 n := len(dst) 433 dst = resize(dst, n+binary.MaxVarintLen64+len(words)) 434 n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1) 435 n += copy(dst[n:], words) 436 return dst[:n] 437 } 438 439 func appendRunLengthBytes(dst []byte, count int, value byte) []byte { 440 n := len(dst) 441 dst = resize(dst, n+binary.MaxVarintLen64+1) 442 n += binary.PutUvarint(dst[n:], uint64(count)<<1) 443 dst[n] = value 444 return dst[:n+1] 445 } 446 447 func appendBitPackedBytes(dst []byte, words []uint64, bitWidth uint) []byte { 448 n := len(dst) 449 dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+8) 450 n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1) 451 n += encodeBytesBitpack(dst[n:], words, bitWidth) 452 return dst[:n] 453 } 454 455 func appendRunLengthInt32(dst []byte, count int, value int32, bitWidth uint) []byte { 456 n := len(dst) 457 dst = resize(dst, n+binary.MaxVarintLen64+4) 458 n += binary.PutUvarint(dst[n:], uint64(count)<<1) 459 binary.LittleEndian.PutUint32(dst[n:], uint32(value)) 460 return dst[:n+bitpack.ByteCount(bitWidth)] 461 } 462 463 func appendBitPackedInt32(dst []byte, words [][8]int32, bitWidth uint) []byte { 464 n := len(dst) 465 dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+32) 466 n += binary.PutUvarint(dst[n:], uint64(len(words))<<1|1) 467 n += encodeInt32Bitpack(dst[n:], words, bitWidth) 468 return dst[:n] 469 } 470 471 func broadcast8x1(v uint64) uint64 { 472 return (v & 0xFF) * 0x0101010101010101 473 } 474 475 func broadcast8x4(v int32) [8]int32 { 476 return [8]int32{v, v, v, v, v, v, v, v} 477 } 478 479 func isZero(data []byte) bool { 480 return bytealg.Count(data, 0x00) == len(data) 481 } 482 483 func isOnes(data []byte) bool { 484 return bytealg.Count(data, 0xFF) == len(data) 485 } 486 487 func resize(buf []byte, size int) []byte { 488 if cap(buf) < size { 489 return grow(buf, size) 490 } 491 return buf[:size] 492 } 493 494 func grow(buf []byte, size int) []byte { 495 newCap := 2 * cap(buf) 496 if newCap < size { 497 newCap = size 498 } 499 newBuf := make([]byte, size, newCap) 500 copy(newBuf, buf) 501 return newBuf 502 } 503 504 func encodeInt32BitpackDefault(dst []byte, src [][8]int32, bitWidth uint) int { 505 bits := unsafe.Slice((*int32)(unsafe.Pointer(&src[0])), len(src)*8) 506 bitpack.PackInt32(dst, bits, bitWidth) 507 return bitpack.ByteCount(uint(len(src)*8) * bitWidth) 508 } 509 510 func encodeBytesBitpackDefault(dst []byte, src []uint64, bitWidth uint) int { 511 bitMask := uint64(1<<bitWidth) - 1 512 n := 0 513 514 for _, word := range src { 515 word = (word & bitMask) | 516 (((word >> 8) & bitMask) << (1 * bitWidth)) | 517 (((word >> 16) & bitMask) << (2 * bitWidth)) | 518 (((word >> 24) & bitMask) << (3 * bitWidth)) | 519 (((word >> 32) & bitMask) << (4 * bitWidth)) | 520 (((word >> 40) & bitMask) << (5 * bitWidth)) | 521 (((word >> 48) & bitMask) << (6 * bitWidth)) | 522 (((word >> 56) & bitMask) << (7 * bitWidth)) 523 binary.LittleEndian.PutUint64(dst[n:], word) 524 n += int(bitWidth) 525 } 526 527 return n 528 } 529 530 func decodeBytesBitpackDefault(dst, src []byte, count, bitWidth uint) { 531 dst = dst[:0] 532 533 bitMask := uint64(1<<bitWidth) - 1 534 byteCount := bitpack.ByteCount(8 * bitWidth) 535 536 for i := 0; count > 0; count -= 8 { 537 j := i + byteCount 538 539 bits := [8]byte{} 540 copy(bits[:], src[i:j]) 541 word := binary.LittleEndian.Uint64(bits[:]) 542 543 dst = append(dst, 544 byte((word>>(0*bitWidth))&bitMask), 545 byte((word>>(1*bitWidth))&bitMask), 546 byte((word>>(2*bitWidth))&bitMask), 547 byte((word>>(3*bitWidth))&bitMask), 548 byte((word>>(4*bitWidth))&bitMask), 549 byte((word>>(5*bitWidth))&bitMask), 550 byte((word>>(6*bitWidth))&bitMask), 551 byte((word>>(7*bitWidth))&bitMask), 552 ) 553 554 i = j 555 } 556 }