github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding/rle/rle.go (about) 1 // Package rle implements the hybrid RLE/Bit-Packed encoding employed in 2 // repetition and definition levels, dictionary indexed data pages, and 3 // boolean values in the PLAIN encoding. 4 // 5 // https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 6 package rle 7 8 import ( 9 "encoding/binary" 10 "fmt" 11 "io" 12 "unsafe" 13 14 "github.com/segmentio/parquet-go/encoding" 15 "github.com/segmentio/parquet-go/format" 16 "github.com/segmentio/parquet-go/internal/bitpack" 17 "github.com/segmentio/parquet-go/internal/bytealg" 18 "github.com/segmentio/parquet-go/internal/unsafecast" 19 ) 20 21 const ( 22 // This limit is intended to prevent unbounded memory allocations when 23 // decoding runs. 24 // 25 // We use a generous limit which allows for over 16 million values per page 26 // if there is only one run to encode the repetition or definition levels 27 // (this should be uncommon). 28 maxSupportedValueCount = 16 * 1024 * 1024 29 ) 30 31 type Encoding struct { 32 encoding.NotSupported 33 BitWidth int 34 } 35 36 func (e *Encoding) String() string { 37 return "RLE" 38 } 39 40 func (e *Encoding) Encoding() format.Encoding { 41 return format.RLE 42 } 43 44 func (e *Encoding) EncodeLevels(dst []byte, src []uint8) ([]byte, error) { 45 dst, err := encodeBytes(dst[:0], src, uint(e.BitWidth)) 46 return dst, e.wrap(err) 47 } 48 49 func (e *Encoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) { 50 // In the case of encoding a boolean values, the 4 bytes length of the 51 // output is expected by the parquet format. We add the bytes as placeholder 52 // before appending the encoded data. 53 dst = append(dst[:0], 0, 0, 0, 0) 54 dst, err := encodeBits(dst, src) 55 binary.LittleEndian.PutUint32(dst, uint32(len(dst))-4) 56 return dst, e.wrap(err) 57 } 58 59 func (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { 60 dst, err := encodeInt32(dst[:0], src, uint(e.BitWidth)) 61 return dst, e.wrap(err) 62 } 63 64 func (e *Encoding) DecodeLevels(dst []uint8, src []byte) ([]uint8, error) { 65 dst, err := decodeBytes(dst[:0], src, uint(e.BitWidth)) 66 return dst, e.wrap(err) 67 } 68 69 func (e *Encoding) DecodeBoolean(dst []byte, src []byte) ([]byte, error) { 70 if len(src) == 4 { 71 return dst[:0], nil 72 } 73 if len(src) < 4 { 74 return dst[:0], fmt.Errorf("input shorter than 4 bytes: %w", io.ErrUnexpectedEOF) 75 } 76 n := int(binary.LittleEndian.Uint32(src)) 77 src = src[4:] 78 if n > len(src) { 79 return dst[:0], fmt.Errorf("input shorter than length prefix: %d < %d: %w", len(src), n, io.ErrUnexpectedEOF) 80 } 81 dst, err := decodeBits(dst[:0], src[:n]) 82 return dst, e.wrap(err) 83 } 84 85 func (e *Encoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { 86 buf := unsafecast.Int32ToBytes(dst) 87 buf, err := decodeInt32(buf[:0], src, uint(e.BitWidth)) 88 return unsafecast.BytesToInt32(buf), e.wrap(err) 89 } 90 91 func (e *Encoding) wrap(err error) error { 92 if err != nil { 93 err = encoding.Error(e, err) 94 } 95 return err 96 } 97 98 func encodeBits(dst, src []byte) ([]byte, error) { 99 if len(src) == 0 || isZero(src) || isOnes(src) { 100 dst = appendUvarint(dst, uint64(8*len(src))<<1) 101 if len(src) > 0 { 102 dst = append(dst, src[0]) 103 } 104 return dst, nil 105 } 106 107 for i := 0; i < len(src); { 108 j := i + 1 109 110 // Look for contiguous sections of 8 bits, all zeros or ones; these 111 // are run-length encoded as it only takes 2 or 3 bytes to store these 112 // sequences. 113 if src[i] == 0 || src[i] == 0xFF { 114 for j < len(src) && src[i] == src[j] { 115 j++ 116 } 117 118 if n := j - i; n > 1 { 119 dst = appendRunLengthBits(dst, 8*n, src[i]) 120 i = j 121 continue 122 } 123 } 124 125 // Sequences of bits that are neither all zeroes or ones are bit-packed, 126 // which is a simple copy of the input to the output preceded with the 127 // bit-pack header. 128 for j < len(src) && (src[j-1] != src[j] || (src[j] != 0 && src[j] == 0xFF)) { 129 j++ 130 } 131 132 if (j-i) > 1 && j < len(src) { 133 j-- 134 } 135 136 dst = appendBitPackedBits(dst, src[i:j]) 137 i = j 138 } 139 return dst, nil 140 } 141 142 func encodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) { 143 if bitWidth > 8 { 144 return dst, errEncodeInvalidBitWidth("INT8", bitWidth) 145 } 146 if bitWidth == 0 { 147 if !isZero(src) { 148 return dst, errEncodeInvalidBitWidth("INT8", bitWidth) 149 } 150 return appendUvarint(dst, uint64(len(src))<<1), nil 151 } 152 153 if len(src) >= 8 { 154 words := unsafe.Slice((*uint64)(unsafe.Pointer(&src[0])), len(src)/8) 155 156 for i := 0; i < len(words); { 157 j := i 158 pattern := broadcast8x1(words[i]) 159 160 for j < len(words) && words[j] == pattern { 161 j++ 162 } 163 164 if i < j { 165 dst = appendRunLengthBytes(dst, 8*(j-i), byte(pattern)) 166 } else { 167 j++ 168 169 for j < len(words) && words[j] != broadcast8x1(words[j-1]) { 170 j++ 171 } 172 173 dst = appendBitPackedBytes(dst, words[i:j], bitWidth) 174 } 175 176 i = j 177 } 178 } 179 180 for i := (len(src) / 8) * 8; i < len(src); { 181 j := i + 1 182 183 for j < len(src) && src[i] == src[j] { 184 j++ 185 } 186 187 dst = appendRunLengthBytes(dst, j-i, src[i]) 188 i = j 189 } 190 191 return dst, nil 192 } 193 194 func encodeInt32(dst []byte, src []int32, bitWidth uint) ([]byte, error) { 195 if bitWidth > 32 { 196 return dst, errEncodeInvalidBitWidth("INT32", bitWidth) 197 } 198 if bitWidth == 0 { 199 if !isZero(unsafecast.Int32ToBytes(src)) { 200 return dst, errEncodeInvalidBitWidth("INT32", bitWidth) 201 } 202 return appendUvarint(dst, uint64(len(src))<<1), nil 203 } 204 205 if len(src) >= 8 { 206 words := unsafe.Slice((*[8]int32)(unsafe.Pointer(&src[0])), len(src)/8) 207 208 for i := 0; i < len(words); { 209 j := i 210 pattern := broadcast8x4(words[i][0]) 211 212 for j < len(words) && words[j] == pattern { 213 j++ 214 } 215 216 if i < j { 217 dst = appendRunLengthInt32(dst, 8*(j-i), pattern[0], bitWidth) 218 } else { 219 j += 1 220 j += encodeInt32IndexEqual8Contiguous(words[j:]) 221 dst = appendBitPackedInt32(dst, words[i:j], bitWidth) 222 } 223 224 i = j 225 } 226 } 227 228 for i := (len(src) / 8) * 8; i < len(src); { 229 j := i + 1 230 231 for j < len(src) && src[i] == src[j] { 232 j++ 233 } 234 235 dst = appendRunLengthInt32(dst, j-i, src[i], bitWidth) 236 i = j 237 } 238 239 return dst, nil 240 } 241 242 func decodeBits(dst, src []byte) ([]byte, error) { 243 for i := 0; i < len(src); { 244 u, n := binary.Uvarint(src[i:]) 245 if n == 0 { 246 return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF) 247 } 248 if n < 0 { 249 return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src)) 250 } 251 i += n 252 253 count, bitpacked := uint(u>>1), (u&1) != 0 254 if count > maxSupportedValueCount { 255 return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) 256 } 257 if bitpacked { 258 n := int(count) 259 j := i + n 260 261 if j > len(src) { 262 return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", n, io.ErrUnexpectedEOF) 263 } 264 265 dst = append(dst, src[i:j]...) 266 i = j 267 } else { 268 word := byte(0) 269 if i < len(src) { 270 word = src[i] 271 i++ 272 } 273 274 offset := len(dst) 275 length := bitpack.ByteCount(count) 276 dst = resize(dst, offset+length) 277 bytealg.Broadcast(dst[offset:], word) 278 } 279 } 280 return dst, nil 281 } 282 283 func decodeBytes(dst, src []byte, bitWidth uint) ([]byte, error) { 284 if bitWidth > 8 { 285 return dst, errDecodeInvalidBitWidth("INT8", bitWidth) 286 } 287 288 for i := 0; i < len(src); { 289 u, n := binary.Uvarint(src[i:]) 290 if n == 0 { 291 return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF) 292 } 293 if n < 0 { 294 return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src)) 295 } 296 i += n 297 298 count, bitpacked := uint(u>>1), (u&1) != 0 299 if count > maxSupportedValueCount { 300 return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) 301 } 302 if bitpacked { 303 count *= 8 304 j := i + bitpack.ByteCount(count*bitWidth) 305 306 if j > len(src) { 307 return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", 8*count, io.ErrUnexpectedEOF) 308 } 309 310 offset := len(dst) 311 length := int(count) 312 dst = resize(dst, offset+length) 313 decodeBytesBitpack(dst[offset:], src[i:j], count, bitWidth) 314 315 i = j 316 } else { 317 if bitWidth != 0 && (i+1) > len(src) { 318 return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF) 319 } 320 321 word := byte(0) 322 if bitWidth != 0 { 323 word = src[i] 324 i++ 325 } 326 327 offset := len(dst) 328 length := int(count) 329 dst = resize(dst, offset+length) 330 bytealg.Broadcast(dst[offset:], word) 331 } 332 } 333 334 return dst, nil 335 } 336 337 func decodeInt32(dst, src []byte, bitWidth uint) ([]byte, error) { 338 if bitWidth > 32 { 339 return dst, errDecodeInvalidBitWidth("INT32", bitWidth) 340 } 341 342 buf := make([]byte, 2*bitpack.PaddingInt32) 343 344 for i := 0; i < len(src); { 345 u, n := binary.Uvarint(src[i:]) 346 if n == 0 { 347 return dst, fmt.Errorf("decoding run-length block header: %w", io.ErrUnexpectedEOF) 348 } 349 if n < 0 { 350 return dst, fmt.Errorf("overflow after decoding %d/%d bytes of run-length block header", -n+i, len(src)) 351 } 352 i += n 353 354 count, bitpacked := uint(u>>1), (u&1) != 0 355 if count > maxSupportedValueCount { 356 return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) 357 } 358 if bitpacked { 359 offset := len(dst) 360 length := int(count * bitWidth) 361 dst = resize(dst, offset+4*8*int(count)) 362 363 // The bitpack.UnpackInt32 function requires the input to be padded 364 // or the function panics. If there is enough room in the input 365 // buffer we can use it, otherwise we have to copy it to a larger 366 // location (which should rarely happen). 367 in := src[i : i+length] 368 if (cap(in) - len(in)) >= bitpack.PaddingInt32 { 369 in = in[:cap(in)] 370 } else { 371 buf = resize(buf, len(in)+bitpack.PaddingInt32) 372 copy(buf, in) 373 in = buf 374 } 375 376 out := unsafecast.BytesToInt32(dst[offset:]) 377 bitpack.UnpackInt32(out, in, bitWidth) 378 i += length 379 } else { 380 j := i + bitpack.ByteCount(bitWidth) 381 382 if j > len(src) { 383 return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF) 384 } 385 386 bits := [4]byte{} 387 copy(bits[:], src[i:j]) 388 dst = appendRepeat(dst, bits[:], count) 389 i = j 390 } 391 } 392 393 return dst, nil 394 } 395 396 func errEncodeInvalidBitWidth(typ string, bitWidth uint) error { 397 return errInvalidBitWidth("encode", typ, bitWidth) 398 } 399 400 func errDecodeInvalidBitWidth(typ string, bitWidth uint) error { 401 return errInvalidBitWidth("decode", typ, bitWidth) 402 } 403 404 func errInvalidBitWidth(op, typ string, bitWidth uint) error { 405 return fmt.Errorf("cannot %s %s with invalid bit-width=%d", op, typ, bitWidth) 406 } 407 408 func appendRepeat(dst, pattern []byte, count uint) []byte { 409 offset := len(dst) 410 length := int(count) * len(pattern) 411 dst = resize(dst, offset+length) 412 i := offset + copy(dst[offset:], pattern) 413 for i < len(dst) { 414 i += copy(dst[i:], dst[offset:i]) 415 } 416 return dst 417 } 418 419 func appendUvarint(dst []byte, u uint64) []byte { 420 var b [binary.MaxVarintLen64]byte 421 var n = binary.PutUvarint(b[:], u) 422 return append(dst, b[:n]...) 423 } 424 425 func appendRunLengthBits(dst []byte, count int, value byte) []byte { 426 return appendRunLengthBytes(dst, count, value) 427 } 428 429 func appendBitPackedBits(dst []byte, words []byte) []byte { 430 n := len(dst) 431 dst = resize(dst, n+binary.MaxVarintLen64+len(words)) 432 n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1) 433 n += copy(dst[n:], words) 434 return dst[:n] 435 } 436 437 func appendRunLengthBytes(dst []byte, count int, value byte) []byte { 438 n := len(dst) 439 dst = resize(dst, n+binary.MaxVarintLen64+1) 440 n += binary.PutUvarint(dst[n:], uint64(count)<<1) 441 dst[n] = value 442 return dst[:n+1] 443 } 444 445 func appendBitPackedBytes(dst []byte, words []uint64, bitWidth uint) []byte { 446 n := len(dst) 447 dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+8) 448 n += binary.PutUvarint(dst[n:], uint64(len(words)<<1)|1) 449 n += encodeBytesBitpack(dst[n:], words, bitWidth) 450 return dst[:n] 451 } 452 453 func appendRunLengthInt32(dst []byte, count int, value int32, bitWidth uint) []byte { 454 n := len(dst) 455 dst = resize(dst, n+binary.MaxVarintLen64+4) 456 n += binary.PutUvarint(dst[n:], uint64(count)<<1) 457 binary.LittleEndian.PutUint32(dst[n:], uint32(value)) 458 return dst[:n+bitpack.ByteCount(bitWidth)] 459 } 460 461 func appendBitPackedInt32(dst []byte, words [][8]int32, bitWidth uint) []byte { 462 n := len(dst) 463 dst = resize(dst, n+binary.MaxVarintLen64+(len(words)*int(bitWidth))+32) 464 n += binary.PutUvarint(dst[n:], uint64(len(words))<<1|1) 465 n += encodeInt32Bitpack(dst[n:], words, bitWidth) 466 return dst[:n] 467 } 468 469 func broadcast8x1(v uint64) uint64 { 470 return (v & 0xFF) * 0x0101010101010101 471 } 472 473 func broadcast8x4(v int32) [8]int32 { 474 return [8]int32{v, v, v, v, v, v, v, v} 475 } 476 477 func isZero(data []byte) bool { 478 return bytealg.Count(data, 0x00) == len(data) 479 } 480 481 func isOnes(data []byte) bool { 482 return bytealg.Count(data, 0xFF) == len(data) 483 } 484 485 func resize(buf []byte, size int) []byte { 486 if cap(buf) < size { 487 return grow(buf, size) 488 } 489 return buf[:size] 490 } 491 492 func grow(buf []byte, size int) []byte { 493 newCap := 2 * cap(buf) 494 if newCap < size { 495 newCap = size 496 } 497 newBuf := make([]byte, size, newCap) 498 copy(newBuf, buf) 499 return newBuf 500 } 501 502 func encodeInt32BitpackDefault(dst []byte, src [][8]int32, bitWidth uint) int { 503 bits := unsafe.Slice((*int32)(unsafe.Pointer(&src[0])), len(src)*8) 504 bitpack.PackInt32(dst, bits, bitWidth) 505 return bitpack.ByteCount(uint(len(src)*8) * bitWidth) 506 } 507 508 func encodeBytesBitpackDefault(dst []byte, src []uint64, bitWidth uint) int { 509 bitMask := uint64(1<<bitWidth) - 1 510 n := 0 511 512 for _, word := range src { 513 word = (word & bitMask) | 514 (((word >> 8) & bitMask) << (1 * bitWidth)) | 515 (((word >> 16) & bitMask) << (2 * bitWidth)) | 516 (((word >> 24) & bitMask) << (3 * bitWidth)) | 517 (((word >> 32) & bitMask) << (4 * bitWidth)) | 518 (((word >> 40) & bitMask) << (5 * bitWidth)) | 519 (((word >> 48) & bitMask) << (6 * bitWidth)) | 520 (((word >> 56) & bitMask) << (7 * bitWidth)) 521 binary.LittleEndian.PutUint64(dst[n:], word) 522 n += int(bitWidth) 523 } 524 525 return n 526 } 527 528 func decodeBytesBitpackDefault(dst, src []byte, count, bitWidth uint) { 529 dst = dst[:0] 530 531 bitMask := uint64(1<<bitWidth) - 1 532 byteCount := bitpack.ByteCount(8 * bitWidth) 533 534 for i := 0; count > 0; count -= 8 { 535 j := i + byteCount 536 537 bits := [8]byte{} 538 copy(bits[:], src[i:j]) 539 word := binary.LittleEndian.Uint64(bits[:]) 540 541 dst = append(dst, 542 byte((word>>(0*bitWidth))&bitMask), 543 byte((word>>(1*bitWidth))&bitMask), 544 byte((word>>(2*bitWidth))&bitMask), 545 byte((word>>(3*bitWidth))&bitMask), 546 byte((word>>(4*bitWidth))&bitMask), 547 byte((word>>(5*bitWidth))&bitMask), 548 byte((word>>(6*bitWidth))&bitMask), 549 byte((word>>(7*bitWidth))&bitMask), 550 ) 551 552 i = j 553 } 554 }