github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/delta_bit_packing.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "bytes" 21 "math" 22 "math/bits" 23 "reflect" 24 25 "github.com/apache/arrow/go/v14/arrow" 26 "github.com/apache/arrow/go/v14/arrow/memory" 27 shared_utils "github.com/apache/arrow/go/v14/internal/utils" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/internal/utils" 30 "golang.org/x/xerrors" 31 ) 32 33 // see the deltaBitPack encoder for a description of the encoding format that is 34 // used for delta-bitpacking. 35 type deltaBitPackDecoder struct { 36 decoder 37 38 mem memory.Allocator 39 40 usedFirst bool 41 bitdecoder *utils.BitReader 42 blockSize uint64 43 currentBlockVals uint32 44 miniBlocks uint64 45 valsPerMini uint32 46 currentMiniBlockVals uint32 47 minDelta int64 48 miniBlockIdx uint64 49 50 deltaBitWidths *memory.Buffer 51 deltaBitWidth byte 52 53 totalValues uint64 54 lastVal int64 55 } 56 57 // returns the number of bytes read so far 58 func (d *deltaBitPackDecoder) bytesRead() int64 { 59 return d.bitdecoder.CurOffset() 60 } 61 62 func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem } 63 64 // SetData sets the bytes and the expected number of values to decode 65 // into the decoder, updating the decoder and allowing it to be reused. 66 func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) error { 67 // set our data into the underlying decoder for the type 68 if err := d.decoder.SetData(nvalues, data); err != nil { 69 return err 70 } 71 // create a bit reader for our decoder's values 72 d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data)) 73 d.currentBlockVals = 0 74 d.currentMiniBlockVals = 0 75 if d.deltaBitWidths == nil { 76 d.deltaBitWidths = memory.NewResizableBuffer(d.mem) 77 } 78 79 var ok bool 80 d.blockSize, ok = d.bitdecoder.GetVlqInt() 81 if !ok { 82 return xerrors.New("parquet: eof exception") 83 } 84 85 if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok { 86 return xerrors.New("parquet: eof exception") 87 } 88 89 if d.totalValues, ok = d.bitdecoder.GetVlqInt(); !ok { 90 return xerrors.New("parquet: eof exception") 91 } 92 93 if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { 94 return xerrors.New("parquet: eof exception") 95 } 96 97 if d.miniBlocks != 0 { 98 d.valsPerMini = uint32(d.blockSize / d.miniBlocks) 99 } 100 return nil 101 } 102 103 // initialize a block to decode 104 func (d *deltaBitPackDecoder) initBlock() error { 105 // first we grab the min delta value that we'll start from 106 var ok bool 107 if d.minDelta, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { 108 return xerrors.New("parquet: eof exception") 109 } 110 111 // ensure we have enough space for our miniblocks to decode the widths 112 d.deltaBitWidths.Resize(int(d.miniBlocks)) 113 114 var err error 115 for i := uint64(0); i < d.miniBlocks; i++ { 116 if d.deltaBitWidths.Bytes()[i], err = d.bitdecoder.ReadByte(); err != nil { 117 return err 118 } 119 } 120 121 d.miniBlockIdx = 0 122 d.deltaBitWidth = d.deltaBitWidths.Bytes()[0] 123 d.currentBlockVals = uint32(d.blockSize) 124 return nil 125 } 126 127 // DeltaBitPackInt32Decoder decodes Int32 values which are packed using the Delta BitPacking algorithm. 128 type DeltaBitPackInt32Decoder struct { 129 *deltaBitPackDecoder 130 131 miniBlockValues []int32 132 } 133 134 func (d *DeltaBitPackInt32Decoder) unpackNextMini() error { 135 if d.miniBlockValues == nil { 136 d.miniBlockValues = make([]int32, 0, int(d.valsPerMini)) 137 } else { 138 d.miniBlockValues = d.miniBlockValues[:0] 139 } 140 d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)] 141 d.currentMiniBlockVals = d.valsPerMini 142 143 for j := 0; j < int(d.valsPerMini); j++ { 144 delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth)) 145 if !ok { 146 return xerrors.New("parquet: eof exception") 147 } 148 149 d.lastVal += int64(delta) + int64(d.minDelta) 150 d.miniBlockValues = append(d.miniBlockValues, int32(d.lastVal)) 151 } 152 d.miniBlockIdx++ 153 return nil 154 } 155 156 // Decode retrieves min(remaining values, len(out)) values from the data and returns the number 157 // of values actually decoded and any errors encountered. 158 func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) { 159 max := shared_utils.MinInt(len(out), int(d.totalValues)) 160 if max == 0 { 161 return 0, nil 162 } 163 164 out = out[:max] 165 if !d.usedFirst { // starting value to calculate deltas against 166 out[0] = int32(d.lastVal) 167 out = out[1:] 168 d.usedFirst = true 169 } 170 171 var err error 172 for len(out) > 0 { // unpack mini blocks until we get all the values we need 173 if d.currentBlockVals == 0 { 174 err = d.initBlock() 175 } 176 if d.currentMiniBlockVals == 0 { 177 err = d.unpackNextMini() 178 } 179 if err != nil { 180 return 0, err 181 } 182 183 // copy as many values from our mini block as we can into out 184 start := int(d.valsPerMini - d.currentMiniBlockVals) 185 numCopied := copy(out, d.miniBlockValues[start:]) 186 187 out = out[numCopied:] 188 d.currentBlockVals -= uint32(numCopied) 189 d.currentMiniBlockVals -= uint32(numCopied) 190 } 191 d.nvals -= max 192 return max, nil 193 } 194 195 // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap 196 func (d *DeltaBitPackInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 197 toread := len(out) - nullCount 198 values, err := d.Decode(out[:toread]) 199 if err != nil { 200 return values, err 201 } 202 if values != toread { 203 return values, xerrors.New("parquet: number of values / definition levels read did not match") 204 } 205 206 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 207 } 208 209 // Type returns the physical parquet type that this decoder decodes, in this case Int32 210 func (DeltaBitPackInt32Decoder) Type() parquet.Type { 211 return parquet.Types.Int32 212 } 213 214 // DeltaBitPackInt64Decoder decodes a delta bit packed int64 column of data. 215 type DeltaBitPackInt64Decoder struct { 216 *deltaBitPackDecoder 217 218 miniBlockValues []int64 219 } 220 221 func (d *DeltaBitPackInt64Decoder) unpackNextMini() error { 222 if d.miniBlockValues == nil { 223 d.miniBlockValues = make([]int64, 0, int(d.valsPerMini)) 224 } else { 225 d.miniBlockValues = d.miniBlockValues[:0] 226 } 227 228 d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)] 229 d.currentMiniBlockVals = d.valsPerMini 230 231 for j := 0; j < int(d.valsPerMini); j++ { 232 delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth)) 233 if !ok { 234 return xerrors.New("parquet: eof exception") 235 } 236 237 d.lastVal += int64(delta) + int64(d.minDelta) 238 d.miniBlockValues = append(d.miniBlockValues, d.lastVal) 239 } 240 d.miniBlockIdx++ 241 return nil 242 } 243 244 // Decode retrieves min(remaining values, len(out)) values from the data and returns the number 245 // of values actually decoded and any errors encountered. 246 func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) { 247 max := shared_utils.MinInt(len(out), d.nvals) 248 if max == 0 { 249 return 0, nil 250 } 251 252 out = out[:max] 253 if !d.usedFirst { 254 out[0] = d.lastVal 255 out = out[1:] 256 d.usedFirst = true 257 } 258 259 var err error 260 for len(out) > 0 { 261 if d.currentBlockVals == 0 { 262 err = d.initBlock() 263 } 264 if d.currentMiniBlockVals == 0 { 265 err = d.unpackNextMini() 266 } 267 268 if err != nil { 269 return 0, err 270 } 271 272 start := int(d.valsPerMini - d.currentMiniBlockVals) 273 numCopied := copy(out, d.miniBlockValues[start:]) 274 275 out = out[numCopied:] 276 d.currentBlockVals -= uint32(numCopied) 277 d.currentMiniBlockVals -= uint32(numCopied) 278 } 279 d.nvals -= max 280 return max, nil 281 } 282 283 // Type returns the physical parquet type that this decoder decodes, in this case Int64 284 func (DeltaBitPackInt64Decoder) Type() parquet.Type { 285 return parquet.Types.Int64 286 } 287 288 // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap 289 func (d DeltaBitPackInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 290 toread := len(out) - nullCount 291 values, err := d.Decode(out[:toread]) 292 if err != nil { 293 return values, err 294 } 295 if values != toread { 296 return values, xerrors.New("parquet: number of values / definition levels read did not match") 297 } 298 299 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 300 } 301 302 const ( 303 // block size must be a multiple of 128 304 defaultBlockSize = 128 305 defaultNumMiniBlocks = 4 306 // block size / number of mini blocks must result in a multiple of 32 307 defaultNumValuesPerMini = 32 308 // max size of the header for the delta blocks 309 maxHeaderWriterSize = 32 310 ) 311 312 // deltaBitPackEncoder is an encoder for the DeltaBinary Packing format 313 // as per the parquet spec. 314 // 315 // Consists of a header followed by blocks of delta encoded values binary packed. 316 // 317 // Format 318 // [header] [block 1] [block 2] ... [block N] 319 // 320 // Header 321 // [block size] [number of mini blocks per block] [total value count] [first value] 322 // 323 // Block 324 // [min delta] [list of bitwidths of the miniblocks] [miniblocks...] 325 // 326 // Sets aside bytes at the start of the internal buffer where the header will be written, 327 // and only writes the header when FlushValues is called before returning it. 328 type deltaBitPackEncoder struct { 329 encoder 330 331 bitWriter *utils.BitWriter 332 totalVals uint64 333 firstVal int64 334 currentVal int64 335 336 blockSize uint64 337 miniBlockSize uint64 338 numMiniBlocks uint64 339 deltas []int64 340 } 341 342 // flushBlock flushes out a finished block for writing to the underlying encoder 343 func (enc *deltaBitPackEncoder) flushBlock() { 344 if len(enc.deltas) == 0 { 345 return 346 } 347 348 // determine the minimum delta value 349 minDelta := int64(math.MaxInt64) 350 for _, delta := range enc.deltas { 351 if delta < minDelta { 352 minDelta = delta 353 } 354 } 355 356 enc.bitWriter.WriteZigZagVlqInt(minDelta) 357 // reserve enough bytes to write out our miniblock deltas 358 offset, _ := enc.bitWriter.SkipBytes(int(enc.numMiniBlocks)) 359 360 valuesToWrite := int64(len(enc.deltas)) 361 for i := 0; i < int(enc.numMiniBlocks); i++ { 362 n := shared_utils.Min(int64(enc.miniBlockSize), valuesToWrite) 363 if n == 0 { 364 break 365 } 366 367 maxDelta := int64(math.MinInt64) 368 start := i * int(enc.miniBlockSize) 369 for _, val := range enc.deltas[start : start+int(n)] { 370 maxDelta = shared_utils.Max(maxDelta, val) 371 } 372 373 // compute bit width to store (max_delta - min_delta) 374 width := uint(bits.Len64(uint64(maxDelta - minDelta))) 375 // write out the bit width we used into the bytes we reserved earlier 376 enc.bitWriter.WriteAt([]byte{byte(width)}, int64(offset+i)) 377 378 // write out our deltas 379 for _, val := range enc.deltas[start : start+int(n)] { 380 enc.bitWriter.WriteValue(uint64(val-minDelta), width) 381 } 382 383 valuesToWrite -= n 384 385 // pad the last block if n < miniBlockSize 386 for ; n < int64(enc.miniBlockSize); n++ { 387 enc.bitWriter.WriteValue(0, width) 388 } 389 } 390 enc.deltas = enc.deltas[:0] 391 } 392 393 // putInternal is the implementation for actually writing data which must be 394 // integral data as int, int8, int32, or int64. 395 func (enc *deltaBitPackEncoder) putInternal(data interface{}) { 396 v := reflect.ValueOf(data) 397 if v.Len() == 0 { 398 return 399 } 400 401 idx := 0 402 if enc.totalVals == 0 { 403 enc.blockSize = defaultBlockSize 404 enc.numMiniBlocks = defaultNumMiniBlocks 405 enc.miniBlockSize = defaultNumValuesPerMini 406 407 enc.firstVal = v.Index(0).Int() 408 enc.currentVal = enc.firstVal 409 idx = 1 410 411 enc.bitWriter = utils.NewBitWriter(enc.sink) 412 } 413 414 enc.totalVals += uint64(v.Len()) 415 for ; idx < v.Len(); idx++ { 416 val := v.Index(idx).Int() 417 enc.deltas = append(enc.deltas, val-enc.currentVal) 418 enc.currentVal = val 419 if len(enc.deltas) == int(enc.blockSize) { 420 enc.flushBlock() 421 } 422 } 423 } 424 425 // FlushValues flushes any remaining data and returns the finished encoded buffer 426 // or returns nil and any error encountered during flushing. 427 func (enc *deltaBitPackEncoder) FlushValues() (Buffer, error) { 428 if enc.bitWriter != nil { 429 // write any remaining values 430 enc.flushBlock() 431 enc.bitWriter.Flush(true) 432 } else { 433 enc.blockSize = defaultBlockSize 434 enc.numMiniBlocks = defaultNumMiniBlocks 435 enc.miniBlockSize = defaultNumValuesPerMini 436 } 437 438 buffer := make([]byte, maxHeaderWriterSize) 439 headerWriter := utils.NewBitWriter(utils.NewWriterAtBuffer(buffer)) 440 441 headerWriter.WriteVlqInt(uint64(enc.blockSize)) 442 headerWriter.WriteVlqInt(uint64(enc.numMiniBlocks)) 443 headerWriter.WriteVlqInt(uint64(enc.totalVals)) 444 headerWriter.WriteZigZagVlqInt(int64(enc.firstVal)) 445 headerWriter.Flush(false) 446 447 buffer = buffer[:headerWriter.Written()] 448 enc.totalVals = 0 449 450 if enc.bitWriter != nil { 451 flushed := enc.sink.Finish() 452 defer flushed.Release() 453 454 buffer = append(buffer, flushed.Buf()[:enc.bitWriter.Written()]...) 455 } 456 return poolBuffer{memory.NewBufferBytes(buffer)}, nil 457 } 458 459 // EstimatedDataEncodedSize returns the current amount of data actually flushed out and written 460 func (enc *deltaBitPackEncoder) EstimatedDataEncodedSize() int64 { 461 return int64(enc.bitWriter.Written()) 462 } 463 464 // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data. 465 type DeltaBitPackInt32Encoder struct { 466 *deltaBitPackEncoder 467 } 468 469 // Put writes the values from the provided slice of int32 to the encoder 470 func (enc DeltaBitPackInt32Encoder) Put(in []int32) { 471 enc.putInternal(in) 472 } 473 474 // PutSpaced takes a slice of int32 along with a bitmap that describes the nulls and an offset into the bitmap 475 // in order to write spaced data to the encoder. 476 func (enc DeltaBitPackInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) { 477 buffer := memory.NewResizableBuffer(enc.mem) 478 buffer.Reserve(arrow.Int32Traits.BytesRequired(len(in))) 479 defer buffer.Release() 480 481 data := arrow.Int32Traits.CastFromBytes(buffer.Buf()) 482 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 483 enc.Put(data[:nvalid]) 484 } 485 486 // Type returns the underlying physical type this encoder works with, in this case Int32 487 func (DeltaBitPackInt32Encoder) Type() parquet.Type { 488 return parquet.Types.Int32 489 } 490 491 // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data. 492 type DeltaBitPackInt64Encoder struct { 493 *deltaBitPackEncoder 494 } 495 496 // Put writes the values from the provided slice of int64 to the encoder 497 func (enc DeltaBitPackInt64Encoder) Put(in []int64) { 498 enc.putInternal(in) 499 } 500 501 // PutSpaced takes a slice of int64 along with a bitmap that describes the nulls and an offset into the bitmap 502 // in order to write spaced data to the encoder. 503 func (enc DeltaBitPackInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) { 504 buffer := memory.NewResizableBuffer(enc.mem) 505 buffer.Reserve(arrow.Int64Traits.BytesRequired(len(in))) 506 defer buffer.Release() 507 508 data := arrow.Int64Traits.CastFromBytes(buffer.Buf()) 509 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 510 enc.Put(data[:nvalid]) 511 } 512 513 // Type returns the underlying physical type this encoder works with, in this case Int64 514 func (DeltaBitPackInt64Encoder) Type() parquet.Type { 515 return parquet.Types.Int64 516 }