github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/delta_bit_packing.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "bytes" 21 "math" 22 "math/bits" 23 "reflect" 24 25 "github.com/apache/arrow/go/v10/arrow" 26 "github.com/apache/arrow/go/v10/arrow/memory" 27 shared_utils "github.com/apache/arrow/go/v10/internal/utils" 28 "github.com/apache/arrow/go/v10/parquet" 29 "github.com/apache/arrow/go/v10/parquet/internal/utils" 30 "golang.org/x/xerrors" 31 ) 32 33 // see the deltaBitPack encoder for a description of the encoding format that is 34 // used for delta-bitpacking. 35 type deltaBitPackDecoder struct { 36 decoder 37 38 mem memory.Allocator 39 40 usedFirst bool 41 bitdecoder *utils.BitReader 42 blockSize uint64 43 currentBlockVals uint32 44 miniBlocks uint64 45 valsPerMini uint32 46 currentMiniBlockVals uint32 47 minDelta int64 48 miniBlockIdx uint64 49 50 deltaBitWidths *memory.Buffer 51 deltaBitWidth byte 52 53 totalValues uint64 54 lastVal int64 55 } 56 57 // returns the number of bytes read so far 58 func (d *deltaBitPackDecoder) bytesRead() int64 { 59 return d.bitdecoder.CurOffset() 60 } 61 62 func (d *deltaBitPackDecoder) Allocator() memory.Allocator { return d.mem } 63 64 // SetData sets the bytes and the expected number of values to decode 65 // into the decoder, updating the decoder and allowing it to be reused. 66 func (d *deltaBitPackDecoder) SetData(nvalues int, data []byte) error { 67 // set our data into the underlying decoder for the type 68 if err := d.decoder.SetData(nvalues, data); err != nil { 69 return err 70 } 71 // create a bit reader for our decoder's values 72 d.bitdecoder = utils.NewBitReader(bytes.NewReader(d.data)) 73 d.currentBlockVals = 0 74 d.currentMiniBlockVals = 0 75 if d.deltaBitWidths == nil { 76 d.deltaBitWidths = memory.NewResizableBuffer(d.mem) 77 } 78 79 var ok bool 80 d.blockSize, ok = d.bitdecoder.GetVlqInt() 81 if !ok { 82 return xerrors.New("parquet: eof exception") 83 } 84 85 if d.miniBlocks, ok = d.bitdecoder.GetVlqInt(); !ok { 86 return xerrors.New("parquet: eof exception") 87 } 88 89 if d.totalValues, ok = d.bitdecoder.GetVlqInt(); !ok { 90 return xerrors.New("parquet: eof exception") 91 } 92 93 if d.lastVal, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { 94 return xerrors.New("parquet: eof exception") 95 } 96 97 if d.miniBlocks != 0 { 98 d.valsPerMini = uint32(d.blockSize / d.miniBlocks) 99 } 100 return nil 101 } 102 103 // initialize a block to decode 104 func (d *deltaBitPackDecoder) initBlock() error { 105 // first we grab the min delta value that we'll start from 106 var ok bool 107 if d.minDelta, ok = d.bitdecoder.GetZigZagVlqInt(); !ok { 108 return xerrors.New("parquet: eof exception") 109 } 110 111 // ensure we have enough space for our miniblocks to decode the widths 112 d.deltaBitWidths.Resize(int(d.miniBlocks)) 113 114 var err error 115 for i := uint64(0); i < d.miniBlocks; i++ { 116 if d.deltaBitWidths.Bytes()[i], err = d.bitdecoder.ReadByte(); err != nil { 117 return err 118 } 119 } 120 121 d.miniBlockIdx = 0 122 d.deltaBitWidth = d.deltaBitWidths.Bytes()[0] 123 d.currentBlockVals = uint32(d.blockSize) 124 return nil 125 } 126 127 // DeltaBitPackInt32Decoder decodes Int32 values which are packed using the Delta BitPacking algorithm. 128 type DeltaBitPackInt32Decoder struct { 129 *deltaBitPackDecoder 130 131 miniBlockValues []int32 132 } 133 134 func (d *DeltaBitPackInt32Decoder) unpackNextMini() error { 135 if d.miniBlockValues == nil { 136 d.miniBlockValues = make([]int32, 0, int(d.valsPerMini)) 137 } else { 138 d.miniBlockValues = d.miniBlockValues[:0] 139 } 140 d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)] 141 d.currentMiniBlockVals = d.valsPerMini 142 143 for j := 0; j < int(d.valsPerMini); j++ { 144 delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth)) 145 if !ok { 146 return xerrors.New("parquet: eof exception") 147 } 148 149 d.lastVal += int64(delta) + int64(d.minDelta) 150 d.miniBlockValues = append(d.miniBlockValues, int32(d.lastVal)) 151 } 152 d.miniBlockIdx++ 153 return nil 154 } 155 156 // Decode retrieves min(remaining values, len(out)) values from the data and returns the number 157 // of values actually decoded and any errors encountered. 158 func (d *DeltaBitPackInt32Decoder) Decode(out []int32) (int, error) { 159 max := shared_utils.MinInt(len(out), d.nvals) 160 if max == 0 { 161 return 0, nil 162 } 163 164 out = out[:max] 165 if !d.usedFirst { // starting value to calculate deltas against 166 out[0] = int32(d.lastVal) 167 out = out[1:] 168 d.usedFirst = true 169 } 170 171 var err error 172 for len(out) > 0 { // unpack mini blocks until we get all the values we need 173 if d.currentBlockVals == 0 { 174 err = d.initBlock() 175 } 176 if d.currentMiniBlockVals == 0 { 177 err = d.unpackNextMini() 178 } 179 if err != nil { 180 return 0, err 181 } 182 183 // copy as many values from our mini block as we can into out 184 start := int(d.valsPerMini - d.currentMiniBlockVals) 185 end := shared_utils.MinInt(int(d.valsPerMini), len(out)) 186 copy(out, d.miniBlockValues[start:end]) 187 188 numCopied := end - start 189 out = out[numCopied:] 190 d.currentBlockVals -= uint32(numCopied) 191 d.currentMiniBlockVals -= uint32(numCopied) 192 } 193 return max, nil 194 } 195 196 // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap 197 func (d *DeltaBitPackInt32Decoder) DecodeSpaced(out []int32, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 198 toread := len(out) - nullCount 199 values, err := d.Decode(out[:toread]) 200 if err != nil { 201 return values, err 202 } 203 if values != toread { 204 return values, xerrors.New("parquet: number of values / definition levels read did not match") 205 } 206 207 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 208 } 209 210 // Type returns the physical parquet type that this decoder decodes, in this case Int32 211 func (DeltaBitPackInt32Decoder) Type() parquet.Type { 212 return parquet.Types.Int32 213 } 214 215 // DeltaBitPackInt64Decoder decodes a delta bit packed int64 column of data. 216 type DeltaBitPackInt64Decoder struct { 217 *deltaBitPackDecoder 218 219 miniBlockValues []int64 220 } 221 222 func (d *DeltaBitPackInt64Decoder) unpackNextMini() error { 223 if d.miniBlockValues == nil { 224 d.miniBlockValues = make([]int64, 0, int(d.valsPerMini)) 225 } else { 226 d.miniBlockValues = d.miniBlockValues[:0] 227 } 228 229 d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)] 230 d.currentMiniBlockVals = d.valsPerMini 231 232 for j := 0; j < int(d.valsPerMini); j++ { 233 delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth)) 234 if !ok { 235 return xerrors.New("parquet: eof exception") 236 } 237 238 d.lastVal += int64(delta) + int64(d.minDelta) 239 d.miniBlockValues = append(d.miniBlockValues, d.lastVal) 240 } 241 d.miniBlockIdx++ 242 return nil 243 } 244 245 // Decode retrieves min(remaining values, len(out)) values from the data and returns the number 246 // of values actually decoded and any errors encountered. 247 func (d *DeltaBitPackInt64Decoder) Decode(out []int64) (int, error) { 248 max := shared_utils.MinInt(len(out), d.nvals) 249 if max == 0 { 250 return 0, nil 251 } 252 253 out = out[:max] 254 if !d.usedFirst { 255 out[0] = d.lastVal 256 out = out[1:] 257 d.usedFirst = true 258 } 259 260 var err error 261 for len(out) > 0 { 262 if d.currentBlockVals == 0 { 263 err = d.initBlock() 264 } 265 if d.currentMiniBlockVals == 0 { 266 err = d.unpackNextMini() 267 } 268 269 if err != nil { 270 return 0, err 271 } 272 273 start := int(d.valsPerMini - d.currentMiniBlockVals) 274 end := shared_utils.MinInt(int(d.valsPerMini), len(out)) 275 copy(out, d.miniBlockValues[start:end]) 276 277 numCopied := end - start 278 out = out[numCopied:] 279 d.currentBlockVals -= uint32(numCopied) 280 d.currentMiniBlockVals -= uint32(numCopied) 281 } 282 return max, nil 283 } 284 285 // Type returns the physical parquet type that this decoder decodes, in this case Int64 286 func (DeltaBitPackInt64Decoder) Type() parquet.Type { 287 return parquet.Types.Int64 288 } 289 290 // DecodeSpaced is like Decode, but the result is spaced out appropriately based on the passed in bitmap 291 func (d DeltaBitPackInt64Decoder) DecodeSpaced(out []int64, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 292 toread := len(out) - nullCount 293 values, err := d.Decode(out[:toread]) 294 if err != nil { 295 return values, err 296 } 297 if values != toread { 298 return values, xerrors.New("parquet: number of values / definition levels read did not match") 299 } 300 301 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 302 } 303 304 const ( 305 // block size must be a multiple of 128 306 defaultBlockSize = 128 307 defaultNumMiniBlocks = 4 308 // block size / number of mini blocks must result in a multiple of 32 309 defaultNumValuesPerMini = 32 310 // max size of the header for the delta blocks 311 maxHeaderWriterSize = 32 312 ) 313 314 // deltaBitPackEncoder is an encoder for the DeltaBinary Packing format 315 // as per the parquet spec. 316 // 317 // Consists of a header followed by blocks of delta encoded values binary packed. 318 // 319 // Format 320 // [header] [block 1] [block 2] ... [block N] 321 // 322 // Header 323 // [block size] [number of mini blocks per block] [total value count] [first value] 324 // 325 // Block 326 // [min delta] [list of bitwidths of the miniblocks] [miniblocks...] 327 // 328 // Sets aside bytes at the start of the internal buffer where the header will be written, 329 // and only writes the header when FlushValues is called before returning it. 330 type deltaBitPackEncoder struct { 331 encoder 332 333 bitWriter *utils.BitWriter 334 totalVals uint64 335 firstVal int64 336 currentVal int64 337 338 blockSize uint64 339 miniBlockSize uint64 340 numMiniBlocks uint64 341 deltas []int64 342 } 343 344 // flushBlock flushes out a finished block for writing to the underlying encoder 345 func (enc *deltaBitPackEncoder) flushBlock() { 346 if len(enc.deltas) == 0 { 347 return 348 } 349 350 // determine the minimum delta value 351 minDelta := int64(math.MaxInt64) 352 for _, delta := range enc.deltas { 353 if delta < minDelta { 354 minDelta = delta 355 } 356 } 357 358 enc.bitWriter.WriteZigZagVlqInt(minDelta) 359 // reserve enough bytes to write out our miniblock deltas 360 offset := enc.bitWriter.ReserveBytes(int(enc.numMiniBlocks)) 361 362 valuesToWrite := int64(len(enc.deltas)) 363 for i := 0; i < int(enc.numMiniBlocks); i++ { 364 n := shared_utils.Min(int64(enc.miniBlockSize), valuesToWrite) 365 if n == 0 { 366 break 367 } 368 369 maxDelta := int64(math.MinInt64) 370 start := i * int(enc.miniBlockSize) 371 for _, val := range enc.deltas[start : start+int(n)] { 372 maxDelta = shared_utils.Max(maxDelta, val) 373 } 374 375 // compute bit width to store (max_delta - min_delta) 376 width := uint(bits.Len64(uint64(maxDelta - minDelta))) 377 // write out the bit width we used into the bytes we reserved earlier 378 enc.bitWriter.WriteAt([]byte{byte(width)}, int64(offset+i)) 379 380 // write out our deltas 381 for _, val := range enc.deltas[start : start+int(n)] { 382 enc.bitWriter.WriteValue(uint64(val-minDelta), width) 383 } 384 385 valuesToWrite -= n 386 387 // pad the last block if n < miniBlockSize 388 for ; n < int64(enc.miniBlockSize); n++ { 389 enc.bitWriter.WriteValue(0, width) 390 } 391 } 392 enc.deltas = enc.deltas[:0] 393 } 394 395 // putInternal is the implementation for actually writing data which must be 396 // integral data as int, int8, int32, or int64. 397 func (enc *deltaBitPackEncoder) putInternal(data interface{}) { 398 v := reflect.ValueOf(data) 399 if v.Len() == 0 { 400 return 401 } 402 403 idx := 0 404 if enc.totalVals == 0 { 405 enc.blockSize = defaultBlockSize 406 enc.numMiniBlocks = defaultNumMiniBlocks 407 enc.miniBlockSize = defaultNumValuesPerMini 408 409 enc.firstVal = v.Index(0).Int() 410 enc.currentVal = enc.firstVal 411 idx = 1 412 413 enc.bitWriter = utils.NewBitWriter(enc.sink) 414 } 415 416 enc.totalVals += uint64(v.Len()) 417 for ; idx < v.Len(); idx++ { 418 val := v.Index(idx).Int() 419 enc.deltas = append(enc.deltas, val-enc.currentVal) 420 enc.currentVal = val 421 if len(enc.deltas) == int(enc.blockSize) { 422 enc.flushBlock() 423 } 424 } 425 } 426 427 // FlushValues flushes any remaining data and returns the finished encoded buffer 428 // or returns nil and any error encountered during flushing. 429 func (enc *deltaBitPackEncoder) FlushValues() (Buffer, error) { 430 if enc.bitWriter != nil { 431 // write any remaining values 432 enc.flushBlock() 433 enc.bitWriter.Flush(true) 434 } else { 435 enc.blockSize = defaultBlockSize 436 enc.numMiniBlocks = defaultNumMiniBlocks 437 enc.miniBlockSize = defaultNumValuesPerMini 438 } 439 440 buffer := make([]byte, maxHeaderWriterSize) 441 headerWriter := utils.NewBitWriter(utils.NewWriterAtBuffer(buffer)) 442 443 headerWriter.WriteVlqInt(uint64(enc.blockSize)) 444 headerWriter.WriteVlqInt(uint64(enc.numMiniBlocks)) 445 headerWriter.WriteVlqInt(uint64(enc.totalVals)) 446 headerWriter.WriteZigZagVlqInt(int64(enc.firstVal)) 447 headerWriter.Flush(false) 448 449 buffer = buffer[:headerWriter.Written()] 450 enc.totalVals = 0 451 452 if enc.bitWriter != nil { 453 flushed := enc.sink.Finish() 454 defer flushed.Release() 455 456 buffer = append(buffer, flushed.Buf()[:enc.bitWriter.Written()]...) 457 } 458 return poolBuffer{memory.NewBufferBytes(buffer)}, nil 459 } 460 461 // EstimatedDataEncodedSize returns the current amount of data actually flushed out and written 462 func (enc *deltaBitPackEncoder) EstimatedDataEncodedSize() int64 { 463 return int64(enc.bitWriter.Written()) 464 } 465 466 // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data. 467 type DeltaBitPackInt32Encoder struct { 468 *deltaBitPackEncoder 469 } 470 471 // Put writes the values from the provided slice of int32 to the encoder 472 func (enc DeltaBitPackInt32Encoder) Put(in []int32) { 473 enc.putInternal(in) 474 } 475 476 // PutSpaced takes a slice of int32 along with a bitmap that describes the nulls and an offset into the bitmap 477 // in order to write spaced data to the encoder. 478 func (enc DeltaBitPackInt32Encoder) PutSpaced(in []int32, validBits []byte, validBitsOffset int64) { 479 buffer := memory.NewResizableBuffer(enc.mem) 480 buffer.Reserve(arrow.Int32Traits.BytesRequired(len(in))) 481 defer buffer.Release() 482 483 data := arrow.Int32Traits.CastFromBytes(buffer.Buf()) 484 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 485 enc.Put(data[:nvalid]) 486 } 487 488 // Type returns the underlying physical type this encoder works with, in this case Int32 489 func (DeltaBitPackInt32Encoder) Type() parquet.Type { 490 return parquet.Types.Int32 491 } 492 493 // DeltaBitPackInt32Encoder is an encoder for the delta bitpacking encoding for int32 data. 494 type DeltaBitPackInt64Encoder struct { 495 *deltaBitPackEncoder 496 } 497 498 // Put writes the values from the provided slice of int64 to the encoder 499 func (enc DeltaBitPackInt64Encoder) Put(in []int64) { 500 enc.putInternal(in) 501 } 502 503 // PutSpaced takes a slice of int64 along with a bitmap that describes the nulls and an offset into the bitmap 504 // in order to write spaced data to the encoder. 505 func (enc DeltaBitPackInt64Encoder) PutSpaced(in []int64, validBits []byte, validBitsOffset int64) { 506 buffer := memory.NewResizableBuffer(enc.mem) 507 buffer.Reserve(arrow.Int64Traits.BytesRequired(len(in))) 508 defer buffer.Release() 509 510 data := arrow.Int64Traits.CastFromBytes(buffer.Buf()) 511 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 512 enc.Put(data[:nvalid]) 513 } 514 515 // Type returns the underlying physical type this encoder works with, in this case Int64 516 func (DeltaBitPackInt64Encoder) Type() parquet.Type { 517 return parquet.Types.Int64 518 }