github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/utils/rle.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package utils contains various internal utilities for the parquet library
    18  // that aren't intended to be exposed to external consumers such as interfaces
    19  // and bitmap readers/writers including the RLE encoder/decoder and so on.
    20  package utils
    21  
    22  import (
    23  	"bytes"
    24  	"encoding/binary"
    25  	"io"
    26  	"math"
    27  
    28  	"github.com/apache/arrow/go/v10/arrow/bitutil"
    29  	"github.com/apache/arrow/go/v10/internal/bitutils"
    30  	"github.com/apache/arrow/go/v10/internal/utils"
    31  	"github.com/apache/arrow/go/v10/parquet"
    32  	"golang.org/x/xerrors"
    33  )
    34  
    35  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata typed_rle_dict.gen.go.tmpl
    36  
    37  const (
    38  	MaxValuesPerLiteralRun = (1 << 6) * 8
    39  )
    40  
    41  func MinBufferSize(bitWidth int) int {
    42  	maxLiteralRunSize := 1 + bitutil.BytesForBits(int64(MaxValuesPerLiteralRun*bitWidth))
    43  	maxRepeatedRunSize := binary.MaxVarintLen32 + bitutil.BytesForBits(int64(bitWidth))
    44  	return int(utils.Max(maxLiteralRunSize, maxRepeatedRunSize))
    45  }
    46  
    47  func MaxBufferSize(width, numValues int) int {
    48  	bytesPerRun := width
    49  	numRuns := int(bitutil.BytesForBits(int64(numValues)))
    50  	literalMaxSize := numRuns + (numRuns * bytesPerRun)
    51  
    52  	minRepeatedRunSize := 1 + int(bitutil.BytesForBits(int64(width)))
    53  	repeatedMaxSize := int(bitutil.BytesForBits(int64(numValues))) * minRepeatedRunSize
    54  
    55  	return utils.MaxInt(literalMaxSize, repeatedMaxSize)
    56  }
    57  
    58  // Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
    59  // are sufficiently long, RLE is used, otherwise, the values are just bit-packed
    60  // (literal encoding).
    61  // For both types of runs, there is a byte-aligned indicator which encodes the length
    62  // of the run and the type of the run.
    63  // This encoding has the benefit that when there aren't any long enough runs, values
    64  // are always decoded at fixed (can be precomputed) bit offsets OR both the value and
    65  // the run length are byte aligned. This allows for very efficient decoding
    66  // implementations.
    67  // The encoding is:
    68  //    encoded-block := run*
    69  //    run := literal-run | repeated-run
    70  //    literal-run := literal-indicator < literal bytes >
    71  //    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
    72  //    literal-indicator := varint_encode( number_of_groups << 1 | 1)
    73  //    repeated-indicator := varint_encode( number_of_repetitions << 1 )
    74  //
    75  // Each run is preceded by a varint. The varint's least significant bit is
    76  // used to indicate whether the run is a literal run or a repeated run. The rest
    77  // of the varint is used to determine the length of the run (eg how many times the
    78  // value repeats).
    79  //
    80  // In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
    81  // in groups of 8), so that no matter the bit-width of the value, the sequence will end
    82  // on a byte boundary without padding.
    83  // Given that we know it is a multiple of 8, we store the number of 8-groups rather than
    84  // the actual number of encoded ints. (This means that the total number of encoded values
    85  // can not be determined from the encoded data, since the number of values in the last
    86  // group may not be a multiple of 8). For the last group of literal runs, we pad
    87  // the group to 8 with zeros. This allows for 8 at a time decoding on the read side
    88  // without the need for additional checks.
    89  //
    90  // There is a break-even point when it is more storage efficient to do run length
    91  // encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
    92  // for both the repeated encoding or the literal encoding.  This value can always
    93  // be computed based on the bit-width.
    94  //
    95  // Examples with bit-width 1 (eg encoding booleans):
    96  // ----------------------------------------
    97  // 100 1s followed by 100 0s:
    98  // <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
    99  //  - (total 4 bytes)
   100  //
   101  // alternating 1s and 0s (200 total):
   102  // 200 ints = 25 groups of 8
   103  // <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
   104  // (total 26 bytes, 1 byte overhead)
   105  //
   106  
   107  type RleDecoder struct {
   108  	r *BitReader
   109  
   110  	bitWidth int
   111  	curVal   uint64
   112  	repCount int32
   113  	litCount int32
   114  }
   115  
   116  func NewRleDecoder(data *bytes.Reader, width int) *RleDecoder {
   117  	return &RleDecoder{r: NewBitReader(data), bitWidth: width}
   118  }
   119  
   120  func (r *RleDecoder) Reset(data *bytes.Reader, width int) {
   121  	r.bitWidth = width
   122  	r.curVal = 0
   123  	r.repCount = 0
   124  	r.litCount = 0
   125  	r.r.Reset(data)
   126  }
   127  
   128  func (r *RleDecoder) Next() bool {
   129  	indicator, ok := r.r.GetVlqInt()
   130  	if !ok {
   131  		return false
   132  	}
   133  
   134  	literal := (indicator & 1) != 0
   135  	count := uint32(indicator >> 1)
   136  	if literal {
   137  		if count == 0 || count > uint32(math.MaxInt32/8) {
   138  			return false
   139  		}
   140  		r.litCount = int32(count) * 8
   141  	} else {
   142  		if count == 0 || count > uint32(math.MaxInt32) {
   143  			return false
   144  		}
   145  		r.repCount = int32(count)
   146  
   147  		nbytes := int(bitutil.BytesForBits(int64(r.bitWidth)))
   148  		switch {
   149  		case nbytes > 4:
   150  			if !r.r.GetAligned(nbytes, &r.curVal) {
   151  				return false
   152  			}
   153  		case nbytes > 2:
   154  			var val uint32
   155  			if !r.r.GetAligned(nbytes, &val) {
   156  				return false
   157  			}
   158  			r.curVal = uint64(val)
   159  		case nbytes > 1:
   160  			var val uint16
   161  			if !r.r.GetAligned(nbytes, &val) {
   162  				return false
   163  			}
   164  			r.curVal = uint64(val)
   165  		default:
   166  			var val uint8
   167  			if !r.r.GetAligned(nbytes, &val) {
   168  				return false
   169  			}
   170  			r.curVal = uint64(val)
   171  		}
   172  	}
   173  	return true
   174  }
   175  
   176  func (r *RleDecoder) GetValue() (uint64, bool) {
   177  	vals := make([]uint64, 1)
   178  	n := r.GetBatch(vals)
   179  	return vals[0], n == 1
   180  }
   181  
   182  func (r *RleDecoder) GetBatch(values []uint64) int {
   183  	read := 0
   184  	size := len(values)
   185  
   186  	out := values
   187  	for read < size {
   188  		remain := size - read
   189  
   190  		if r.repCount > 0 {
   191  			repbatch := int(math.Min(float64(remain), float64(r.repCount)))
   192  			for i := 0; i < repbatch; i++ {
   193  				out[i] = r.curVal
   194  			}
   195  
   196  			r.repCount -= int32(repbatch)
   197  			read += repbatch
   198  			out = out[repbatch:]
   199  		} else if r.litCount > 0 {
   200  			litbatch := int(math.Min(float64(remain), float64(r.litCount)))
   201  			n, _ := r.r.GetBatch(uint(r.bitWidth), out[:litbatch])
   202  			if n != litbatch {
   203  				return read
   204  			}
   205  
   206  			r.litCount -= int32(litbatch)
   207  			read += litbatch
   208  			out = out[litbatch:]
   209  		} else {
   210  			if !r.Next() {
   211  				return read
   212  			}
   213  		}
   214  	}
   215  	return read
   216  }
   217  
   218  func (r *RleDecoder) GetBatchSpaced(vals []uint64, nullcount int, validBits []byte, validBitsOffset int64) (int, error) {
   219  	if nullcount == 0 {
   220  		return r.GetBatch(vals), nil
   221  	}
   222  
   223  	converter := plainConverter{}
   224  	blockCounter := bitutils.NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
   225  
   226  	var (
   227  		totalProcessed int
   228  		processed      int
   229  		block          bitutils.BitBlockCount
   230  		err            error
   231  	)
   232  
   233  	for {
   234  		block = blockCounter.NextFourWords()
   235  		if block.Len == 0 {
   236  			break
   237  		}
   238  
   239  		if block.AllSet() {
   240  			processed = r.GetBatch(vals[:block.Len])
   241  		} else if block.NoneSet() {
   242  			converter.FillZero(vals[:block.Len])
   243  			processed = int(block.Len)
   244  		} else {
   245  			processed, err = r.getspaced(converter, vals, int(block.Len), int(block.Len-block.Popcnt), validBits, validBitsOffset)
   246  			if err != nil {
   247  				return totalProcessed, err
   248  			}
   249  		}
   250  
   251  		totalProcessed += processed
   252  		vals = vals[int(block.Len):]
   253  		validBitsOffset += int64(block.Len)
   254  
   255  		if processed != int(block.Len) {
   256  			break
   257  		}
   258  	}
   259  	return totalProcessed, nil
   260  }
   261  
   262  func (r *RleDecoder) getspaced(dc DictionaryConverter, vals interface{}, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   263  	switch vals := vals.(type) {
   264  	case []int32:
   265  		return r.getspacedInt32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   266  	case []int64:
   267  		return r.getspacedInt64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   268  	case []float32:
   269  		return r.getspacedFloat32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   270  	case []float64:
   271  		return r.getspacedFloat64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   272  	case []parquet.ByteArray:
   273  		return r.getspacedByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   274  	case []parquet.FixedLenByteArray:
   275  		return r.getspacedFixedLenByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   276  	case []parquet.Int96:
   277  		return r.getspacedInt96(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   278  	case []uint64:
   279  		return r.getspacedUint64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   280  	default:
   281  		return 0, xerrors.New("parquet/rle: getspaced invalid type")
   282  	}
   283  }
   284  
   285  func (r *RleDecoder) getspacedUint64(dc DictionaryConverter, vals []uint64, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   286  	if nullCount == batchSize {
   287  		dc.FillZero(vals[:batchSize])
   288  		return batchSize, nil
   289  	}
   290  
   291  	read := 0
   292  	remain := batchSize - nullCount
   293  
   294  	const bufferSize = 1024
   295  	var indexbuffer [bufferSize]IndexType
   296  
   297  	// assume no bits to start
   298  	bitReader := bitutils.NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
   299  	validRun := bitReader.NextRun()
   300  	for read < batchSize {
   301  		if validRun.Len == 0 {
   302  			validRun = bitReader.NextRun()
   303  		}
   304  
   305  		if !validRun.Set {
   306  			dc.FillZero(vals[:int(validRun.Len)])
   307  			vals = vals[int(validRun.Len):]
   308  			read += int(validRun.Len)
   309  			validRun.Len = 0
   310  			continue
   311  		}
   312  
   313  		if r.repCount == 0 && r.litCount == 0 {
   314  			if !r.Next() {
   315  				return read, nil
   316  			}
   317  		}
   318  
   319  		var batch int
   320  		switch {
   321  		case r.repCount > 0:
   322  			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
   323  			current := IndexType(r.curVal)
   324  			if !dc.IsValid(current) {
   325  				return read, nil
   326  			}
   327  			dc.Fill(vals[:batch], current)
   328  		case r.litCount > 0:
   329  			var (
   330  				litread int
   331  				skipped int
   332  				err     error
   333  			)
   334  			litread, skipped, validRun, err = r.consumeLiteralsUint64(dc, vals, remain, indexbuffer[:], validRun, bitReader)
   335  			if err != nil {
   336  				return read, err
   337  			}
   338  			batch = litread + skipped
   339  			remain -= litread
   340  		}
   341  
   342  		vals = vals[batch:]
   343  		read += batch
   344  	}
   345  	return read, nil
   346  }
   347  
   348  func (r *RleDecoder) consumeRepeatCounts(read, batchSize, remain int, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun) {
   349  	// Consume the entire repeat counts incrementing repeat_batch to
   350  	// be the total of nulls + values consumed, we only need to
   351  	// get the total count because we can fill in the same value for
   352  	// nulls and non-nulls. This proves to be a big efficiency win.
   353  	repeatBatch := 0
   354  	for r.repCount > 0 && (read+repeatBatch) < batchSize {
   355  		if run.Set {
   356  			updateSize := int(utils.Min(run.Len, int64(r.repCount)))
   357  			r.repCount -= int32(updateSize)
   358  			repeatBatch += updateSize
   359  			run.Len -= int64(updateSize)
   360  			remain -= updateSize
   361  		} else {
   362  			repeatBatch += int(run.Len)
   363  			run.Len = 0
   364  		}
   365  
   366  		if run.Len == 0 {
   367  			run = bitRdr.NextRun()
   368  		}
   369  	}
   370  	return repeatBatch, remain, run
   371  }
   372  
   373  func (r *RleDecoder) consumeLiteralsUint64(dc DictionaryConverter, vals []uint64, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) {
   374  	batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf))
   375  	buf = buf[:batch]
   376  
   377  	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
   378  	if n != batch {
   379  		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
   380  	}
   381  
   382  	if !dc.IsValid(buf...) {
   383  		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
   384  	}
   385  
   386  	var (
   387  		read    int
   388  		skipped int
   389  	)
   390  	for read < batch {
   391  		if run.Set {
   392  			updateSize := utils.MinInt(batch-read, int(run.Len))
   393  			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
   394  				return 0, 0, run, err
   395  			}
   396  			read += updateSize
   397  			vals = vals[updateSize:]
   398  			run.Len -= int64(updateSize)
   399  		} else {
   400  			dc.FillZero(vals[:int(run.Len)])
   401  			vals = vals[int(run.Len):]
   402  			skipped += int(run.Len)
   403  			run.Len = 0
   404  		}
   405  		if run.Len == 0 {
   406  			run = bitRdr.NextRun()
   407  		}
   408  	}
   409  	r.litCount -= int32(batch)
   410  	return read, skipped, run, nil
   411  }
   412  
   413  func (r *RleDecoder) GetBatchWithDict(dc DictionaryConverter, vals interface{}) (int, error) {
   414  	switch vals := vals.(type) {
   415  	case []int32:
   416  		return r.GetBatchWithDictInt32(dc, vals)
   417  	case []int64:
   418  		return r.GetBatchWithDictInt64(dc, vals)
   419  	case []float32:
   420  		return r.GetBatchWithDictFloat32(dc, vals)
   421  	case []float64:
   422  		return r.GetBatchWithDictFloat64(dc, vals)
   423  	case []parquet.ByteArray:
   424  		return r.GetBatchWithDictByteArray(dc, vals)
   425  	case []parquet.FixedLenByteArray:
   426  		return r.GetBatchWithDictFixedLenByteArray(dc, vals)
   427  	case []parquet.Int96:
   428  		return r.GetBatchWithDictInt96(dc, vals)
   429  	default:
   430  		return 0, xerrors.New("parquet/rle: GetBatchWithDict invalid type")
   431  	}
   432  }
   433  
   434  func (r *RleDecoder) GetBatchWithDictSpaced(dc DictionaryConverter, vals interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   435  	switch vals := vals.(type) {
   436  	case []int32:
   437  		return r.GetBatchWithDictSpacedInt32(dc, vals, nullCount, validBits, validBitsOffset)
   438  	case []int64:
   439  		return r.GetBatchWithDictSpacedInt64(dc, vals, nullCount, validBits, validBitsOffset)
   440  	case []float32:
   441  		return r.GetBatchWithDictSpacedFloat32(dc, vals, nullCount, validBits, validBitsOffset)
   442  	case []float64:
   443  		return r.GetBatchWithDictSpacedFloat64(dc, vals, nullCount, validBits, validBitsOffset)
   444  	case []parquet.ByteArray:
   445  		return r.GetBatchWithDictSpacedByteArray(dc, vals, nullCount, validBits, validBitsOffset)
   446  	case []parquet.FixedLenByteArray:
   447  		return r.GetBatchWithDictSpacedFixedLenByteArray(dc, vals, nullCount, validBits, validBitsOffset)
   448  	case []parquet.Int96:
   449  		return r.GetBatchWithDictSpacedInt96(dc, vals, nullCount, validBits, validBitsOffset)
   450  	default:
   451  		return 0, xerrors.New("parquet/rle: GetBatchWithDictSpaced invalid type")
   452  	}
   453  }
   454  
   455  type RleEncoder struct {
   456  	w *BitWriter
   457  
   458  	buffer                 []uint64
   459  	BitWidth               int
   460  	curVal                 uint64
   461  	repCount               int32
   462  	litCount               int32
   463  	literalIndicatorOffset int
   464  
   465  	indicatorBuffer [1]byte
   466  }
   467  
   468  func NewRleEncoder(w io.WriterAt, width int) *RleEncoder {
   469  	return &RleEncoder{
   470  		w:                      NewBitWriter(w),
   471  		buffer:                 make([]uint64, 0, 8),
   472  		BitWidth:               width,
   473  		literalIndicatorOffset: -1,
   474  	}
   475  }
   476  
   477  func (r *RleEncoder) Flush() int {
   478  	if r.litCount > 0 || r.repCount > 0 || len(r.buffer) > 0 {
   479  		allRep := r.litCount == 0 && (r.repCount == int32(len(r.buffer)) || len(r.buffer) == 0)
   480  		if r.repCount > 0 && allRep {
   481  			r.flushRepeated()
   482  		} else {
   483  			// buffer the last grou pof literals to 8 by padding with 0s
   484  			for len(r.buffer) != 0 && len(r.buffer) < 8 {
   485  				r.buffer = append(r.buffer, 0)
   486  			}
   487  
   488  			r.litCount += int32(len(r.buffer))
   489  			r.flushLiteral(true)
   490  			r.repCount = 0
   491  		}
   492  	}
   493  	r.w.Flush(false)
   494  	return r.w.Written()
   495  }
   496  
   497  func (r *RleEncoder) flushBuffered(done bool) (err error) {
   498  	if r.repCount >= 8 {
   499  		// clear buffered values. they are part of the repeated run now and we
   500  		// don't want to flush them as literals
   501  		r.buffer = r.buffer[:0]
   502  		if r.litCount != 0 {
   503  			// there was  current literal run. all values flushed but need to update the indicator
   504  			err = r.flushLiteral(true)
   505  		}
   506  		return
   507  	}
   508  
   509  	r.litCount += int32(len(r.buffer))
   510  	ngroups := r.litCount / 8
   511  	if ngroups+1 >= (1 << 6) {
   512  		// we need to start a new literal run because the indicator byte we've reserved
   513  		// cannot store any more values
   514  		err = r.flushLiteral(true)
   515  	} else {
   516  		err = r.flushLiteral(done)
   517  	}
   518  	r.repCount = 0
   519  	return
   520  }
   521  
   522  func (r *RleEncoder) flushLiteral(updateIndicator bool) (err error) {
   523  	if r.literalIndicatorOffset == -1 {
   524  		r.literalIndicatorOffset = r.w.ReserveBytes(1)
   525  	}
   526  
   527  	for _, val := range r.buffer {
   528  		if err = r.w.WriteValue(val, uint(r.BitWidth)); err != nil {
   529  			return
   530  		}
   531  	}
   532  	r.buffer = r.buffer[:0]
   533  
   534  	if updateIndicator {
   535  		// at this point we need to write the indicator byte for the literal run.
   536  		// we only reserve one byte, to allow for streaming writes of literal values.
   537  		// the logic makes sure we flush literal runs often enough to not overrun the 1 byte.
   538  		ngroups := r.litCount / 8
   539  		r.indicatorBuffer[0] = byte((ngroups << 1) | 1)
   540  		_, err = r.w.WriteAt(r.indicatorBuffer[:], int64(r.literalIndicatorOffset))
   541  		r.literalIndicatorOffset = -1
   542  		r.litCount = 0
   543  	}
   544  	return
   545  }
   546  
   547  func (r *RleEncoder) flushRepeated() (ret bool) {
   548  	indicator := r.repCount << 1
   549  
   550  	ret = r.w.WriteVlqInt(uint64(indicator))
   551  	ret = ret && r.w.WriteAligned(r.curVal, int(bitutil.BytesForBits(int64(r.BitWidth))))
   552  
   553  	r.repCount = 0
   554  	r.buffer = r.buffer[:0]
   555  	return
   556  }
   557  
   558  // Put buffers input values 8 at a time. after seeing all 8 values,
   559  // it decides whether they should be encoded as a literal or repeated run.
   560  func (r *RleEncoder) Put(value uint64) error {
   561  	if r.curVal == value {
   562  		r.repCount++
   563  		if r.repCount > 8 {
   564  			// this is just a continuation of the current run, no need to buffer the values
   565  			// NOTE this is the fast path for long repeated runs
   566  			return nil
   567  		}
   568  	} else {
   569  		if r.repCount >= 8 {
   570  			if !r.flushRepeated() {
   571  				return xerrors.New("failed to flush repeated value")
   572  			}
   573  		}
   574  		r.repCount = 1
   575  		r.curVal = value
   576  	}
   577  
   578  	r.buffer = append(r.buffer, value)
   579  	if len(r.buffer) == 8 {
   580  		return r.flushBuffered(false)
   581  	}
   582  	return nil
   583  }
   584  
   585  func (r *RleEncoder) Clear() {
   586  	r.curVal = 0
   587  	r.repCount = 0
   588  	r.buffer = r.buffer[:0]
   589  	r.litCount = 0
   590  	r.literalIndicatorOffset = -1
   591  	r.w.Clear()
   592  }