github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/utils/rle.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package utils contains various internal utilities for the parquet library
    18  // that aren't intended to be exposed to external consumers such as interfaces
    19  // and bitmap readers/writers including the RLE encoder/decoder and so on.
    20  package utils
    21  
    22  import (
    23  	"bytes"
    24  	"encoding/binary"
    25  	"math"
    26  
    27  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    28  	"github.com/apache/arrow/go/v14/internal/bitutils"
    29  	"github.com/apache/arrow/go/v14/internal/utils"
    30  	"github.com/apache/arrow/go/v14/parquet"
    31  	"golang.org/x/xerrors"
    32  )
    33  
    34  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata typed_rle_dict.gen.go.tmpl
    35  
    36  const (
    37  	MaxValuesPerLiteralRun = (1 << 6) * 8
    38  )
    39  
    40  func MinBufferSize(bitWidth int) int {
    41  	maxLiteralRunSize := 1 + bitutil.BytesForBits(int64(MaxValuesPerLiteralRun*bitWidth))
    42  	maxRepeatedRunSize := binary.MaxVarintLen32 + bitutil.BytesForBits(int64(bitWidth))
    43  	return int(utils.Max(maxLiteralRunSize, maxRepeatedRunSize))
    44  }
    45  
    46  func MaxBufferSize(width, numValues int) int {
    47  	bytesPerRun := width
    48  	numRuns := int(bitutil.BytesForBits(int64(numValues)))
    49  	literalMaxSize := numRuns + (numRuns * bytesPerRun)
    50  
    51  	minRepeatedRunSize := 1 + int(bitutil.BytesForBits(int64(width)))
    52  	repeatedMaxSize := int(bitutil.BytesForBits(int64(numValues))) * minRepeatedRunSize
    53  
    54  	return utils.MaxInt(literalMaxSize, repeatedMaxSize)
    55  }
    56  
    57  // Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
    58  // are sufficiently long, RLE is used, otherwise, the values are just bit-packed
    59  // (literal encoding).
    60  // For both types of runs, there is a byte-aligned indicator which encodes the length
    61  // of the run and the type of the run.
    62  // This encoding has the benefit that when there aren't any long enough runs, values
    63  // are always decoded at fixed (can be precomputed) bit offsets OR both the value and
    64  // the run length are byte aligned. This allows for very efficient decoding
    65  // implementations.
    66  // The encoding is:
    67  //    encoded-block := run*
    68  //    run := literal-run | repeated-run
    69  //    literal-run := literal-indicator < literal bytes >
    70  //    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
    71  //    literal-indicator := varint_encode( number_of_groups << 1 | 1)
    72  //    repeated-indicator := varint_encode( number_of_repetitions << 1 )
    73  //
    74  // Each run is preceded by a varint. The varint's least significant bit is
    75  // used to indicate whether the run is a literal run or a repeated run. The rest
    76  // of the varint is used to determine the length of the run (eg how many times the
    77  // value repeats).
    78  //
    79  // In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
    80  // in groups of 8), so that no matter the bit-width of the value, the sequence will end
    81  // on a byte boundary without padding.
    82  // Given that we know it is a multiple of 8, we store the number of 8-groups rather than
    83  // the actual number of encoded ints. (This means that the total number of encoded values
    84  // can not be determined from the encoded data, since the number of values in the last
    85  // group may not be a multiple of 8). For the last group of literal runs, we pad
    86  // the group to 8 with zeros. This allows for 8 at a time decoding on the read side
    87  // without the need for additional checks.
    88  //
    89  // There is a break-even point when it is more storage efficient to do run length
    90  // encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
    91  // for both the repeated encoding or the literal encoding.  This value can always
    92  // be computed based on the bit-width.
    93  //
    94  // Examples with bit-width 1 (eg encoding booleans):
    95  // ----------------------------------------
    96  // 100 1s followed by 100 0s:
    97  // <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
    98  //  - (total 4 bytes)
    99  //
   100  // alternating 1s and 0s (200 total):
   101  // 200 ints = 25 groups of 8
   102  // <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
   103  // (total 26 bytes, 1 byte overhead)
   104  //
   105  
   106  type RleDecoder struct {
   107  	r *BitReader
   108  
   109  	bitWidth int
   110  	curVal   uint64
   111  	repCount int32
   112  	litCount int32
   113  }
   114  
   115  func NewRleDecoder(data *bytes.Reader, width int) *RleDecoder {
   116  	return &RleDecoder{r: NewBitReader(data), bitWidth: width}
   117  }
   118  
   119  func (r *RleDecoder) Reset(data *bytes.Reader, width int) {
   120  	r.bitWidth = width
   121  	r.curVal = 0
   122  	r.repCount = 0
   123  	r.litCount = 0
   124  	r.r.Reset(data)
   125  }
   126  
   127  func (r *RleDecoder) Next() bool {
   128  	indicator, ok := r.r.GetVlqInt()
   129  	if !ok {
   130  		return false
   131  	}
   132  
   133  	literal := (indicator & 1) != 0
   134  	count := uint32(indicator >> 1)
   135  	if literal {
   136  		if count == 0 || count > uint32(math.MaxInt32/8) {
   137  			return false
   138  		}
   139  		r.litCount = int32(count) * 8
   140  	} else {
   141  		if count == 0 || count > uint32(math.MaxInt32) {
   142  			return false
   143  		}
   144  		r.repCount = int32(count)
   145  
   146  		nbytes := int(bitutil.BytesForBits(int64(r.bitWidth)))
   147  		switch {
   148  		case nbytes > 4:
   149  			if !r.r.GetAligned(nbytes, &r.curVal) {
   150  				return false
   151  			}
   152  		case nbytes > 2:
   153  			var val uint32
   154  			if !r.r.GetAligned(nbytes, &val) {
   155  				return false
   156  			}
   157  			r.curVal = uint64(val)
   158  		case nbytes > 1:
   159  			var val uint16
   160  			if !r.r.GetAligned(nbytes, &val) {
   161  				return false
   162  			}
   163  			r.curVal = uint64(val)
   164  		default:
   165  			var val uint8
   166  			if !r.r.GetAligned(nbytes, &val) {
   167  				return false
   168  			}
   169  			r.curVal = uint64(val)
   170  		}
   171  	}
   172  	return true
   173  }
   174  
   175  func (r *RleDecoder) GetValue() (uint64, bool) {
   176  	vals := make([]uint64, 1)
   177  	n := r.GetBatch(vals)
   178  	return vals[0], n == 1
   179  }
   180  
   181  func (r *RleDecoder) GetBatch(values []uint64) int {
   182  	read := 0
   183  	size := len(values)
   184  
   185  	out := values
   186  	for read < size {
   187  		remain := size - read
   188  
   189  		if r.repCount > 0 {
   190  			repbatch := int(math.Min(float64(remain), float64(r.repCount)))
   191  			for i := 0; i < repbatch; i++ {
   192  				out[i] = r.curVal
   193  			}
   194  
   195  			r.repCount -= int32(repbatch)
   196  			read += repbatch
   197  			out = out[repbatch:]
   198  		} else if r.litCount > 0 {
   199  			litbatch := int(math.Min(float64(remain), float64(r.litCount)))
   200  			n, _ := r.r.GetBatch(uint(r.bitWidth), out[:litbatch])
   201  			if n != litbatch {
   202  				return read
   203  			}
   204  
   205  			r.litCount -= int32(litbatch)
   206  			read += litbatch
   207  			out = out[litbatch:]
   208  		} else {
   209  			if !r.Next() {
   210  				return read
   211  			}
   212  		}
   213  	}
   214  	return read
   215  }
   216  
   217  func (r *RleDecoder) GetBatchSpaced(vals []uint64, nullcount int, validBits []byte, validBitsOffset int64) (int, error) {
   218  	if nullcount == 0 {
   219  		return r.GetBatch(vals), nil
   220  	}
   221  
   222  	converter := plainConverter{}
   223  	blockCounter := bitutils.NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
   224  
   225  	var (
   226  		totalProcessed int
   227  		processed      int
   228  		block          bitutils.BitBlockCount
   229  		err            error
   230  	)
   231  
   232  	for {
   233  		block = blockCounter.NextFourWords()
   234  		if block.Len == 0 {
   235  			break
   236  		}
   237  
   238  		if block.AllSet() {
   239  			processed = r.GetBatch(vals[:block.Len])
   240  		} else if block.NoneSet() {
   241  			converter.FillZero(vals[:block.Len])
   242  			processed = int(block.Len)
   243  		} else {
   244  			processed, err = r.getspaced(converter, vals, int(block.Len), int(block.Len-block.Popcnt), validBits, validBitsOffset)
   245  			if err != nil {
   246  				return totalProcessed, err
   247  			}
   248  		}
   249  
   250  		totalProcessed += processed
   251  		vals = vals[int(block.Len):]
   252  		validBitsOffset += int64(block.Len)
   253  
   254  		if processed != int(block.Len) {
   255  			break
   256  		}
   257  	}
   258  	return totalProcessed, nil
   259  }
   260  
   261  func (r *RleDecoder) getspaced(dc DictionaryConverter, vals interface{}, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   262  	switch vals := vals.(type) {
   263  	case []int32:
   264  		return r.getspacedInt32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   265  	case []int64:
   266  		return r.getspacedInt64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   267  	case []float32:
   268  		return r.getspacedFloat32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   269  	case []float64:
   270  		return r.getspacedFloat64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   271  	case []parquet.ByteArray:
   272  		return r.getspacedByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   273  	case []parquet.FixedLenByteArray:
   274  		return r.getspacedFixedLenByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   275  	case []parquet.Int96:
   276  		return r.getspacedInt96(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   277  	case []uint64:
   278  		return r.getspacedUint64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   279  	default:
   280  		return 0, xerrors.New("parquet/rle: getspaced invalid type")
   281  	}
   282  }
   283  
   284  func (r *RleDecoder) getspacedUint64(dc DictionaryConverter, vals []uint64, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   285  	if nullCount == batchSize {
   286  		dc.FillZero(vals[:batchSize])
   287  		return batchSize, nil
   288  	}
   289  
   290  	read := 0
   291  	remain := batchSize - nullCount
   292  
   293  	const bufferSize = 1024
   294  	var indexbuffer [bufferSize]IndexType
   295  
   296  	// assume no bits to start
   297  	bitReader := bitutils.NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
   298  	validRun := bitReader.NextRun()
   299  	for read < batchSize {
   300  		if validRun.Len == 0 {
   301  			validRun = bitReader.NextRun()
   302  		}
   303  
   304  		if !validRun.Set {
   305  			dc.FillZero(vals[:int(validRun.Len)])
   306  			vals = vals[int(validRun.Len):]
   307  			read += int(validRun.Len)
   308  			validRun.Len = 0
   309  			continue
   310  		}
   311  
   312  		if r.repCount == 0 && r.litCount == 0 {
   313  			if !r.Next() {
   314  				return read, nil
   315  			}
   316  		}
   317  
   318  		var batch int
   319  		switch {
   320  		case r.repCount > 0:
   321  			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
   322  			current := IndexType(r.curVal)
   323  			if !dc.IsValid(current) {
   324  				return read, nil
   325  			}
   326  			dc.Fill(vals[:batch], current)
   327  		case r.litCount > 0:
   328  			var (
   329  				litread int
   330  				skipped int
   331  				err     error
   332  			)
   333  			litread, skipped, validRun, err = r.consumeLiteralsUint64(dc, vals, remain, indexbuffer[:], validRun, bitReader)
   334  			if err != nil {
   335  				return read, err
   336  			}
   337  			batch = litread + skipped
   338  			remain -= litread
   339  		}
   340  
   341  		vals = vals[batch:]
   342  		read += batch
   343  	}
   344  	return read, nil
   345  }
   346  
   347  func (r *RleDecoder) consumeRepeatCounts(read, batchSize, remain int, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun) {
   348  	// Consume the entire repeat counts incrementing repeat_batch to
   349  	// be the total of nulls + values consumed, we only need to
   350  	// get the total count because we can fill in the same value for
   351  	// nulls and non-nulls. This proves to be a big efficiency win.
   352  	repeatBatch := 0
   353  	for r.repCount > 0 && (read+repeatBatch) < batchSize {
   354  		if run.Set {
   355  			updateSize := int(utils.Min(run.Len, int64(r.repCount)))
   356  			r.repCount -= int32(updateSize)
   357  			repeatBatch += updateSize
   358  			run.Len -= int64(updateSize)
   359  			remain -= updateSize
   360  		} else {
   361  			repeatBatch += int(run.Len)
   362  			run.Len = 0
   363  		}
   364  
   365  		if run.Len == 0 {
   366  			run = bitRdr.NextRun()
   367  		}
   368  	}
   369  	return repeatBatch, remain, run
   370  }
   371  
   372  func (r *RleDecoder) consumeLiteralsUint64(dc DictionaryConverter, vals []uint64, remain int, buf []IndexType, run bitutils.BitRun, bitRdr bitutils.BitRunReader) (int, int, bitutils.BitRun, error) {
   373  	batch := utils.MinInt(utils.MinInt(remain, int(r.litCount)), len(buf))
   374  	buf = buf[:batch]
   375  
   376  	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
   377  	if n != batch {
   378  		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
   379  	}
   380  
   381  	if !dc.IsValid(buf...) {
   382  		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
   383  	}
   384  
   385  	var (
   386  		read    int
   387  		skipped int
   388  	)
   389  	for read < batch {
   390  		if run.Set {
   391  			updateSize := utils.MinInt(batch-read, int(run.Len))
   392  			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
   393  				return 0, 0, run, err
   394  			}
   395  			read += updateSize
   396  			vals = vals[updateSize:]
   397  			run.Len -= int64(updateSize)
   398  		} else {
   399  			dc.FillZero(vals[:int(run.Len)])
   400  			vals = vals[int(run.Len):]
   401  			skipped += int(run.Len)
   402  			run.Len = 0
   403  		}
   404  		if run.Len == 0 {
   405  			run = bitRdr.NextRun()
   406  		}
   407  	}
   408  	r.litCount -= int32(batch)
   409  	return read, skipped, run, nil
   410  }
   411  
   412  func (r *RleDecoder) GetBatchWithDict(dc DictionaryConverter, vals interface{}) (int, error) {
   413  	switch vals := vals.(type) {
   414  	case []int32:
   415  		return r.GetBatchWithDictInt32(dc, vals)
   416  	case []int64:
   417  		return r.GetBatchWithDictInt64(dc, vals)
   418  	case []float32:
   419  		return r.GetBatchWithDictFloat32(dc, vals)
   420  	case []float64:
   421  		return r.GetBatchWithDictFloat64(dc, vals)
   422  	case []parquet.ByteArray:
   423  		return r.GetBatchWithDictByteArray(dc, vals)
   424  	case []parquet.FixedLenByteArray:
   425  		return r.GetBatchWithDictFixedLenByteArray(dc, vals)
   426  	case []parquet.Int96:
   427  		return r.GetBatchWithDictInt96(dc, vals)
   428  	default:
   429  		return 0, xerrors.New("parquet/rle: GetBatchWithDict invalid type")
   430  	}
   431  }
   432  
   433  func (r *RleDecoder) GetBatchWithDictSpaced(dc DictionaryConverter, vals interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   434  	switch vals := vals.(type) {
   435  	case []int32:
   436  		return r.GetBatchWithDictSpacedInt32(dc, vals, nullCount, validBits, validBitsOffset)
   437  	case []int64:
   438  		return r.GetBatchWithDictSpacedInt64(dc, vals, nullCount, validBits, validBitsOffset)
   439  	case []float32:
   440  		return r.GetBatchWithDictSpacedFloat32(dc, vals, nullCount, validBits, validBitsOffset)
   441  	case []float64:
   442  		return r.GetBatchWithDictSpacedFloat64(dc, vals, nullCount, validBits, validBitsOffset)
   443  	case []parquet.ByteArray:
   444  		return r.GetBatchWithDictSpacedByteArray(dc, vals, nullCount, validBits, validBitsOffset)
   445  	case []parquet.FixedLenByteArray:
   446  		return r.GetBatchWithDictSpacedFixedLenByteArray(dc, vals, nullCount, validBits, validBitsOffset)
   447  	case []parquet.Int96:
   448  		return r.GetBatchWithDictSpacedInt96(dc, vals, nullCount, validBits, validBitsOffset)
   449  	default:
   450  		return 0, xerrors.New("parquet/rle: GetBatchWithDictSpaced invalid type")
   451  	}
   452  }
   453  
   454  type RleEncoder struct {
   455  	w *BitWriter
   456  
   457  	buffer                 []uint64
   458  	BitWidth               int
   459  	curVal                 uint64
   460  	repCount               int32
   461  	litCount               int32
   462  	literalIndicatorOffset int
   463  
   464  	indicatorBuffer [1]byte
   465  }
   466  
   467  func NewRleEncoder(w WriterAtWithLen, width int) *RleEncoder {
   468  	return &RleEncoder{
   469  		w:                      NewBitWriter(w),
   470  		buffer:                 make([]uint64, 0, 8),
   471  		BitWidth:               width,
   472  		literalIndicatorOffset: -1,
   473  	}
   474  }
   475  
   476  func (r *RleEncoder) Flush() int {
   477  	if r.litCount > 0 || r.repCount > 0 || len(r.buffer) > 0 {
   478  		allRep := r.litCount == 0 && (r.repCount == int32(len(r.buffer)) || len(r.buffer) == 0)
   479  		if r.repCount > 0 && allRep {
   480  			r.flushRepeated()
   481  		} else {
   482  			// buffer the last grou pof literals to 8 by padding with 0s
   483  			for len(r.buffer) != 0 && len(r.buffer) < 8 {
   484  				r.buffer = append(r.buffer, 0)
   485  			}
   486  
   487  			r.litCount += int32(len(r.buffer))
   488  			r.flushLiteral(true)
   489  			r.repCount = 0
   490  		}
   491  	}
   492  	r.w.Flush(false)
   493  	return r.w.Written()
   494  }
   495  
   496  func (r *RleEncoder) flushBuffered(done bool) (err error) {
   497  	if r.repCount >= 8 {
   498  		// clear buffered values. they are part of the repeated run now and we
   499  		// don't want to flush them as literals
   500  		r.buffer = r.buffer[:0]
   501  		if r.litCount != 0 {
   502  			// there was  current literal run. all values flushed but need to update the indicator
   503  			err = r.flushLiteral(true)
   504  		}
   505  		return
   506  	}
   507  
   508  	r.litCount += int32(len(r.buffer))
   509  	ngroups := r.litCount / 8
   510  	if ngroups+1 >= (1 << 6) {
   511  		// we need to start a new literal run because the indicator byte we've reserved
   512  		// cannot store any more values
   513  		err = r.flushLiteral(true)
   514  	} else {
   515  		err = r.flushLiteral(done)
   516  	}
   517  	r.repCount = 0
   518  	return
   519  }
   520  
   521  func (r *RleEncoder) flushLiteral(updateIndicator bool) (err error) {
   522  	if r.literalIndicatorOffset == -1 {
   523  		r.literalIndicatorOffset, err = r.w.SkipBytes(1)
   524  		if err != nil {
   525  			return
   526  		}
   527  	}
   528  
   529  	for _, val := range r.buffer {
   530  		if err = r.w.WriteValue(val, uint(r.BitWidth)); err != nil {
   531  			return
   532  		}
   533  	}
   534  	r.buffer = r.buffer[:0]
   535  
   536  	if updateIndicator {
   537  		// at this point we need to write the indicator byte for the literal run.
   538  		// we only reserve one byte, to allow for streaming writes of literal values.
   539  		// the logic makes sure we flush literal runs often enough to not overrun the 1 byte.
   540  		ngroups := r.litCount / 8
   541  		r.indicatorBuffer[0] = byte((ngroups << 1) | 1)
   542  		_, err = r.w.WriteAt(r.indicatorBuffer[:], int64(r.literalIndicatorOffset))
   543  		r.literalIndicatorOffset = -1
   544  		r.litCount = 0
   545  	}
   546  	return
   547  }
   548  
   549  func (r *RleEncoder) flushRepeated() (ret bool) {
   550  	indicator := r.repCount << 1
   551  
   552  	ret = r.w.WriteVlqInt(uint64(indicator))
   553  	ret = ret && r.w.WriteAligned(r.curVal, int(bitutil.BytesForBits(int64(r.BitWidth))))
   554  
   555  	r.repCount = 0
   556  	r.buffer = r.buffer[:0]
   557  	return
   558  }
   559  
   560  // Put buffers input values 8 at a time. after seeing all 8 values,
   561  // it decides whether they should be encoded as a literal or repeated run.
   562  func (r *RleEncoder) Put(value uint64) error {
   563  	if r.curVal == value {
   564  		r.repCount++
   565  		if r.repCount > 8 {
   566  			// this is just a continuation of the current run, no need to buffer the values
   567  			// NOTE this is the fast path for long repeated runs
   568  			return nil
   569  		}
   570  	} else {
   571  		if r.repCount >= 8 {
   572  			if !r.flushRepeated() {
   573  				return xerrors.New("failed to flush repeated value")
   574  			}
   575  		}
   576  		r.repCount = 1
   577  		r.curVal = value
   578  	}
   579  
   580  	r.buffer = append(r.buffer, value)
   581  	if len(r.buffer) == 8 {
   582  		return r.flushBuffered(false)
   583  	}
   584  	return nil
   585  }
   586  
   587  func (r *RleEncoder) Clear() {
   588  	r.curVal = 0
   589  	r.repCount = 0
   590  	r.buffer = r.buffer[:0]
   591  	r.litCount = 0
   592  	r.literalIndicatorOffset = -1
   593  	r.w.Clear()
   594  }