github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/utils/rle.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package utils contains various internal utilities for the parquet library
    18  // that aren't intended to be exposed to external consumers such as interfaces
    19  // and bitmap readers/writers including the RLE encoder/decoder and so on.
    20  package utils
    21  
    22  import (
    23  	"bytes"
    24  	"encoding/binary"
    25  	"io"
    26  	"math"
    27  
    28  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    29  	"github.com/apache/arrow/go/v7/parquet"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata typed_rle_dict.gen.go.tmpl
    34  
    35  const (
    36  	MaxValuesPerLiteralRun = (1 << 6) * 8
    37  )
    38  
    39  func MinBufferSize(bitWidth int) int {
    40  	maxLiteralRunSize := 1 + bitutil.BytesForBits(int64(MaxValuesPerLiteralRun*bitWidth))
    41  	maxRepeatedRunSize := binary.MaxVarintLen32 + bitutil.BytesForBits(int64(bitWidth))
    42  	return int(Max(maxLiteralRunSize, maxRepeatedRunSize))
    43  }
    44  
    45  func MaxBufferSize(width, numValues int) int {
    46  	bytesPerRun := width
    47  	numRuns := int(bitutil.BytesForBits(int64(numValues)))
    48  	literalMaxSize := numRuns + (numRuns * bytesPerRun)
    49  
    50  	minRepeatedRunSize := 1 + int(bitutil.BytesForBits(int64(width)))
    51  	repeatedMaxSize := int(bitutil.BytesForBits(int64(numValues))) * minRepeatedRunSize
    52  
    53  	return MaxInt(literalMaxSize, repeatedMaxSize)
    54  }
    55  
    56  // Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
    57  // are sufficiently long, RLE is used, otherwise, the values are just bit-packed
    58  // (literal encoding).
    59  // For both types of runs, there is a byte-aligned indicator which encodes the length
    60  // of the run and the type of the run.
    61  // This encoding has the benefit that when there aren't any long enough runs, values
    62  // are always decoded at fixed (can be precomputed) bit offsets OR both the value and
    63  // the run length are byte aligned. This allows for very efficient decoding
    64  // implementations.
    65  // The encoding is:
    66  //    encoded-block := run*
    67  //    run := literal-run | repeated-run
    68  //    literal-run := literal-indicator < literal bytes >
    69  //    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
    70  //    literal-indicator := varint_encode( number_of_groups << 1 | 1)
    71  //    repeated-indicator := varint_encode( number_of_repetitions << 1 )
    72  //
    73  // Each run is preceded by a varint. The varint's least significant bit is
    74  // used to indicate whether the run is a literal run or a repeated run. The rest
    75  // of the varint is used to determine the length of the run (eg how many times the
    76  // value repeats).
    77  //
    78  // In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
    79  // in groups of 8), so that no matter the bit-width of the value, the sequence will end
    80  // on a byte boundary without padding.
    81  // Given that we know it is a multiple of 8, we store the number of 8-groups rather than
    82  // the actual number of encoded ints. (This means that the total number of encoded values
    83  // can not be determined from the encoded data, since the number of values in the last
    84  // group may not be a multiple of 8). For the last group of literal runs, we pad
    85  // the group to 8 with zeros. This allows for 8 at a time decoding on the read side
    86  // without the need for additional checks.
    87  //
    88  // There is a break-even point when it is more storage efficient to do run length
    89  // encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
    90  // for both the repeated encoding or the literal encoding.  This value can always
    91  // be computed based on the bit-width.
    92  //
    93  // Examples with bit-width 1 (eg encoding booleans):
    94  // ----------------------------------------
    95  // 100 1s followed by 100 0s:
    96  // <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
    97  //  - (total 4 bytes)
    98  //
    99  // alternating 1s and 0s (200 total):
   100  // 200 ints = 25 groups of 8
   101  // <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
   102  // (total 26 bytes, 1 byte overhead)
   103  //
   104  
   105  type RleDecoder struct {
   106  	r *BitReader
   107  
   108  	bitWidth int
   109  	curVal   uint64
   110  	repCount int32
   111  	litCount int32
   112  }
   113  
   114  func NewRleDecoder(data *bytes.Reader, width int) *RleDecoder {
   115  	return &RleDecoder{r: NewBitReader(data), bitWidth: width}
   116  }
   117  
   118  func (r *RleDecoder) Reset(data *bytes.Reader, width int) {
   119  	r.bitWidth = width
   120  	r.curVal = 0
   121  	r.repCount = 0
   122  	r.litCount = 0
   123  	r.r.Reset(data)
   124  }
   125  
   126  func (r *RleDecoder) Next() bool {
   127  	indicator, ok := r.r.GetVlqInt()
   128  	if !ok {
   129  		return false
   130  	}
   131  
   132  	literal := (indicator & 1) != 0
   133  	count := uint32(indicator >> 1)
   134  	if literal {
   135  		if count == 0 || count > uint32(math.MaxInt32/8) {
   136  			return false
   137  		}
   138  		r.litCount = int32(count) * 8
   139  	} else {
   140  		if count == 0 || count > uint32(math.MaxInt32) {
   141  			return false
   142  		}
   143  		r.repCount = int32(count)
   144  
   145  		nbytes := int(bitutil.BytesForBits(int64(r.bitWidth)))
   146  		switch {
   147  		case nbytes > 4:
   148  			if !r.r.GetAligned(nbytes, &r.curVal) {
   149  				return false
   150  			}
   151  		case nbytes > 2:
   152  			var val uint32
   153  			if !r.r.GetAligned(nbytes, &val) {
   154  				return false
   155  			}
   156  			r.curVal = uint64(val)
   157  		case nbytes > 1:
   158  			var val uint16
   159  			if !r.r.GetAligned(nbytes, &val) {
   160  				return false
   161  			}
   162  			r.curVal = uint64(val)
   163  		default:
   164  			var val uint8
   165  			if !r.r.GetAligned(nbytes, &val) {
   166  				return false
   167  			}
   168  			r.curVal = uint64(val)
   169  		}
   170  	}
   171  	return true
   172  }
   173  
   174  func (r *RleDecoder) GetValue() (uint64, bool) {
   175  	vals := make([]uint64, 1)
   176  	n := r.GetBatch(vals)
   177  	return vals[0], n == 1
   178  }
   179  
   180  func (r *RleDecoder) GetBatch(values []uint64) int {
   181  	read := 0
   182  	size := len(values)
   183  
   184  	out := values
   185  	for read < size {
   186  		remain := size - read
   187  
   188  		if r.repCount > 0 {
   189  			repbatch := int(math.Min(float64(remain), float64(r.repCount)))
   190  			for i := 0; i < repbatch; i++ {
   191  				out[i] = r.curVal
   192  			}
   193  
   194  			r.repCount -= int32(repbatch)
   195  			read += repbatch
   196  			out = out[repbatch:]
   197  		} else if r.litCount > 0 {
   198  			litbatch := int(math.Min(float64(remain), float64(r.litCount)))
   199  			n, _ := r.r.GetBatch(uint(r.bitWidth), out[:litbatch])
   200  			if n != litbatch {
   201  				return read
   202  			}
   203  
   204  			r.litCount -= int32(litbatch)
   205  			read += litbatch
   206  			out = out[litbatch:]
   207  		} else {
   208  			if !r.Next() {
   209  				return read
   210  			}
   211  		}
   212  	}
   213  	return read
   214  }
   215  
   216  func (r *RleDecoder) GetBatchSpaced(vals []uint64, nullcount int, validBits []byte, validBitsOffset int64) (int, error) {
   217  	if nullcount == 0 {
   218  		return r.GetBatch(vals), nil
   219  	}
   220  
   221  	converter := plainConverter{}
   222  	blockCounter := NewBitBlockCounter(validBits, validBitsOffset, int64(len(vals)))
   223  
   224  	var (
   225  		totalProcessed int
   226  		processed      int
   227  		block          BitBlockCount
   228  		err            error
   229  	)
   230  
   231  	for {
   232  		block = blockCounter.NextFourWords()
   233  		if block.Len == 0 {
   234  			break
   235  		}
   236  
   237  		if block.AllSet() {
   238  			processed = r.GetBatch(vals[:block.Len])
   239  		} else if block.NoneSet() {
   240  			converter.FillZero(vals[:block.Len])
   241  			processed = int(block.Len)
   242  		} else {
   243  			processed, err = r.getspaced(converter, vals, int(block.Len), int(block.Len-block.Popcnt), validBits, validBitsOffset)
   244  			if err != nil {
   245  				return totalProcessed, err
   246  			}
   247  		}
   248  
   249  		totalProcessed += processed
   250  		vals = vals[int(block.Len):]
   251  		validBitsOffset += int64(block.Len)
   252  
   253  		if processed != int(block.Len) {
   254  			break
   255  		}
   256  	}
   257  	return totalProcessed, nil
   258  }
   259  
   260  func (r *RleDecoder) getspaced(dc DictionaryConverter, vals interface{}, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   261  	switch vals := vals.(type) {
   262  	case []int32:
   263  		return r.getspacedInt32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   264  	case []int64:
   265  		return r.getspacedInt64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   266  	case []float32:
   267  		return r.getspacedFloat32(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   268  	case []float64:
   269  		return r.getspacedFloat64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   270  	case []parquet.ByteArray:
   271  		return r.getspacedByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   272  	case []parquet.FixedLenByteArray:
   273  		return r.getspacedFixedLenByteArray(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   274  	case []parquet.Int96:
   275  		return r.getspacedInt96(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   276  	case []uint64:
   277  		return r.getspacedUint64(dc, vals, batchSize, nullCount, validBits, validBitsOffset)
   278  	default:
   279  		return 0, xerrors.New("parquet/rle: getspaced invalid type")
   280  	}
   281  }
   282  
   283  func (r *RleDecoder) getspacedUint64(dc DictionaryConverter, vals []uint64, batchSize, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   284  	if nullCount == batchSize {
   285  		dc.FillZero(vals[:batchSize])
   286  		return batchSize, nil
   287  	}
   288  
   289  	read := 0
   290  	remain := batchSize - nullCount
   291  
   292  	const bufferSize = 1024
   293  	var indexbuffer [bufferSize]IndexType
   294  
   295  	// assume no bits to start
   296  	bitReader := NewBitRunReader(validBits, validBitsOffset, int64(batchSize))
   297  	validRun := bitReader.NextRun()
   298  	for read < batchSize {
   299  		if validRun.Len == 0 {
   300  			validRun = bitReader.NextRun()
   301  		}
   302  
   303  		if !validRun.Set {
   304  			dc.FillZero(vals[:int(validRun.Len)])
   305  			vals = vals[int(validRun.Len):]
   306  			read += int(validRun.Len)
   307  			validRun.Len = 0
   308  			continue
   309  		}
   310  
   311  		if r.repCount == 0 && r.litCount == 0 {
   312  			if !r.Next() {
   313  				return read, nil
   314  			}
   315  		}
   316  
   317  		var batch int
   318  		switch {
   319  		case r.repCount > 0:
   320  			batch, remain, validRun = r.consumeRepeatCounts(read, batchSize, remain, validRun, bitReader)
   321  			current := IndexType(r.curVal)
   322  			if !dc.IsValid(current) {
   323  				return read, nil
   324  			}
   325  			dc.Fill(vals[:batch], current)
   326  		case r.litCount > 0:
   327  			var (
   328  				litread int
   329  				skipped int
   330  				err     error
   331  			)
   332  			litread, skipped, validRun, err = r.consumeLiteralsUint64(dc, vals, remain, indexbuffer[:], validRun, bitReader)
   333  			if err != nil {
   334  				return read, err
   335  			}
   336  			batch = litread + skipped
   337  			remain -= litread
   338  		}
   339  
   340  		vals = vals[batch:]
   341  		read += batch
   342  	}
   343  	return read, nil
   344  }
   345  
   346  func (r *RleDecoder) consumeRepeatCounts(read, batchSize, remain int, run BitRun, bitRdr BitRunReader) (int, int, BitRun) {
   347  	// Consume the entire repeat counts incrementing repeat_batch to
   348  	// be the total of nulls + values consumed, we only need to
   349  	// get the total count because we can fill in the same value for
   350  	// nulls and non-nulls. This proves to be a big efficiency win.
   351  	repeatBatch := 0
   352  	for r.repCount > 0 && (read+repeatBatch) < batchSize {
   353  		if run.Set {
   354  			updateSize := int(Min(run.Len, int64(r.repCount)))
   355  			r.repCount -= int32(updateSize)
   356  			repeatBatch += updateSize
   357  			run.Len -= int64(updateSize)
   358  			remain -= updateSize
   359  		} else {
   360  			repeatBatch += int(run.Len)
   361  			run.Len = 0
   362  		}
   363  
   364  		if run.Len == 0 {
   365  			run = bitRdr.NextRun()
   366  		}
   367  	}
   368  	return repeatBatch, remain, run
   369  }
   370  
   371  func (r *RleDecoder) consumeLiteralsUint64(dc DictionaryConverter, vals []uint64, remain int, buf []IndexType, run BitRun, bitRdr BitRunReader) (int, int, BitRun, error) {
   372  	batch := MinInt(MinInt(remain, int(r.litCount)), len(buf))
   373  	buf = buf[:batch]
   374  
   375  	n, _ := r.r.GetBatchIndex(uint(r.bitWidth), buf)
   376  	if n != batch {
   377  		return 0, 0, run, xerrors.New("was not able to retrieve correct number of indexes")
   378  	}
   379  
   380  	if !dc.IsValid(buf...) {
   381  		return 0, 0, run, xerrors.New("invalid index values found for dictionary converter")
   382  	}
   383  
   384  	var (
   385  		read    int
   386  		skipped int
   387  	)
   388  	for read < batch {
   389  		if run.Set {
   390  			updateSize := MinInt(batch-read, int(run.Len))
   391  			if err := dc.Copy(vals, buf[read:read+updateSize]); err != nil {
   392  				return 0, 0, run, err
   393  			}
   394  			read += updateSize
   395  			vals = vals[updateSize:]
   396  			run.Len -= int64(updateSize)
   397  		} else {
   398  			dc.FillZero(vals[:int(run.Len)])
   399  			vals = vals[int(run.Len):]
   400  			skipped += int(run.Len)
   401  			run.Len = 0
   402  		}
   403  		if run.Len == 0 {
   404  			run = bitRdr.NextRun()
   405  		}
   406  	}
   407  	r.litCount -= int32(batch)
   408  	return read, skipped, run, nil
   409  }
   410  
   411  func (r *RleDecoder) GetBatchWithDict(dc DictionaryConverter, vals interface{}) (int, error) {
   412  	switch vals := vals.(type) {
   413  	case []int32:
   414  		return r.GetBatchWithDictInt32(dc, vals)
   415  	case []int64:
   416  		return r.GetBatchWithDictInt64(dc, vals)
   417  	case []float32:
   418  		return r.GetBatchWithDictFloat32(dc, vals)
   419  	case []float64:
   420  		return r.GetBatchWithDictFloat64(dc, vals)
   421  	case []parquet.ByteArray:
   422  		return r.GetBatchWithDictByteArray(dc, vals)
   423  	case []parquet.FixedLenByteArray:
   424  		return r.GetBatchWithDictFixedLenByteArray(dc, vals)
   425  	case []parquet.Int96:
   426  		return r.GetBatchWithDictInt96(dc, vals)
   427  	default:
   428  		return 0, xerrors.New("parquet/rle: GetBatchWithDict invalid type")
   429  	}
   430  }
   431  
   432  func (r *RleDecoder) GetBatchWithDictSpaced(dc DictionaryConverter, vals interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   433  	switch vals := vals.(type) {
   434  	case []int32:
   435  		return r.GetBatchWithDictSpacedInt32(dc, vals, nullCount, validBits, validBitsOffset)
   436  	case []int64:
   437  		return r.GetBatchWithDictSpacedInt64(dc, vals, nullCount, validBits, validBitsOffset)
   438  	case []float32:
   439  		return r.GetBatchWithDictSpacedFloat32(dc, vals, nullCount, validBits, validBitsOffset)
   440  	case []float64:
   441  		return r.GetBatchWithDictSpacedFloat64(dc, vals, nullCount, validBits, validBitsOffset)
   442  	case []parquet.ByteArray:
   443  		return r.GetBatchWithDictSpacedByteArray(dc, vals, nullCount, validBits, validBitsOffset)
   444  	case []parquet.FixedLenByteArray:
   445  		return r.GetBatchWithDictSpacedFixedLenByteArray(dc, vals, nullCount, validBits, validBitsOffset)
   446  	case []parquet.Int96:
   447  		return r.GetBatchWithDictSpacedInt96(dc, vals, nullCount, validBits, validBitsOffset)
   448  	default:
   449  		return 0, xerrors.New("parquet/rle: GetBatchWithDictSpaced invalid type")
   450  	}
   451  }
   452  
   453  type RleEncoder struct {
   454  	w *BitWriter
   455  
   456  	buffer                 []uint64
   457  	BitWidth               int
   458  	curVal                 uint64
   459  	repCount               int32
   460  	litCount               int32
   461  	literalIndicatorOffset int
   462  
   463  	indicatorBuffer [1]byte
   464  }
   465  
   466  func NewRleEncoder(w io.WriterAt, width int) *RleEncoder {
   467  	return &RleEncoder{
   468  		w:                      NewBitWriter(w),
   469  		buffer:                 make([]uint64, 0, 8),
   470  		BitWidth:               width,
   471  		literalIndicatorOffset: -1,
   472  	}
   473  }
   474  
   475  func (r *RleEncoder) Flush() int {
   476  	if r.litCount > 0 || r.repCount > 0 || len(r.buffer) > 0 {
   477  		allRep := r.litCount == 0 && (r.repCount == int32(len(r.buffer)) || len(r.buffer) == 0)
   478  		if r.repCount > 0 && allRep {
   479  			r.flushRepeated()
   480  		} else {
   481  			// buffer the last grou pof literals to 8 by padding with 0s
   482  			for len(r.buffer) != 0 && len(r.buffer) < 8 {
   483  				r.buffer = append(r.buffer, 0)
   484  			}
   485  
   486  			r.litCount += int32(len(r.buffer))
   487  			r.flushLiteral(true)
   488  			r.repCount = 0
   489  		}
   490  	}
   491  	r.w.Flush(false)
   492  	return r.w.Written()
   493  }
   494  
   495  func (r *RleEncoder) flushBuffered(done bool) (err error) {
   496  	if r.repCount >= 8 {
   497  		// clear buffered values. they are part of the repeated run now and we
   498  		// don't want to flush them as literals
   499  		r.buffer = r.buffer[:0]
   500  		if r.litCount != 0 {
   501  			// there was  current literal run. all values flushed but need to update the indicator
   502  			err = r.flushLiteral(true)
   503  		}
   504  		return
   505  	}
   506  
   507  	r.litCount += int32(len(r.buffer))
   508  	ngroups := r.litCount / 8
   509  	if ngroups+1 >= (1 << 6) {
   510  		// we need to start a new literal run because the indicator byte we've reserved
   511  		// cannot store any more values
   512  		err = r.flushLiteral(true)
   513  	} else {
   514  		err = r.flushLiteral(done)
   515  	}
   516  	r.repCount = 0
   517  	return
   518  }
   519  
   520  func (r *RleEncoder) flushLiteral(updateIndicator bool) (err error) {
   521  	if r.literalIndicatorOffset == -1 {
   522  		r.literalIndicatorOffset = r.w.ReserveBytes(1)
   523  	}
   524  
   525  	for _, val := range r.buffer {
   526  		if err = r.w.WriteValue(val, uint(r.BitWidth)); err != nil {
   527  			return
   528  		}
   529  	}
   530  	r.buffer = r.buffer[:0]
   531  
   532  	if updateIndicator {
   533  		// at this point we need to write the indicator byte for the literal run.
   534  		// we only reserve one byte, to allow for streaming writes of literal values.
   535  		// the logic makes sure we flush literal runs often enough to not overrun the 1 byte.
   536  		ngroups := r.litCount / 8
   537  		r.indicatorBuffer[0] = byte((ngroups << 1) | 1)
   538  		_, err = r.w.WriteAt(r.indicatorBuffer[:], int64(r.literalIndicatorOffset))
   539  		r.literalIndicatorOffset = -1
   540  		r.litCount = 0
   541  	}
   542  	return
   543  }
   544  
   545  func (r *RleEncoder) flushRepeated() (ret bool) {
   546  	indicator := r.repCount << 1
   547  
   548  	ret = r.w.WriteVlqInt(uint64(indicator))
   549  	ret = ret && r.w.WriteAligned(r.curVal, int(bitutil.BytesForBits(int64(r.BitWidth))))
   550  
   551  	r.repCount = 0
   552  	r.buffer = r.buffer[:0]
   553  	return
   554  }
   555  
   556  // Put buffers input values 8 at a time. after seeing all 8 values,
   557  // it decides whether they should be encoded as a literal or repeated run.
   558  func (r *RleEncoder) Put(value uint64) error {
   559  	if r.curVal == value {
   560  		r.repCount++
   561  		if r.repCount > 8 {
   562  			// this is just a continuation of the current run, no need to buffer the values
   563  			// NOTE this is the fast path for long repeated runs
   564  			return nil
   565  		}
   566  	} else {
   567  		if r.repCount >= 8 {
   568  			if !r.flushRepeated() {
   569  				return xerrors.New("failed to flush repeated value")
   570  			}
   571  		}
   572  		r.repCount = 1
   573  		r.curVal = value
   574  	}
   575  
   576  	r.buffer = append(r.buffer, value)
   577  	if len(r.buffer) == 8 {
   578  		return r.flushBuffered(false)
   579  	}
   580  	return nil
   581  }
   582  
   583  func (r *RleEncoder) Clear() {
   584  	r.curVal = 0
   585  	r.repCount = 0
   586  	r.buffer = r.buffer[:0]
   587  	r.litCount = 0
   588  	r.literalIndicatorOffset = -1
   589  	r.w.Clear()
   590  }