github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/decoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"bytes"
    21  	"reflect"
    22  
    23  	"github.com/apache/arrow/go/v14/arrow/array"
    24  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    25  	"github.com/apache/arrow/go/v14/arrow/memory"
    26  	"github.com/apache/arrow/go/v14/internal/bitutils"
    27  	shared_utils "github.com/apache/arrow/go/v14/internal/utils"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/debug"
    30  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v14/parquet/internal/utils"
    32  	"github.com/apache/arrow/go/v14/parquet/schema"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // DecoderTraits provides an interface for more easily interacting with types
    37  // to generate decoders for specific types.
    38  type DecoderTraits interface {
    39  	Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder
    40  	BytesRequired(int) int
    41  }
    42  
    43  // NewDecoder constructs a decoder for a given type and encoding
    44  func NewDecoder(t parquet.Type, e parquet.Encoding, descr *schema.Column, mem memory.Allocator) TypedDecoder {
    45  	traits := getDecodingTraits(t)
    46  	if traits == nil {
    47  		return nil
    48  	}
    49  
    50  	return traits.Decoder(e, descr, false /* use dictionary */, mem)
    51  }
    52  
    53  // NewDictDecoder is like NewDecoder but for dictionary encodings, panics if type is bool.
    54  //
    55  // if mem is nil, memory.DefaultAllocator will be used
    56  func NewDictDecoder(t parquet.Type, descr *schema.Column, mem memory.Allocator) DictDecoder {
    57  	traits := getDecodingTraits(t)
    58  	if traits == nil {
    59  		return nil
    60  	}
    61  
    62  	if mem == nil {
    63  		mem = memory.DefaultAllocator
    64  	}
    65  
    66  	return traits.Decoder(parquet.Encodings.RLEDict, descr, true /* use dictionary */, mem).(DictDecoder)
    67  }
    68  
    69  type decoder struct {
    70  	descr    *schema.Column
    71  	encoding format.Encoding
    72  	nvals    int
    73  	data     []byte
    74  	typeLen  int
    75  }
    76  
    77  // newDecoderBase constructs the base decoding object that is embedded in the
    78  // type specific decoders.
    79  func newDecoderBase(e format.Encoding, descr *schema.Column) decoder {
    80  	typeLen := -1
    81  	if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray {
    82  		typeLen = int(descr.TypeLength())
    83  	}
    84  
    85  	return decoder{
    86  		descr:    descr,
    87  		encoding: e,
    88  		typeLen:  typeLen,
    89  	}
    90  }
    91  
    92  // SetData sets the data for decoding into the decoder to update the available
    93  // data bytes and number of values available.
    94  func (d *decoder) SetData(nvals int, data []byte) error {
    95  	d.data = data
    96  	d.nvals = nvals
    97  	return nil
    98  }
    99  
   100  // ValuesLeft returns the number of remaining values that can be decoded
   101  func (d *decoder) ValuesLeft() int { return d.nvals }
   102  
   103  // Encoding returns the encoding type used by this decoder to decode the bytes.
   104  func (d *decoder) Encoding() parquet.Encoding { return parquet.Encoding(d.encoding) }
   105  
   106  type dictDecoder struct {
   107  	decoder
   108  	mem              memory.Allocator
   109  	dictValueDecoder utils.DictionaryConverter
   110  	idxDecoder       *utils.RleDecoder
   111  
   112  	idxScratchSpace []uint64
   113  }
   114  
   115  // SetDict sets a decoder that can be used to decode the dictionary that is
   116  // used for this column in order to return the proper values.
   117  func (d *dictDecoder) SetDict(dict TypedDecoder) {
   118  	if dict.Type() != d.descr.PhysicalType() {
   119  		panic("parquet: mismatch dictionary and column data type")
   120  	}
   121  
   122  	d.dictValueDecoder = NewDictConverter(dict)
   123  }
   124  
   125  // SetData sets the index value data into the decoder.
   126  func (d *dictDecoder) SetData(nvals int, data []byte) error {
   127  	d.nvals = nvals
   128  	if len(data) == 0 {
   129  		// no data, bitwidth can safely be 0
   130  		d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data), 0 /* bitwidth */)
   131  		return nil
   132  	}
   133  
   134  	// grab the bit width from the first byte
   135  	width := uint8(data[0])
   136  	if width >= 64 {
   137  		return xerrors.New("parquet: invalid or corrupted bit width")
   138  	}
   139  
   140  	// pass the rest of the data, minus that first byte, to the decoder
   141  	d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data[1:]), int(width))
   142  	return nil
   143  }
   144  
   145  func (d *dictDecoder) decode(out interface{}) (int, error) {
   146  	n, err := d.idxDecoder.GetBatchWithDict(d.dictValueDecoder, out)
   147  	d.nvals -= n
   148  	return n, err
   149  }
   150  
   151  func (d *dictDecoder) decodeSpaced(out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   152  	n, err := d.idxDecoder.GetBatchWithDictSpaced(d.dictValueDecoder, out, nullCount, validBits, validBitsOffset)
   153  	d.nvals -= n
   154  	return n, err
   155  }
   156  
   157  func (d *dictDecoder) DecodeIndices(numValues int, bldr array.Builder) (int, error) {
   158  	n := shared_utils.MinInt(numValues, d.nvals)
   159  	if cap(d.idxScratchSpace) < n {
   160  		d.idxScratchSpace = make([]uint64, n, bitutil.NextPowerOf2(n))
   161  	} else {
   162  		d.idxScratchSpace = d.idxScratchSpace[:n]
   163  	}
   164  
   165  	n = d.idxDecoder.GetBatch(d.idxScratchSpace)
   166  
   167  	toAppend := make([]int, n)
   168  	for i, v := range d.idxScratchSpace {
   169  		toAppend[i] = int(v)
   170  	}
   171  	bldr.(*array.BinaryDictionaryBuilder).AppendIndices(toAppend, nil)
   172  	d.nvals -= n
   173  	return n, nil
   174  }
   175  
   176  func (d *dictDecoder) DecodeIndicesSpaced(numValues, nullCount int, validBits []byte, offset int64, bldr array.Builder) (int, error) {
   177  	if cap(d.idxScratchSpace) < numValues {
   178  		d.idxScratchSpace = make([]uint64, numValues, bitutil.NextPowerOf2(numValues))
   179  	} else {
   180  		d.idxScratchSpace = d.idxScratchSpace[:numValues]
   181  	}
   182  
   183  	n, err := d.idxDecoder.GetBatchSpaced(d.idxScratchSpace, nullCount, validBits, offset)
   184  	if err != nil {
   185  		return n, err
   186  	}
   187  
   188  	valid := make([]bool, n)
   189  	bitutils.VisitBitBlocks(validBits, offset, int64(n),
   190  		func(pos int64) { valid[pos] = true }, func() {})
   191  
   192  	toAppend := make([]int, n)
   193  	for i, v := range d.idxScratchSpace {
   194  		toAppend[i] = int(v)
   195  	}
   196  	bldr.(*array.BinaryDictionaryBuilder).AppendIndices(toAppend, valid)
   197  	d.nvals -= n - nullCount
   198  	return n, nil
   199  }
   200  
   201  // spacedExpand is used to take a slice of data and utilize the bitmap provided to fill in nulls into the
   202  // correct slots according to the bitmap in order to produce a fully expanded result slice with nulls
   203  // in the correct slots.
   204  func spacedExpand(buffer interface{}, nullCount int, validBits []byte, validBitsOffset int64) int {
   205  	bufferRef := reflect.ValueOf(buffer)
   206  	if bufferRef.Kind() != reflect.Slice {
   207  		panic("invalid spacedexpand type, not slice")
   208  	}
   209  
   210  	var (
   211  		numValues int = bufferRef.Len()
   212  	)
   213  
   214  	idxDecode := int64(numValues - nullCount)
   215  	if idxDecode == 0 { // if there's nothing to decode there's nothing to do.
   216  		return numValues
   217  	}
   218  
   219  	// read the bitmap in reverse grabbing runs of valid bits where possible.
   220  	rdr := bitutils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(numValues))
   221  	for {
   222  		run := rdr.NextRun()
   223  		if run.Length == 0 {
   224  			break
   225  		}
   226  
   227  		// copy data from the end of the slice to it's proper location in the slice after accounting for the nulls
   228  		// because we technically don't care what is in the null slots we don't actually have to clean
   229  		// up after ourselves because we're doing this in reverse to guarantee that we'll always simply
   230  		// overwrite any existing data with the correctly spaced data. Any data that happens to be left in the null
   231  		// slots is fine since it shouldn't matter and saves us work.
   232  		idxDecode -= run.Length
   233  		n := reflect.Copy(bufferRef.Slice(int(run.Pos), bufferRef.Len()), bufferRef.Slice(int(idxDecode), int(int64(idxDecode)+run.Length)))
   234  		debug.Assert(n == int(run.Length), "reflect.Copy copied incorrect number of elements in spacedExpand")
   235  	}
   236  
   237  	return numValues
   238  }