github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/decoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "bytes" 21 "reflect" 22 23 "github.com/apache/arrow/go/v10/arrow/memory" 24 "github.com/apache/arrow/go/v10/internal/bitutils" 25 "github.com/apache/arrow/go/v10/parquet" 26 "github.com/apache/arrow/go/v10/parquet/internal/debug" 27 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 28 "github.com/apache/arrow/go/v10/parquet/internal/utils" 29 "github.com/apache/arrow/go/v10/parquet/schema" 30 "golang.org/x/xerrors" 31 ) 32 33 // DecoderTraits provides an interface for more easily interacting with types 34 // to generate decoders for specific types. 35 type DecoderTraits interface { 36 Decoder(e parquet.Encoding, descr *schema.Column, useDict bool, mem memory.Allocator) TypedDecoder 37 BytesRequired(int) int 38 } 39 40 // NewDecoder constructs a decoder for a given type and encoding 41 func NewDecoder(t parquet.Type, e parquet.Encoding, descr *schema.Column, mem memory.Allocator) TypedDecoder { 42 traits := getDecodingTraits(t) 43 if traits == nil { 44 return nil 45 } 46 47 return traits.Decoder(e, descr, false /* use dictionary */, mem) 48 } 49 50 // NewDictDecoder is like NewDecoder but for dictionary encodings, panics if type is bool. 51 // 52 // if mem is nil, memory.DefaultAllocator will be used 53 func NewDictDecoder(t parquet.Type, descr *schema.Column, mem memory.Allocator) DictDecoder { 54 traits := getDecodingTraits(t) 55 if traits == nil { 56 return nil 57 } 58 59 if mem == nil { 60 mem = memory.DefaultAllocator 61 } 62 63 return traits.Decoder(parquet.Encodings.RLEDict, descr, true /* use dictionary */, mem).(DictDecoder) 64 } 65 66 type decoder struct { 67 descr *schema.Column 68 encoding format.Encoding 69 nvals int 70 data []byte 71 typeLen int 72 } 73 74 // newDecoderBase constructs the base decoding object that is embedded in the 75 // type specific decoders. 76 func newDecoderBase(e format.Encoding, descr *schema.Column) decoder { 77 typeLen := -1 78 if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray { 79 typeLen = int(descr.TypeLength()) 80 } 81 82 return decoder{ 83 descr: descr, 84 encoding: e, 85 typeLen: typeLen, 86 } 87 } 88 89 // SetData sets the data for decoding into the decoder to update the available 90 // data bytes and number of values available. 91 func (d *decoder) SetData(nvals int, data []byte) error { 92 d.data = data 93 d.nvals = nvals 94 return nil 95 } 96 97 // ValuesLeft returns the number of remaining values that can be decoded 98 func (d *decoder) ValuesLeft() int { return d.nvals } 99 100 // Encoding returns the encoding type used by this decoder to decode the bytes. 101 func (d *decoder) Encoding() parquet.Encoding { return parquet.Encoding(d.encoding) } 102 103 type dictDecoder struct { 104 decoder 105 mem memory.Allocator 106 dictValueDecoder utils.DictionaryConverter 107 idxDecoder *utils.RleDecoder 108 } 109 110 // SetDict sets a decoder that can be used to decode the dictionary that is 111 // used for this column in order to return the proper values. 112 func (d *dictDecoder) SetDict(dict TypedDecoder) { 113 if dict.Type() != d.descr.PhysicalType() { 114 panic("parquet: mismatch dictionary and column data type") 115 } 116 117 d.dictValueDecoder = NewDictConverter(dict) 118 } 119 120 // SetData sets the index value data into the decoder. 121 func (d *dictDecoder) SetData(nvals int, data []byte) error { 122 d.nvals = nvals 123 if len(data) == 0 { 124 // no data, bitwidth can safely be 0 125 d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data), 0 /* bitwidth */) 126 return nil 127 } 128 129 // grab the bit width from the first byte 130 width := uint8(data[0]) 131 if width >= 64 { 132 return xerrors.New("parquet: invalid or corrupted bit width") 133 } 134 135 // pass the rest of the data, minus that first byte, to the decoder 136 d.idxDecoder = utils.NewRleDecoder(bytes.NewReader(data[1:]), int(width)) 137 return nil 138 } 139 140 func (d *dictDecoder) decode(out interface{}) (int, error) { 141 return d.idxDecoder.GetBatchWithDict(d.dictValueDecoder, out) 142 } 143 144 func (d *dictDecoder) decodeSpaced(out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 145 return d.idxDecoder.GetBatchWithDictSpaced(d.dictValueDecoder, out, nullCount, validBits, validBitsOffset) 146 } 147 148 var empty = [1]byte{0} 149 150 // spacedExpand is used to take a slice of data and utilize the bitmap provided to fill in nulls into the 151 // correct slots according to the bitmap in order to produce a fully expanded result slice with nulls 152 // in the correct slots. 153 func spacedExpand(buffer interface{}, nullCount int, validBits []byte, validBitsOffset int64) int { 154 bufferRef := reflect.ValueOf(buffer) 155 if bufferRef.Kind() != reflect.Slice { 156 panic("invalid spacedexpand type, not slice") 157 } 158 159 var ( 160 numValues int = bufferRef.Len() 161 ) 162 163 idxDecode := int64(numValues - nullCount) 164 if idxDecode == 0 { // if there's nothing to decode there's nothing to do. 165 return numValues 166 } 167 168 // read the bitmap in reverse grabbing runs of valid bits where possible. 169 rdr := bitutils.NewReverseSetBitRunReader(validBits, validBitsOffset, int64(numValues)) 170 for { 171 run := rdr.NextRun() 172 if run.Length == 0 { 173 break 174 } 175 176 // copy data from the end of the slice to it's proper location in the slice after accounting for the nulls 177 // because we technically don't care what is in the null slots we don't actually have to clean 178 // up after ourselves because we're doing this in reverse to guarantee that we'll always simply 179 // overwrite any existing data with the correctly spaced data. Any data that happens to be left in the null 180 // slots is fine since it shouldn't matter and saves us work. 181 idxDecode -= run.Length 182 n := reflect.Copy(bufferRef.Slice(int(run.Pos), bufferRef.Len()), bufferRef.Slice(int(idxDecode), int(int64(idxDecode)+run.Length))) 183 debug.Assert(n == int(run.Length), "reflect.Copy copied incorrect number of elements in spacedExpand") 184 } 185 186 return numValues 187 }