github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/encoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "math/bits" 21 "reflect" 22 23 "github.com/apache/arrow/go/v10/arrow" 24 "github.com/apache/arrow/go/v10/arrow/bitutil" 25 "github.com/apache/arrow/go/v10/arrow/memory" 26 "github.com/apache/arrow/go/v10/internal/bitutils" 27 "github.com/apache/arrow/go/v10/parquet" 28 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 29 "github.com/apache/arrow/go/v10/parquet/internal/utils" 30 "github.com/apache/arrow/go/v10/parquet/schema" 31 ) 32 33 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl 34 35 // EncoderTraits is an interface for the different types to make it more 36 // convenient to construct encoders for specific types. 37 type EncoderTraits interface { 38 Encoder(format.Encoding, bool, *schema.Column, memory.Allocator) TypedEncoder 39 } 40 41 // NewEncoder will return the appropriately typed encoder for the requested physical type 42 // and encoding. 43 // 44 // If mem is nil, memory.DefaultAllocator will be used. 45 func NewEncoder(t parquet.Type, e parquet.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { 46 traits := getEncodingTraits(t) 47 if traits == nil { 48 return nil 49 } 50 51 if mem == nil { 52 mem = memory.DefaultAllocator 53 } 54 return traits.Encoder(format.Encoding(e), useDict, descr, mem) 55 } 56 57 type encoder struct { 58 descr *schema.Column 59 encoding format.Encoding 60 typeLen int 61 mem memory.Allocator 62 63 sink *PooledBufferWriter 64 } 65 66 // newEncoderBase constructs a new base encoder for embedding on the typed encoders 67 // encapsulating the common functionality. 68 func newEncoderBase(e format.Encoding, descr *schema.Column, mem memory.Allocator) encoder { 69 typelen := -1 70 if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray { 71 typelen = int(descr.TypeLength()) 72 } 73 return encoder{ 74 descr: descr, 75 encoding: e, 76 mem: mem, 77 typeLen: typelen, 78 sink: NewPooledBufferWriter(1024), 79 } 80 } 81 82 // ReserveForWrite allocates n bytes so that the next n bytes written do not require new allocations. 83 func (e *encoder) ReserveForWrite(n int) { e.sink.Reserve(n) } 84 func (e *encoder) EstimatedDataEncodedSize() int64 { return int64(e.sink.Len()) } 85 func (e *encoder) Encoding() parquet.Encoding { return parquet.Encoding(e.encoding) } 86 func (e *encoder) Allocator() memory.Allocator { return e.mem } 87 func (e *encoder) append(data []byte) { e.sink.Write(data) } 88 89 // FlushValues flushes any unwritten data to the buffer and returns the finished encoded buffer of data. 90 // This also clears the encoder, ownership of the data belongs to whomever called FlushValues, Release 91 // should be called on the resulting Buffer when done. 92 func (e *encoder) FlushValues() (Buffer, error) { return e.sink.Finish(), nil } 93 94 // Bytes returns the current bytes that have been written to the encoder's buffer but doesn't transfer ownership. 95 func (e *encoder) Bytes() []byte { return e.sink.Bytes() } 96 97 // Reset drops the data currently in the encoder and resets for new use. 98 func (e *encoder) Reset() { e.sink.Reset(0) } 99 100 type dictEncoder struct { 101 encoder 102 103 dictEncodedSize int 104 idxBuffer *memory.Buffer 105 idxValues []int32 106 memo MemoTable 107 } 108 109 // newDictEncoderBase constructs and returns a dictionary encoder for the appropriate type using the passed 110 // in memo table for constructing the index. 111 func newDictEncoderBase(descr *schema.Column, memo MemoTable, mem memory.Allocator) dictEncoder { 112 return dictEncoder{ 113 encoder: newEncoderBase(format.Encoding_PLAIN_DICTIONARY, descr, mem), 114 idxBuffer: memory.NewResizableBuffer(mem), 115 memo: memo, 116 } 117 } 118 119 // Reset drops all the currently encoded values from the index and indexes from the data to allow 120 // restarting the encoding process. 121 func (d *dictEncoder) Reset() { 122 d.encoder.Reset() 123 d.dictEncodedSize = 0 124 d.idxValues = d.idxValues[:0] 125 d.idxBuffer.ResizeNoShrink(0) 126 d.memo.Reset() 127 } 128 129 // append the passed index to the indexbuffer 130 func (d *dictEncoder) addIndex(idx int) { 131 if len(d.idxValues) == cap(d.idxValues) { 132 curLen := len(d.idxValues) 133 d.idxBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(bitutil.NextPowerOf2(curLen + 1))) 134 d.idxValues = arrow.Int32Traits.CastFromBytes(d.idxBuffer.Buf())[: curLen : d.idxBuffer.Len()/arrow.Int32SizeBytes] 135 } 136 d.idxValues = append(d.idxValues, int32(idx)) 137 } 138 139 // FlushValues dumps all the currently buffered indexes that would become the data page to a buffer and 140 // returns it or returns nil and any error encountered. 141 func (d *dictEncoder) FlushValues() (Buffer, error) { 142 buf := bufferPool.Get().(*memory.Buffer) 143 buf.Reserve(int(d.EstimatedDataEncodedSize())) 144 size, err := d.WriteIndices(buf.Buf()) 145 if err != nil { 146 poolBuffer{buf}.Release() 147 return nil, err 148 } 149 buf.ResizeNoShrink(size) 150 return poolBuffer{buf}, nil 151 } 152 153 // EstimatedDataEncodedSize returns the maximum number of bytes needed to store the RLE encoded indexes, not including the 154 // dictionary index in the computation. 155 func (d *dictEncoder) EstimatedDataEncodedSize() int64 { 156 return 1 + int64(utils.MaxBufferSize(d.BitWidth(), len(d.idxValues))+utils.MinBufferSize(d.BitWidth())) 157 } 158 159 // NumEntries returns the number of entires in the dictionary index for this encoder. 160 func (d *dictEncoder) NumEntries() int { 161 return d.memo.Size() 162 } 163 164 // BitWidth returns the max bitwidth that would be necessary for encoding the index values currently 165 // in the dictionary based on the size of the dictionary index. 166 func (d *dictEncoder) BitWidth() int { 167 switch d.NumEntries() { 168 case 0: 169 return 0 170 case 1: 171 return 1 172 default: 173 return bits.Len32(uint32(d.NumEntries() - 1)) 174 } 175 } 176 177 // WriteDict writes the dictionary index to the given byte slice. 178 func (d *dictEncoder) WriteDict(out []byte) { 179 d.memo.WriteOut(out) 180 } 181 182 // WriteIndices performs Run Length encoding on the indexes and the writes the encoded 183 // index value data to the provided byte slice, returning the number of bytes actually written. 184 // If any error is encountered, it will return -1 and the error. 185 func (d *dictEncoder) WriteIndices(out []byte) (int, error) { 186 out[0] = byte(d.BitWidth()) 187 188 enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(out[1:]), d.BitWidth()) 189 for _, idx := range d.idxValues { 190 if err := enc.Put(uint64(idx)); err != nil { 191 return -1, err 192 } 193 } 194 nbytes := enc.Flush() 195 196 d.idxValues = d.idxValues[:0] 197 return nbytes + 1, nil 198 } 199 200 // Put adds a value to the dictionary data column, inserting the value if it 201 // didn't already exist in the dictionary. 202 func (d *dictEncoder) Put(v interface{}) { 203 memoIdx, found, err := d.memo.GetOrInsert(v) 204 if err != nil { 205 panic(err) 206 } 207 if !found { 208 d.dictEncodedSize += int(reflect.TypeOf(v).Size()) 209 } 210 d.addIndex(memoIdx) 211 } 212 213 // DictEncodedSize returns the current size of the encoded dictionary 214 func (d *dictEncoder) DictEncodedSize() int { 215 return d.dictEncodedSize 216 } 217 218 // spacedCompress is a helper function for encoders to remove the slots in the slices passed in according 219 // to the bitmap which are null into an output slice that is no longer spaced out with slots for nulls. 220 func spacedCompress(src, out interface{}, validBits []byte, validBitsOffset int64) int { 221 nvalid := 0 222 223 // for efficiency we use a type switch because the copy runs significantly faster when typed 224 // than calling reflect.Copy 225 switch s := src.(type) { 226 case []int32: 227 o := out.([]int32) 228 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 229 for { 230 run := reader.NextRun() 231 if run.Length == 0 { 232 break 233 } 234 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 235 nvalid += int(run.Length) 236 } 237 case []int64: 238 o := out.([]int64) 239 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 240 for { 241 run := reader.NextRun() 242 if run.Length == 0 { 243 break 244 } 245 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 246 nvalid += int(run.Length) 247 } 248 case []float32: 249 o := out.([]float32) 250 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 251 for { 252 run := reader.NextRun() 253 if run.Length == 0 { 254 break 255 } 256 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 257 nvalid += int(run.Length) 258 } 259 case []float64: 260 o := out.([]float64) 261 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 262 for { 263 run := reader.NextRun() 264 if run.Length == 0 { 265 break 266 } 267 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 268 nvalid += int(run.Length) 269 } 270 case []parquet.ByteArray: 271 o := out.([]parquet.ByteArray) 272 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 273 for { 274 run := reader.NextRun() 275 if run.Length == 0 { 276 break 277 } 278 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 279 nvalid += int(run.Length) 280 } 281 case []parquet.FixedLenByteArray: 282 o := out.([]parquet.FixedLenByteArray) 283 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 284 for { 285 run := reader.NextRun() 286 if run.Length == 0 { 287 break 288 } 289 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 290 nvalid += int(run.Length) 291 } 292 case []bool: 293 o := out.([]bool) 294 reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 295 for { 296 run := reader.NextRun() 297 if run.Length == 0 { 298 break 299 } 300 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 301 nvalid += int(run.Length) 302 } 303 } 304 305 return nvalid 306 }