github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/encoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "math/bits" 21 "reflect" 22 23 "github.com/apache/arrow/go/v7/arrow" 24 "github.com/apache/arrow/go/v7/arrow/bitutil" 25 "github.com/apache/arrow/go/v7/arrow/memory" 26 "github.com/apache/arrow/go/v7/parquet" 27 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 28 "github.com/apache/arrow/go/v7/parquet/internal/utils" 29 "github.com/apache/arrow/go/v7/parquet/schema" 30 ) 31 32 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=physical_types.tmpldata plain_encoder_types.gen.go.tmpl typed_encoder.gen.go.tmpl 33 34 // EncoderTraits is an interface for the different types to make it more 35 // convenient to construct encoders for specific types. 36 type EncoderTraits interface { 37 Encoder(format.Encoding, bool, *schema.Column, memory.Allocator) TypedEncoder 38 } 39 40 // NewEncoder will return the appropriately typed encoder for the requested physical type 41 // and encoding. 42 // 43 // If mem is nil, memory.DefaultAllocator will be used. 44 func NewEncoder(t parquet.Type, e parquet.Encoding, useDict bool, descr *schema.Column, mem memory.Allocator) TypedEncoder { 45 traits := getEncodingTraits(t) 46 if traits == nil { 47 return nil 48 } 49 50 if mem == nil { 51 mem = memory.DefaultAllocator 52 } 53 return traits.Encoder(format.Encoding(e), useDict, descr, mem) 54 } 55 56 type encoder struct { 57 descr *schema.Column 58 encoding format.Encoding 59 typeLen int 60 mem memory.Allocator 61 62 sink *PooledBufferWriter 63 } 64 65 // newEncoderBase constructs a new base encoder for embedding on the typed encoders 66 // encapsulating the common functionality. 67 func newEncoderBase(e format.Encoding, descr *schema.Column, mem memory.Allocator) encoder { 68 typelen := -1 69 if descr != nil && descr.PhysicalType() == parquet.Types.FixedLenByteArray { 70 typelen = int(descr.TypeLength()) 71 } 72 return encoder{ 73 descr: descr, 74 encoding: e, 75 mem: mem, 76 typeLen: typelen, 77 sink: NewPooledBufferWriter(1024), 78 } 79 } 80 81 // ReserveForWrite allocates n bytes so that the next n bytes written do not require new allocations. 82 func (e *encoder) ReserveForWrite(n int) { e.sink.Reserve(n) } 83 func (e *encoder) EstimatedDataEncodedSize() int64 { return int64(e.sink.Len()) } 84 func (e *encoder) Encoding() parquet.Encoding { return parquet.Encoding(e.encoding) } 85 func (e *encoder) Allocator() memory.Allocator { return e.mem } 86 func (e *encoder) append(data []byte) { e.sink.Write(data) } 87 88 // FlushValues flushes any unwritten data to the buffer and returns the finished encoded buffer of data. 89 // This also clears the encoder, ownership of the data belongs to whomever called FlushValues, Release 90 // should be called on the resulting Buffer when done. 91 func (e *encoder) FlushValues() (Buffer, error) { return e.sink.Finish(), nil } 92 93 // Bytes returns the current bytes that have been written to the encoder's buffer but doesn't transfer ownership. 94 func (e *encoder) Bytes() []byte { return e.sink.Bytes() } 95 96 // Reset drops the data currently in the encoder and resets for new use. 97 func (e *encoder) Reset() { e.sink.Reset(0) } 98 99 type dictEncoder struct { 100 encoder 101 102 dictEncodedSize int 103 idxBuffer *memory.Buffer 104 idxValues []int32 105 memo MemoTable 106 } 107 108 // newDictEncoderBase constructs and returns a dictionary encoder for the appropriate type using the passed 109 // in memo table for constructing the index. 110 func newDictEncoderBase(descr *schema.Column, memo MemoTable, mem memory.Allocator) dictEncoder { 111 return dictEncoder{ 112 encoder: newEncoderBase(format.Encoding_PLAIN_DICTIONARY, descr, mem), 113 idxBuffer: memory.NewResizableBuffer(mem), 114 memo: memo, 115 } 116 } 117 118 // Reset drops all the currently encoded values from the index and indexes from the data to allow 119 // restarting the encoding process. 120 func (d *dictEncoder) Reset() { 121 d.encoder.Reset() 122 d.dictEncodedSize = 0 123 d.idxValues = d.idxValues[:0] 124 d.idxBuffer.ResizeNoShrink(0) 125 d.memo.Reset() 126 } 127 128 // append the passed index to the indexbuffer 129 func (d *dictEncoder) addIndex(idx int) { 130 if len(d.idxValues) == cap(d.idxValues) { 131 curLen := len(d.idxValues) 132 d.idxBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(bitutil.NextPowerOf2(curLen + 1))) 133 d.idxValues = arrow.Int32Traits.CastFromBytes(d.idxBuffer.Buf())[: curLen : d.idxBuffer.Len()/arrow.Int32SizeBytes] 134 } 135 d.idxValues = append(d.idxValues, int32(idx)) 136 } 137 138 // FlushValues dumps all the currently buffered indexes that would become the data page to a buffer and 139 // returns it or returns nil and any error encountered. 140 func (d *dictEncoder) FlushValues() (Buffer, error) { 141 buf := bufferPool.Get().(*memory.Buffer) 142 buf.Reserve(int(d.EstimatedDataEncodedSize())) 143 size, err := d.WriteIndices(buf.Buf()) 144 if err != nil { 145 poolBuffer{buf}.Release() 146 return nil, err 147 } 148 buf.ResizeNoShrink(size) 149 return poolBuffer{buf}, nil 150 } 151 152 // EstimatedDataEncodedSize returns the maximum number of bytes needed to store the RLE encoded indexes, not including the 153 // dictionary index in the computation. 154 func (d *dictEncoder) EstimatedDataEncodedSize() int64 { 155 return 1 + int64(utils.MaxBufferSize(d.BitWidth(), len(d.idxValues))+utils.MinBufferSize(d.BitWidth())) 156 } 157 158 // NumEntries returns the number of entires in the dictionary index for this encoder. 159 func (d *dictEncoder) NumEntries() int { 160 return d.memo.Size() 161 } 162 163 // BitWidth returns the max bitwidth that would be necessary for encoding the index values currently 164 // in the dictionary based on the size of the dictionary index. 165 func (d *dictEncoder) BitWidth() int { 166 switch d.NumEntries() { 167 case 0: 168 return 0 169 case 1: 170 return 1 171 default: 172 return bits.Len32(uint32(d.NumEntries() - 1)) 173 } 174 } 175 176 // WriteDict writes the dictionary index to the given byte slice. 177 func (d *dictEncoder) WriteDict(out []byte) { 178 d.memo.WriteOut(out) 179 } 180 181 // WriteIndices performs Run Length encoding on the indexes and the writes the encoded 182 // index value data to the provided byte slice, returning the number of bytes actually written. 183 // If any error is encountered, it will return -1 and the error. 184 func (d *dictEncoder) WriteIndices(out []byte) (int, error) { 185 out[0] = byte(d.BitWidth()) 186 187 enc := utils.NewRleEncoder(utils.NewWriterAtBuffer(out[1:]), d.BitWidth()) 188 for _, idx := range d.idxValues { 189 if err := enc.Put(uint64(idx)); err != nil { 190 return -1, err 191 } 192 } 193 nbytes := enc.Flush() 194 195 d.idxValues = d.idxValues[:0] 196 return nbytes + 1, nil 197 } 198 199 // Put adds a value to the dictionary data column, inserting the value if it 200 // didn't already exist in the dictionary. 201 func (d *dictEncoder) Put(v interface{}) { 202 memoIdx, found, err := d.memo.GetOrInsert(v) 203 if err != nil { 204 panic(err) 205 } 206 if !found { 207 d.dictEncodedSize += int(reflect.TypeOf(v).Size()) 208 } 209 d.addIndex(memoIdx) 210 } 211 212 // DictEncodedSize returns the current size of the encoded dictionary 213 func (d *dictEncoder) DictEncodedSize() int { 214 return d.dictEncodedSize 215 } 216 217 // spacedCompress is a helper function for encoders to remove the slots in the slices passed in according 218 // to the bitmap which are null into an output slice that is no longer spaced out with slots for nulls. 219 func spacedCompress(src, out interface{}, validBits []byte, validBitsOffset int64) int { 220 nvalid := 0 221 222 // for efficiency we use a type switch because the copy runs significantly faster when typed 223 // than calling reflect.Copy 224 switch s := src.(type) { 225 case []int32: 226 o := out.([]int32) 227 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 228 for { 229 run := reader.NextRun() 230 if run.Length == 0 { 231 break 232 } 233 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 234 nvalid += int(run.Length) 235 } 236 case []int64: 237 o := out.([]int64) 238 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 239 for { 240 run := reader.NextRun() 241 if run.Length == 0 { 242 break 243 } 244 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 245 nvalid += int(run.Length) 246 } 247 case []float32: 248 o := out.([]float32) 249 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 250 for { 251 run := reader.NextRun() 252 if run.Length == 0 { 253 break 254 } 255 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 256 nvalid += int(run.Length) 257 } 258 case []float64: 259 o := out.([]float64) 260 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 261 for { 262 run := reader.NextRun() 263 if run.Length == 0 { 264 break 265 } 266 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 267 nvalid += int(run.Length) 268 } 269 case []parquet.ByteArray: 270 o := out.([]parquet.ByteArray) 271 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 272 for { 273 run := reader.NextRun() 274 if run.Length == 0 { 275 break 276 } 277 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 278 nvalid += int(run.Length) 279 } 280 case []parquet.FixedLenByteArray: 281 o := out.([]parquet.FixedLenByteArray) 282 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 283 for { 284 run := reader.NextRun() 285 if run.Length == 0 { 286 break 287 } 288 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 289 nvalid += int(run.Length) 290 } 291 case []bool: 292 o := out.([]bool) 293 reader := utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(s))) 294 for { 295 run := reader.NextRun() 296 if run.Length == 0 { 297 break 298 } 299 copy(o[nvalid:], s[int(run.Pos):int(run.Pos+run.Length)]) 300 nvalid += int(run.Length) 301 } 302 } 303 304 return nvalid 305 }