github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/delta_length_byte_array.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "github.com/apache/arrow/go/v14/arrow/memory" 21 "github.com/apache/arrow/go/v14/internal/utils" 22 "github.com/apache/arrow/go/v14/parquet" 23 "golang.org/x/xerrors" 24 ) 25 26 // DeltaLengthByteArrayEncoder encodes data using by taking all of the byte array lengths 27 // and encoding them in front using delta encoding, followed by all of the binary data 28 // concatenated back to back. The expected savings is from the cost of encoding the lengths 29 // and possibly better compression in the data which will no longer be interleaved with the lengths. 30 // 31 // This encoding is always preferred over PLAIN for byte array columns where possible. 32 // 33 // For example, if the data was "Hello", "World", "Foobar", "ABCDEF" the encoded data would be: 34 // DeltaEncoding(5, 5, 6, 6) "HelloWorldFoobarABCDEF" 35 type DeltaLengthByteArrayEncoder struct { 36 encoder 37 38 lengthEncoder *DeltaBitPackInt32Encoder 39 } 40 41 // Put writes the provided slice of byte arrays to the encoder 42 func (enc *DeltaLengthByteArrayEncoder) Put(in []parquet.ByteArray) { 43 lengths := make([]int32, len(in)) 44 totalLen := int(0) 45 for idx, val := range in { 46 lengths[idx] = int32(val.Len()) 47 totalLen += val.Len() 48 } 49 50 enc.lengthEncoder.Put(lengths) 51 enc.sink.Reserve(totalLen) 52 for _, val := range in { 53 enc.sink.UnsafeWrite(val) 54 } 55 } 56 57 // PutSpaced is like Put, but the data is spaced out according to the bitmap provided and is compressed 58 // accordingly before it is written to drop the null data from the write. 59 func (enc *DeltaLengthByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 60 if validBits != nil { 61 data := make([]parquet.ByteArray, len(in)) 62 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 63 enc.Put(data[:nvalid]) 64 } else { 65 enc.Put(in) 66 } 67 } 68 69 // Type returns the underlying type which is handled by this encoder, ByteArrays only. 70 func (DeltaLengthByteArrayEncoder) Type() parquet.Type { 71 return parquet.Types.ByteArray 72 } 73 74 // FlushValues flushes any remaining data and returns the final encoded buffer of data 75 // or returns nil and any error encountered. 76 func (enc *DeltaLengthByteArrayEncoder) FlushValues() (Buffer, error) { 77 ret, err := enc.lengthEncoder.FlushValues() 78 if err != nil { 79 return nil, err 80 } 81 defer ret.Release() 82 83 data := enc.sink.Finish() 84 defer data.Release() 85 86 output := bufferPool.Get().(*memory.Buffer) 87 output.ResizeNoShrink(ret.Len() + data.Len()) 88 copy(output.Bytes(), ret.Bytes()) 89 copy(output.Bytes()[ret.Len():], data.Bytes()) 90 return poolBuffer{output}, nil 91 } 92 93 // DeltaLengthByteArrayDecoder is a decoder for handling data produced by the corresponding 94 // encoder which expects delta packed lengths followed by the bytes of data. 95 type DeltaLengthByteArrayDecoder struct { 96 decoder 97 98 mem memory.Allocator 99 lengths []int32 100 } 101 102 // Type returns the underlying type which is handled by this encoder, ByteArrays only. 103 func (DeltaLengthByteArrayDecoder) Type() parquet.Type { 104 return parquet.Types.ByteArray 105 } 106 107 func (d *DeltaLengthByteArrayDecoder) Allocator() memory.Allocator { return d.mem } 108 109 // SetData sets in the expected data to the decoder which should be nvalues delta packed lengths 110 // followed by the rest of the byte array data immediately after. 111 func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) error { 112 dec := DeltaBitPackInt32Decoder{ 113 deltaBitPackDecoder: &deltaBitPackDecoder{ 114 decoder: newDecoderBase(d.encoding, d.descr), 115 mem: d.mem}} 116 117 if err := dec.SetData(nvalues, data); err != nil { 118 return err 119 } 120 d.lengths = make([]int32, dec.totalValues) 121 dec.Decode(d.lengths) 122 123 return d.decoder.SetData(nvalues, data[int(dec.bytesRead()):]) 124 } 125 126 // Decode populates the passed in slice with data decoded until it hits the length of out 127 // or runs out of values in the column to decode, then returns the number of values actually decoded. 128 func (d *DeltaLengthByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { 129 max := utils.MinInt(len(out), d.nvals) 130 for i := 0; i < max; i++ { 131 out[i] = d.data[:d.lengths[i]:d.lengths[i]] 132 d.data = d.data[d.lengths[i]:] 133 } 134 d.nvals -= max 135 d.lengths = d.lengths[max:] 136 return max, nil 137 } 138 139 // DecodeSpaced is like Decode, but for spaced data using the provided bitmap to determine where the nulls should be inserted. 140 func (d *DeltaLengthByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 141 toread := len(out) - nullCount 142 values, _ := d.Decode(out[:toread]) 143 if values != toread { 144 return values, xerrors.New("parquet: number of values / definition levels read did not match") 145 } 146 147 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 148 }