github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/delta_byte_array.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "github.com/apache/arrow/go/v14/arrow/memory" 21 "github.com/apache/arrow/go/v14/internal/utils" 22 "github.com/apache/arrow/go/v14/parquet" 23 "golang.org/x/xerrors" 24 ) 25 26 // DeltaByteArrayEncoder is an encoder for writing bytearrays which are delta encoded 27 // this is also known as incremental encoding or front compression. For each element 28 // in a sequence of strings, we store the prefix length of the previous entry plus the suffix 29 // see https://en.wikipedia.org/wiki/Incremental_encoding for a longer description. 30 // 31 // This is stored as a sequence of delta-encoded prefix lengths followed by the suffixes 32 // encoded as delta length byte arrays. 33 type DeltaByteArrayEncoder struct { 34 encoder 35 36 prefixEncoder *DeltaBitPackInt32Encoder 37 suffixEncoder *DeltaLengthByteArrayEncoder 38 39 lastVal parquet.ByteArray 40 } 41 42 func (enc *DeltaByteArrayEncoder) EstimatedDataEncodedSize() int64 { 43 return enc.prefixEncoder.EstimatedDataEncodedSize() + enc.suffixEncoder.EstimatedDataEncodedSize() 44 } 45 46 func (enc *DeltaByteArrayEncoder) initEncoders() { 47 enc.prefixEncoder = &DeltaBitPackInt32Encoder{ 48 deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}} 49 enc.suffixEncoder = &DeltaLengthByteArrayEncoder{ 50 newEncoderBase(enc.encoding, nil, enc.mem), 51 &DeltaBitPackInt32Encoder{ 52 deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}} 53 } 54 55 // Type returns the underlying physical type this operates on, in this case ByteArrays only 56 func (DeltaByteArrayEncoder) Type() parquet.Type { return parquet.Types.ByteArray } 57 58 // Put writes a slice of ByteArrays to the encoder 59 func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { 60 if len(in) == 0 { 61 return 62 } 63 64 var suf parquet.ByteArray 65 if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet 66 enc.initEncoders() 67 enc.prefixEncoder.Put([]int32{0}) 68 suf = in[0] 69 enc.lastVal = in[0] 70 enc.suffixEncoder.Put([]parquet.ByteArray{suf}) 71 in = in[1:] 72 } 73 74 // for each value, figure out the common prefix with the previous value 75 // and then write the prefix length and the suffix. 76 for _, val := range in { 77 l1 := enc.lastVal.Len() 78 l2 := val.Len() 79 j := 0 80 for j < l1 && j < l2 { 81 if enc.lastVal[j] != val[j] { 82 break 83 } 84 j++ 85 } 86 enc.prefixEncoder.Put([]int32{int32(j)}) 87 suf = val[j:] 88 enc.suffixEncoder.Put([]parquet.ByteArray{suf}) 89 enc.lastVal = val 90 } 91 92 // do the memcpy after the loops to keep a copy of the lastVal 93 // we do a copy here so that we only copy and keep a reference 94 // to the suffix, and aren't forcing the *entire* value to stay 95 // in memory while we have this reference to just the suffix. 96 enc.lastVal = append([]byte{}, enc.lastVal...) 97 } 98 99 // PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset 100 // to compress the data before writing it without the null slots. 101 func (enc *DeltaByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 102 if validBits != nil { 103 data := make([]parquet.ByteArray, len(in)) 104 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 105 enc.Put(data[:nvalid]) 106 } else { 107 enc.Put(in) 108 } 109 } 110 111 // Flush flushes any remaining data out and returns the finished encoded buffer. 112 // or returns nil and any error encountered during flushing. 113 func (enc *DeltaByteArrayEncoder) FlushValues() (Buffer, error) { 114 if enc.prefixEncoder == nil { 115 enc.initEncoders() 116 } 117 prefixBuf, err := enc.prefixEncoder.FlushValues() 118 if err != nil { 119 return nil, err 120 } 121 defer prefixBuf.Release() 122 123 suffixBuf, err := enc.suffixEncoder.FlushValues() 124 if err != nil { 125 return nil, err 126 } 127 defer suffixBuf.Release() 128 129 ret := bufferPool.Get().(*memory.Buffer) 130 ret.ResizeNoShrink(prefixBuf.Len() + suffixBuf.Len()) 131 copy(ret.Bytes(), prefixBuf.Bytes()) 132 copy(ret.Bytes()[prefixBuf.Len():], suffixBuf.Bytes()) 133 return poolBuffer{ret}, nil 134 } 135 136 // DeltaByteArrayDecoder is a decoder for a column of data encoded using incremental or prefix encoding. 137 type DeltaByteArrayDecoder struct { 138 *DeltaLengthByteArrayDecoder 139 140 prefixLengths []int32 141 lastVal parquet.ByteArray 142 } 143 144 // Type returns the underlying physical type this decoder operates on, in this case ByteArrays only 145 func (DeltaByteArrayDecoder) Type() parquet.Type { 146 return parquet.Types.ByteArray 147 } 148 149 func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem } 150 151 // SetData expects the data passed in to be the prefix lengths, followed by the 152 // blocks of suffix data in order to initialize the decoder. 153 func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error { 154 prefixLenDec := DeltaBitPackInt32Decoder{ 155 deltaBitPackDecoder: &deltaBitPackDecoder{ 156 decoder: newDecoderBase(d.encoding, d.descr), 157 mem: d.mem}} 158 159 if err := prefixLenDec.SetData(nvalues, data); err != nil { 160 return err 161 } 162 163 d.prefixLengths = make([]int32, nvalues) 164 // decode all the prefix lengths first so we know how many bytes it took to get the 165 // prefix lengths for nvalues 166 prefixLenDec.Decode(d.prefixLengths) 167 168 // now that we know how many bytes we needed for the prefix lengths, the rest are the 169 // delta length byte array encoding. 170 return d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):]) 171 } 172 173 // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded 174 func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { 175 max := utils.MinInt(len(out), d.nvals) 176 if max == 0 { 177 return 0, nil 178 } 179 out = out[:max] 180 181 var err error 182 if d.lastVal == nil { 183 _, err = d.DeltaLengthByteArrayDecoder.Decode(out[:1]) 184 if err != nil { 185 return 0, err 186 } 187 d.lastVal = out[0] 188 out = out[1:] 189 d.prefixLengths = d.prefixLengths[1:] 190 } 191 192 var prefixLen int32 193 suffixHolder := make([]parquet.ByteArray, 1) 194 for len(out) > 0 { 195 prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:] 196 197 prefix := d.lastVal[:prefixLen:prefixLen] 198 _, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder) 199 if err != nil { 200 return 0, err 201 } 202 203 if len(suffixHolder[0]) == 0 { 204 d.lastVal = prefix 205 } else { 206 d.lastVal = make([]byte, int(prefixLen)+len(suffixHolder[0])) 207 copy(d.lastVal, prefix) 208 copy(d.lastVal[prefixLen:], suffixHolder[0]) 209 } 210 out[0], out = d.lastVal, out[1:] 211 } 212 return max, nil 213 } 214 215 // DecodeSpaced is like decode, but the result is spaced out based on the bitmap provided. 216 func (d *DeltaByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 217 toread := len(out) - nullCount 218 values, err := d.Decode(out[:toread]) 219 if err != nil { 220 return values, err 221 } 222 if values != toread { 223 return values, xerrors.New("parquet: number of values / definition levels read did not match") 224 } 225 226 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 227 }