github.com/apache/arrow/go/v16@v16.1.0/parquet/internal/encoding/delta_byte_array.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "github.com/apache/arrow/go/v16/arrow/memory" 21 "github.com/apache/arrow/go/v16/internal/utils" 22 "github.com/apache/arrow/go/v16/parquet" 23 "golang.org/x/xerrors" 24 ) 25 26 // DeltaByteArrayEncoder is an encoder for writing bytearrays which are delta encoded 27 // this is also known as incremental encoding or front compression. For each element 28 // in a sequence of strings, we store the prefix length of the previous entry plus the suffix 29 // see https://en.wikipedia.org/wiki/Incremental_encoding for a longer description. 30 // 31 // This is stored as a sequence of delta-encoded prefix lengths followed by the suffixes 32 // encoded as delta length byte arrays. 33 type DeltaByteArrayEncoder struct { 34 encoder 35 36 prefixEncoder *DeltaBitPackInt32Encoder 37 suffixEncoder *DeltaLengthByteArrayEncoder 38 39 lastVal parquet.ByteArray 40 } 41 42 func (enc *DeltaByteArrayEncoder) EstimatedDataEncodedSize() int64 { 43 prefixEstimatedSize := int64(0) 44 if enc.prefixEncoder != nil { 45 prefixEstimatedSize = enc.prefixEncoder.EstimatedDataEncodedSize() 46 } 47 suffixEstimatedSize := int64(0) 48 if enc.suffixEncoder != nil { 49 suffixEstimatedSize = enc.suffixEncoder.EstimatedDataEncodedSize() 50 } 51 return prefixEstimatedSize + suffixEstimatedSize 52 } 53 54 func (enc *DeltaByteArrayEncoder) initEncoders() { 55 enc.prefixEncoder = &DeltaBitPackInt32Encoder{ 56 deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}} 57 enc.suffixEncoder = &DeltaLengthByteArrayEncoder{ 58 newEncoderBase(enc.encoding, nil, enc.mem), 59 &DeltaBitPackInt32Encoder{ 60 deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}} 61 } 62 63 // Type returns the underlying physical type this operates on, in this case ByteArrays only 64 func (DeltaByteArrayEncoder) Type() parquet.Type { return parquet.Types.ByteArray } 65 66 // Put writes a slice of ByteArrays to the encoder 67 func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) { 68 if len(in) == 0 { 69 return 70 } 71 72 var suf parquet.ByteArray 73 if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet 74 enc.initEncoders() 75 enc.prefixEncoder.Put([]int32{0}) 76 suf = in[0] 77 enc.lastVal = in[0] 78 enc.suffixEncoder.Put([]parquet.ByteArray{suf}) 79 in = in[1:] 80 } 81 82 // for each value, figure out the common prefix with the previous value 83 // and then write the prefix length and the suffix. 84 for _, val := range in { 85 l1 := enc.lastVal.Len() 86 l2 := val.Len() 87 j := 0 88 for j < l1 && j < l2 { 89 if enc.lastVal[j] != val[j] { 90 break 91 } 92 j++ 93 } 94 enc.prefixEncoder.Put([]int32{int32(j)}) 95 suf = val[j:] 96 enc.suffixEncoder.Put([]parquet.ByteArray{suf}) 97 enc.lastVal = val 98 } 99 100 // do the memcpy after the loops to keep a copy of the lastVal 101 // we do a copy here so that we only copy and keep a reference 102 // to the suffix, and aren't forcing the *entire* value to stay 103 // in memory while we have this reference to just the suffix. 104 enc.lastVal = append([]byte{}, enc.lastVal...) 105 } 106 107 // PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset 108 // to compress the data before writing it without the null slots. 109 func (enc *DeltaByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 110 if validBits != nil { 111 data := make([]parquet.ByteArray, len(in)) 112 nvalid := spacedCompress(in, data, validBits, validBitsOffset) 113 enc.Put(data[:nvalid]) 114 } else { 115 enc.Put(in) 116 } 117 } 118 119 // Flush flushes any remaining data out and returns the finished encoded buffer. 120 // or returns nil and any error encountered during flushing. 121 func (enc *DeltaByteArrayEncoder) FlushValues() (Buffer, error) { 122 if enc.prefixEncoder == nil { 123 enc.initEncoders() 124 } 125 prefixBuf, err := enc.prefixEncoder.FlushValues() 126 if err != nil { 127 return nil, err 128 } 129 defer prefixBuf.Release() 130 131 suffixBuf, err := enc.suffixEncoder.FlushValues() 132 if err != nil { 133 return nil, err 134 } 135 defer suffixBuf.Release() 136 137 ret := bufferPool.Get().(*memory.Buffer) 138 ret.ResizeNoShrink(prefixBuf.Len() + suffixBuf.Len()) 139 copy(ret.Bytes(), prefixBuf.Bytes()) 140 copy(ret.Bytes()[prefixBuf.Len():], suffixBuf.Bytes()) 141 return poolBuffer{ret}, nil 142 } 143 144 // DeltaByteArrayDecoder is a decoder for a column of data encoded using incremental or prefix encoding. 145 type DeltaByteArrayDecoder struct { 146 *DeltaLengthByteArrayDecoder 147 148 prefixLengths []int32 149 lastVal parquet.ByteArray 150 } 151 152 // Type returns the underlying physical type this decoder operates on, in this case ByteArrays only 153 func (DeltaByteArrayDecoder) Type() parquet.Type { 154 return parquet.Types.ByteArray 155 } 156 157 func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem } 158 159 // SetData expects the passed in data to be the prefix lengths, followed by the 160 // blocks of suffix data in order to initialize the decoder. 161 func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error { 162 prefixLenDec := DeltaBitPackInt32Decoder{ 163 deltaBitPackDecoder: &deltaBitPackDecoder{ 164 decoder: newDecoderBase(d.encoding, d.descr), 165 mem: d.mem}} 166 167 if err := prefixLenDec.SetData(nvalues, data); err != nil { 168 return err 169 } 170 171 d.prefixLengths = make([]int32, nvalues) 172 // decode all the prefix lengths first so we know how many bytes it took to get the 173 // prefix lengths for nvalues 174 prefixLenDec.Decode(d.prefixLengths) 175 176 // now that we know how many bytes we needed for the prefix lengths, the rest are the 177 // delta length byte array encoding. 178 return d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):]) 179 } 180 181 // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded 182 func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { 183 max := utils.Min(len(out), d.nvals) 184 if max == 0 { 185 return 0, nil 186 } 187 out = out[:max] 188 189 var err error 190 if d.lastVal == nil { 191 _, err = d.DeltaLengthByteArrayDecoder.Decode(out[:1]) 192 if err != nil { 193 return 0, err 194 } 195 d.lastVal = out[0] 196 out = out[1:] 197 d.prefixLengths = d.prefixLengths[1:] 198 } 199 200 var prefixLen int32 201 suffixHolder := make([]parquet.ByteArray, 1) 202 for len(out) > 0 { 203 prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:] 204 205 prefix := d.lastVal[:prefixLen:prefixLen] 206 _, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder) 207 if err != nil { 208 return 0, err 209 } 210 211 if len(suffixHolder[0]) == 0 { 212 d.lastVal = prefix 213 } else { 214 d.lastVal = make([]byte, int(prefixLen)+len(suffixHolder[0])) 215 copy(d.lastVal, prefix) 216 copy(d.lastVal[prefixLen:], suffixHolder[0]) 217 } 218 out[0], out = d.lastVal, out[1:] 219 } 220 return max, nil 221 } 222 223 // DecodeSpaced is like decode, but the result is spaced out based on the bitmap provided. 224 func (d *DeltaByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 225 toread := len(out) - nullCount 226 values, err := d.Decode(out[:toread]) 227 if err != nil { 228 return values, err 229 } 230 if values != toread { 231 return values, xerrors.New("parquet: number of values / definition levels read did not match") 232 } 233 234 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 235 }