github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/levels.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "fmt" 23 "math/bits" 24 25 "github.com/JohnCGriffin/overflow" 26 "github.com/apache/arrow/go/v14/arrow/bitutil" 27 shared_utils "github.com/apache/arrow/go/v14/internal/utils" 28 "github.com/apache/arrow/go/v14/parquet" 29 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 30 "github.com/apache/arrow/go/v14/parquet/internal/utils" 31 "golang.org/x/xerrors" 32 ) 33 34 // LevelEncoder is for handling the encoding of Definition and Repetition levels 35 // to parquet files. 36 type LevelEncoder struct { 37 bitWidth int 38 rleLen int 39 encoding format.Encoding 40 rle *utils.RleEncoder 41 bit *utils.BitWriter 42 } 43 44 // LevelEncodingMaxBufferSize estimates the max number of bytes needed to encode data with the 45 // specified encoding given the max level and number of buffered values provided. 46 func LevelEncodingMaxBufferSize(encoding parquet.Encoding, maxLvl int16, nbuffered int) int { 47 bitWidth := bits.Len64(uint64(maxLvl)) 48 nbytes := 0 49 switch encoding { 50 case parquet.Encodings.RLE: 51 nbytes = utils.MaxBufferSize(bitWidth, nbuffered) + utils.MinBufferSize(bitWidth) 52 case parquet.Encodings.BitPacked: 53 nbytes = int(bitutil.BytesForBits(int64(nbuffered * bitWidth))) 54 default: 55 panic("parquet: unknown encoding type for levels") 56 } 57 return nbytes 58 } 59 60 // Reset resets the encoder allowing it to be reused and updating the maxlevel to the new 61 // specified value. 62 func (l *LevelEncoder) Reset(maxLvl int16) { 63 l.bitWidth = bits.Len64(uint64(maxLvl)) 64 switch l.encoding { 65 case format.Encoding_RLE: 66 l.rle.Clear() 67 l.rle.BitWidth = l.bitWidth 68 case format.Encoding_BIT_PACKED: 69 l.bit.Clear() 70 default: 71 panic("parquet: unknown encoding type") 72 } 73 } 74 75 // Init is called to set up the desired encoding type, max level and underlying writer for a 76 // level encoder to control where the resulting encoded buffer will end up. 77 func (l *LevelEncoder) Init(encoding parquet.Encoding, maxLvl int16, w utils.WriterAtWithLen) { 78 l.bitWidth = bits.Len64(uint64(maxLvl)) 79 l.encoding = format.Encoding(encoding) 80 switch l.encoding { 81 case format.Encoding_RLE: 82 l.rle = utils.NewRleEncoder(w, l.bitWidth) 83 case format.Encoding_BIT_PACKED: 84 l.bit = utils.NewBitWriter(w) 85 default: 86 panic("parquet: unknown encoding type for levels") 87 } 88 } 89 90 // EncodeNoFlush encodes the provided levels in the encoder, but doesn't flush 91 // the buffer and return it yet, appending these encoded values. Returns the number 92 // of values encoded and any error encountered or nil. If err is not nil, nencoded 93 // will be the number of values encoded before the error was encountered 94 func (l *LevelEncoder) EncodeNoFlush(lvls []int16) (nencoded int, err error) { 95 if l.rle == nil && l.bit == nil { 96 panic("parquet: level encoders are not initialized") 97 } 98 99 switch l.encoding { 100 case format.Encoding_RLE: 101 for _, level := range lvls { 102 if err = l.rle.Put(uint64(level)); err != nil { 103 return 104 } 105 nencoded++ 106 } 107 default: 108 for _, level := range lvls { 109 if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil { 110 return 111 } 112 nencoded++ 113 } 114 } 115 return 116 } 117 118 // Flush flushes out any encoded data to the underlying writer. 119 func (l *LevelEncoder) Flush() { 120 if l.rle == nil && l.bit == nil { 121 panic("parquet: level encoders are not initialized") 122 } 123 124 switch l.encoding { 125 case format.Encoding_RLE: 126 l.rleLen = l.rle.Flush() 127 default: 128 l.bit.Flush(false) 129 } 130 } 131 132 // Encode encodes the slice of definition or repetition levels based on 133 // the currently configured encoding type and returns the number of 134 // values that were encoded. 135 func (l *LevelEncoder) Encode(lvls []int16) (nencoded int, err error) { 136 if l.rle == nil && l.bit == nil { 137 panic("parquet: level encoders are not initialized") 138 } 139 140 switch l.encoding { 141 case format.Encoding_RLE: 142 defer func() { l.rleLen = l.rle.Flush() }() 143 for _, level := range lvls { 144 if err = l.rle.Put(uint64(level)); err != nil { 145 return 146 } 147 nencoded++ 148 } 149 150 default: 151 defer l.bit.Flush(false) 152 for _, level := range lvls { 153 if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil { 154 return 155 } 156 nencoded++ 157 } 158 } 159 return 160 } 161 162 // Len returns the number of bytes that were written as Run Length encoded 163 // levels, this is only valid for run length encoding and will panic if using 164 // deprecated bit packed encoding. 165 func (l *LevelEncoder) Len() int { 166 if l.encoding != format.Encoding_RLE { 167 panic("parquet: level encoder, only implemented for RLE") 168 } 169 return l.rleLen 170 } 171 172 // LevelDecoder handles the decoding of repetition and definition levels from a 173 // parquet file supporting bit packed and run length encoded values. 174 type LevelDecoder struct { 175 bitWidth int 176 remaining int // the number of values left to be decoded in the input data 177 maxLvl int16 178 encoding format.Encoding 179 // only one of the following should ever be set at a time based on the 180 // encoding format. 181 rle *utils.RleDecoder 182 bit *utils.BitReader 183 } 184 185 // SetData sets in the data to be decoded by subsequent calls by specifying the encoding type 186 // the maximum level (which is what determines the bit width), the number of values expected 187 // and the raw bytes to decode. Returns the number of bytes expected to be decoded. 188 func (l *LevelDecoder) SetData(encoding parquet.Encoding, maxLvl int16, nbuffered int, data []byte) (int, error) { 189 l.maxLvl = maxLvl 190 l.encoding = format.Encoding(encoding) 191 l.remaining = nbuffered 192 l.bitWidth = bits.Len64(uint64(maxLvl)) 193 194 switch encoding { 195 case parquet.Encodings.RLE: 196 if len(data) < 4 { 197 return 0, xerrors.New("parquet: received invalid levels (corrupt data page?)") 198 } 199 200 nbytes := int32(binary.LittleEndian.Uint32(data[:4])) 201 if nbytes < 0 || nbytes > int32(len(data)-4) { 202 return 0, xerrors.New("parquet: received invalid number of bytes (corrupt data page?)") 203 } 204 205 buf := data[4:] 206 if l.rle == nil { 207 l.rle = utils.NewRleDecoder(bytes.NewReader(buf), l.bitWidth) 208 } else { 209 l.rle.Reset(bytes.NewReader(buf), l.bitWidth) 210 } 211 return int(nbytes) + 4, nil 212 case parquet.Encodings.BitPacked: 213 nbits, ok := overflow.Mul(nbuffered, l.bitWidth) 214 if !ok { 215 return 0, xerrors.New("parquet: number of buffered values too large (corrupt data page?)") 216 } 217 218 nbytes := bitutil.BytesForBits(int64(nbits)) 219 if nbytes < 0 || nbytes > int64(len(data)) { 220 return 0, xerrors.New("parquet: recieved invalid number of bytes (corrupt data page?)") 221 } 222 if l.bit == nil { 223 l.bit = utils.NewBitReader(bytes.NewReader(data)) 224 } else { 225 l.bit.Reset(bytes.NewReader(data)) 226 } 227 return int(nbytes), nil 228 default: 229 return 0, fmt.Errorf("parquet: unknown encoding type for levels '%s'", encoding) 230 } 231 } 232 233 // SetDataV2 is the same as SetData but only for DataPageV2 pages and only supports 234 // run length encoding. 235 func (l *LevelDecoder) SetDataV2(nbytes int32, maxLvl int16, nbuffered int, data []byte) error { 236 if nbytes < 0 { 237 return xerrors.New("parquet: invalid page header (corrupt data page?)") 238 } 239 240 l.maxLvl = maxLvl 241 l.encoding = format.Encoding_RLE 242 l.remaining = nbuffered 243 l.bitWidth = bits.Len64(uint64(maxLvl)) 244 245 if l.rle == nil { 246 l.rle = utils.NewRleDecoder(bytes.NewReader(data), l.bitWidth) 247 } else { 248 l.rle.Reset(bytes.NewReader(data), l.bitWidth) 249 } 250 return nil 251 } 252 253 // Decode decodes the bytes that were set with SetData into the slice of levels 254 // returning the total number of levels that were decoded and the number of 255 // values which had a level equal to the max level, indicating how many physical 256 // values exist to be read. 257 func (l *LevelDecoder) Decode(levels []int16) (int, int64) { 258 var ( 259 buf [1024]uint64 260 totaldecoded int 261 decoded int 262 valsToRead int64 263 ) 264 265 n := shared_utils.Min(int64(l.remaining), int64(len(levels))) 266 for n > 0 { 267 batch := shared_utils.Min(1024, n) 268 switch l.encoding { 269 case format.Encoding_RLE: 270 decoded = l.rle.GetBatch(buf[:batch]) 271 case format.Encoding_BIT_PACKED: 272 decoded, _ = l.bit.GetBatch(uint(l.bitWidth), buf[:batch]) 273 } 274 l.remaining -= decoded 275 totaldecoded += decoded 276 n -= batch 277 278 for idx, val := range buf[:decoded] { 279 lvl := int16(val) 280 levels[idx] = lvl 281 if lvl == l.maxLvl { 282 valsToRead++ 283 } 284 } 285 levels = levels[decoded:] 286 } 287 288 return totaldecoded, valsToRead 289 }