github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/levels.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "fmt" 23 "io" 24 "math/bits" 25 26 "github.com/JohnCGriffin/overflow" 27 "github.com/apache/arrow/go/v10/arrow/bitutil" 28 shared_utils "github.com/apache/arrow/go/v10/internal/utils" 29 "github.com/apache/arrow/go/v10/parquet" 30 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 31 "github.com/apache/arrow/go/v10/parquet/internal/utils" 32 "golang.org/x/xerrors" 33 ) 34 35 // LevelEncoder is for handling the encoding of Definition and Repetition levels 36 // to parquet files. 37 type LevelEncoder struct { 38 bitWidth int 39 rleLen int 40 encoding format.Encoding 41 rle *utils.RleEncoder 42 bit *utils.BitWriter 43 } 44 45 // LevelEncodingMaxBufferSize estimates the max number of bytes needed to encode data with the 46 // specified encoding given the max level and number of buffered values provided. 47 func LevelEncodingMaxBufferSize(encoding parquet.Encoding, maxLvl int16, nbuffered int) int { 48 bitWidth := bits.Len64(uint64(maxLvl)) 49 nbytes := 0 50 switch encoding { 51 case parquet.Encodings.RLE: 52 nbytes = utils.MaxBufferSize(bitWidth, nbuffered) + utils.MinBufferSize(bitWidth) 53 case parquet.Encodings.BitPacked: 54 nbytes = int(bitutil.BytesForBits(int64(nbuffered * bitWidth))) 55 default: 56 panic("parquet: unknown encoding type for levels") 57 } 58 return nbytes 59 } 60 61 // Reset resets the encoder allowing it to be reused and updating the maxlevel to the new 62 // specified value. 63 func (l *LevelEncoder) Reset(maxLvl int16) { 64 l.bitWidth = bits.Len64(uint64(maxLvl)) 65 switch l.encoding { 66 case format.Encoding_RLE: 67 l.rle.Clear() 68 l.rle.BitWidth = l.bitWidth 69 case format.Encoding_BIT_PACKED: 70 l.bit.Clear() 71 default: 72 panic("parquet: unknown encoding type") 73 } 74 } 75 76 // Init is called to set up the desired encoding type, max level and underlying writer for a 77 // level encoder to control where the resulting encoded buffer will end up. 78 func (l *LevelEncoder) Init(encoding parquet.Encoding, maxLvl int16, w io.WriterAt) { 79 l.bitWidth = bits.Len64(uint64(maxLvl)) 80 l.encoding = format.Encoding(encoding) 81 switch l.encoding { 82 case format.Encoding_RLE: 83 l.rle = utils.NewRleEncoder(w, l.bitWidth) 84 case format.Encoding_BIT_PACKED: 85 l.bit = utils.NewBitWriter(w) 86 default: 87 panic("parquet: unknown encoding type for levels") 88 } 89 } 90 91 // EncodeNoFlush encodes the provided levels in the encoder, but doesn't flush 92 // the buffer and return it yet, appending these encoded values. Returns the number 93 // of values encoded and any error encountered or nil. If err is not nil, nencoded 94 // will be the number of values encoded before the error was encountered 95 func (l *LevelEncoder) EncodeNoFlush(lvls []int16) (nencoded int, err error) { 96 if l.rle == nil && l.bit == nil { 97 panic("parquet: level encoders are not initialized") 98 } 99 100 switch l.encoding { 101 case format.Encoding_RLE: 102 for _, level := range lvls { 103 if err = l.rle.Put(uint64(level)); err != nil { 104 return 105 } 106 nencoded++ 107 } 108 default: 109 for _, level := range lvls { 110 if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil { 111 return 112 } 113 nencoded++ 114 } 115 } 116 return 117 } 118 119 // Flush flushes out any encoded data to the underlying writer. 120 func (l *LevelEncoder) Flush() { 121 if l.rle == nil && l.bit == nil { 122 panic("parquet: level encoders are not initialized") 123 } 124 125 switch l.encoding { 126 case format.Encoding_RLE: 127 l.rleLen = l.rle.Flush() 128 default: 129 l.bit.Flush(false) 130 } 131 } 132 133 // Encode encodes the slice of definition or repetition levels based on 134 // the currently configured encoding type and returns the number of 135 // values that were encoded. 136 func (l *LevelEncoder) Encode(lvls []int16) (nencoded int, err error) { 137 if l.rle == nil && l.bit == nil { 138 panic("parquet: level encoders are not initialized") 139 } 140 141 switch l.encoding { 142 case format.Encoding_RLE: 143 defer func() { l.rleLen = l.rle.Flush() }() 144 for _, level := range lvls { 145 if err = l.rle.Put(uint64(level)); err != nil { 146 return 147 } 148 nencoded++ 149 } 150 151 default: 152 defer l.bit.Flush(false) 153 for _, level := range lvls { 154 if err = l.bit.WriteValue(uint64(level), uint(l.bitWidth)); err != nil { 155 return 156 } 157 nencoded++ 158 } 159 } 160 return 161 } 162 163 // Len returns the number of bytes that were written as Run Length encoded 164 // levels, this is only valid for run length encoding and will panic if using 165 // deprecated bit packed encoding. 166 func (l *LevelEncoder) Len() int { 167 if l.encoding != format.Encoding_RLE { 168 panic("parquet: level encoder, only implemented for RLE") 169 } 170 return l.rleLen 171 } 172 173 // LevelDecoder handles the decoding of repetition and definition levels from a 174 // parquet file supporting bit packed and run length encoded values. 175 type LevelDecoder struct { 176 bitWidth int 177 remaining int // the number of values left to be decoded in the input data 178 maxLvl int16 179 encoding format.Encoding 180 // only one of the following should ever be set at a time based on the 181 // encoding format. 182 rle *utils.RleDecoder 183 bit *utils.BitReader 184 } 185 186 // SetData sets in the data to be decoded by subsequent calls by specifying the encoding type 187 // the maximum level (which is what determines the bit width), the number of values expected 188 // and the raw bytes to decode. Returns the number of bytes expected to be decoded. 189 func (l *LevelDecoder) SetData(encoding parquet.Encoding, maxLvl int16, nbuffered int, data []byte) (int, error) { 190 l.maxLvl = maxLvl 191 l.encoding = format.Encoding(encoding) 192 l.remaining = nbuffered 193 l.bitWidth = bits.Len64(uint64(maxLvl)) 194 195 switch encoding { 196 case parquet.Encodings.RLE: 197 if len(data) < 4 { 198 return 0, xerrors.New("parquet: received invalid levels (corrupt data page?)") 199 } 200 201 nbytes := int32(binary.LittleEndian.Uint32(data[:4])) 202 if nbytes < 0 || nbytes > int32(len(data)-4) { 203 return 0, xerrors.New("parquet: received invalid number of bytes (corrupt data page?)") 204 } 205 206 buf := data[4:] 207 if l.rle == nil { 208 l.rle = utils.NewRleDecoder(bytes.NewReader(buf), l.bitWidth) 209 } else { 210 l.rle.Reset(bytes.NewReader(buf), l.bitWidth) 211 } 212 return int(nbytes) + 4, nil 213 case parquet.Encodings.BitPacked: 214 nbits, ok := overflow.Mul(nbuffered, l.bitWidth) 215 if !ok { 216 return 0, xerrors.New("parquet: number of buffered values too large (corrupt data page?)") 217 } 218 219 nbytes := bitutil.BytesForBits(int64(nbits)) 220 if nbytes < 0 || nbytes > int64(len(data)) { 221 return 0, xerrors.New("parquet: recieved invalid number of bytes (corrupt data page?)") 222 } 223 if l.bit == nil { 224 l.bit = utils.NewBitReader(bytes.NewReader(data)) 225 } else { 226 l.bit.Reset(bytes.NewReader(data)) 227 } 228 return int(nbytes), nil 229 default: 230 return 0, fmt.Errorf("parquet: unknown encoding type for levels '%s'", encoding) 231 } 232 } 233 234 // SetDataV2 is the same as SetData but only for DataPageV2 pages and only supports 235 // run length encoding. 236 func (l *LevelDecoder) SetDataV2(nbytes int32, maxLvl int16, nbuffered int, data []byte) error { 237 if nbytes < 0 { 238 return xerrors.New("parquet: invalid page header (corrupt data page?)") 239 } 240 241 l.maxLvl = maxLvl 242 l.encoding = format.Encoding_RLE 243 l.remaining = nbuffered 244 l.bitWidth = bits.Len64(uint64(maxLvl)) 245 246 if l.rle == nil { 247 l.rle = utils.NewRleDecoder(bytes.NewReader(data), l.bitWidth) 248 } else { 249 l.rle.Reset(bytes.NewReader(data), l.bitWidth) 250 } 251 return nil 252 } 253 254 // Decode decodes the bytes that were set with SetData into the slice of levels 255 // returning the total number of levels that were decoded and the number of 256 // values which had a level equal to the max level, indicating how many physical 257 // values exist to be read. 258 func (l *LevelDecoder) Decode(levels []int16) (int, int64) { 259 var ( 260 buf [1024]uint64 261 totaldecoded int 262 decoded int 263 valsToRead int64 264 ) 265 266 n := shared_utils.Min(int64(l.remaining), int64(len(levels))) 267 for n > 0 { 268 batch := shared_utils.Min(1024, n) 269 switch l.encoding { 270 case format.Encoding_RLE: 271 decoded = l.rle.GetBatch(buf[:batch]) 272 case format.Encoding_BIT_PACKED: 273 decoded, _ = l.bit.GetBatch(uint(l.bitWidth), buf[:batch]) 274 } 275 l.remaining -= decoded 276 totaldecoded += decoded 277 n -= batch 278 279 for idx, val := range buf[:decoded] { 280 lvl := int16(val) 281 levels[idx] = lvl 282 if lvl == l.maxLvl { 283 valsToRead++ 284 } 285 } 286 levels = levels[decoded:] 287 } 288 289 return totaldecoded, valsToRead 290 }