github.com/fraugster/parquet-go@v0.12.0/deltabp_decoder.go (about) 1 package goparquet 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 ) 8 9 // The two following decoder are identical, since there is no generic, I had two option, one use the interfaces 10 // which was my first choice but its branchy and full of if and else. so I decided to go for second solution and 11 // almost copy/paste this two types 12 13 type deltaBitPackDecoder32 struct { 14 r io.Reader 15 16 blockSize int32 17 miniBlockCount int32 18 valuesCount int32 19 miniBlockValueCount int32 20 21 previousValue int32 22 minDelta int32 23 24 miniBlockBitWidth []uint8 25 currentMiniBlock int32 26 currentMiniBlockBitWidth uint8 27 miniBlockPosition int32 // position inside the current mini block 28 position int32 // position in the value. since delta may have padding we need to track this 29 currentUnpacker unpack8int32Func 30 miniBlockInt32 [8]int32 31 } 32 33 func (d *deltaBitPackDecoder32) initSize(r io.Reader) error { 34 return d.init(r) 35 } 36 37 func (d *deltaBitPackDecoder32) init(r io.Reader) error { 38 d.r = r 39 40 if err := d.readBlockHeader(); err != nil { 41 return err 42 } 43 44 if err := d.readMiniBlockHeader(); err != nil { 45 return err 46 } 47 48 return nil 49 } 50 51 func (d *deltaBitPackDecoder32) readBlockHeader() error { 52 var err error 53 if d.blockSize, err = readUVariant32(d.r); err != nil { 54 return fmt.Errorf("failed to read block size: %w", err) 55 } 56 if d.blockSize <= 0 && d.blockSize%128 != 0 { 57 return errors.New("invalid block size") 58 } 59 60 if d.miniBlockCount, err = readUVariant32(d.r); err != nil { 61 return fmt.Errorf("failed to read number of mini blocks: %w", err) 62 } 63 64 if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 { 65 return errors.New("int/delta: invalid number of mini blocks") 66 } 67 68 d.miniBlockValueCount = d.blockSize / d.miniBlockCount 69 if d.miniBlockValueCount == 0 { 70 return fmt.Errorf("invalid mini block value count, it can't be zero") 71 } 72 73 if d.valuesCount, err = readUVariant32(d.r); err != nil { 74 return fmt.Errorf("failed to read total value count: %w", err) 75 } 76 77 if d.valuesCount < 0 { 78 return errors.New("invalid total value count") 79 } 80 81 if d.previousValue, err = readVariant32(d.r); err != nil { 82 return fmt.Errorf("failed to read first value: %w", err) 83 } 84 85 return nil 86 } 87 88 func (d *deltaBitPackDecoder32) readMiniBlockHeader() error { 89 var err error 90 91 if d.minDelta, err = readVariant32(d.r); err != nil { 92 return fmt.Errorf("failed to read min delta: %w", err) 93 } 94 95 // the mini block bitwidth is always there, even if the value is zero 96 d.miniBlockBitWidth = make([]uint8, d.miniBlockCount) 97 if _, err = io.ReadFull(d.r, d.miniBlockBitWidth); err != nil { 98 return fmt.Errorf("not enough data to read all miniblock bit widths: %w", err) 99 } 100 101 for i := range d.miniBlockBitWidth { 102 if d.miniBlockBitWidth[i] > 32 { 103 return fmt.Errorf("invalid miniblock bit width: %d", d.miniBlockBitWidth[i]) 104 } 105 } 106 107 // start from the first min block in a big block 108 d.currentMiniBlock = 0 109 110 return nil 111 } 112 113 func (d *deltaBitPackDecoder32) next() (int32, error) { 114 if d.position >= d.valuesCount { 115 // No value left in the buffer 116 return 0, io.EOF 117 } 118 119 // need new byte? 120 if d.position%8 == 0 { 121 // do we need to advance a mini block? 122 if d.position%d.miniBlockValueCount == 0 { 123 // do we need to advance a big block? 124 if d.currentMiniBlock >= d.miniBlockCount { 125 if err := d.readMiniBlockHeader(); err != nil { 126 return 0, err 127 } 128 } 129 130 d.currentMiniBlockBitWidth = d.miniBlockBitWidth[d.currentMiniBlock] 131 d.currentUnpacker = unpack8Int32FuncByWidth[int(d.currentMiniBlockBitWidth)] 132 133 d.miniBlockPosition = 0 134 d.currentMiniBlock++ 135 } 136 137 // read next 8 values 138 w := int32(d.currentMiniBlockBitWidth) 139 buf := make([]byte, w) 140 if _, err := io.ReadFull(d.r, buf); err != nil { 141 return 0, err 142 } 143 144 d.miniBlockInt32 = d.currentUnpacker(buf) 145 d.miniBlockPosition += w 146 // there is padding here, read them all from the reader, first deal with the remaining of the current block, 147 // then the next blocks. if the blocks bit width is zero then simply ignore them, but the docs said reader 148 // should accept any arbitrary bit width here. 149 if d.position+8 >= d.valuesCount { 150 // current block 151 l := (d.miniBlockValueCount/8)*w - d.miniBlockPosition 152 if l < 0 { 153 return 0, errors.New("invalid stream") 154 } 155 remaining := make([]byte, l) 156 _, _ = io.ReadFull(d.r, remaining) 157 for i := d.currentMiniBlock; i < d.miniBlockCount; i++ { 158 w := int32(d.miniBlockBitWidth[d.currentMiniBlock]) 159 if w != 0 { 160 remaining := make([]byte, (d.miniBlockValueCount/8)*w) 161 _, _ = io.ReadFull(d.r, remaining) 162 } 163 } 164 } 165 } 166 167 // value is the previous value + delta stored in the reader and the min delta for the block, also we always read one 168 // value ahead 169 ret := d.previousValue 170 d.previousValue += d.miniBlockInt32[d.position%8] + d.minDelta 171 d.position++ 172 173 return ret, nil 174 } 175 176 type deltaBitPackDecoder64 struct { 177 r io.Reader 178 179 blockSize int32 180 miniBlockCount int32 181 valuesCount int32 182 miniBlockValueCount int32 183 184 previousValue int64 185 minDelta int64 186 187 miniBlockBitWidth []uint8 188 currentMiniBlock int32 189 currentMiniBlockBitWidth uint8 190 miniBlockPosition int32 // position inside the current mini block 191 position int32 // position in the value. since delta may have padding we need to track this 192 currentUnpacker unpack8int64Func 193 miniBlockInt64 [8]int64 194 } 195 196 func (d *deltaBitPackDecoder64) init(r io.Reader) error { 197 d.r = r 198 199 if err := d.readBlockHeader(); err != nil { 200 return err 201 } 202 203 if err := d.readMiniBlockHeader(); err != nil { 204 return err 205 } 206 207 return nil 208 } 209 210 func (d *deltaBitPackDecoder64) readBlockHeader() error { 211 var err error 212 if d.blockSize, err = readUVariant32(d.r); err != nil { 213 return fmt.Errorf("failed to read block size: %w", err) 214 } 215 if d.blockSize <= 0 && d.blockSize%128 != 0 { 216 return errors.New("invalid block size") 217 } 218 219 if d.miniBlockCount, err = readUVariant32(d.r); err != nil { 220 return fmt.Errorf("failed to read number of mini blocks: %w", err) 221 } 222 223 if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 { 224 return errors.New("int/delta: invalid number of mini blocks") 225 } 226 227 d.miniBlockValueCount = d.blockSize / d.miniBlockCount 228 if d.miniBlockValueCount == 0 { 229 return errors.New("invalid mini block value count, it can't be zero") 230 } 231 232 if d.valuesCount, err = readUVariant32(d.r); err != nil { 233 return fmt.Errorf("failed to read total value count: %w", err) 234 } 235 236 if d.valuesCount < 0 { 237 return fmt.Errorf("invalid total value count %d", d.valuesCount) 238 } 239 240 if d.previousValue, err = readVariant64(d.r); err != nil { 241 return fmt.Errorf("failed to read first value: %w", err) 242 } 243 244 return nil 245 } 246 247 func (d *deltaBitPackDecoder64) readMiniBlockHeader() error { 248 var err error 249 250 if d.minDelta, err = readVariant64(d.r); err != nil { 251 return fmt.Errorf("failed to read min delta: %w", err) 252 } 253 254 // the mini block bitwidth is always there, even if the value is zero 255 d.miniBlockBitWidth = make([]uint8, d.miniBlockCount) 256 if _, err = io.ReadFull(d.r, d.miniBlockBitWidth); err != nil { 257 return fmt.Errorf("not enough data to read all miniblock bit widths: %w", err) 258 } 259 260 for i := range d.miniBlockBitWidth { 261 if d.miniBlockBitWidth[i] > 64 { 262 return fmt.Errorf("invalid miniblock bit width: %d", d.miniBlockBitWidth[i]) 263 } 264 } 265 266 // start from the first min block in a big block 267 d.currentMiniBlock = 0 268 269 return nil 270 } 271 272 func (d *deltaBitPackDecoder64) next() (int64, error) { 273 if d.position >= d.valuesCount { 274 // No value left in the buffer 275 return 0, io.EOF 276 } 277 278 // need new byte? 279 if d.position%8 == 0 { 280 // do we need to advance a mini block? 281 if d.position%d.miniBlockValueCount == 0 { 282 // do we need to advance a big block? 283 if d.currentMiniBlock >= d.miniBlockCount { 284 if err := d.readMiniBlockHeader(); err != nil { 285 return 0, err 286 } 287 } 288 289 d.currentMiniBlockBitWidth = d.miniBlockBitWidth[d.currentMiniBlock] 290 d.currentUnpacker = unpack8Int64FuncByWidth[int(d.currentMiniBlockBitWidth)] 291 292 d.miniBlockPosition = 0 293 d.currentMiniBlock++ 294 } 295 296 // read next 8 values 297 w := int32(d.currentMiniBlockBitWidth) 298 buf := make([]byte, w) 299 if _, err := io.ReadFull(d.r, buf); err != nil { 300 return 0, err 301 } 302 303 d.miniBlockInt64 = d.currentUnpacker(buf) 304 d.miniBlockPosition += w 305 // there is padding here, read them all from the reader, first deal with the remaining of the current block, 306 // then the next blocks. if the blocks bit width is zero then simply ignore them, but the docs said reader 307 // should accept any arbitrary bit width here. 308 if d.position+8 >= d.valuesCount { 309 // current block 310 sliceLen := (d.miniBlockValueCount/8)*w - d.miniBlockPosition 311 if sliceLen < 0 { 312 return 0, fmt.Errorf("invalid remaining values, mini block value count = %d, width = %d, mini block position = %d", d.miniBlockValueCount, w, d.miniBlockPosition) 313 } 314 remaining := make([]byte, sliceLen) 315 _, _ = io.ReadFull(d.r, remaining) 316 for i := d.currentMiniBlock; i < d.miniBlockCount; i++ { 317 w := int32(d.miniBlockBitWidth[d.currentMiniBlock]) 318 if w != 0 { 319 remaining := make([]byte, (d.miniBlockValueCount/8)*w) 320 _, _ = io.ReadFull(d.r, remaining) 321 } 322 } 323 } 324 } 325 326 // value is the previous value + delta stored in the reader and the min delta for the block, also we always read one 327 // value ahead 328 ret := d.previousValue 329 d.previousValue += d.miniBlockInt64[d.position%8] + d.minDelta 330 d.position++ 331 332 return ret, nil 333 }