github.com/fraugster/parquet-go@v0.12.0/deltabp_encoder.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "fmt" 7 "io" 8 "math" 9 "math/bits" 10 ) 11 12 type deltaBitPackEncoder32 struct { 13 deltas []int32 14 bitWidth []uint8 15 packed [][]byte 16 w io.Writer 17 18 // this value should be there before the init 19 blockSize int // Must be multiple of 128 20 miniBlockCount int // blockSize % miniBlockCount should be 0 21 22 miniBlockValueCount int 23 24 valuesCount int 25 buffer *bytes.Buffer 26 27 firstValue int32 // the first value to write 28 minDelta int32 29 previousValue int32 30 } 31 32 func (d *deltaBitPackEncoder32) init(w io.Writer) error { 33 d.w = w 34 35 if d.blockSize%128 != 0 || d.blockSize <= 0 { 36 return fmt.Errorf("invalid block size, it should be multiple of 128, it is %d", d.blockSize) 37 } 38 39 if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 { 40 return fmt.Errorf("invalid mini block count, it is %d", d.miniBlockCount) 41 } 42 43 d.miniBlockValueCount = d.blockSize / d.miniBlockCount 44 if d.miniBlockValueCount%8 != 0 { 45 return fmt.Errorf("invalid mini block count, the mini block value count should be multiple of 8, it is %d", d.miniBlockCount) 46 } 47 48 d.firstValue = 0 49 d.valuesCount = 0 50 d.minDelta = math.MaxInt32 51 d.deltas = make([]int32, 0, d.blockSize) 52 d.previousValue = 0 53 d.buffer = &bytes.Buffer{} 54 d.bitWidth = make([]uint8, 0, d.miniBlockCount) 55 return nil 56 } 57 58 func (d *deltaBitPackEncoder32) flush() error { 59 // Technically, based on the spec after this step all values are positive, but NO, it's not. the problem is when 60 // the min delta is small enough (lets say MinInt) and one of deltas are MaxInt, the the result of MaxInt-MinInt is 61 // -1, get the idea, there is a lot of numbers here because of overflow can produce negative value 62 for i := range d.deltas { 63 d.deltas[i] -= d.minDelta 64 } 65 66 if err := writeVariant(d.buffer, int64(d.minDelta)); err != nil { 67 return err 68 } 69 70 d.bitWidth = d.bitWidth[:0] //reset the bitWidth buffer 71 d.packed = d.packed[:0] 72 for i := 0; i < len(d.deltas); i += d.miniBlockValueCount { 73 end := i + d.miniBlockValueCount 74 if end >= len(d.deltas) { 75 end = len(d.deltas) 76 } 77 // The cast to uint32 here, is the key. or the max not works at all 78 max := uint32(d.deltas[i]) 79 buf := make([][8]int32, d.miniBlockValueCount/8) 80 for j := i; j < end; j++ { 81 if max < uint32(d.deltas[j]) { 82 max = uint32(d.deltas[j]) 83 } 84 t := j - i 85 buf[t/8][t%8] = d.deltas[j] 86 } 87 bw := bits.Len32(max) 88 d.bitWidth = append(d.bitWidth, uint8(bw)) 89 90 data := make([]byte, 0, bw*len(buf)) 91 packer := pack8Int32FuncByWidth[bw] 92 for j := range buf { 93 data = append(data, packer(buf[j])...) 94 } 95 d.packed = append(d.packed, data) 96 } 97 98 for len(d.bitWidth) < d.miniBlockCount { 99 d.bitWidth = append(d.bitWidth, 0) 100 } 101 102 if err := binary.Write(d.buffer, binary.LittleEndian, d.bitWidth); err != nil { 103 return err 104 } 105 106 for i := range d.packed { 107 if err := writeFull(d.buffer, d.packed[i]); err != nil { 108 return err 109 } 110 } 111 d.minDelta = math.MaxInt32 112 d.deltas = d.deltas[:0] 113 114 return nil 115 } 116 117 func (d *deltaBitPackEncoder32) addInt32(i int32) error { 118 d.valuesCount++ 119 if d.valuesCount == 1 { 120 d.firstValue = i 121 d.previousValue = i 122 return nil 123 } 124 125 delta := i - d.previousValue 126 d.previousValue = i 127 d.deltas = append(d.deltas, delta) 128 if delta < d.minDelta { 129 d.minDelta = delta 130 } 131 132 if len(d.deltas) == d.blockSize { 133 // flush 134 return d.flush() 135 } 136 137 return nil 138 } 139 140 func (d *deltaBitPackEncoder32) write() error { 141 if d.valuesCount == 1 || len(d.deltas) > 0 { 142 if err := d.flush(); err != nil { 143 return err 144 } 145 } 146 147 if err := writeUVariant(d.w, uint64(d.blockSize)); err != nil { 148 return err 149 } 150 151 if err := writeUVariant(d.w, uint64(d.miniBlockCount)); err != nil { 152 return err 153 } 154 155 if err := writeUVariant(d.w, uint64(d.valuesCount)); err != nil { 156 return err 157 } 158 159 if err := writeVariant(d.w, int64(d.firstValue)); err != nil { 160 return err 161 } 162 163 return writeFull(d.w, d.buffer.Bytes()) 164 } 165 166 func (d *deltaBitPackEncoder32) Close() error { 167 return d.write() 168 } 169 170 type deltaBitPackEncoder64 struct { 171 // this value should be there before the init 172 blockSize int // Must be multiple of 128 173 miniBlockCount int // blockSize % miniBlockCount should be 0 174 175 // 176 miniBlockValueCount int 177 178 w io.Writer 179 180 firstValue int64 // the first value to write 181 valuesCount int 182 minDelta int64 183 deltas []int64 184 previousValue int64 185 186 buffer *bytes.Buffer 187 bitWidth []uint8 188 packed [][]byte 189 } 190 191 func (d *deltaBitPackEncoder64) init(w io.Writer) error { 192 d.w = w 193 194 if d.blockSize%128 != 0 || d.blockSize <= 0 { 195 return fmt.Errorf("invalid block size, it should be multiple of 128, it is %d", d.blockSize) 196 } 197 198 if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 { 199 return fmt.Errorf("invalid mini block count, it is %d", d.miniBlockCount) 200 } 201 202 d.miniBlockValueCount = d.blockSize / d.miniBlockCount 203 if d.miniBlockValueCount%8 != 0 { 204 return fmt.Errorf("invalid mini block count, the mini block value count should be multiple of 8, it is %d", d.miniBlockCount) 205 } 206 207 d.firstValue = 0 208 d.valuesCount = 0 209 d.minDelta = math.MaxInt32 210 d.deltas = make([]int64, 0, d.blockSize) 211 d.previousValue = 0 212 d.buffer = &bytes.Buffer{} 213 d.bitWidth = make([]uint8, 0, d.miniBlockCount) 214 return nil 215 } 216 217 func (d *deltaBitPackEncoder64) flush() error { 218 // Technically, based on the spec after this step all values are positive, but NO, it's not. the problem is when 219 // the min delta is small enough (lets say MinInt) and one of deltas are MaxInt, the the result of MaxInt-MinInt is 220 // -1, get the idea, there is a lot of numbers here because of overflow can produce negative value 221 for i := range d.deltas { 222 d.deltas[i] -= d.minDelta 223 } 224 225 if err := writeVariant(d.buffer, d.minDelta); err != nil { 226 return err 227 } 228 229 d.bitWidth = d.bitWidth[:0] //reset the bitWidth buffer 230 d.packed = d.packed[:0] 231 for i := 0; i < len(d.deltas); i += d.miniBlockValueCount { 232 end := i + d.miniBlockValueCount 233 if end >= len(d.deltas) { 234 end = len(d.deltas) 235 } 236 // The cast to uint64 here, is the key. or the max not works at all 237 max := uint64(d.deltas[i]) 238 buf := make([][8]int64, d.miniBlockValueCount/8) 239 for j := i; j < end; j++ { 240 if max < uint64(d.deltas[j]) { 241 max = uint64(d.deltas[j]) 242 } 243 t := j - i 244 buf[t/8][t%8] = d.deltas[j] 245 } 246 bw := bits.Len64(max) 247 d.bitWidth = append(d.bitWidth, uint8(bw)) 248 249 data := make([]byte, 0, bw*len(buf)) 250 packer := pack8Int64FuncByWidth[bw] 251 for j := range buf { 252 data = append(data, packer(buf[j])...) 253 } 254 d.packed = append(d.packed, data) 255 } 256 257 for len(d.bitWidth) < d.miniBlockCount { 258 d.bitWidth = append(d.bitWidth, 0) 259 } 260 261 if err := binary.Write(d.buffer, binary.LittleEndian, d.bitWidth); err != nil { 262 return err 263 } 264 265 for i := range d.packed { 266 if err := writeFull(d.buffer, d.packed[i]); err != nil { 267 return err 268 } 269 } 270 d.minDelta = math.MaxInt32 271 d.deltas = d.deltas[:0] 272 273 return nil 274 } 275 276 func (d *deltaBitPackEncoder64) addInt64(i int64) error { 277 d.valuesCount++ 278 if d.valuesCount == 1 { 279 d.firstValue = i 280 d.previousValue = i 281 return nil 282 } 283 284 delta := i - d.previousValue 285 d.previousValue = i 286 d.deltas = append(d.deltas, delta) 287 if delta < d.minDelta { 288 d.minDelta = delta 289 } 290 291 if len(d.deltas) == d.blockSize { 292 // flush 293 return d.flush() 294 } 295 296 return nil 297 } 298 299 func (d *deltaBitPackEncoder64) write() error { 300 if d.valuesCount == 1 || len(d.deltas) > 0 { 301 if err := d.flush(); err != nil { 302 return err 303 } 304 } 305 306 if err := writeUVariant(d.w, uint64(d.blockSize)); err != nil { 307 return err 308 } 309 310 if err := writeUVariant(d.w, uint64(d.miniBlockCount)); err != nil { 311 return err 312 } 313 314 if err := writeUVariant(d.w, uint64(d.valuesCount)); err != nil { 315 return err 316 } 317 318 if err := writeVariant(d.w, d.firstValue); err != nil { 319 return err 320 } 321 322 return writeFull(d.w, d.buffer.Bytes()) 323 } 324 325 func (d *deltaBitPackEncoder64) Close() error { 326 return d.write() 327 }