github.com/fraugster/parquet-go@v0.12.0/deltabp_encoder.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"fmt"
     7  	"io"
     8  	"math"
     9  	"math/bits"
    10  )
    11  
    12  type deltaBitPackEncoder32 struct {
    13  	deltas   []int32
    14  	bitWidth []uint8
    15  	packed   [][]byte
    16  	w        io.Writer
    17  
    18  	// this value should be there before the init
    19  	blockSize      int // Must be multiple of 128
    20  	miniBlockCount int // blockSize % miniBlockCount should be 0
    21  
    22  	miniBlockValueCount int
    23  
    24  	valuesCount int
    25  	buffer      *bytes.Buffer
    26  
    27  	firstValue    int32 // the first value to write
    28  	minDelta      int32
    29  	previousValue int32
    30  }
    31  
    32  func (d *deltaBitPackEncoder32) init(w io.Writer) error {
    33  	d.w = w
    34  
    35  	if d.blockSize%128 != 0 || d.blockSize <= 0 {
    36  		return fmt.Errorf("invalid block size, it should be multiple of 128, it is %d", d.blockSize)
    37  	}
    38  
    39  	if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 {
    40  		return fmt.Errorf("invalid mini block count, it is %d", d.miniBlockCount)
    41  	}
    42  
    43  	d.miniBlockValueCount = d.blockSize / d.miniBlockCount
    44  	if d.miniBlockValueCount%8 != 0 {
    45  		return fmt.Errorf("invalid mini block count, the mini block value count should be multiple of 8, it is %d", d.miniBlockCount)
    46  	}
    47  
    48  	d.firstValue = 0
    49  	d.valuesCount = 0
    50  	d.minDelta = math.MaxInt32
    51  	d.deltas = make([]int32, 0, d.blockSize)
    52  	d.previousValue = 0
    53  	d.buffer = &bytes.Buffer{}
    54  	d.bitWidth = make([]uint8, 0, d.miniBlockCount)
    55  	return nil
    56  }
    57  
    58  func (d *deltaBitPackEncoder32) flush() error {
    59  	// Technically, based on the spec after this step all values are positive, but NO, it's not. the problem is when
    60  	// the min delta is small enough (lets say MinInt) and one of deltas are MaxInt, the the result of MaxInt-MinInt is
    61  	// -1, get the idea, there is a lot of numbers here because of overflow can produce negative value
    62  	for i := range d.deltas {
    63  		d.deltas[i] -= d.minDelta
    64  	}
    65  
    66  	if err := writeVariant(d.buffer, int64(d.minDelta)); err != nil {
    67  		return err
    68  	}
    69  
    70  	d.bitWidth = d.bitWidth[:0] //reset the bitWidth buffer
    71  	d.packed = d.packed[:0]
    72  	for i := 0; i < len(d.deltas); i += d.miniBlockValueCount {
    73  		end := i + d.miniBlockValueCount
    74  		if end >= len(d.deltas) {
    75  			end = len(d.deltas)
    76  		}
    77  		// The cast to uint32 here, is the key. or the max not works at all
    78  		max := uint32(d.deltas[i])
    79  		buf := make([][8]int32, d.miniBlockValueCount/8)
    80  		for j := i; j < end; j++ {
    81  			if max < uint32(d.deltas[j]) {
    82  				max = uint32(d.deltas[j])
    83  			}
    84  			t := j - i
    85  			buf[t/8][t%8] = d.deltas[j]
    86  		}
    87  		bw := bits.Len32(max)
    88  		d.bitWidth = append(d.bitWidth, uint8(bw))
    89  
    90  		data := make([]byte, 0, bw*len(buf))
    91  		packer := pack8Int32FuncByWidth[bw]
    92  		for j := range buf {
    93  			data = append(data, packer(buf[j])...)
    94  		}
    95  		d.packed = append(d.packed, data)
    96  	}
    97  
    98  	for len(d.bitWidth) < d.miniBlockCount {
    99  		d.bitWidth = append(d.bitWidth, 0)
   100  	}
   101  
   102  	if err := binary.Write(d.buffer, binary.LittleEndian, d.bitWidth); err != nil {
   103  		return err
   104  	}
   105  
   106  	for i := range d.packed {
   107  		if err := writeFull(d.buffer, d.packed[i]); err != nil {
   108  			return err
   109  		}
   110  	}
   111  	d.minDelta = math.MaxInt32
   112  	d.deltas = d.deltas[:0]
   113  
   114  	return nil
   115  }
   116  
   117  func (d *deltaBitPackEncoder32) addInt32(i int32) error {
   118  	d.valuesCount++
   119  	if d.valuesCount == 1 {
   120  		d.firstValue = i
   121  		d.previousValue = i
   122  		return nil
   123  	}
   124  
   125  	delta := i - d.previousValue
   126  	d.previousValue = i
   127  	d.deltas = append(d.deltas, delta)
   128  	if delta < d.minDelta {
   129  		d.minDelta = delta
   130  	}
   131  
   132  	if len(d.deltas) == d.blockSize {
   133  		// flush
   134  		return d.flush()
   135  	}
   136  
   137  	return nil
   138  }
   139  
   140  func (d *deltaBitPackEncoder32) write() error {
   141  	if d.valuesCount == 1 || len(d.deltas) > 0 {
   142  		if err := d.flush(); err != nil {
   143  			return err
   144  		}
   145  	}
   146  
   147  	if err := writeUVariant(d.w, uint64(d.blockSize)); err != nil {
   148  		return err
   149  	}
   150  
   151  	if err := writeUVariant(d.w, uint64(d.miniBlockCount)); err != nil {
   152  		return err
   153  	}
   154  
   155  	if err := writeUVariant(d.w, uint64(d.valuesCount)); err != nil {
   156  		return err
   157  	}
   158  
   159  	if err := writeVariant(d.w, int64(d.firstValue)); err != nil {
   160  		return err
   161  	}
   162  
   163  	return writeFull(d.w, d.buffer.Bytes())
   164  }
   165  
   166  func (d *deltaBitPackEncoder32) Close() error {
   167  	return d.write()
   168  }
   169  
   170  type deltaBitPackEncoder64 struct {
   171  	// this value should be there before the init
   172  	blockSize      int // Must be multiple of 128
   173  	miniBlockCount int // blockSize % miniBlockCount should be 0
   174  
   175  	//
   176  	miniBlockValueCount int
   177  
   178  	w io.Writer
   179  
   180  	firstValue    int64 // the first value to write
   181  	valuesCount   int
   182  	minDelta      int64
   183  	deltas        []int64
   184  	previousValue int64
   185  
   186  	buffer   *bytes.Buffer
   187  	bitWidth []uint8
   188  	packed   [][]byte
   189  }
   190  
   191  func (d *deltaBitPackEncoder64) init(w io.Writer) error {
   192  	d.w = w
   193  
   194  	if d.blockSize%128 != 0 || d.blockSize <= 0 {
   195  		return fmt.Errorf("invalid block size, it should be multiple of 128, it is %d", d.blockSize)
   196  	}
   197  
   198  	if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 {
   199  		return fmt.Errorf("invalid mini block count, it is %d", d.miniBlockCount)
   200  	}
   201  
   202  	d.miniBlockValueCount = d.blockSize / d.miniBlockCount
   203  	if d.miniBlockValueCount%8 != 0 {
   204  		return fmt.Errorf("invalid mini block count, the mini block value count should be multiple of 8, it is %d", d.miniBlockCount)
   205  	}
   206  
   207  	d.firstValue = 0
   208  	d.valuesCount = 0
   209  	d.minDelta = math.MaxInt32
   210  	d.deltas = make([]int64, 0, d.blockSize)
   211  	d.previousValue = 0
   212  	d.buffer = &bytes.Buffer{}
   213  	d.bitWidth = make([]uint8, 0, d.miniBlockCount)
   214  	return nil
   215  }
   216  
   217  func (d *deltaBitPackEncoder64) flush() error {
   218  	// Technically, based on the spec after this step all values are positive, but NO, it's not. the problem is when
   219  	// the min delta is small enough (lets say MinInt) and one of deltas are MaxInt, the the result of MaxInt-MinInt is
   220  	// -1, get the idea, there is a lot of numbers here because of overflow can produce negative value
   221  	for i := range d.deltas {
   222  		d.deltas[i] -= d.minDelta
   223  	}
   224  
   225  	if err := writeVariant(d.buffer, d.minDelta); err != nil {
   226  		return err
   227  	}
   228  
   229  	d.bitWidth = d.bitWidth[:0] //reset the bitWidth buffer
   230  	d.packed = d.packed[:0]
   231  	for i := 0; i < len(d.deltas); i += d.miniBlockValueCount {
   232  		end := i + d.miniBlockValueCount
   233  		if end >= len(d.deltas) {
   234  			end = len(d.deltas)
   235  		}
   236  		// The cast to uint64 here, is the key. or the max not works at all
   237  		max := uint64(d.deltas[i])
   238  		buf := make([][8]int64, d.miniBlockValueCount/8)
   239  		for j := i; j < end; j++ {
   240  			if max < uint64(d.deltas[j]) {
   241  				max = uint64(d.deltas[j])
   242  			}
   243  			t := j - i
   244  			buf[t/8][t%8] = d.deltas[j]
   245  		}
   246  		bw := bits.Len64(max)
   247  		d.bitWidth = append(d.bitWidth, uint8(bw))
   248  
   249  		data := make([]byte, 0, bw*len(buf))
   250  		packer := pack8Int64FuncByWidth[bw]
   251  		for j := range buf {
   252  			data = append(data, packer(buf[j])...)
   253  		}
   254  		d.packed = append(d.packed, data)
   255  	}
   256  
   257  	for len(d.bitWidth) < d.miniBlockCount {
   258  		d.bitWidth = append(d.bitWidth, 0)
   259  	}
   260  
   261  	if err := binary.Write(d.buffer, binary.LittleEndian, d.bitWidth); err != nil {
   262  		return err
   263  	}
   264  
   265  	for i := range d.packed {
   266  		if err := writeFull(d.buffer, d.packed[i]); err != nil {
   267  			return err
   268  		}
   269  	}
   270  	d.minDelta = math.MaxInt32
   271  	d.deltas = d.deltas[:0]
   272  
   273  	return nil
   274  }
   275  
   276  func (d *deltaBitPackEncoder64) addInt64(i int64) error {
   277  	d.valuesCount++
   278  	if d.valuesCount == 1 {
   279  		d.firstValue = i
   280  		d.previousValue = i
   281  		return nil
   282  	}
   283  
   284  	delta := i - d.previousValue
   285  	d.previousValue = i
   286  	d.deltas = append(d.deltas, delta)
   287  	if delta < d.minDelta {
   288  		d.minDelta = delta
   289  	}
   290  
   291  	if len(d.deltas) == d.blockSize {
   292  		// flush
   293  		return d.flush()
   294  	}
   295  
   296  	return nil
   297  }
   298  
   299  func (d *deltaBitPackEncoder64) write() error {
   300  	if d.valuesCount == 1 || len(d.deltas) > 0 {
   301  		if err := d.flush(); err != nil {
   302  			return err
   303  		}
   304  	}
   305  
   306  	if err := writeUVariant(d.w, uint64(d.blockSize)); err != nil {
   307  		return err
   308  	}
   309  
   310  	if err := writeUVariant(d.w, uint64(d.miniBlockCount)); err != nil {
   311  		return err
   312  	}
   313  
   314  	if err := writeUVariant(d.w, uint64(d.valuesCount)); err != nil {
   315  		return err
   316  	}
   317  
   318  	if err := writeVariant(d.w, d.firstValue); err != nil {
   319  		return err
   320  	}
   321  
   322  	return writeFull(d.w, d.buffer.Bytes())
   323  }
   324  
   325  func (d *deltaBitPackEncoder64) Close() error {
   326  	return d.write()
   327  }