github.com/fraugster/parquet-go@v0.12.0/deltabp_decoder.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  )
     8  
     9  // The two following decoder are identical, since there is no generic, I had two option, one use the interfaces
    10  // which was my first choice but its branchy and full of if and else. so I decided to go for second solution and
    11  // almost copy/paste this two types
    12  
    13  type deltaBitPackDecoder32 struct {
    14  	r io.Reader
    15  
    16  	blockSize           int32
    17  	miniBlockCount      int32
    18  	valuesCount         int32
    19  	miniBlockValueCount int32
    20  
    21  	previousValue int32
    22  	minDelta      int32
    23  
    24  	miniBlockBitWidth        []uint8
    25  	currentMiniBlock         int32
    26  	currentMiniBlockBitWidth uint8
    27  	miniBlockPosition        int32 // position inside the current mini block
    28  	position                 int32 // position in the value. since delta may have padding we need to track this
    29  	currentUnpacker          unpack8int32Func
    30  	miniBlockInt32           [8]int32
    31  }
    32  
    33  func (d *deltaBitPackDecoder32) initSize(r io.Reader) error {
    34  	return d.init(r)
    35  }
    36  
    37  func (d *deltaBitPackDecoder32) init(r io.Reader) error {
    38  	d.r = r
    39  
    40  	if err := d.readBlockHeader(); err != nil {
    41  		return err
    42  	}
    43  
    44  	if err := d.readMiniBlockHeader(); err != nil {
    45  		return err
    46  	}
    47  
    48  	return nil
    49  }
    50  
    51  func (d *deltaBitPackDecoder32) readBlockHeader() error {
    52  	var err error
    53  	if d.blockSize, err = readUVariant32(d.r); err != nil {
    54  		return fmt.Errorf("failed to read block size: %w", err)
    55  	}
    56  	if d.blockSize <= 0 && d.blockSize%128 != 0 {
    57  		return errors.New("invalid block size")
    58  	}
    59  
    60  	if d.miniBlockCount, err = readUVariant32(d.r); err != nil {
    61  		return fmt.Errorf("failed to read number of mini blocks: %w", err)
    62  	}
    63  
    64  	if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 {
    65  		return errors.New("int/delta: invalid number of mini blocks")
    66  	}
    67  
    68  	d.miniBlockValueCount = d.blockSize / d.miniBlockCount
    69  	if d.miniBlockValueCount == 0 {
    70  		return fmt.Errorf("invalid mini block value count, it can't be zero")
    71  	}
    72  
    73  	if d.valuesCount, err = readUVariant32(d.r); err != nil {
    74  		return fmt.Errorf("failed to read total value count: %w", err)
    75  	}
    76  
    77  	if d.valuesCount < 0 {
    78  		return errors.New("invalid total value count")
    79  	}
    80  
    81  	if d.previousValue, err = readVariant32(d.r); err != nil {
    82  		return fmt.Errorf("failed to read first value: %w", err)
    83  	}
    84  
    85  	return nil
    86  }
    87  
    88  func (d *deltaBitPackDecoder32) readMiniBlockHeader() error {
    89  	var err error
    90  
    91  	if d.minDelta, err = readVariant32(d.r); err != nil {
    92  		return fmt.Errorf("failed to read min delta: %w", err)
    93  	}
    94  
    95  	// the mini block bitwidth is always there, even if the value is zero
    96  	d.miniBlockBitWidth = make([]uint8, d.miniBlockCount)
    97  	if _, err = io.ReadFull(d.r, d.miniBlockBitWidth); err != nil {
    98  		return fmt.Errorf("not enough data to read all miniblock bit widths: %w", err)
    99  	}
   100  
   101  	for i := range d.miniBlockBitWidth {
   102  		if d.miniBlockBitWidth[i] > 32 {
   103  			return fmt.Errorf("invalid miniblock bit width: %d", d.miniBlockBitWidth[i])
   104  		}
   105  	}
   106  
   107  	// start from the first min block in a big block
   108  	d.currentMiniBlock = 0
   109  
   110  	return nil
   111  }
   112  
   113  func (d *deltaBitPackDecoder32) next() (int32, error) {
   114  	if d.position >= d.valuesCount {
   115  		// No value left in the buffer
   116  		return 0, io.EOF
   117  	}
   118  
   119  	// need new byte?
   120  	if d.position%8 == 0 {
   121  		// do we need to advance a mini block?
   122  		if d.position%d.miniBlockValueCount == 0 {
   123  			// do we need to advance a big block?
   124  			if d.currentMiniBlock >= d.miniBlockCount {
   125  				if err := d.readMiniBlockHeader(); err != nil {
   126  					return 0, err
   127  				}
   128  			}
   129  
   130  			d.currentMiniBlockBitWidth = d.miniBlockBitWidth[d.currentMiniBlock]
   131  			d.currentUnpacker = unpack8Int32FuncByWidth[int(d.currentMiniBlockBitWidth)]
   132  
   133  			d.miniBlockPosition = 0
   134  			d.currentMiniBlock++
   135  		}
   136  
   137  		// read next 8 values
   138  		w := int32(d.currentMiniBlockBitWidth)
   139  		buf := make([]byte, w)
   140  		if _, err := io.ReadFull(d.r, buf); err != nil {
   141  			return 0, err
   142  		}
   143  
   144  		d.miniBlockInt32 = d.currentUnpacker(buf)
   145  		d.miniBlockPosition += w
   146  		// there is padding here, read them all from the reader, first deal with the remaining of the current block,
   147  		// then the next blocks. if the blocks bit width is zero then simply ignore them, but the docs said reader
   148  		// should accept any arbitrary bit width here.
   149  		if d.position+8 >= d.valuesCount {
   150  			//  current block
   151  			l := (d.miniBlockValueCount/8)*w - d.miniBlockPosition
   152  			if l < 0 {
   153  				return 0, errors.New("invalid stream")
   154  			}
   155  			remaining := make([]byte, l)
   156  			_, _ = io.ReadFull(d.r, remaining)
   157  			for i := d.currentMiniBlock; i < d.miniBlockCount; i++ {
   158  				w := int32(d.miniBlockBitWidth[d.currentMiniBlock])
   159  				if w != 0 {
   160  					remaining := make([]byte, (d.miniBlockValueCount/8)*w)
   161  					_, _ = io.ReadFull(d.r, remaining)
   162  				}
   163  			}
   164  		}
   165  	}
   166  
   167  	// value is the previous value + delta stored in the reader and the min delta for the block, also we always read one
   168  	// value ahead
   169  	ret := d.previousValue
   170  	d.previousValue += d.miniBlockInt32[d.position%8] + d.minDelta
   171  	d.position++
   172  
   173  	return ret, nil
   174  }
   175  
   176  type deltaBitPackDecoder64 struct {
   177  	r io.Reader
   178  
   179  	blockSize           int32
   180  	miniBlockCount      int32
   181  	valuesCount         int32
   182  	miniBlockValueCount int32
   183  
   184  	previousValue int64
   185  	minDelta      int64
   186  
   187  	miniBlockBitWidth        []uint8
   188  	currentMiniBlock         int32
   189  	currentMiniBlockBitWidth uint8
   190  	miniBlockPosition        int32 // position inside the current mini block
   191  	position                 int32 // position in the value. since delta may have padding we need to track this
   192  	currentUnpacker          unpack8int64Func
   193  	miniBlockInt64           [8]int64
   194  }
   195  
   196  func (d *deltaBitPackDecoder64) init(r io.Reader) error {
   197  	d.r = r
   198  
   199  	if err := d.readBlockHeader(); err != nil {
   200  		return err
   201  	}
   202  
   203  	if err := d.readMiniBlockHeader(); err != nil {
   204  		return err
   205  	}
   206  
   207  	return nil
   208  }
   209  
   210  func (d *deltaBitPackDecoder64) readBlockHeader() error {
   211  	var err error
   212  	if d.blockSize, err = readUVariant32(d.r); err != nil {
   213  		return fmt.Errorf("failed to read block size: %w", err)
   214  	}
   215  	if d.blockSize <= 0 && d.blockSize%128 != 0 {
   216  		return errors.New("invalid block size")
   217  	}
   218  
   219  	if d.miniBlockCount, err = readUVariant32(d.r); err != nil {
   220  		return fmt.Errorf("failed to read number of mini blocks: %w", err)
   221  	}
   222  
   223  	if d.miniBlockCount <= 0 || d.blockSize%d.miniBlockCount != 0 {
   224  		return errors.New("int/delta: invalid number of mini blocks")
   225  	}
   226  
   227  	d.miniBlockValueCount = d.blockSize / d.miniBlockCount
   228  	if d.miniBlockValueCount == 0 {
   229  		return errors.New("invalid mini block value count, it can't be zero")
   230  	}
   231  
   232  	if d.valuesCount, err = readUVariant32(d.r); err != nil {
   233  		return fmt.Errorf("failed to read total value count: %w", err)
   234  	}
   235  
   236  	if d.valuesCount < 0 {
   237  		return fmt.Errorf("invalid total value count %d", d.valuesCount)
   238  	}
   239  
   240  	if d.previousValue, err = readVariant64(d.r); err != nil {
   241  		return fmt.Errorf("failed to read first value: %w", err)
   242  	}
   243  
   244  	return nil
   245  }
   246  
   247  func (d *deltaBitPackDecoder64) readMiniBlockHeader() error {
   248  	var err error
   249  
   250  	if d.minDelta, err = readVariant64(d.r); err != nil {
   251  		return fmt.Errorf("failed to read min delta: %w", err)
   252  	}
   253  
   254  	// the mini block bitwidth is always there, even if the value is zero
   255  	d.miniBlockBitWidth = make([]uint8, d.miniBlockCount)
   256  	if _, err = io.ReadFull(d.r, d.miniBlockBitWidth); err != nil {
   257  		return fmt.Errorf("not enough data to read all miniblock bit widths: %w", err)
   258  	}
   259  
   260  	for i := range d.miniBlockBitWidth {
   261  		if d.miniBlockBitWidth[i] > 64 {
   262  			return fmt.Errorf("invalid miniblock bit width: %d", d.miniBlockBitWidth[i])
   263  		}
   264  	}
   265  
   266  	// start from the first min block in a big block
   267  	d.currentMiniBlock = 0
   268  
   269  	return nil
   270  }
   271  
   272  func (d *deltaBitPackDecoder64) next() (int64, error) {
   273  	if d.position >= d.valuesCount {
   274  		// No value left in the buffer
   275  		return 0, io.EOF
   276  	}
   277  
   278  	// need new byte?
   279  	if d.position%8 == 0 {
   280  		// do we need to advance a mini block?
   281  		if d.position%d.miniBlockValueCount == 0 {
   282  			// do we need to advance a big block?
   283  			if d.currentMiniBlock >= d.miniBlockCount {
   284  				if err := d.readMiniBlockHeader(); err != nil {
   285  					return 0, err
   286  				}
   287  			}
   288  
   289  			d.currentMiniBlockBitWidth = d.miniBlockBitWidth[d.currentMiniBlock]
   290  			d.currentUnpacker = unpack8Int64FuncByWidth[int(d.currentMiniBlockBitWidth)]
   291  
   292  			d.miniBlockPosition = 0
   293  			d.currentMiniBlock++
   294  		}
   295  
   296  		// read next 8 values
   297  		w := int32(d.currentMiniBlockBitWidth)
   298  		buf := make([]byte, w)
   299  		if _, err := io.ReadFull(d.r, buf); err != nil {
   300  			return 0, err
   301  		}
   302  
   303  		d.miniBlockInt64 = d.currentUnpacker(buf)
   304  		d.miniBlockPosition += w
   305  		// there is padding here, read them all from the reader, first deal with the remaining of the current block,
   306  		// then the next blocks. if the blocks bit width is zero then simply ignore them, but the docs said reader
   307  		// should accept any arbitrary bit width here.
   308  		if d.position+8 >= d.valuesCount {
   309  			//  current block
   310  			sliceLen := (d.miniBlockValueCount/8)*w - d.miniBlockPosition
   311  			if sliceLen < 0 {
   312  				return 0, fmt.Errorf("invalid remaining values, mini block value count = %d, width = %d, mini block position = %d", d.miniBlockValueCount, w, d.miniBlockPosition)
   313  			}
   314  			remaining := make([]byte, sliceLen)
   315  			_, _ = io.ReadFull(d.r, remaining)
   316  			for i := d.currentMiniBlock; i < d.miniBlockCount; i++ {
   317  				w := int32(d.miniBlockBitWidth[d.currentMiniBlock])
   318  				if w != 0 {
   319  					remaining := make([]byte, (d.miniBlockValueCount/8)*w)
   320  					_, _ = io.ReadFull(d.r, remaining)
   321  				}
   322  			}
   323  		}
   324  	}
   325  
   326  	// value is the previous value + delta stored in the reader and the min delta for the block, also we always read one
   327  	// value ahead
   328  	ret := d.previousValue
   329  	d.previousValue += d.miniBlockInt64[d.position%8] + d.minDelta
   330  	d.position++
   331  
   332  	return ret, nil
   333  }