github.com/fraugster/parquet-go@v0.12.0/hybrid_decoder.go (about)

     1  package goparquet
     2  
     3  // This file is based on the code from https://github.com/kostya-sh/parquet-go
     4  // Copyright (c) 2015 Konstantin Shaposhnikov
     5  
     6  import (
     7  	"bytes"
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"io/ioutil"
    13  	"math/bits"
    14  )
    15  
    16  type decoder interface {
    17  	next() (int32, error)
    18  
    19  	init(io.Reader) error
    20  	initSize(io.Reader) error
    21  }
    22  
    23  type levelDecoder interface {
    24  	decoder
    25  
    26  	maxLevel() uint16
    27  }
    28  
    29  type hybridDecoder struct {
    30  	r io.Reader
    31  
    32  	bitWidth     int
    33  	unpackerFn   unpack8int32Func
    34  	rleValueSize int
    35  
    36  	bpRun [8]int32
    37  
    38  	rleCount uint32
    39  	rleValue int32
    40  
    41  	bpCount  uint32
    42  	bpRunPos uint8
    43  
    44  	buffered bool
    45  }
    46  
    47  func newHybridDecoder(bitWidth int) *hybridDecoder {
    48  	return &hybridDecoder{
    49  		bitWidth:   bitWidth,
    50  		unpackerFn: unpack8Int32FuncByWidth[bitWidth],
    51  
    52  		rleValueSize: (bitWidth + 7) / 8,
    53  	}
    54  }
    55  
    56  func (hd *hybridDecoder) initSize(r io.Reader) error {
    57  	if hd.bitWidth == 0 {
    58  		return nil
    59  	}
    60  	var size uint32
    61  	if err := binary.Read(r, binary.LittleEndian, &size); err != nil {
    62  		return err
    63  	}
    64  	reader := io.LimitReader(r, int64(size))
    65  	return hd.init(reader)
    66  }
    67  
    68  func (hd *hybridDecoder) init(r io.Reader) error {
    69  	if hd.buffered {
    70  		buf, err := ioutil.ReadAll(r)
    71  		if err != nil {
    72  			return err
    73  		}
    74  		hd.r = bytes.NewReader(buf)
    75  	} else {
    76  		hd.r = r
    77  	}
    78  	return nil
    79  }
    80  
    81  func (hd *hybridDecoder) next() (next int32, err error) {
    82  	// when the bit width is zero, it means we can only have infinite zero.
    83  	if hd.bitWidth == 0 {
    84  		return 0, nil
    85  	}
    86  	if hd.r == nil {
    87  		return 0, errors.New("reader is not initialized")
    88  	}
    89  	if hd.rleCount == 0 && hd.bpCount == 0 && hd.bpRunPos == 0 {
    90  		if err = hd.readRunHeader(); err != nil {
    91  			return 0, err
    92  		}
    93  	}
    94  
    95  	switch {
    96  	case hd.rleCount > 0:
    97  		next = hd.rleValue
    98  		hd.rleCount--
    99  	case hd.bpCount > 0 || hd.bpRunPos > 0:
   100  		if hd.bpRunPos == 0 {
   101  			if err = hd.readBitPackedRun(); err != nil {
   102  				return 0, err
   103  			}
   104  			hd.bpCount--
   105  		}
   106  		next = hd.bpRun[hd.bpRunPos]
   107  		hd.bpRunPos = (hd.bpRunPos + 1) % 8
   108  	default:
   109  		return 0, io.EOF
   110  	}
   111  
   112  	return next, err
   113  }
   114  
   115  func (hd *hybridDecoder) readRLERunValue() error {
   116  	v := make([]byte, hd.rleValueSize)
   117  	n, err := hd.r.Read(v)
   118  	if err != nil {
   119  		return err
   120  	}
   121  	if n != hd.rleValueSize {
   122  		return io.ErrUnexpectedEOF
   123  	}
   124  
   125  	hd.rleValue = decodeRLEValue(v)
   126  	if bits.LeadingZeros32(uint32(hd.rleValue)) < 32-hd.bitWidth {
   127  		return errors.New("rle: RLE run value is too large")
   128  	}
   129  	return nil
   130  }
   131  
   132  func (hd *hybridDecoder) readBitPackedRun() error {
   133  	data := make([]byte, hd.bitWidth)
   134  	_, err := hd.r.Read(data)
   135  	if err != nil {
   136  		return err
   137  	}
   138  	hd.bpRun = hd.unpackerFn(data)
   139  	return nil
   140  }
   141  
   142  func (hd *hybridDecoder) readRunHeader() error {
   143  	h, err := readUVariant32(hd.r)
   144  	if err != nil {
   145  		// this error could be EOF which is ok by this implementation the only issue is the binary.ReadUVariant can not
   146  		// return UnexpectedEOF is there is some bit read from the stream with no luck, it always return EOF
   147  		return err
   148  	}
   149  
   150  	// The lower bit indicate if this is bitpack or rle
   151  	if h&1 == 1 {
   152  		hd.bpCount = uint32(h >> 1)
   153  		if hd.bpCount == 0 {
   154  			return fmt.Errorf("rle: empty bit-packed run")
   155  		}
   156  		hd.bpRunPos = 0
   157  	} else {
   158  		hd.rleCount = uint32(h >> 1)
   159  		if hd.rleCount == 0 {
   160  			return fmt.Errorf("rle: empty RLE run")
   161  		}
   162  		return hd.readRLERunValue()
   163  	}
   164  	return nil
   165  }