github.com/fraugster/parquet-go@v0.12.0/hybrid_decoder.go (about) 1 package goparquet 2 3 // This file is based on the code from https://github.com/kostya-sh/parquet-go 4 // Copyright (c) 2015 Konstantin Shaposhnikov 5 6 import ( 7 "bytes" 8 "encoding/binary" 9 "errors" 10 "fmt" 11 "io" 12 "io/ioutil" 13 "math/bits" 14 ) 15 16 type decoder interface { 17 next() (int32, error) 18 19 init(io.Reader) error 20 initSize(io.Reader) error 21 } 22 23 type levelDecoder interface { 24 decoder 25 26 maxLevel() uint16 27 } 28 29 type hybridDecoder struct { 30 r io.Reader 31 32 bitWidth int 33 unpackerFn unpack8int32Func 34 rleValueSize int 35 36 bpRun [8]int32 37 38 rleCount uint32 39 rleValue int32 40 41 bpCount uint32 42 bpRunPos uint8 43 44 buffered bool 45 } 46 47 func newHybridDecoder(bitWidth int) *hybridDecoder { 48 return &hybridDecoder{ 49 bitWidth: bitWidth, 50 unpackerFn: unpack8Int32FuncByWidth[bitWidth], 51 52 rleValueSize: (bitWidth + 7) / 8, 53 } 54 } 55 56 func (hd *hybridDecoder) initSize(r io.Reader) error { 57 if hd.bitWidth == 0 { 58 return nil 59 } 60 var size uint32 61 if err := binary.Read(r, binary.LittleEndian, &size); err != nil { 62 return err 63 } 64 reader := io.LimitReader(r, int64(size)) 65 return hd.init(reader) 66 } 67 68 func (hd *hybridDecoder) init(r io.Reader) error { 69 if hd.buffered { 70 buf, err := ioutil.ReadAll(r) 71 if err != nil { 72 return err 73 } 74 hd.r = bytes.NewReader(buf) 75 } else { 76 hd.r = r 77 } 78 return nil 79 } 80 81 func (hd *hybridDecoder) next() (next int32, err error) { 82 // when the bit width is zero, it means we can only have infinite zero. 83 if hd.bitWidth == 0 { 84 return 0, nil 85 } 86 if hd.r == nil { 87 return 0, errors.New("reader is not initialized") 88 } 89 if hd.rleCount == 0 && hd.bpCount == 0 && hd.bpRunPos == 0 { 90 if err = hd.readRunHeader(); err != nil { 91 return 0, err 92 } 93 } 94 95 switch { 96 case hd.rleCount > 0: 97 next = hd.rleValue 98 hd.rleCount-- 99 case hd.bpCount > 0 || hd.bpRunPos > 0: 100 if hd.bpRunPos == 0 { 101 if err = hd.readBitPackedRun(); err != nil { 102 return 0, err 103 } 104 hd.bpCount-- 105 } 106 next = hd.bpRun[hd.bpRunPos] 107 hd.bpRunPos = (hd.bpRunPos + 1) % 8 108 default: 109 return 0, io.EOF 110 } 111 112 return next, err 113 } 114 115 func (hd *hybridDecoder) readRLERunValue() error { 116 v := make([]byte, hd.rleValueSize) 117 n, err := hd.r.Read(v) 118 if err != nil { 119 return err 120 } 121 if n != hd.rleValueSize { 122 return io.ErrUnexpectedEOF 123 } 124 125 hd.rleValue = decodeRLEValue(v) 126 if bits.LeadingZeros32(uint32(hd.rleValue)) < 32-hd.bitWidth { 127 return errors.New("rle: RLE run value is too large") 128 } 129 return nil 130 } 131 132 func (hd *hybridDecoder) readBitPackedRun() error { 133 data := make([]byte, hd.bitWidth) 134 _, err := hd.r.Read(data) 135 if err != nil { 136 return err 137 } 138 hd.bpRun = hd.unpackerFn(data) 139 return nil 140 } 141 142 func (hd *hybridDecoder) readRunHeader() error { 143 h, err := readUVariant32(hd.r) 144 if err != nil { 145 // this error could be EOF which is ok by this implementation the only issue is the binary.ReadUVariant can not 146 // return UnexpectedEOF is there is some bit read from the stream with no luck, it always return EOF 147 return err 148 } 149 150 // The lower bit indicate if this is bitpack or rle 151 if h&1 == 1 { 152 hd.bpCount = uint32(h >> 1) 153 if hd.bpCount == 0 { 154 return fmt.Errorf("rle: empty bit-packed run") 155 } 156 hd.bpRunPos = 0 157 } else { 158 hd.rleCount = uint32(h >> 1) 159 if hd.rleCount == 0 { 160 return fmt.Errorf("rle: empty RLE run") 161 } 162 return hd.readRLERunValue() 163 } 164 return nil 165 }