github.com/fraugster/parquet-go@v0.12.0/helpers.go (about) 1 package goparquet 2 3 import ( 4 "context" 5 "encoding/binary" 6 "errors" 7 "fmt" 8 "hash/fnv" 9 "io" 10 "math" 11 "math/bits" 12 13 "github.com/apache/thrift/lib/go/thrift" 14 ) 15 16 // DefaultHashFunc is used to generate a hash value to detect and handle duplicate values. 17 // The function has to return any type that can be used as a map key. In particular, the 18 // result can not be a slice. The default implementation used the fnv hash function as 19 // implemented in Go's standard library. 20 var DefaultHashFunc func([]byte) interface{} 21 22 func init() { 23 DefaultHashFunc = fnvHashFunc 24 } 25 26 type byteReader struct { 27 io.Reader 28 } 29 30 func (br *byteReader) ReadByte() (byte, error) { 31 buf := make([]byte, 1) 32 if _, err := io.ReadFull(br.Reader, buf); err != nil { 33 return 0, err 34 } 35 36 return buf[0], nil 37 } 38 39 type offsetReader struct { 40 inner io.ReadSeeker 41 offset int64 42 count int64 43 } 44 45 func (o *offsetReader) Read(p []byte) (int, error) { 46 n, err := o.inner.Read(p) 47 o.offset += int64(n) 48 o.count += int64(n) 49 return n, err 50 } 51 52 func (o *offsetReader) Seek(offset int64, whence int) (int64, error) { 53 i, err := o.inner.Seek(offset, whence) 54 if err == nil { 55 o.count += i - o.offset 56 o.offset = i 57 } 58 59 return i, err 60 } 61 62 func (o *offsetReader) Count() int64 { 63 return o.count 64 } 65 66 func decodeRLEValue(bytes []byte) int32 { 67 switch len(bytes) { 68 case 0: 69 return 0 70 case 1: 71 return int32(bytes[0]) 72 case 2: 73 return int32(bytes[0]) + int32(bytes[1])<<8 74 case 3: 75 return int32(bytes[0]) + int32(bytes[1])<<8 + int32(bytes[2])<<16 76 case 4: 77 return int32(bytes[0]) + int32(bytes[1])<<8 + int32(bytes[2])<<16 + int32(bytes[3])<<24 78 default: 79 panic("invalid argument") 80 } 81 } 82 83 func writeFull(w io.Writer, buf []byte) error { 84 if len(buf) == 0 { 85 return nil 86 } 87 cnt, err := w.Write(buf) 88 if err != nil { 89 return err 90 } 91 92 if cnt != len(buf) { 93 return fmt.Errorf("need to write %d byte wrote %d", cnt, len(buf)) 94 } 95 96 return nil 97 } 98 99 type thriftReader interface { 100 Read(context.Context, thrift.TProtocol) error 101 } 102 103 func readThrift(ctx context.Context, tr thriftReader, r io.Reader) error { 104 // Make sure we are not using any kind of buffered reader here. bufio.Reader "can" reads more data ahead of time, 105 // which is a problem on this library 106 transport := &thrift.StreamTransport{Reader: r} 107 proto := thrift.NewTCompactProtocolConf(transport, &thrift.TConfiguration{}) 108 return tr.Read(ctx, proto) 109 } 110 111 type thriftWriter interface { 112 Write(context.Context, thrift.TProtocol) error 113 } 114 115 func writeThrift(ctx context.Context, tr thriftWriter, w io.Writer) error { 116 transport := &thrift.StreamTransport{Writer: w} 117 proto := thrift.NewTCompactProtocolConf(transport, &thrift.TConfiguration{}) 118 return tr.Write(ctx, proto) 119 } 120 121 func decodeInt32(d decoder, data []int32) error { 122 for i := range data { 123 u, err := d.next() 124 if err != nil { 125 return err 126 } 127 data[i] = u 128 } 129 130 return nil 131 } 132 133 func decodePackedArray(d levelDecoder, count int) (*packedArray, int, error) { 134 ret := &packedArray{} 135 ret.reset(bits.Len16(d.maxLevel())) 136 nn := 0 // Counting not nulls only good for dLevels 137 for i := 0; i < count; i++ { 138 u, err := d.next() 139 if err != nil { 140 return nil, 0, err 141 } 142 ret.appendSingle(u) 143 if u == int32(d.maxLevel()) { 144 nn++ 145 } 146 } 147 148 return ret, nn, nil 149 } 150 151 func readUVariant32(r io.Reader) (int32, error) { 152 b, ok := r.(io.ByteReader) 153 if !ok { 154 b = &byteReader{Reader: r} 155 } 156 157 i, err := binary.ReadUvarint(b) 158 if err != nil { 159 return 0, err 160 } 161 162 if i > math.MaxInt32 { 163 return 0, errors.New("int32 out of range") 164 } 165 166 return int32(i), nil 167 } 168 169 func readVariant32(r io.Reader) (int32, error) { 170 b, ok := r.(io.ByteReader) 171 if !ok { 172 b = &byteReader{Reader: r} 173 } 174 175 i, err := binary.ReadVarint(b) 176 if err != nil { 177 return 0, err 178 } 179 180 if i > math.MaxInt32 || i < math.MinInt32 { 181 return 0, errors.New("int32 out of range") 182 } 183 184 return int32(i), nil 185 } 186 187 func writeVariant(w io.Writer, in int64) error { 188 buf := make([]byte, 12) 189 n := binary.PutVarint(buf, in) 190 191 return writeFull(w, buf[:n]) 192 } 193 194 func writeUVariant(w io.Writer, in uint64) error { 195 buf := make([]byte, 12) 196 n := binary.PutUvarint(buf, in) 197 198 return writeFull(w, buf[:n]) 199 } 200 201 func readVariant64(r io.Reader) (int64, error) { 202 b, ok := r.(io.ByteReader) 203 if !ok { 204 b = &byteReader{Reader: r} 205 } 206 207 return binary.ReadVarint(b) 208 } 209 210 type constDecoder int32 211 212 func (cd constDecoder) initSize(io.Reader) error { 213 return nil 214 } 215 216 func (cd constDecoder) init(io.Reader) error { 217 return nil 218 } 219 220 func (cd constDecoder) next() (int32, error) { 221 return int32(cd), nil 222 } 223 224 type levelDecoderWrapper struct { 225 decoder 226 max uint16 227 } 228 229 func (l *levelDecoderWrapper) maxLevel() uint16 { 230 return l.max 231 } 232 233 // check the b2 into b1 to find the max prefix len 234 func prefix(b1, b2 []byte) int { 235 l := len(b1) 236 if l2 := len(b2); l > l2 { 237 l = l2 238 } 239 for i := 0; i < l; i++ { 240 if b1[i] != b2[i] { 241 return i 242 } 243 } 244 245 return l 246 } 247 248 func encodeValue(w io.Writer, enc valuesEncoder, all []interface{}) error { 249 if err := enc.init(w); err != nil { 250 return err 251 } 252 253 if err := enc.encodeValues(all); err != nil { 254 return err 255 } 256 257 return enc.Close() 258 } 259 260 // In PageV1 the rle stream for rep/def level has the size in stream , but in V2 the size is inside the header not the 261 // stream 262 func encodeLevelsV1(w io.Writer, max uint16, values *packedArray) error { 263 rle := newHybridEncoder(bits.Len16(max)) 264 if err := rle.initSize(w); err != nil { 265 return fmt.Errorf("level writer initialize with size failed: %w", err) 266 } 267 if err := rle.encodePacked(values); err != nil { 268 return fmt.Errorf("level writer encode values failed: %w", err) 269 } 270 271 if err := rle.Close(); err != nil { 272 return fmt.Errorf("level writer flush failed: %w", err) 273 } 274 275 return nil 276 } 277 278 func encodeLevelsV2(w io.Writer, max uint16, values *packedArray) error { 279 rle := newHybridEncoder(bits.Len16(max)) 280 if err := rle.init(w); err != nil { 281 return fmt.Errorf("level writer initialize with size failed: %w", err) 282 } 283 if err := rle.encodePacked(values); err != nil { 284 return fmt.Errorf("level writer encode values failed: %w", err) 285 } 286 287 if err := rle.Close(); err != nil { 288 return fmt.Errorf("level writer flush failed: %w", err) 289 } 290 291 return nil 292 } 293 294 func mapKey(a interface{}) interface{} { 295 switch v := a.(type) { 296 case int, int32, int64, string, bool: 297 return a 298 case float64: 299 return math.Float64bits(v) 300 case float32: 301 return math.Float32bits(v) 302 case []byte: 303 return DefaultHashFunc(v) 304 case [12]byte: 305 return DefaultHashFunc(v[:]) 306 default: 307 panic("not supported type") 308 } 309 } 310 311 func fnvHashFunc(in []byte) interface{} { 312 hash := fnv.New64() 313 if err := writeFull(hash, in); err != nil { 314 panic(err) 315 } 316 return hash.Sum64() 317 } 318 319 type writePos interface { 320 io.Writer 321 Pos() int64 322 } 323 324 type writePosStruct struct { 325 w io.Writer 326 pos int64 327 } 328 329 func (w *writePosStruct) Write(p []byte) (n int, err error) { 330 n, err = w.w.Write(p) 331 w.pos += int64(n) 332 return n, err 333 } 334 335 func (w *writePosStruct) Pos() int64 { 336 return w.pos 337 }