github.com/fraugster/parquet-go@v0.12.0/helpers.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"context"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"hash/fnv"
     9  	"io"
    10  	"math"
    11  	"math/bits"
    12  
    13  	"github.com/apache/thrift/lib/go/thrift"
    14  )
    15  
    16  // DefaultHashFunc is used to generate a hash value to detect and handle duplicate values.
    17  // The function has to return any type that can be used as a map key. In particular, the
    18  // result can not be a slice. The default implementation used the fnv hash function as
    19  // implemented in Go's standard library.
    20  var DefaultHashFunc func([]byte) interface{}
    21  
    22  func init() {
    23  	DefaultHashFunc = fnvHashFunc
    24  }
    25  
    26  type byteReader struct {
    27  	io.Reader
    28  }
    29  
    30  func (br *byteReader) ReadByte() (byte, error) {
    31  	buf := make([]byte, 1)
    32  	if _, err := io.ReadFull(br.Reader, buf); err != nil {
    33  		return 0, err
    34  	}
    35  
    36  	return buf[0], nil
    37  }
    38  
    39  type offsetReader struct {
    40  	inner  io.ReadSeeker
    41  	offset int64
    42  	count  int64
    43  }
    44  
    45  func (o *offsetReader) Read(p []byte) (int, error) {
    46  	n, err := o.inner.Read(p)
    47  	o.offset += int64(n)
    48  	o.count += int64(n)
    49  	return n, err
    50  }
    51  
    52  func (o *offsetReader) Seek(offset int64, whence int) (int64, error) {
    53  	i, err := o.inner.Seek(offset, whence)
    54  	if err == nil {
    55  		o.count += i - o.offset
    56  		o.offset = i
    57  	}
    58  
    59  	return i, err
    60  }
    61  
    62  func (o *offsetReader) Count() int64 {
    63  	return o.count
    64  }
    65  
    66  func decodeRLEValue(bytes []byte) int32 {
    67  	switch len(bytes) {
    68  	case 0:
    69  		return 0
    70  	case 1:
    71  		return int32(bytes[0])
    72  	case 2:
    73  		return int32(bytes[0]) + int32(bytes[1])<<8
    74  	case 3:
    75  		return int32(bytes[0]) + int32(bytes[1])<<8 + int32(bytes[2])<<16
    76  	case 4:
    77  		return int32(bytes[0]) + int32(bytes[1])<<8 + int32(bytes[2])<<16 + int32(bytes[3])<<24
    78  	default:
    79  		panic("invalid argument")
    80  	}
    81  }
    82  
    83  func writeFull(w io.Writer, buf []byte) error {
    84  	if len(buf) == 0 {
    85  		return nil
    86  	}
    87  	cnt, err := w.Write(buf)
    88  	if err != nil {
    89  		return err
    90  	}
    91  
    92  	if cnt != len(buf) {
    93  		return fmt.Errorf("need to write %d byte wrote %d", cnt, len(buf))
    94  	}
    95  
    96  	return nil
    97  }
    98  
    99  type thriftReader interface {
   100  	Read(context.Context, thrift.TProtocol) error
   101  }
   102  
   103  func readThrift(ctx context.Context, tr thriftReader, r io.Reader) error {
   104  	// Make sure we are not using any kind of buffered reader here. bufio.Reader "can" reads more data ahead of time,
   105  	// which is a problem on this library
   106  	transport := &thrift.StreamTransport{Reader: r}
   107  	proto := thrift.NewTCompactProtocolConf(transport, &thrift.TConfiguration{})
   108  	return tr.Read(ctx, proto)
   109  }
   110  
   111  type thriftWriter interface {
   112  	Write(context.Context, thrift.TProtocol) error
   113  }
   114  
   115  func writeThrift(ctx context.Context, tr thriftWriter, w io.Writer) error {
   116  	transport := &thrift.StreamTransport{Writer: w}
   117  	proto := thrift.NewTCompactProtocolConf(transport, &thrift.TConfiguration{})
   118  	return tr.Write(ctx, proto)
   119  }
   120  
   121  func decodeInt32(d decoder, data []int32) error {
   122  	for i := range data {
   123  		u, err := d.next()
   124  		if err != nil {
   125  			return err
   126  		}
   127  		data[i] = u
   128  	}
   129  
   130  	return nil
   131  }
   132  
   133  func decodePackedArray(d levelDecoder, count int) (*packedArray, int, error) {
   134  	ret := &packedArray{}
   135  	ret.reset(bits.Len16(d.maxLevel()))
   136  	nn := 0 // Counting not nulls only good for dLevels
   137  	for i := 0; i < count; i++ {
   138  		u, err := d.next()
   139  		if err != nil {
   140  			return nil, 0, err
   141  		}
   142  		ret.appendSingle(u)
   143  		if u == int32(d.maxLevel()) {
   144  			nn++
   145  		}
   146  	}
   147  
   148  	return ret, nn, nil
   149  }
   150  
   151  func readUVariant32(r io.Reader) (int32, error) {
   152  	b, ok := r.(io.ByteReader)
   153  	if !ok {
   154  		b = &byteReader{Reader: r}
   155  	}
   156  
   157  	i, err := binary.ReadUvarint(b)
   158  	if err != nil {
   159  		return 0, err
   160  	}
   161  
   162  	if i > math.MaxInt32 {
   163  		return 0, errors.New("int32 out of range")
   164  	}
   165  
   166  	return int32(i), nil
   167  }
   168  
   169  func readVariant32(r io.Reader) (int32, error) {
   170  	b, ok := r.(io.ByteReader)
   171  	if !ok {
   172  		b = &byteReader{Reader: r}
   173  	}
   174  
   175  	i, err := binary.ReadVarint(b)
   176  	if err != nil {
   177  		return 0, err
   178  	}
   179  
   180  	if i > math.MaxInt32 || i < math.MinInt32 {
   181  		return 0, errors.New("int32 out of range")
   182  	}
   183  
   184  	return int32(i), nil
   185  }
   186  
   187  func writeVariant(w io.Writer, in int64) error {
   188  	buf := make([]byte, 12)
   189  	n := binary.PutVarint(buf, in)
   190  
   191  	return writeFull(w, buf[:n])
   192  }
   193  
   194  func writeUVariant(w io.Writer, in uint64) error {
   195  	buf := make([]byte, 12)
   196  	n := binary.PutUvarint(buf, in)
   197  
   198  	return writeFull(w, buf[:n])
   199  }
   200  
   201  func readVariant64(r io.Reader) (int64, error) {
   202  	b, ok := r.(io.ByteReader)
   203  	if !ok {
   204  		b = &byteReader{Reader: r}
   205  	}
   206  
   207  	return binary.ReadVarint(b)
   208  }
   209  
   210  type constDecoder int32
   211  
   212  func (cd constDecoder) initSize(io.Reader) error {
   213  	return nil
   214  }
   215  
   216  func (cd constDecoder) init(io.Reader) error {
   217  	return nil
   218  }
   219  
   220  func (cd constDecoder) next() (int32, error) {
   221  	return int32(cd), nil
   222  }
   223  
   224  type levelDecoderWrapper struct {
   225  	decoder
   226  	max uint16
   227  }
   228  
   229  func (l *levelDecoderWrapper) maxLevel() uint16 {
   230  	return l.max
   231  }
   232  
   233  // check the b2 into b1 to find the max prefix len
   234  func prefix(b1, b2 []byte) int {
   235  	l := len(b1)
   236  	if l2 := len(b2); l > l2 {
   237  		l = l2
   238  	}
   239  	for i := 0; i < l; i++ {
   240  		if b1[i] != b2[i] {
   241  			return i
   242  		}
   243  	}
   244  
   245  	return l
   246  }
   247  
   248  func encodeValue(w io.Writer, enc valuesEncoder, all []interface{}) error {
   249  	if err := enc.init(w); err != nil {
   250  		return err
   251  	}
   252  
   253  	if err := enc.encodeValues(all); err != nil {
   254  		return err
   255  	}
   256  
   257  	return enc.Close()
   258  }
   259  
   260  // In PageV1 the rle stream for rep/def level has the size in stream , but in V2 the size is inside the header not the
   261  // stream
   262  func encodeLevelsV1(w io.Writer, max uint16, values *packedArray) error {
   263  	rle := newHybridEncoder(bits.Len16(max))
   264  	if err := rle.initSize(w); err != nil {
   265  		return fmt.Errorf("level writer initialize with size failed: %w", err)
   266  	}
   267  	if err := rle.encodePacked(values); err != nil {
   268  		return fmt.Errorf("level writer encode values failed: %w", err)
   269  	}
   270  
   271  	if err := rle.Close(); err != nil {
   272  		return fmt.Errorf("level writer flush failed: %w", err)
   273  	}
   274  
   275  	return nil
   276  }
   277  
   278  func encodeLevelsV2(w io.Writer, max uint16, values *packedArray) error {
   279  	rle := newHybridEncoder(bits.Len16(max))
   280  	if err := rle.init(w); err != nil {
   281  		return fmt.Errorf("level writer initialize with size failed: %w", err)
   282  	}
   283  	if err := rle.encodePacked(values); err != nil {
   284  		return fmt.Errorf("level writer encode values failed: %w", err)
   285  	}
   286  
   287  	if err := rle.Close(); err != nil {
   288  		return fmt.Errorf("level writer flush failed: %w", err)
   289  	}
   290  
   291  	return nil
   292  }
   293  
   294  func mapKey(a interface{}) interface{} {
   295  	switch v := a.(type) {
   296  	case int, int32, int64, string, bool:
   297  		return a
   298  	case float64:
   299  		return math.Float64bits(v)
   300  	case float32:
   301  		return math.Float32bits(v)
   302  	case []byte:
   303  		return DefaultHashFunc(v)
   304  	case [12]byte:
   305  		return DefaultHashFunc(v[:])
   306  	default:
   307  		panic("not supported type")
   308  	}
   309  }
   310  
   311  func fnvHashFunc(in []byte) interface{} {
   312  	hash := fnv.New64()
   313  	if err := writeFull(hash, in); err != nil {
   314  		panic(err)
   315  	}
   316  	return hash.Sum64()
   317  }
   318  
   319  type writePos interface {
   320  	io.Writer
   321  	Pos() int64
   322  }
   323  
   324  type writePosStruct struct {
   325  	w   io.Writer
   326  	pos int64
   327  }
   328  
   329  func (w *writePosStruct) Write(p []byte) (n int, err error) {
   330  	n, err = w.w.Write(p)
   331  	w.pos += int64(n)
   332  	return n, err
   333  }
   334  
   335  func (w *writePosStruct) Pos() int64 {
   336  	return w.pos
   337  }