github.com/fraugster/parquet-go@v0.12.0/type_dict.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  )
     9  
    10  type dictDecoder struct {
    11  	uniqueValues []interface{}
    12  
    13  	keys decoder
    14  }
    15  
    16  // just for tests
    17  func (d *dictDecoder) setValues(v []interface{}) {
    18  	d.uniqueValues = v
    19  }
    20  
    21  // the value should be there before the init
    22  func (d *dictDecoder) init(r io.Reader) error {
    23  	buf := make([]byte, 1)
    24  	if _, err := io.ReadFull(r, buf); err != nil {
    25  		return err
    26  	}
    27  	w := int(buf[0])
    28  	if w < 0 || w > 32 {
    29  		return fmt.Errorf("invalid bitwidth %d", w)
    30  	}
    31  	if w >= 0 {
    32  		d.keys = newHybridDecoder(w)
    33  		err := d.keys.init(r)
    34  		return err
    35  	}
    36  
    37  	return errors.New("bit width zero with non-empty dictionary")
    38  }
    39  
    40  func (d *dictDecoder) decodeValues(dst []interface{}) (int, error) {
    41  	if d.keys == nil {
    42  		return 0, errors.New("no value is inside dictionary")
    43  	}
    44  	size := int32(len(d.uniqueValues))
    45  
    46  	for i := range dst {
    47  		key, err := d.keys.next()
    48  		if err != nil {
    49  			return i, err
    50  		}
    51  
    52  		if key < 0 || key >= size {
    53  			return 0, fmt.Errorf("dict: invalid index %d, values count are %d", key, size)
    54  		}
    55  
    56  		dst[i] = d.uniqueValues[key]
    57  	}
    58  
    59  	return len(dst), nil
    60  }
    61  
    62  type dictStore struct {
    63  	valueList        []interface{}
    64  	uniqueValues     map[interface{}]struct{}
    65  	uniqueValuesSize int64
    66  	allValuesSize    int64
    67  	readPos          int
    68  	nullCount        int32
    69  	useDict          bool
    70  
    71  	alloc *allocTracker
    72  }
    73  
    74  func (d *dictStore) getValues() []interface{} {
    75  	return d.valueList
    76  }
    77  
    78  func (d *dictStore) init() {
    79  	d.uniqueValues = make(map[interface{}]struct{})
    80  	d.valueList = nil
    81  	d.reset()
    82  }
    83  
    84  func (d *dictStore) reset() {
    85  	d.nullCount = 0
    86  	d.readPos = 0
    87  	d.uniqueValuesSize = 0
    88  	d.allValuesSize = 0
    89  }
    90  
    91  func (d *dictStore) addValue(v interface{}, size int) {
    92  	if v == nil {
    93  		d.nullCount++
    94  		return
    95  	}
    96  	if d.useDict {
    97  		k := mapKey(v)
    98  		if _, found := d.uniqueValues[k]; !found {
    99  			d.uniqueValues[k] = struct{}{}
   100  			d.uniqueValuesSize += int64(size)
   101  			if len(d.uniqueValues) > math.MaxInt16 {
   102  				d.useDict = false
   103  			}
   104  		}
   105  	}
   106  	d.allValuesSize += int64(size)
   107  	d.valueList = append(d.valueList, v)
   108  	d.alloc.register(v, uint64(size))
   109  }
   110  
   111  func (d *dictStore) getNextValue() (interface{}, error) {
   112  	if d.readPos >= len(d.valueList) {
   113  		return nil, errors.New("out of range")
   114  	}
   115  	d.readPos++
   116  	return d.valueList[d.readPos-1], nil
   117  }
   118  
   119  func (d *dictStore) numValues() int32 {
   120  	return int32(len(d.valueList))
   121  }
   122  
   123  func (d *dictStore) nullValueCount() int32 {
   124  	return d.nullCount
   125  }
   126  
   127  func (d *dictStore) distinctValueCount() int64 {
   128  	return int64(len(d.uniqueValues))
   129  }
   130  
   131  func (d *dictStore) sizes() (dictLen int64, noDictLen int64) {
   132  	return d.uniqueValuesSize + int64(4*len(d.valueList)), d.allValuesSize
   133  }
   134  
   135  type dictEncoder struct {
   136  	w        io.Writer
   137  	bitWidth int
   138  	indices  []int32
   139  }
   140  
   141  func newDictEncoder(w io.Writer, bitWidth int) *dictEncoder {
   142  	return &dictEncoder{w: w, bitWidth: bitWidth}
   143  }
   144  
   145  func (d *dictEncoder) Close() error {
   146  	// first write the bitLength in a byte
   147  	if err := writeFull(d.w, []byte{byte(d.bitWidth)}); err != nil {
   148  		return err
   149  	}
   150  	enc := newHybridEncoder(d.bitWidth)
   151  	if err := enc.init(d.w); err != nil {
   152  		return err
   153  	}
   154  	if err := enc.encode(d.indices); err != nil {
   155  		return err
   156  	}
   157  
   158  	return enc.Close()
   159  }
   160  
   161  func (d *dictEncoder) encodeIndices(indices []int32) error {
   162  	d.indices = append(d.indices, indices...)
   163  	return nil
   164  }