github.com/fraugster/parquet-go@v0.12.0/type_dict.go (about) 1 package goparquet 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "math" 8 ) 9 10 type dictDecoder struct { 11 uniqueValues []interface{} 12 13 keys decoder 14 } 15 16 // just for tests 17 func (d *dictDecoder) setValues(v []interface{}) { 18 d.uniqueValues = v 19 } 20 21 // the value should be there before the init 22 func (d *dictDecoder) init(r io.Reader) error { 23 buf := make([]byte, 1) 24 if _, err := io.ReadFull(r, buf); err != nil { 25 return err 26 } 27 w := int(buf[0]) 28 if w < 0 || w > 32 { 29 return fmt.Errorf("invalid bitwidth %d", w) 30 } 31 if w >= 0 { 32 d.keys = newHybridDecoder(w) 33 err := d.keys.init(r) 34 return err 35 } 36 37 return errors.New("bit width zero with non-empty dictionary") 38 } 39 40 func (d *dictDecoder) decodeValues(dst []interface{}) (int, error) { 41 if d.keys == nil { 42 return 0, errors.New("no value is inside dictionary") 43 } 44 size := int32(len(d.uniqueValues)) 45 46 for i := range dst { 47 key, err := d.keys.next() 48 if err != nil { 49 return i, err 50 } 51 52 if key < 0 || key >= size { 53 return 0, fmt.Errorf("dict: invalid index %d, values count are %d", key, size) 54 } 55 56 dst[i] = d.uniqueValues[key] 57 } 58 59 return len(dst), nil 60 } 61 62 type dictStore struct { 63 valueList []interface{} 64 uniqueValues map[interface{}]struct{} 65 uniqueValuesSize int64 66 allValuesSize int64 67 readPos int 68 nullCount int32 69 useDict bool 70 71 alloc *allocTracker 72 } 73 74 func (d *dictStore) getValues() []interface{} { 75 return d.valueList 76 } 77 78 func (d *dictStore) init() { 79 d.uniqueValues = make(map[interface{}]struct{}) 80 d.valueList = nil 81 d.reset() 82 } 83 84 func (d *dictStore) reset() { 85 d.nullCount = 0 86 d.readPos = 0 87 d.uniqueValuesSize = 0 88 d.allValuesSize = 0 89 } 90 91 func (d *dictStore) addValue(v interface{}, size int) { 92 if v == nil { 93 d.nullCount++ 94 return 95 } 96 if d.useDict { 97 k := mapKey(v) 98 if _, found := d.uniqueValues[k]; !found { 99 d.uniqueValues[k] = struct{}{} 100 d.uniqueValuesSize += int64(size) 101 if len(d.uniqueValues) > math.MaxInt16 { 102 d.useDict = false 103 } 104 } 105 } 106 d.allValuesSize += int64(size) 107 d.valueList = append(d.valueList, v) 108 d.alloc.register(v, uint64(size)) 109 } 110 111 func (d *dictStore) getNextValue() (interface{}, error) { 112 if d.readPos >= len(d.valueList) { 113 return nil, errors.New("out of range") 114 } 115 d.readPos++ 116 return d.valueList[d.readPos-1], nil 117 } 118 119 func (d *dictStore) numValues() int32 { 120 return int32(len(d.valueList)) 121 } 122 123 func (d *dictStore) nullValueCount() int32 { 124 return d.nullCount 125 } 126 127 func (d *dictStore) distinctValueCount() int64 { 128 return int64(len(d.uniqueValues)) 129 } 130 131 func (d *dictStore) sizes() (dictLen int64, noDictLen int64) { 132 return d.uniqueValuesSize + int64(4*len(d.valueList)), d.allValuesSize 133 } 134 135 type dictEncoder struct { 136 w io.Writer 137 bitWidth int 138 indices []int32 139 } 140 141 func newDictEncoder(w io.Writer, bitWidth int) *dictEncoder { 142 return &dictEncoder{w: w, bitWidth: bitWidth} 143 } 144 145 func (d *dictEncoder) Close() error { 146 // first write the bitLength in a byte 147 if err := writeFull(d.w, []byte{byte(d.bitWidth)}); err != nil { 148 return err 149 } 150 enc := newHybridEncoder(d.bitWidth) 151 if err := enc.init(d.w); err != nil { 152 return err 153 } 154 if err := enc.encode(d.indices); err != nil { 155 return err 156 } 157 158 return enc.Close() 159 } 160 161 func (d *dictEncoder) encodeIndices(indices []int32) error { 162 d.indices = append(d.indices, indices...) 163 return nil 164 }