github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package hashing 18 19 import ( 20 "github.com/apache/arrow/go/v7/arrow/bitutil" 21 "github.com/apache/arrow/go/v7/parquet/internal/utils" 22 ) 23 24 {{range .In}} 25 type payload{{.Name}} struct { 26 val {{.name}} 27 memoIdx int32 28 } 29 30 type entry{{.Name}} struct { 31 h uint64 32 payload payload{{.Name}} 33 } 34 35 func (e entry{{.Name}}) Valid() bool { return e.h != sentinel } 36 37 // {{.Name}}HashTable is a hashtable specifically for {{.name}} that 38 // is utilized with the MemoTable to generalize interactions for easier 39 // implementation of dictionaries without losing performance. 40 type {{.Name}}HashTable struct { 41 cap uint64 42 capMask uint64 43 size uint64 44 45 entries []entry{{.Name}} 46 } 47 48 // New{{.Name}}HashTable returns a new hash table for {{.name}} values 49 // initialized with the passed in capacity or 32 whichever is larger. 50 func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable { 51 initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 52 ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0} 53 ret.entries = make([]entry{{.Name}}, initCap) 54 return ret 55 } 56 57 // Reset drops all of the values in this hash table and re-initializes it 58 // with the specified initial capacity as if by calling New, but without having 59 // to reallocate the object. 60 func (h *{{.Name}}HashTable) Reset(cap uint64) { 61 h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 62 h.capMask = h.cap - 1 63 h.size = 0 64 h.entries = make([]entry{{.Name}}, h.cap) 65 } 66 67 // CopyValues is used for copying the values out of the hash table into the 68 // passed in slice, in the order that they were first inserted 69 func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) { 70 h.CopyValuesSubset(0, out) 71 } 72 73 // CopyValuesSubset copies a subset of the values in the hashtable out, starting 74 // with the value at start, in the order that they were inserted. 75 func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) { 76 h.VisitEntries(func(e *entry{{.Name}}) { 77 idx := e.payload.memoIdx - int32(start) 78 if idx >= 0 { 79 out[idx] = e.payload.val 80 } 81 }) 82 } 83 84 func (h *{{.Name}}HashTable) WriteOut(out []byte) { 85 h.WriteOutSubset(0, out) 86 } 87 88 func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) { 89 data := arrow.{{.Name}}Traits.CastFromBytes(out) 90 h.VisitEntries(func(e *entry{{.Name}}) { 91 idx := e.payload.memoIdx - int32(start) 92 if idx >= 0 { 93 data[idx] = utils.ToLE{{.Name}}(e.payload.val) 94 } 95 }) 96 } 97 98 func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } 99 100 func ({{.Name}}HashTable) fixHash(v uint64) uint64 { 101 if v == sentinel { 102 return 42 103 } 104 return v 105 } 106 107 // Lookup retrieves the entry for a given hash value assuming it's payload value returns 108 // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, 109 // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. 110 func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) { 111 idx, ok := h.lookup(v, h.capMask, cmp) 112 return &h.entries[idx], ok 113 } 114 115 func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) { 116 const perturbShift uint8 = 5 117 118 var ( 119 idx uint64 120 perturb uint64 121 e *entry{{.Name}} 122 ) 123 124 v = h.fixHash(v) 125 idx = v & szMask 126 perturb = (v >> uint64(perturbShift)) + 1 127 128 for { 129 e = &h.entries[idx] 130 if e.h == v && cmp(e.payload.val) { 131 return idx, true 132 } 133 134 if e.h == sentinel { 135 return idx, false 136 } 137 138 // perturbation logic inspired from CPython's set/dict object 139 // the goal is that all 64 bits of unmasked hash value eventually 140 // participate int he probing sequence, to minimize clustering 141 idx = (idx + perturb) & szMask 142 perturb = (perturb >> uint64(perturbShift)) + 1 143 } 144 } 145 146 func (h *{{.Name}}HashTable) upsize(newcap uint64) error { 147 newMask := newcap - 1 148 149 oldEntries := h.entries 150 h.entries = make([]entry{{.Name}}, newcap) 151 for _, e := range oldEntries { 152 if e.Valid() { 153 idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false }) 154 h.entries[idx] = e 155 } 156 } 157 h.cap = newcap 158 h.capMask = newMask 159 return nil 160 } 161 162 // Insert updates the given entry with the provided hash value, payload value and memo index. 163 // The entry pointer must have been retrieved via lookup in order to actually insert properly. 164 func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error { 165 e.h = h.fixHash(v) 166 e.payload.val = val 167 e.payload.memoIdx = memoIdx 168 h.size++ 169 170 if h.needUpsize() { 171 h.upsize(h.cap * uint64(loadFactor) * 2) 172 } 173 return nil 174 } 175 176 // VisitEntries will call the passed in function on each *valid* entry in the hash table, 177 // a valid entry being one which has had a value inserted into it. 178 func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) { 179 for _, e := range h.entries { 180 if e.Valid() { 181 visit(&e) 182 } 183 } 184 } 185 186 // {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface 187 // conforming to the MemoTable interface defined in the encoding package for general interactions 188 // regarding dictionaries. 189 type {{.Name}}MemoTable struct { 190 tbl *{{.Name}}HashTable 191 nullIdx int32 192 } 193 194 // New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further 195 // allocations when inserting. 196 func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable { 197 return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound} 198 } 199 200 // Reset allows this table to be re-used by dumping all the data currently in the table. 201 func (s *{{.Name}}MemoTable) Reset() { 202 s.tbl.Reset(32) 203 s.nullIdx = KeyNotFound 204 } 205 206 // Size returns the current number of inserted elements into the table including if a null 207 // has been inserted. 208 func (s *{{.Name}}MemoTable) Size() int { 209 sz := int(s.tbl.size) 210 if _, ok := s.GetNull(); ok { 211 sz++ 212 } 213 return sz 214 } 215 216 // GetNull returns the index of an inserted null or KeyNotFound along with a bool 217 // that will be true if found and false if not. 218 func (s *{{.Name}}MemoTable) GetNull() (int, bool) { 219 return int(s.nullIdx), s.nullIdx != KeyNotFound 220 } 221 222 // GetOrInsertNull will return the index of the null entry or insert a null entry 223 // if one currently doesn't exist. The found value will be true if there was already 224 // a null in the table, and false if it inserted one. 225 func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) { 226 idx, found = s.GetNull() 227 if !found { 228 idx = s.Size() 229 s.nullIdx = int32(idx) 230 } 231 return 232 } 233 234 // CopyValues will copy the values from the memo table out into the passed in slice 235 // which must be of the appropriate type. 236 func (s *{{.Name}}MemoTable) CopyValues(out interface{}) { 237 s.CopyValuesSubset(0, out) 238 } 239 240 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 241 // at the provided start index 242 func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) { 243 s.tbl.CopyValuesSubset(start, out.([]{{.name}})) 244 } 245 246 func (s *{{.Name}}MemoTable) WriteOut(out []byte) { 247 s.tbl.WriteOut(out) 248 } 249 250 func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) { 251 s.tbl.WriteOutSubset(start, out) 252 } 253 254 // Get returns the index of the requested value in the hash table or KeyNotFound 255 // along with a boolean indicating if it was found or not. 256 func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) { 257 {{if or (eq .Name "Int32") (eq .Name "Int64") }} 258 h := hashInt(uint64(val.({{.name}})), 0) 259 if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok { 260 {{ else -}} 261 var cmp func({{.name}}) bool 262 {{if eq .Name "Float32"}} 263 if math.IsNaN(float64(val.(float32))) { 264 cmp = isNan32Cmp 265 // use consistent internal bit pattern for NaN regardless of the pattern 266 // that is passed to us. NaN is NaN is NaN 267 val = float32(math.NaN()) 268 {{ else -}} 269 if math.IsNaN(val.(float64)) { 270 cmp = math.IsNaN 271 // use consistent internal bit pattern for NaN regardless of the pattern 272 // that is passed to us. NaN is NaN is NaN 273 val = math.NaN() 274 {{end -}} 275 } else { 276 cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } 277 } 278 279 h := hash{{.Name}}(val.({{.name}}), 0) 280 if e, ok := s.tbl.Lookup(h, cmp); ok { 281 {{ end -}} 282 return int(e.payload.memoIdx), ok 283 } 284 return KeyNotFound, false 285 } 286 287 // GetOrInsert will return the index of the specified value in the table, or insert the 288 // value into the table and return the new index. found indicates whether or not it already 289 // existed in the table (true) or was inserted by this call (false). 290 func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 291 {{if or (eq .Name "Int32") (eq .Name "Int64") }} 292 h := hashInt(uint64(val.({{.name}})), 0) 293 e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { 294 return val.({{.name}}) == v 295 }) 296 {{ else }} 297 var cmp func({{.name}}) bool 298 {{if eq .Name "Float32"}} 299 if math.IsNaN(float64(val.(float32))) { 300 cmp = isNan32Cmp 301 // use consistent internal bit pattern for NaN regardless of the pattern 302 // that is passed to us. NaN is NaN is NaN 303 val = float32(math.NaN()) 304 {{ else -}} 305 if math.IsNaN(val.(float64)) { 306 cmp = math.IsNaN 307 // use consistent internal bit pattern for NaN regardless of the pattern 308 // that is passed to us. NaN is NaN is NaN 309 val = math.NaN() 310 {{end -}} 311 } else { 312 cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } 313 } 314 315 h := hash{{.Name}}(val.({{.name}}), 0) 316 e, ok := s.tbl.Lookup(h, cmp) 317 {{ end }} 318 if ok { 319 idx = int(e.payload.memoIdx) 320 found = true 321 } else { 322 idx = s.Size() 323 s.tbl.Insert(e, h, val.({{.name}}), int32(idx)) 324 } 325 return 326 } 327 {{end}}