github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/hashing/xxh3_memo_table.gen.go (about) 1 // Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT. 2 3 // Licensed to the Apache Software Foundation (ASF) under one 4 // or more contributor license agreements. See the NOTICE file 5 // distributed with this work for additional information 6 // regarding copyright ownership. The ASF licenses this file 7 // to you under the Apache License, Version 2.0 (the 8 // "License"); you may not use this file except in compliance 9 // with the License. You may obtain a copy of the License at 10 // 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // 13 // Unless required by applicable law or agreed to in writing, software 14 // distributed under the License is distributed on an "AS IS" BASIS, 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 // See the License for the specific language governing permissions and 17 // limitations under the License. 18 19 package hashing 20 21 import ( 22 "math" 23 24 "github.com/apache/arrow/go/v7/arrow" 25 "github.com/apache/arrow/go/v7/arrow/bitutil" 26 "github.com/apache/arrow/go/v7/parquet/internal/utils" 27 ) 28 29 type payloadInt32 struct { 30 val int32 31 memoIdx int32 32 } 33 34 type entryInt32 struct { 35 h uint64 36 payload payloadInt32 37 } 38 39 func (e entryInt32) Valid() bool { return e.h != sentinel } 40 41 // Int32HashTable is a hashtable specifically for int32 that 42 // is utilized with the MemoTable to generalize interactions for easier 43 // implementation of dictionaries without losing performance. 44 type Int32HashTable struct { 45 cap uint64 46 capMask uint64 47 size uint64 48 49 entries []entryInt32 50 } 51 52 // NewInt32HashTable returns a new hash table for int32 values 53 // initialized with the passed in capacity or 32 whichever is larger. 54 func NewInt32HashTable(cap uint64) *Int32HashTable { 55 initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 56 ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0} 57 ret.entries = make([]entryInt32, initCap) 58 return ret 59 } 60 61 // Reset drops all of the values in this hash table and re-initializes it 62 // with the specified initial capacity as if by calling New, but without having 63 // to reallocate the object. 64 func (h *Int32HashTable) Reset(cap uint64) { 65 h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 66 h.capMask = h.cap - 1 67 h.size = 0 68 h.entries = make([]entryInt32, h.cap) 69 } 70 71 // CopyValues is used for copying the values out of the hash table into the 72 // passed in slice, in the order that they were first inserted 73 func (h *Int32HashTable) CopyValues(out []int32) { 74 h.CopyValuesSubset(0, out) 75 } 76 77 // CopyValuesSubset copies a subset of the values in the hashtable out, starting 78 // with the value at start, in the order that they were inserted. 79 func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) { 80 h.VisitEntries(func(e *entryInt32) { 81 idx := e.payload.memoIdx - int32(start) 82 if idx >= 0 { 83 out[idx] = e.payload.val 84 } 85 }) 86 } 87 88 func (h *Int32HashTable) WriteOut(out []byte) { 89 h.WriteOutSubset(0, out) 90 } 91 92 func (h *Int32HashTable) WriteOutSubset(start int, out []byte) { 93 data := arrow.Int32Traits.CastFromBytes(out) 94 h.VisitEntries(func(e *entryInt32) { 95 idx := e.payload.memoIdx - int32(start) 96 if idx >= 0 { 97 data[idx] = utils.ToLEInt32(e.payload.val) 98 } 99 }) 100 } 101 102 func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } 103 104 func (Int32HashTable) fixHash(v uint64) uint64 { 105 if v == sentinel { 106 return 42 107 } 108 return v 109 } 110 111 // Lookup retrieves the entry for a given hash value assuming it's payload value returns 112 // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, 113 // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. 114 func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) { 115 idx, ok := h.lookup(v, h.capMask, cmp) 116 return &h.entries[idx], ok 117 } 118 119 func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) { 120 const perturbShift uint8 = 5 121 122 var ( 123 idx uint64 124 perturb uint64 125 e *entryInt32 126 ) 127 128 v = h.fixHash(v) 129 idx = v & szMask 130 perturb = (v >> uint64(perturbShift)) + 1 131 132 for { 133 e = &h.entries[idx] 134 if e.h == v && cmp(e.payload.val) { 135 return idx, true 136 } 137 138 if e.h == sentinel { 139 return idx, false 140 } 141 142 // perturbation logic inspired from CPython's set/dict object 143 // the goal is that all 64 bits of unmasked hash value eventually 144 // participate int he probing sequence, to minimize clustering 145 idx = (idx + perturb) & szMask 146 perturb = (perturb >> uint64(perturbShift)) + 1 147 } 148 } 149 150 func (h *Int32HashTable) upsize(newcap uint64) error { 151 newMask := newcap - 1 152 153 oldEntries := h.entries 154 h.entries = make([]entryInt32, newcap) 155 for _, e := range oldEntries { 156 if e.Valid() { 157 idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false }) 158 h.entries[idx] = e 159 } 160 } 161 h.cap = newcap 162 h.capMask = newMask 163 return nil 164 } 165 166 // Insert updates the given entry with the provided hash value, payload value and memo index. 167 // The entry pointer must have been retrieved via lookup in order to actually insert properly. 168 func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error { 169 e.h = h.fixHash(v) 170 e.payload.val = val 171 e.payload.memoIdx = memoIdx 172 h.size++ 173 174 if h.needUpsize() { 175 h.upsize(h.cap * uint64(loadFactor) * 2) 176 } 177 return nil 178 } 179 180 // VisitEntries will call the passed in function on each *valid* entry in the hash table, 181 // a valid entry being one which has had a value inserted into it. 182 func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) { 183 for _, e := range h.entries { 184 if e.Valid() { 185 visit(&e) 186 } 187 } 188 } 189 190 // Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface 191 // conforming to the MemoTable interface defined in the encoding package for general interactions 192 // regarding dictionaries. 193 type Int32MemoTable struct { 194 tbl *Int32HashTable 195 nullIdx int32 196 } 197 198 // NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further 199 // allocations when inserting. 200 func NewInt32MemoTable(num int64) *Int32MemoTable { 201 return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound} 202 } 203 204 // Reset allows this table to be re-used by dumping all the data currently in the table. 205 func (s *Int32MemoTable) Reset() { 206 s.tbl.Reset(32) 207 s.nullIdx = KeyNotFound 208 } 209 210 // Size returns the current number of inserted elements into the table including if a null 211 // has been inserted. 212 func (s *Int32MemoTable) Size() int { 213 sz := int(s.tbl.size) 214 if _, ok := s.GetNull(); ok { 215 sz++ 216 } 217 return sz 218 } 219 220 // GetNull returns the index of an inserted null or KeyNotFound along with a bool 221 // that will be true if found and false if not. 222 func (s *Int32MemoTable) GetNull() (int, bool) { 223 return int(s.nullIdx), s.nullIdx != KeyNotFound 224 } 225 226 // GetOrInsertNull will return the index of the null entry or insert a null entry 227 // if one currently doesn't exist. The found value will be true if there was already 228 // a null in the table, and false if it inserted one. 229 func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) { 230 idx, found = s.GetNull() 231 if !found { 232 idx = s.Size() 233 s.nullIdx = int32(idx) 234 } 235 return 236 } 237 238 // CopyValues will copy the values from the memo table out into the passed in slice 239 // which must be of the appropriate type. 240 func (s *Int32MemoTable) CopyValues(out interface{}) { 241 s.CopyValuesSubset(0, out) 242 } 243 244 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 245 // at the provided start index 246 func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) { 247 s.tbl.CopyValuesSubset(start, out.([]int32)) 248 } 249 250 func (s *Int32MemoTable) WriteOut(out []byte) { 251 s.tbl.WriteOut(out) 252 } 253 254 func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) { 255 s.tbl.WriteOutSubset(start, out) 256 } 257 258 // Get returns the index of the requested value in the hash table or KeyNotFound 259 // along with a boolean indicating if it was found or not. 260 func (s *Int32MemoTable) Get(val interface{}) (int, bool) { 261 262 h := hashInt(uint64(val.(int32)), 0) 263 if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok { 264 return int(e.payload.memoIdx), ok 265 } 266 return KeyNotFound, false 267 } 268 269 // GetOrInsert will return the index of the specified value in the table, or insert the 270 // value into the table and return the new index. found indicates whether or not it already 271 // existed in the table (true) or was inserted by this call (false). 272 func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 273 274 h := hashInt(uint64(val.(int32)), 0) 275 e, ok := s.tbl.Lookup(h, func(v int32) bool { 276 return val.(int32) == v 277 }) 278 279 if ok { 280 idx = int(e.payload.memoIdx) 281 found = true 282 } else { 283 idx = s.Size() 284 s.tbl.Insert(e, h, val.(int32), int32(idx)) 285 } 286 return 287 } 288 289 type payloadInt64 struct { 290 val int64 291 memoIdx int32 292 } 293 294 type entryInt64 struct { 295 h uint64 296 payload payloadInt64 297 } 298 299 func (e entryInt64) Valid() bool { return e.h != sentinel } 300 301 // Int64HashTable is a hashtable specifically for int64 that 302 // is utilized with the MemoTable to generalize interactions for easier 303 // implementation of dictionaries without losing performance. 304 type Int64HashTable struct { 305 cap uint64 306 capMask uint64 307 size uint64 308 309 entries []entryInt64 310 } 311 312 // NewInt64HashTable returns a new hash table for int64 values 313 // initialized with the passed in capacity or 32 whichever is larger. 314 func NewInt64HashTable(cap uint64) *Int64HashTable { 315 initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 316 ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0} 317 ret.entries = make([]entryInt64, initCap) 318 return ret 319 } 320 321 // Reset drops all of the values in this hash table and re-initializes it 322 // with the specified initial capacity as if by calling New, but without having 323 // to reallocate the object. 324 func (h *Int64HashTable) Reset(cap uint64) { 325 h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 326 h.capMask = h.cap - 1 327 h.size = 0 328 h.entries = make([]entryInt64, h.cap) 329 } 330 331 // CopyValues is used for copying the values out of the hash table into the 332 // passed in slice, in the order that they were first inserted 333 func (h *Int64HashTable) CopyValues(out []int64) { 334 h.CopyValuesSubset(0, out) 335 } 336 337 // CopyValuesSubset copies a subset of the values in the hashtable out, starting 338 // with the value at start, in the order that they were inserted. 339 func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) { 340 h.VisitEntries(func(e *entryInt64) { 341 idx := e.payload.memoIdx - int32(start) 342 if idx >= 0 { 343 out[idx] = e.payload.val 344 } 345 }) 346 } 347 348 func (h *Int64HashTable) WriteOut(out []byte) { 349 h.WriteOutSubset(0, out) 350 } 351 352 func (h *Int64HashTable) WriteOutSubset(start int, out []byte) { 353 data := arrow.Int64Traits.CastFromBytes(out) 354 h.VisitEntries(func(e *entryInt64) { 355 idx := e.payload.memoIdx - int32(start) 356 if idx >= 0 { 357 data[idx] = utils.ToLEInt64(e.payload.val) 358 } 359 }) 360 } 361 362 func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } 363 364 func (Int64HashTable) fixHash(v uint64) uint64 { 365 if v == sentinel { 366 return 42 367 } 368 return v 369 } 370 371 // Lookup retrieves the entry for a given hash value assuming it's payload value returns 372 // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, 373 // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. 374 func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) { 375 idx, ok := h.lookup(v, h.capMask, cmp) 376 return &h.entries[idx], ok 377 } 378 379 func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) { 380 const perturbShift uint8 = 5 381 382 var ( 383 idx uint64 384 perturb uint64 385 e *entryInt64 386 ) 387 388 v = h.fixHash(v) 389 idx = v & szMask 390 perturb = (v >> uint64(perturbShift)) + 1 391 392 for { 393 e = &h.entries[idx] 394 if e.h == v && cmp(e.payload.val) { 395 return idx, true 396 } 397 398 if e.h == sentinel { 399 return idx, false 400 } 401 402 // perturbation logic inspired from CPython's set/dict object 403 // the goal is that all 64 bits of unmasked hash value eventually 404 // participate int he probing sequence, to minimize clustering 405 idx = (idx + perturb) & szMask 406 perturb = (perturb >> uint64(perturbShift)) + 1 407 } 408 } 409 410 func (h *Int64HashTable) upsize(newcap uint64) error { 411 newMask := newcap - 1 412 413 oldEntries := h.entries 414 h.entries = make([]entryInt64, newcap) 415 for _, e := range oldEntries { 416 if e.Valid() { 417 idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false }) 418 h.entries[idx] = e 419 } 420 } 421 h.cap = newcap 422 h.capMask = newMask 423 return nil 424 } 425 426 // Insert updates the given entry with the provided hash value, payload value and memo index. 427 // The entry pointer must have been retrieved via lookup in order to actually insert properly. 428 func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error { 429 e.h = h.fixHash(v) 430 e.payload.val = val 431 e.payload.memoIdx = memoIdx 432 h.size++ 433 434 if h.needUpsize() { 435 h.upsize(h.cap * uint64(loadFactor) * 2) 436 } 437 return nil 438 } 439 440 // VisitEntries will call the passed in function on each *valid* entry in the hash table, 441 // a valid entry being one which has had a value inserted into it. 442 func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) { 443 for _, e := range h.entries { 444 if e.Valid() { 445 visit(&e) 446 } 447 } 448 } 449 450 // Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface 451 // conforming to the MemoTable interface defined in the encoding package for general interactions 452 // regarding dictionaries. 453 type Int64MemoTable struct { 454 tbl *Int64HashTable 455 nullIdx int32 456 } 457 458 // NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further 459 // allocations when inserting. 460 func NewInt64MemoTable(num int64) *Int64MemoTable { 461 return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound} 462 } 463 464 // Reset allows this table to be re-used by dumping all the data currently in the table. 465 func (s *Int64MemoTable) Reset() { 466 s.tbl.Reset(32) 467 s.nullIdx = KeyNotFound 468 } 469 470 // Size returns the current number of inserted elements into the table including if a null 471 // has been inserted. 472 func (s *Int64MemoTable) Size() int { 473 sz := int(s.tbl.size) 474 if _, ok := s.GetNull(); ok { 475 sz++ 476 } 477 return sz 478 } 479 480 // GetNull returns the index of an inserted null or KeyNotFound along with a bool 481 // that will be true if found and false if not. 482 func (s *Int64MemoTable) GetNull() (int, bool) { 483 return int(s.nullIdx), s.nullIdx != KeyNotFound 484 } 485 486 // GetOrInsertNull will return the index of the null entry or insert a null entry 487 // if one currently doesn't exist. The found value will be true if there was already 488 // a null in the table, and false if it inserted one. 489 func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) { 490 idx, found = s.GetNull() 491 if !found { 492 idx = s.Size() 493 s.nullIdx = int32(idx) 494 } 495 return 496 } 497 498 // CopyValues will copy the values from the memo table out into the passed in slice 499 // which must be of the appropriate type. 500 func (s *Int64MemoTable) CopyValues(out interface{}) { 501 s.CopyValuesSubset(0, out) 502 } 503 504 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 505 // at the provided start index 506 func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) { 507 s.tbl.CopyValuesSubset(start, out.([]int64)) 508 } 509 510 func (s *Int64MemoTable) WriteOut(out []byte) { 511 s.tbl.WriteOut(out) 512 } 513 514 func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) { 515 s.tbl.WriteOutSubset(start, out) 516 } 517 518 // Get returns the index of the requested value in the hash table or KeyNotFound 519 // along with a boolean indicating if it was found or not. 520 func (s *Int64MemoTable) Get(val interface{}) (int, bool) { 521 522 h := hashInt(uint64(val.(int64)), 0) 523 if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok { 524 return int(e.payload.memoIdx), ok 525 } 526 return KeyNotFound, false 527 } 528 529 // GetOrInsert will return the index of the specified value in the table, or insert the 530 // value into the table and return the new index. found indicates whether or not it already 531 // existed in the table (true) or was inserted by this call (false). 532 func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 533 534 h := hashInt(uint64(val.(int64)), 0) 535 e, ok := s.tbl.Lookup(h, func(v int64) bool { 536 return val.(int64) == v 537 }) 538 539 if ok { 540 idx = int(e.payload.memoIdx) 541 found = true 542 } else { 543 idx = s.Size() 544 s.tbl.Insert(e, h, val.(int64), int32(idx)) 545 } 546 return 547 } 548 549 type payloadFloat32 struct { 550 val float32 551 memoIdx int32 552 } 553 554 type entryFloat32 struct { 555 h uint64 556 payload payloadFloat32 557 } 558 559 func (e entryFloat32) Valid() bool { return e.h != sentinel } 560 561 // Float32HashTable is a hashtable specifically for float32 that 562 // is utilized with the MemoTable to generalize interactions for easier 563 // implementation of dictionaries without losing performance. 564 type Float32HashTable struct { 565 cap uint64 566 capMask uint64 567 size uint64 568 569 entries []entryFloat32 570 } 571 572 // NewFloat32HashTable returns a new hash table for float32 values 573 // initialized with the passed in capacity or 32 whichever is larger. 574 func NewFloat32HashTable(cap uint64) *Float32HashTable { 575 initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 576 ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0} 577 ret.entries = make([]entryFloat32, initCap) 578 return ret 579 } 580 581 // Reset drops all of the values in this hash table and re-initializes it 582 // with the specified initial capacity as if by calling New, but without having 583 // to reallocate the object. 584 func (h *Float32HashTable) Reset(cap uint64) { 585 h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 586 h.capMask = h.cap - 1 587 h.size = 0 588 h.entries = make([]entryFloat32, h.cap) 589 } 590 591 // CopyValues is used for copying the values out of the hash table into the 592 // passed in slice, in the order that they were first inserted 593 func (h *Float32HashTable) CopyValues(out []float32) { 594 h.CopyValuesSubset(0, out) 595 } 596 597 // CopyValuesSubset copies a subset of the values in the hashtable out, starting 598 // with the value at start, in the order that they were inserted. 599 func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) { 600 h.VisitEntries(func(e *entryFloat32) { 601 idx := e.payload.memoIdx - int32(start) 602 if idx >= 0 { 603 out[idx] = e.payload.val 604 } 605 }) 606 } 607 608 func (h *Float32HashTable) WriteOut(out []byte) { 609 h.WriteOutSubset(0, out) 610 } 611 612 func (h *Float32HashTable) WriteOutSubset(start int, out []byte) { 613 data := arrow.Float32Traits.CastFromBytes(out) 614 h.VisitEntries(func(e *entryFloat32) { 615 idx := e.payload.memoIdx - int32(start) 616 if idx >= 0 { 617 data[idx] = utils.ToLEFloat32(e.payload.val) 618 } 619 }) 620 } 621 622 func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } 623 624 func (Float32HashTable) fixHash(v uint64) uint64 { 625 if v == sentinel { 626 return 42 627 } 628 return v 629 } 630 631 // Lookup retrieves the entry for a given hash value assuming it's payload value returns 632 // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, 633 // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. 634 func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) { 635 idx, ok := h.lookup(v, h.capMask, cmp) 636 return &h.entries[idx], ok 637 } 638 639 func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) { 640 const perturbShift uint8 = 5 641 642 var ( 643 idx uint64 644 perturb uint64 645 e *entryFloat32 646 ) 647 648 v = h.fixHash(v) 649 idx = v & szMask 650 perturb = (v >> uint64(perturbShift)) + 1 651 652 for { 653 e = &h.entries[idx] 654 if e.h == v && cmp(e.payload.val) { 655 return idx, true 656 } 657 658 if e.h == sentinel { 659 return idx, false 660 } 661 662 // perturbation logic inspired from CPython's set/dict object 663 // the goal is that all 64 bits of unmasked hash value eventually 664 // participate int he probing sequence, to minimize clustering 665 idx = (idx + perturb) & szMask 666 perturb = (perturb >> uint64(perturbShift)) + 1 667 } 668 } 669 670 func (h *Float32HashTable) upsize(newcap uint64) error { 671 newMask := newcap - 1 672 673 oldEntries := h.entries 674 h.entries = make([]entryFloat32, newcap) 675 for _, e := range oldEntries { 676 if e.Valid() { 677 idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false }) 678 h.entries[idx] = e 679 } 680 } 681 h.cap = newcap 682 h.capMask = newMask 683 return nil 684 } 685 686 // Insert updates the given entry with the provided hash value, payload value and memo index. 687 // The entry pointer must have been retrieved via lookup in order to actually insert properly. 688 func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error { 689 e.h = h.fixHash(v) 690 e.payload.val = val 691 e.payload.memoIdx = memoIdx 692 h.size++ 693 694 if h.needUpsize() { 695 h.upsize(h.cap * uint64(loadFactor) * 2) 696 } 697 return nil 698 } 699 700 // VisitEntries will call the passed in function on each *valid* entry in the hash table, 701 // a valid entry being one which has had a value inserted into it. 702 func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) { 703 for _, e := range h.entries { 704 if e.Valid() { 705 visit(&e) 706 } 707 } 708 } 709 710 // Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface 711 // conforming to the MemoTable interface defined in the encoding package for general interactions 712 // regarding dictionaries. 713 type Float32MemoTable struct { 714 tbl *Float32HashTable 715 nullIdx int32 716 } 717 718 // NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further 719 // allocations when inserting. 720 func NewFloat32MemoTable(num int64) *Float32MemoTable { 721 return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound} 722 } 723 724 // Reset allows this table to be re-used by dumping all the data currently in the table. 725 func (s *Float32MemoTable) Reset() { 726 s.tbl.Reset(32) 727 s.nullIdx = KeyNotFound 728 } 729 730 // Size returns the current number of inserted elements into the table including if a null 731 // has been inserted. 732 func (s *Float32MemoTable) Size() int { 733 sz := int(s.tbl.size) 734 if _, ok := s.GetNull(); ok { 735 sz++ 736 } 737 return sz 738 } 739 740 // GetNull returns the index of an inserted null or KeyNotFound along with a bool 741 // that will be true if found and false if not. 742 func (s *Float32MemoTable) GetNull() (int, bool) { 743 return int(s.nullIdx), s.nullIdx != KeyNotFound 744 } 745 746 // GetOrInsertNull will return the index of the null entry or insert a null entry 747 // if one currently doesn't exist. The found value will be true if there was already 748 // a null in the table, and false if it inserted one. 749 func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) { 750 idx, found = s.GetNull() 751 if !found { 752 idx = s.Size() 753 s.nullIdx = int32(idx) 754 } 755 return 756 } 757 758 // CopyValues will copy the values from the memo table out into the passed in slice 759 // which must be of the appropriate type. 760 func (s *Float32MemoTable) CopyValues(out interface{}) { 761 s.CopyValuesSubset(0, out) 762 } 763 764 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 765 // at the provided start index 766 func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) { 767 s.tbl.CopyValuesSubset(start, out.([]float32)) 768 } 769 770 func (s *Float32MemoTable) WriteOut(out []byte) { 771 s.tbl.WriteOut(out) 772 } 773 774 func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) { 775 s.tbl.WriteOutSubset(start, out) 776 } 777 778 // Get returns the index of the requested value in the hash table or KeyNotFound 779 // along with a boolean indicating if it was found or not. 780 func (s *Float32MemoTable) Get(val interface{}) (int, bool) { 781 var cmp func(float32) bool 782 783 if math.IsNaN(float64(val.(float32))) { 784 cmp = isNan32Cmp 785 // use consistent internal bit pattern for NaN regardless of the pattern 786 // that is passed to us. NaN is NaN is NaN 787 val = float32(math.NaN()) 788 } else { 789 cmp = func(v float32) bool { return val.(float32) == v } 790 } 791 792 h := hashFloat32(val.(float32), 0) 793 if e, ok := s.tbl.Lookup(h, cmp); ok { 794 return int(e.payload.memoIdx), ok 795 } 796 return KeyNotFound, false 797 } 798 799 // GetOrInsert will return the index of the specified value in the table, or insert the 800 // value into the table and return the new index. found indicates whether or not it already 801 // existed in the table (true) or was inserted by this call (false). 802 func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 803 804 var cmp func(float32) bool 805 806 if math.IsNaN(float64(val.(float32))) { 807 cmp = isNan32Cmp 808 // use consistent internal bit pattern for NaN regardless of the pattern 809 // that is passed to us. NaN is NaN is NaN 810 val = float32(math.NaN()) 811 } else { 812 cmp = func(v float32) bool { return val.(float32) == v } 813 } 814 815 h := hashFloat32(val.(float32), 0) 816 e, ok := s.tbl.Lookup(h, cmp) 817 818 if ok { 819 idx = int(e.payload.memoIdx) 820 found = true 821 } else { 822 idx = s.Size() 823 s.tbl.Insert(e, h, val.(float32), int32(idx)) 824 } 825 return 826 } 827 828 type payloadFloat64 struct { 829 val float64 830 memoIdx int32 831 } 832 833 type entryFloat64 struct { 834 h uint64 835 payload payloadFloat64 836 } 837 838 func (e entryFloat64) Valid() bool { return e.h != sentinel } 839 840 // Float64HashTable is a hashtable specifically for float64 that 841 // is utilized with the MemoTable to generalize interactions for easier 842 // implementation of dictionaries without losing performance. 843 type Float64HashTable struct { 844 cap uint64 845 capMask uint64 846 size uint64 847 848 entries []entryFloat64 849 } 850 851 // NewFloat64HashTable returns a new hash table for float64 values 852 // initialized with the passed in capacity or 32 whichever is larger. 853 func NewFloat64HashTable(cap uint64) *Float64HashTable { 854 initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 855 ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0} 856 ret.entries = make([]entryFloat64, initCap) 857 return ret 858 } 859 860 // Reset drops all of the values in this hash table and re-initializes it 861 // with the specified initial capacity as if by calling New, but without having 862 // to reallocate the object. 863 func (h *Float64HashTable) Reset(cap uint64) { 864 h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) 865 h.capMask = h.cap - 1 866 h.size = 0 867 h.entries = make([]entryFloat64, h.cap) 868 } 869 870 // CopyValues is used for copying the values out of the hash table into the 871 // passed in slice, in the order that they were first inserted 872 func (h *Float64HashTable) CopyValues(out []float64) { 873 h.CopyValuesSubset(0, out) 874 } 875 876 // CopyValuesSubset copies a subset of the values in the hashtable out, starting 877 // with the value at start, in the order that they were inserted. 878 func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) { 879 h.VisitEntries(func(e *entryFloat64) { 880 idx := e.payload.memoIdx - int32(start) 881 if idx >= 0 { 882 out[idx] = e.payload.val 883 } 884 }) 885 } 886 887 func (h *Float64HashTable) WriteOut(out []byte) { 888 h.WriteOutSubset(0, out) 889 } 890 891 func (h *Float64HashTable) WriteOutSubset(start int, out []byte) { 892 data := arrow.Float64Traits.CastFromBytes(out) 893 h.VisitEntries(func(e *entryFloat64) { 894 idx := e.payload.memoIdx - int32(start) 895 if idx >= 0 { 896 data[idx] = utils.ToLEFloat64(e.payload.val) 897 } 898 }) 899 } 900 901 func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } 902 903 func (Float64HashTable) fixHash(v uint64) uint64 { 904 if v == sentinel { 905 return 42 906 } 907 return v 908 } 909 910 // Lookup retrieves the entry for a given hash value assuming it's payload value returns 911 // true when passed to the cmp func. Returns a pointer to the entry for the given hash value, 912 // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. 913 func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) { 914 idx, ok := h.lookup(v, h.capMask, cmp) 915 return &h.entries[idx], ok 916 } 917 918 func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) { 919 const perturbShift uint8 = 5 920 921 var ( 922 idx uint64 923 perturb uint64 924 e *entryFloat64 925 ) 926 927 v = h.fixHash(v) 928 idx = v & szMask 929 perturb = (v >> uint64(perturbShift)) + 1 930 931 for { 932 e = &h.entries[idx] 933 if e.h == v && cmp(e.payload.val) { 934 return idx, true 935 } 936 937 if e.h == sentinel { 938 return idx, false 939 } 940 941 // perturbation logic inspired from CPython's set/dict object 942 // the goal is that all 64 bits of unmasked hash value eventually 943 // participate int he probing sequence, to minimize clustering 944 idx = (idx + perturb) & szMask 945 perturb = (perturb >> uint64(perturbShift)) + 1 946 } 947 } 948 949 func (h *Float64HashTable) upsize(newcap uint64) error { 950 newMask := newcap - 1 951 952 oldEntries := h.entries 953 h.entries = make([]entryFloat64, newcap) 954 for _, e := range oldEntries { 955 if e.Valid() { 956 idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false }) 957 h.entries[idx] = e 958 } 959 } 960 h.cap = newcap 961 h.capMask = newMask 962 return nil 963 } 964 965 // Insert updates the given entry with the provided hash value, payload value and memo index. 966 // The entry pointer must have been retrieved via lookup in order to actually insert properly. 967 func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error { 968 e.h = h.fixHash(v) 969 e.payload.val = val 970 e.payload.memoIdx = memoIdx 971 h.size++ 972 973 if h.needUpsize() { 974 h.upsize(h.cap * uint64(loadFactor) * 2) 975 } 976 return nil 977 } 978 979 // VisitEntries will call the passed in function on each *valid* entry in the hash table, 980 // a valid entry being one which has had a value inserted into it. 981 func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) { 982 for _, e := range h.entries { 983 if e.Valid() { 984 visit(&e) 985 } 986 } 987 } 988 989 // Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface 990 // conforming to the MemoTable interface defined in the encoding package for general interactions 991 // regarding dictionaries. 992 type Float64MemoTable struct { 993 tbl *Float64HashTable 994 nullIdx int32 995 } 996 997 // NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further 998 // allocations when inserting. 999 func NewFloat64MemoTable(num int64) *Float64MemoTable { 1000 return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound} 1001 } 1002 1003 // Reset allows this table to be re-used by dumping all the data currently in the table. 1004 func (s *Float64MemoTable) Reset() { 1005 s.tbl.Reset(32) 1006 s.nullIdx = KeyNotFound 1007 } 1008 1009 // Size returns the current number of inserted elements into the table including if a null 1010 // has been inserted. 1011 func (s *Float64MemoTable) Size() int { 1012 sz := int(s.tbl.size) 1013 if _, ok := s.GetNull(); ok { 1014 sz++ 1015 } 1016 return sz 1017 } 1018 1019 // GetNull returns the index of an inserted null or KeyNotFound along with a bool 1020 // that will be true if found and false if not. 1021 func (s *Float64MemoTable) GetNull() (int, bool) { 1022 return int(s.nullIdx), s.nullIdx != KeyNotFound 1023 } 1024 1025 // GetOrInsertNull will return the index of the null entry or insert a null entry 1026 // if one currently doesn't exist. The found value will be true if there was already 1027 // a null in the table, and false if it inserted one. 1028 func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) { 1029 idx, found = s.GetNull() 1030 if !found { 1031 idx = s.Size() 1032 s.nullIdx = int32(idx) 1033 } 1034 return 1035 } 1036 1037 // CopyValues will copy the values from the memo table out into the passed in slice 1038 // which must be of the appropriate type. 1039 func (s *Float64MemoTable) CopyValues(out interface{}) { 1040 s.CopyValuesSubset(0, out) 1041 } 1042 1043 // CopyValuesSubset is like CopyValues but only copies a subset of values starting 1044 // at the provided start index 1045 func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) { 1046 s.tbl.CopyValuesSubset(start, out.([]float64)) 1047 } 1048 1049 func (s *Float64MemoTable) WriteOut(out []byte) { 1050 s.tbl.WriteOut(out) 1051 } 1052 1053 func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) { 1054 s.tbl.WriteOutSubset(start, out) 1055 } 1056 1057 // Get returns the index of the requested value in the hash table or KeyNotFound 1058 // along with a boolean indicating if it was found or not. 1059 func (s *Float64MemoTable) Get(val interface{}) (int, bool) { 1060 var cmp func(float64) bool 1061 if math.IsNaN(val.(float64)) { 1062 cmp = math.IsNaN 1063 // use consistent internal bit pattern for NaN regardless of the pattern 1064 // that is passed to us. NaN is NaN is NaN 1065 val = math.NaN() 1066 } else { 1067 cmp = func(v float64) bool { return val.(float64) == v } 1068 } 1069 1070 h := hashFloat64(val.(float64), 0) 1071 if e, ok := s.tbl.Lookup(h, cmp); ok { 1072 return int(e.payload.memoIdx), ok 1073 } 1074 return KeyNotFound, false 1075 } 1076 1077 // GetOrInsert will return the index of the specified value in the table, or insert the 1078 // value into the table and return the new index. found indicates whether or not it already 1079 // existed in the table (true) or was inserted by this call (false). 1080 func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 1081 1082 var cmp func(float64) bool 1083 if math.IsNaN(val.(float64)) { 1084 cmp = math.IsNaN 1085 // use consistent internal bit pattern for NaN regardless of the pattern 1086 // that is passed to us. NaN is NaN is NaN 1087 val = math.NaN() 1088 } else { 1089 cmp = func(v float64) bool { return val.(float64) == v } 1090 } 1091 1092 h := hashFloat64(val.(float64), 0) 1093 e, ok := s.tbl.Lookup(h, cmp) 1094 1095 if ok { 1096 idx = int(e.payload.memoIdx) 1097 found = true 1098 } else { 1099 idx = s.Size() 1100 s.tbl.Insert(e, h, val.(float64), int32(idx)) 1101 } 1102 return 1103 }