github.com/benz9527/xboot@v0.0.0-20240504061247-c23f15593274/lib/kv/swiss_map.go (about)

     1  package kv
     2  
     3  import (
     4  	"errors"
     5  	"math/bits"
     6  	randv2 "math/rand/v2"
     7  	"sync/atomic"
     8  
     9  	"go.uber.org/multierr"
    10  
    11  	ibits "github.com/benz9527/xboot/lib/bits"
    12  	"github.com/benz9527/xboot/lib/infra"
    13  )
    14  
    15  // References:
    16  // https://github.com/CppCon/CppCon2017
    17  // https://www.dolthub.com/blog/2023-03-28-swiss-map/
    18  // https://github.com/dolthub/swiss/blob/main/map.go
    19  // https://github.com/thepudds/swisstable/blob/main/map.go
    20  // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
    21  // https://faultlore.com/blah/hashbrown-tldr/
    22  // https://rcoh.me/posts/hash-map-analysis/
    23  // https://github.com/abseil/abseil-cpp/blob/master/absl/container/flat_hash_map.h
    24  // https://github.com/rust-lang/hashbrown
    25  // https://blog.waffles.space/2018/12/07/deep-dive-into-hashbrown/#fn:4
    26  // https://methane.hatenablog.jp/entry/2022/02/22/Swisstable_Hash_%E3%81%AB%E4%BD%BF%E3%82%8F%E3%82%8C%E3%81%A6%E3%81%84%E3%82%8B%E3%83%93%E3%83%83%E3%83%88%E6%BC%94%E7%AE%97%E3%81%AE%E9%AD%94%E8%A1%93
    27  // https://www.youtube.com/watch?v=JZE3_0qvrMg
    28  // https://github.com/abseil/abseil-cpp/blob/master/absl/container/internal/raw_hash_set.h
    29  
    30  // Swiss Table, called Flat Hash Map also.
    31  // Hash slot mapped to the key-val pair slot.
    32  // Short hash from lo bits (1 byte) is an optimization to
    33  // accelerate the hash lookup.
    34  // SSE2 instruction the best performance for linear-probing is
    35  // 16! (https://www.youtube.com/watch?v=ncHmEUmJZf4&t=1449s)
    36  
    37  // SSE2:
    38  // Streaming SIMD Extensions 2 is one of the Intel SIMD (single instruction, multiple data)
    39  // processor supplementary instruction sets introduced by Intel with the initial version
    40  // of the Pentium 4 in 2000.
    41  //
    42  // SSSE3:
    43  // Supplemental Streaming SIMD Extensions 3 (SSSE3).
    44  //
    45  // AVX:
    46  // Advanced Vector Extensions.
    47  
    48  /*
    49   index |   0    |   1    |   2    |   3    |   4    | ... |   15   |
    50  -------|--------|--------|--------|--------|--------|     |--------|
    51   value | (5,7)  |        | (39,8) |        |        | ... |        |
    52  -------|--------|--------|--------|--------|--------|     |--------|
    53   ctrl  |01010111|11111111|00110110|11111111|11111111| ... |11111111|
    54  
    55  1. hash map
    56  It uses arrays as its backend. In the context of hash map, the array
    57  elements are called buckets or slots. Keeps the key and value at the
    58  same time, it is in order to decrease the hash collision.
    59  
    60  2. load factor
    61  It is the ratio of the number of elements in the hash map to the number
    62  of buckets. Once we reach a certain load factor (like 0.5, 0.7 or 0.9)
    63  hash map should resize and rehash all the key-value pairs.
    64  
    65  3. optimization
    66  Whenever the CPU needs to read/write to a memory location, it checks the
    67  caches, and if it's present, it's a cache hit, otherwise it's a cache
    68  missing. Whenever a cache miss occurs, we pay the cost of fetching the
    69  data from main memory (thereby losing a few hundred CPU cycles by waiting).
    70  The second way is to get rid of using external data structures completely,
    71  and use the same array for storing values alongside buckets.
    72  
    73  4. hash collision solution
    74  open-addressing, traversing the array linearly. It is cache-friendly.
    75  It can save CPU instruction cycles.
    76  
    77  5. key-value deletion
    78  5.1 In addition to remove a pair, we also move the next pair to that
    79  slot (shift backwards).
    80  5.2 Or we add a special flag (tombstone) to removed slots, and when
    81  we probe, we can skip a slot containing that flag. But, this will have
    82  bad effect on the load factor and very easily to trigger resize and
    83  rehash.
    84  5.3 robin hood hashing
    85  In robin hood hashing, you follow one rule - if the distance to the
    86  actual slot of the current element in the slot is less than the
    87  distance to the actual slot of the element to be inserted, then we
    88  swap both the elements and proceed.
    89  */
    90  
    91  //go:generate go run ./simd/asm.go -out fast_hash_match.s -stubs fast_hash_match_amd64.go
    92  
    93  const (
    94  	slotSize              = 16 // In order to finding the results in 4 CPU instructions
    95  	maxAvgSlotLoad        = 14
    96  	h1Mask         uint64 = 0xffff_ffff_ffff_ff80
    97  	h2Mask         uint64 = 0x0000_0000_0000_007f
    98  	empty          int8   = -128 // 0b1000_0000, 0x80; https://github.com/abseil/abseil-cpp/blob/61e47a454c81eb07147b0315485f476513cc1230/absl/container/internal/raw_hash_set.h#L505
    99  	deleted        int8   = -2   // 0b1111_1110, OxFE; https://github.com/abseil/abseil-cpp/blob/61e47a454c81eb07147b0315485f476513cc1230/absl/container/internal/raw_hash_set.h#L506
   100  )
   101  
   102  type kvError string
   103  
   104  func (e kvError) Error() string { return string(e) }
   105  
   106  const (
   107  	errSwissMapConcurrentRehash = kvError("[swiss-map] concurrent rehash")
   108  	errSwissMapNextSlotsCapOvf  = kvError("[swiss-map] slots overflow")
   109  )
   110  
   111  // amd64 && !nosimd 256 * 1024 * 1024; !amd64 || nosimd 512 * 1024 * 1024
   112  var maxSlotCap = 1 << (32 - ibits.CeilPowOf2(slotSize))
   113  
   114  // A 57 bits hash prefix.
   115  // The whole hash truncated to a unsigned 64-bit integer.
   116  // Used as an index into the groups array.
   117  type h1 uint64
   118  
   119  // A 7 bits hash suffix.
   120  // The top 7 bits of the hash. In FULL control byte format.
   121  type h2 int8
   122  
   123  type bitset uint16
   124  
   125  type swissMapMetadata [slotSize]int8
   126  
   127  func (md *swissMapMetadata) matchH2(hash h2) bitset {
   128  	b := Fast16WayHashMatch((*[slotSize]int8)(md), int8(hash))
   129  	return bitset(b)
   130  }
   131  
   132  func (md *swissMapMetadata) matchEmpty() bitset {
   133  	b := Fast16WayHashMatch((*[slotSize]int8)(md), empty)
   134  	return bitset(b)
   135  }
   136  
   137  // Array is cache friendly.
   138  type swissMapSlot[K comparable, V any] struct {
   139  	keys [slotSize]K
   140  	vals [slotSize]V
   141  }
   142  
   143  type swissMap[K comparable, V any] struct {
   144  	ctrlMetadataSet []swissMapMetadata
   145  	slots           []swissMapSlot[K, V]
   146  	hasher          Hasher[K]
   147  	resident        uint64 // current alive elements
   148  	dead            uint64 // current tombstone elements
   149  	limit           uint64 // max resident elements
   150  	slotCap         uint32
   151  }
   152  
   153  func (m *swissMap[K, V]) Put(key K, val V) error {
   154  	if m.resident >= m.limit {
   155  		n, err := m.nextCap()
   156  		if err != nil {
   157  			return infra.WrapErrorStack(err)
   158  		}
   159  		if err = m.rehash(n); err != nil {
   160  			return infra.WrapErrorStack(err)
   161  		}
   162  	}
   163  	m.put(key, val)
   164  	return nil
   165  }
   166  
   167  func (m *swissMap[K, V]) put(key K, val V) {
   168  	h1, h2 := splitHash(m.hasher.Hash(key))
   169  	i := findSlotIndex(h1, atomic.LoadUint32(&m.slotCap))
   170  	for {
   171  		for result := m.ctrlMetadataSet[i].matchH2(h2); /* exists */ result != 0; {
   172  			if /* hash collision */ j := nextIndexInSlot(&result);
   173  			/* key equal, update */ key == m.slots[i].keys[j] {
   174  				m.slots[i].keys[j] = key
   175  				m.slots[i].vals[j] = val
   176  				return
   177  			}
   178  		}
   179  
   180  		if /* not found */ result := m.ctrlMetadataSet[i].matchEmpty(); /* insert */ result != 0 {
   181  			n := nextIndexInSlot(&result)
   182  			m.slots[i].keys[n] = key
   183  			m.slots[i].vals[n] = val
   184  			m.ctrlMetadataSet[i][n] = int8(h2)
   185  			m.resident++
   186  			return
   187  		}
   188  		if /* open-addressing (linear-probing) */ i += 1; /* close loop */ i >= atomic.LoadUint32(&m.slotCap) {
   189  			i = 0
   190  		}
   191  	}
   192  }
   193  
   194  func (m *swissMap[K, V]) Get(key K) (val V, exists bool) {
   195  	h1, h2 := splitHash(m.hasher.Hash(key))
   196  	i := findSlotIndex(h1, atomic.LoadUint32(&m.slotCap))
   197  	for {
   198  		for result := m.ctrlMetadataSet[i].matchH2(h2); /* exists */ result != 0; {
   199  			if /* hash collision */ j := nextIndexInSlot(&result); /* found */ key == m.slots[i].keys[j] {
   200  				return m.slots[i].vals[j], true
   201  			}
   202  		}
   203  		if /* not found */ m.ctrlMetadataSet[i].matchEmpty() != 0 {
   204  			return val, false
   205  		}
   206  		if /* open-addressing (linear-probing) */ i += 1; /* close loop */ i >= atomic.LoadUint32(&m.slotCap) {
   207  			i = 0
   208  		}
   209  	}
   210  }
   211  
   212  func (m *swissMap[K, V]) Foreach(action func(i uint64, key K, val V) bool) {
   213  	oldCtrlMetadataSet, oldSlots, oldSlotCap := m.ctrlMetadataSet, m.slots, atomic.LoadUint32(&m.slotCap)
   214  	rngIdx := randv2.Uint32N(oldSlotCap) // random number generation
   215  	idx := uint64(0)
   216  	var _continue bool
   217  	for i := uint32(0); i < oldSlotCap; i++ {
   218  		for j, md := range oldCtrlMetadataSet[rngIdx] {
   219  			if md == empty || md == deleted {
   220  				continue
   221  			}
   222  			k, v := oldSlots[rngIdx].keys[j], oldSlots[rngIdx].vals[j]
   223  			if _continue = action(idx, k, v); !_continue {
   224  				return
   225  			}
   226  			idx++
   227  		}
   228  		if /* open-addressing (linear-probing) */ rngIdx += 1; /* close loop */ rngIdx >= oldSlotCap {
   229  			rngIdx = 0
   230  		}
   231  	}
   232  }
   233  
   234  func (m *swissMap[K, V]) Delete(key K) (val V, err error) {
   235  	h1, h2 := splitHash(m.hasher.Hash(key))
   236  	i := findSlotIndex(h1, atomic.LoadUint32(&m.slotCap))
   237  	for {
   238  		for result := m.ctrlMetadataSet[i].matchH2(h2); /* exists */ result != 0; {
   239  			if /* hash collision */ j := nextIndexInSlot(&result); /* found */ key == m.slots[i].keys[j] {
   240  				val = m.slots[i].vals[j]
   241  
   242  				if m.ctrlMetadataSet[i].matchEmpty() > 0 {
   243  					// SIMD 16-way hash match result is start from the trailing.
   244  					// The empty control byte in trailing will not cause premature
   245  					// termination of linear-probing.
   246  					// In order to terminate the deletion linear-probing quickly.
   247  					m.ctrlMetadataSet[i][j] = empty
   248  					m.resident--
   249  				} else {
   250  					m.ctrlMetadataSet[i][j] = deleted
   251  					m.dead++
   252  				}
   253  
   254  				var (
   255  					k K
   256  					v V
   257  				)
   258  				m.slots[i].keys[j] = k
   259  				m.slots[i].vals[j] = v
   260  				return
   261  			}
   262  		}
   263  		if /* not found */ m.ctrlMetadataSet[i].matchEmpty() != 0 {
   264  			// Found the most likely slot index at first.
   265  			// So if the key not in the slot, it should be
   266  			// store in next slot. If next slot contains
   267  			// empty control byte before h2 linear-probing,
   268  			// it means that key not exists.
   269  			return val, errors.New("[swiss-map] not found to delete")
   270  		}
   271  
   272  		if /* open-addressing (linear-probing) */ i += 1; /* close loop */ i >= atomic.LoadUint32(&m.slotCap) {
   273  			i = 0
   274  		}
   275  	}
   276  }
   277  
   278  func (m *swissMap[K, V]) Clear() {
   279  	var (
   280  		k K
   281  		v V
   282  	)
   283  	for i := uint32(0); i < atomic.LoadUint32(&m.slotCap); i++ {
   284  		slot := &m.slots[i]
   285  		for j := 0; j < slotSize; j++ {
   286  			m.ctrlMetadataSet[i][j] = empty
   287  			slot.keys[j] = k
   288  			slot.vals[j] = v
   289  		}
   290  	}
   291  	m.resident, m.dead = 0, 0
   292  }
   293  
   294  func (m *swissMap[K, V]) MigrateFrom(_m map[K]V) error {
   295  	var merr error
   296  	for k, v := range _m {
   297  		if err := m.Put(k, v); err != nil {
   298  			merr = multierr.Append(merr, err)
   299  		}
   300  	}
   301  	return infra.WrapErrorStack(merr)
   302  }
   303  
   304  func (m *swissMap[K, V]) Len() int64 {
   305  	return int64(m.resident - m.dead)
   306  }
   307  
   308  func (m *swissMap[K, V]) Cap() int64 {
   309  	return int64(m.limit - m.resident)
   310  }
   311  
   312  func (m *swissMap[K, V]) nextCap() (uint32, error) {
   313  	if m.dead >= (m.resident >> 1) {
   314  		return atomic.LoadUint32(&m.slotCap), nil
   315  	}
   316  	newCap := int64(atomic.LoadUint32(&m.slotCap)) * 2
   317  	if newCap > int64(maxSlotCap) {
   318  		return 0, infra.WrapErrorStack(errSwissMapNextSlotsCapOvf)
   319  	}
   320  	return uint32(newCap), nil
   321  }
   322  
   323  func (m *swissMap[K, V]) rehash(newCapacity uint32) error {
   324  	oldCtrlMetadataSet, oldSlots, oldSlotCap := m.ctrlMetadataSet, m.slots, atomic.LoadUint32(&m.slotCap)
   325  	if !atomic.CompareAndSwapUint32(&m.slotCap, oldSlotCap, newCapacity) {
   326  		return infra.WrapErrorStack(errSwissMapConcurrentRehash)
   327  	}
   328  
   329  	m.slots = make([]swissMapSlot[K, V], newCapacity)
   330  	m.ctrlMetadataSet = make([]swissMapMetadata, newCapacity)
   331  	for i := uint32(0); i < atomic.LoadUint32(&m.slotCap); i++ {
   332  		m.ctrlMetadataSet[i] = newEmptyMetadata()
   333  	}
   334  
   335  	m.hasher = newSeedHasher[K](m.hasher)
   336  	m.limit = uint64(newCapacity) * maxAvgSlotLoad
   337  	m.resident, m.dead = 0, 0
   338  	for i := uint32(0); i < oldSlotCap; i++ {
   339  		for j := 0; j < slotSize; j++ {
   340  			if md := oldCtrlMetadataSet[i][j]; md == empty || md == deleted {
   341  				continue
   342  			}
   343  			m.put(oldSlots[i].keys[j], oldSlots[i].vals[j])
   344  		}
   345  	}
   346  	return nil
   347  }
   348  
   349  func (m *swissMap[K, V]) loadFactor() float64 {
   350  	total := float64(atomic.LoadUint32(&m.slotCap) * slotSize)
   351  	return float64(m.resident-m.dead) / total
   352  }
   353  
   354  // @param size, how many elements will be stored in the map
   355  func newSwissMap[K comparable, V any](capacity uint32) *swissMap[K, V] {
   356  	slotCap := calcSlotCapacity(capacity)
   357  	m := &swissMap[K, V]{
   358  		ctrlMetadataSet: make([]swissMapMetadata, slotCap),
   359  		slots:           make([]swissMapSlot[K, V], slotCap),
   360  		slotCap:         slotCap,
   361  		hasher:          newHasher[K](),
   362  		resident:        0,
   363  		dead:            0,
   364  		limit:           uint64(slotCap) * maxAvgSlotLoad,
   365  	}
   366  	for i := uint32(0); i < slotCap; i++ {
   367  		m.ctrlMetadataSet[i] = newEmptyMetadata()
   368  	}
   369  	return m
   370  }
   371  
   372  func NewSwissMap[K comparable, V any](capacity uint32) Map[K, V] {
   373  	return newSwissMap[K, V](capacity)
   374  }
   375  
   376  func calcSlotCapacity(size uint32) uint32 {
   377  	groupCap := (size + maxAvgSlotLoad - 1) / maxAvgSlotLoad
   378  	if groupCap == 0 {
   379  		groupCap = 1
   380  	}
   381  	return groupCap
   382  }
   383  
   384  func newEmptyMetadata() swissMapMetadata {
   385  	var m swissMapMetadata
   386  	for i := 0; i < slotSize; i++ {
   387  		m[i] = empty
   388  	}
   389  	return m
   390  }
   391  
   392  func splitHash(hash uint64) (hi h1, lo h2) {
   393  	return h1((hash & h1Mask) >> 7), h2(hash & h2Mask)
   394  }
   395  
   396  // Check which slot that the key will be placed.
   397  // Fast mod N should not select uint32(X) & uint32(N - 1), it
   398  // is bad performance for swiss-map to do put.
   399  // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
   400  func findSlotIndex(hi h1, slots uint32) uint32 {
   401  	// This is not equal to uint32(X) & uint32(N - 1),
   402  	// but it is completely fair to let mod of X be 
   403  	// uniformly distributed at [0,N).
   404  	return uint32((uint64(uint32(hi)) * uint64(slots)) >> 32)
   405  }
   406  
   407  // Hash collision, find bit as index, start from the trailing then unset it.
   408  func nextIndexInSlot(bs *bitset) uint32 {
   409  	trail := uint32(bits.TrailingZeros16(uint16(*bs)))
   410  	*bs &= ^(1 << trail)
   411  	return trail
   412  }