github.com/benz9527/xboot@v0.0.0-20240504061247-c23f15593274/lib/kv/swiss_map.go (about) 1 package kv 2 3 import ( 4 "errors" 5 "math/bits" 6 randv2 "math/rand/v2" 7 "sync/atomic" 8 9 "go.uber.org/multierr" 10 11 ibits "github.com/benz9527/xboot/lib/bits" 12 "github.com/benz9527/xboot/lib/infra" 13 ) 14 15 // References: 16 // https://github.com/CppCon/CppCon2017 17 // https://www.dolthub.com/blog/2023-03-28-swiss-map/ 18 // https://github.com/dolthub/swiss/blob/main/map.go 19 // https://github.com/thepudds/swisstable/blob/main/map.go 20 // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 21 // https://faultlore.com/blah/hashbrown-tldr/ 22 // https://rcoh.me/posts/hash-map-analysis/ 23 // https://github.com/abseil/abseil-cpp/blob/master/absl/container/flat_hash_map.h 24 // https://github.com/rust-lang/hashbrown 25 // https://blog.waffles.space/2018/12/07/deep-dive-into-hashbrown/#fn:4 26 // https://methane.hatenablog.jp/entry/2022/02/22/Swisstable_Hash_%E3%81%AB%E4%BD%BF%E3%82%8F%E3%82%8C%E3%81%A6%E3%81%84%E3%82%8B%E3%83%93%E3%83%83%E3%83%88%E6%BC%94%E7%AE%97%E3%81%AE%E9%AD%94%E8%A1%93 27 // https://www.youtube.com/watch?v=JZE3_0qvrMg 28 // https://github.com/abseil/abseil-cpp/blob/master/absl/container/internal/raw_hash_set.h 29 30 // Swiss Table, called Flat Hash Map also. 31 // Hash slot mapped to the key-val pair slot. 32 // Short hash from lo bits (1 byte) is an optimization to 33 // accelerate the hash lookup. 34 // SSE2 instruction the best performance for linear-probing is 35 // 16! (https://www.youtube.com/watch?v=ncHmEUmJZf4&t=1449s) 36 37 // SSE2: 38 // Streaming SIMD Extensions 2 is one of the Intel SIMD (single instruction, multiple data) 39 // processor supplementary instruction sets introduced by Intel with the initial version 40 // of the Pentium 4 in 2000. 41 // 42 // SSSE3: 43 // Supplemental Streaming SIMD Extensions 3 (SSSE3). 44 // 45 // AVX: 46 // Advanced Vector Extensions. 47 48 /* 49 index | 0 | 1 | 2 | 3 | 4 | ... | 15 | 50 -------|--------|--------|--------|--------|--------| |--------| 51 value | (5,7) | | (39,8) | | | ... | | 52 -------|--------|--------|--------|--------|--------| |--------| 53 ctrl |01010111|11111111|00110110|11111111|11111111| ... |11111111| 54 55 1. hash map 56 It uses arrays as its backend. In the context of hash map, the array 57 elements are called buckets or slots. Keeps the key and value at the 58 same time, it is in order to decrease the hash collision. 59 60 2. load factor 61 It is the ratio of the number of elements in the hash map to the number 62 of buckets. Once we reach a certain load factor (like 0.5, 0.7 or 0.9) 63 hash map should resize and rehash all the key-value pairs. 64 65 3. optimization 66 Whenever the CPU needs to read/write to a memory location, it checks the 67 caches, and if it's present, it's a cache hit, otherwise it's a cache 68 missing. Whenever a cache miss occurs, we pay the cost of fetching the 69 data from main memory (thereby losing a few hundred CPU cycles by waiting). 70 The second way is to get rid of using external data structures completely, 71 and use the same array for storing values alongside buckets. 72 73 4. hash collision solution 74 open-addressing, traversing the array linearly. It is cache-friendly. 75 It can save CPU instruction cycles. 76 77 5. key-value deletion 78 5.1 In addition to remove a pair, we also move the next pair to that 79 slot (shift backwards). 80 5.2 Or we add a special flag (tombstone) to removed slots, and when 81 we probe, we can skip a slot containing that flag. But, this will have 82 bad effect on the load factor and very easily to trigger resize and 83 rehash. 84 5.3 robin hood hashing 85 In robin hood hashing, you follow one rule - if the distance to the 86 actual slot of the current element in the slot is less than the 87 distance to the actual slot of the element to be inserted, then we 88 swap both the elements and proceed. 89 */ 90 91 //go:generate go run ./simd/asm.go -out fast_hash_match.s -stubs fast_hash_match_amd64.go 92 93 const ( 94 slotSize = 16 // In order to finding the results in 4 CPU instructions 95 maxAvgSlotLoad = 14 96 h1Mask uint64 = 0xffff_ffff_ffff_ff80 97 h2Mask uint64 = 0x0000_0000_0000_007f 98 empty int8 = -128 // 0b1000_0000, 0x80; https://github.com/abseil/abseil-cpp/blob/61e47a454c81eb07147b0315485f476513cc1230/absl/container/internal/raw_hash_set.h#L505 99 deleted int8 = -2 // 0b1111_1110, OxFE; https://github.com/abseil/abseil-cpp/blob/61e47a454c81eb07147b0315485f476513cc1230/absl/container/internal/raw_hash_set.h#L506 100 ) 101 102 type kvError string 103 104 func (e kvError) Error() string { return string(e) } 105 106 const ( 107 errSwissMapConcurrentRehash = kvError("[swiss-map] concurrent rehash") 108 errSwissMapNextSlotsCapOvf = kvError("[swiss-map] slots overflow") 109 ) 110 111 // amd64 && !nosimd 256 * 1024 * 1024; !amd64 || nosimd 512 * 1024 * 1024 112 var maxSlotCap = 1 << (32 - ibits.CeilPowOf2(slotSize)) 113 114 // A 57 bits hash prefix. 115 // The whole hash truncated to a unsigned 64-bit integer. 116 // Used as an index into the groups array. 117 type h1 uint64 118 119 // A 7 bits hash suffix. 120 // The top 7 bits of the hash. In FULL control byte format. 121 type h2 int8 122 123 type bitset uint16 124 125 type swissMapMetadata [slotSize]int8 126 127 func (md *swissMapMetadata) matchH2(hash h2) bitset { 128 b := Fast16WayHashMatch((*[slotSize]int8)(md), int8(hash)) 129 return bitset(b) 130 } 131 132 func (md *swissMapMetadata) matchEmpty() bitset { 133 b := Fast16WayHashMatch((*[slotSize]int8)(md), empty) 134 return bitset(b) 135 } 136 137 // Array is cache friendly. 138 type swissMapSlot[K comparable, V any] struct { 139 keys [slotSize]K 140 vals [slotSize]V 141 } 142 143 type swissMap[K comparable, V any] struct { 144 ctrlMetadataSet []swissMapMetadata 145 slots []swissMapSlot[K, V] 146 hasher Hasher[K] 147 resident uint64 // current alive elements 148 dead uint64 // current tombstone elements 149 limit uint64 // max resident elements 150 slotCap uint32 151 } 152 153 func (m *swissMap[K, V]) Put(key K, val V) error { 154 if m.resident >= m.limit { 155 n, err := m.nextCap() 156 if err != nil { 157 return infra.WrapErrorStack(err) 158 } 159 if err = m.rehash(n); err != nil { 160 return infra.WrapErrorStack(err) 161 } 162 } 163 m.put(key, val) 164 return nil 165 } 166 167 func (m *swissMap[K, V]) put(key K, val V) { 168 h1, h2 := splitHash(m.hasher.Hash(key)) 169 i := findSlotIndex(h1, atomic.LoadUint32(&m.slotCap)) 170 for { 171 for result := m.ctrlMetadataSet[i].matchH2(h2); /* exists */ result != 0; { 172 if /* hash collision */ j := nextIndexInSlot(&result); 173 /* key equal, update */ key == m.slots[i].keys[j] { 174 m.slots[i].keys[j] = key 175 m.slots[i].vals[j] = val 176 return 177 } 178 } 179 180 if /* not found */ result := m.ctrlMetadataSet[i].matchEmpty(); /* insert */ result != 0 { 181 n := nextIndexInSlot(&result) 182 m.slots[i].keys[n] = key 183 m.slots[i].vals[n] = val 184 m.ctrlMetadataSet[i][n] = int8(h2) 185 m.resident++ 186 return 187 } 188 if /* open-addressing (linear-probing) */ i += 1; /* close loop */ i >= atomic.LoadUint32(&m.slotCap) { 189 i = 0 190 } 191 } 192 } 193 194 func (m *swissMap[K, V]) Get(key K) (val V, exists bool) { 195 h1, h2 := splitHash(m.hasher.Hash(key)) 196 i := findSlotIndex(h1, atomic.LoadUint32(&m.slotCap)) 197 for { 198 for result := m.ctrlMetadataSet[i].matchH2(h2); /* exists */ result != 0; { 199 if /* hash collision */ j := nextIndexInSlot(&result); /* found */ key == m.slots[i].keys[j] { 200 return m.slots[i].vals[j], true 201 } 202 } 203 if /* not found */ m.ctrlMetadataSet[i].matchEmpty() != 0 { 204 return val, false 205 } 206 if /* open-addressing (linear-probing) */ i += 1; /* close loop */ i >= atomic.LoadUint32(&m.slotCap) { 207 i = 0 208 } 209 } 210 } 211 212 func (m *swissMap[K, V]) Foreach(action func(i uint64, key K, val V) bool) { 213 oldCtrlMetadataSet, oldSlots, oldSlotCap := m.ctrlMetadataSet, m.slots, atomic.LoadUint32(&m.slotCap) 214 rngIdx := randv2.Uint32N(oldSlotCap) // random number generation 215 idx := uint64(0) 216 var _continue bool 217 for i := uint32(0); i < oldSlotCap; i++ { 218 for j, md := range oldCtrlMetadataSet[rngIdx] { 219 if md == empty || md == deleted { 220 continue 221 } 222 k, v := oldSlots[rngIdx].keys[j], oldSlots[rngIdx].vals[j] 223 if _continue = action(idx, k, v); !_continue { 224 return 225 } 226 idx++ 227 } 228 if /* open-addressing (linear-probing) */ rngIdx += 1; /* close loop */ rngIdx >= oldSlotCap { 229 rngIdx = 0 230 } 231 } 232 } 233 234 func (m *swissMap[K, V]) Delete(key K) (val V, err error) { 235 h1, h2 := splitHash(m.hasher.Hash(key)) 236 i := findSlotIndex(h1, atomic.LoadUint32(&m.slotCap)) 237 for { 238 for result := m.ctrlMetadataSet[i].matchH2(h2); /* exists */ result != 0; { 239 if /* hash collision */ j := nextIndexInSlot(&result); /* found */ key == m.slots[i].keys[j] { 240 val = m.slots[i].vals[j] 241 242 if m.ctrlMetadataSet[i].matchEmpty() > 0 { 243 // SIMD 16-way hash match result is start from the trailing. 244 // The empty control byte in trailing will not cause premature 245 // termination of linear-probing. 246 // In order to terminate the deletion linear-probing quickly. 247 m.ctrlMetadataSet[i][j] = empty 248 m.resident-- 249 } else { 250 m.ctrlMetadataSet[i][j] = deleted 251 m.dead++ 252 } 253 254 var ( 255 k K 256 v V 257 ) 258 m.slots[i].keys[j] = k 259 m.slots[i].vals[j] = v 260 return 261 } 262 } 263 if /* not found */ m.ctrlMetadataSet[i].matchEmpty() != 0 { 264 // Found the most likely slot index at first. 265 // So if the key not in the slot, it should be 266 // store in next slot. If next slot contains 267 // empty control byte before h2 linear-probing, 268 // it means that key not exists. 269 return val, errors.New("[swiss-map] not found to delete") 270 } 271 272 if /* open-addressing (linear-probing) */ i += 1; /* close loop */ i >= atomic.LoadUint32(&m.slotCap) { 273 i = 0 274 } 275 } 276 } 277 278 func (m *swissMap[K, V]) Clear() { 279 var ( 280 k K 281 v V 282 ) 283 for i := uint32(0); i < atomic.LoadUint32(&m.slotCap); i++ { 284 slot := &m.slots[i] 285 for j := 0; j < slotSize; j++ { 286 m.ctrlMetadataSet[i][j] = empty 287 slot.keys[j] = k 288 slot.vals[j] = v 289 } 290 } 291 m.resident, m.dead = 0, 0 292 } 293 294 func (m *swissMap[K, V]) MigrateFrom(_m map[K]V) error { 295 var merr error 296 for k, v := range _m { 297 if err := m.Put(k, v); err != nil { 298 merr = multierr.Append(merr, err) 299 } 300 } 301 return infra.WrapErrorStack(merr) 302 } 303 304 func (m *swissMap[K, V]) Len() int64 { 305 return int64(m.resident - m.dead) 306 } 307 308 func (m *swissMap[K, V]) Cap() int64 { 309 return int64(m.limit - m.resident) 310 } 311 312 func (m *swissMap[K, V]) nextCap() (uint32, error) { 313 if m.dead >= (m.resident >> 1) { 314 return atomic.LoadUint32(&m.slotCap), nil 315 } 316 newCap := int64(atomic.LoadUint32(&m.slotCap)) * 2 317 if newCap > int64(maxSlotCap) { 318 return 0, infra.WrapErrorStack(errSwissMapNextSlotsCapOvf) 319 } 320 return uint32(newCap), nil 321 } 322 323 func (m *swissMap[K, V]) rehash(newCapacity uint32) error { 324 oldCtrlMetadataSet, oldSlots, oldSlotCap := m.ctrlMetadataSet, m.slots, atomic.LoadUint32(&m.slotCap) 325 if !atomic.CompareAndSwapUint32(&m.slotCap, oldSlotCap, newCapacity) { 326 return infra.WrapErrorStack(errSwissMapConcurrentRehash) 327 } 328 329 m.slots = make([]swissMapSlot[K, V], newCapacity) 330 m.ctrlMetadataSet = make([]swissMapMetadata, newCapacity) 331 for i := uint32(0); i < atomic.LoadUint32(&m.slotCap); i++ { 332 m.ctrlMetadataSet[i] = newEmptyMetadata() 333 } 334 335 m.hasher = newSeedHasher[K](m.hasher) 336 m.limit = uint64(newCapacity) * maxAvgSlotLoad 337 m.resident, m.dead = 0, 0 338 for i := uint32(0); i < oldSlotCap; i++ { 339 for j := 0; j < slotSize; j++ { 340 if md := oldCtrlMetadataSet[i][j]; md == empty || md == deleted { 341 continue 342 } 343 m.put(oldSlots[i].keys[j], oldSlots[i].vals[j]) 344 } 345 } 346 return nil 347 } 348 349 func (m *swissMap[K, V]) loadFactor() float64 { 350 total := float64(atomic.LoadUint32(&m.slotCap) * slotSize) 351 return float64(m.resident-m.dead) / total 352 } 353 354 // @param size, how many elements will be stored in the map 355 func newSwissMap[K comparable, V any](capacity uint32) *swissMap[K, V] { 356 slotCap := calcSlotCapacity(capacity) 357 m := &swissMap[K, V]{ 358 ctrlMetadataSet: make([]swissMapMetadata, slotCap), 359 slots: make([]swissMapSlot[K, V], slotCap), 360 slotCap: slotCap, 361 hasher: newHasher[K](), 362 resident: 0, 363 dead: 0, 364 limit: uint64(slotCap) * maxAvgSlotLoad, 365 } 366 for i := uint32(0); i < slotCap; i++ { 367 m.ctrlMetadataSet[i] = newEmptyMetadata() 368 } 369 return m 370 } 371 372 func NewSwissMap[K comparable, V any](capacity uint32) Map[K, V] { 373 return newSwissMap[K, V](capacity) 374 } 375 376 func calcSlotCapacity(size uint32) uint32 { 377 groupCap := (size + maxAvgSlotLoad - 1) / maxAvgSlotLoad 378 if groupCap == 0 { 379 groupCap = 1 380 } 381 return groupCap 382 } 383 384 func newEmptyMetadata() swissMapMetadata { 385 var m swissMapMetadata 386 for i := 0; i < slotSize; i++ { 387 m[i] = empty 388 } 389 return m 390 } 391 392 func splitHash(hash uint64) (hi h1, lo h2) { 393 return h1((hash & h1Mask) >> 7), h2(hash & h2Mask) 394 } 395 396 // Check which slot that the key will be placed. 397 // Fast mod N should not select uint32(X) & uint32(N - 1), it 398 // is bad performance for swiss-map to do put. 399 // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 400 func findSlotIndex(hi h1, slots uint32) uint32 { 401 // This is not equal to uint32(X) & uint32(N - 1), 402 // but it is completely fair to let mod of X be 403 // uniformly distributed at [0,N). 404 return uint32((uint64(uint32(hi)) * uint64(slots)) >> 32) 405 } 406 407 // Hash collision, find bit as index, start from the trailing then unset it. 408 func nextIndexInSlot(bs *bitset) uint32 { 409 trail := uint32(bits.TrailingZeros16(uint16(*bs))) 410 *bs &= ^(1 << trail) 411 return trail 412 }