github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/cache/robin_hood.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package cache 6 7 import ( 8 "fmt" 9 "math/bits" 10 "os" 11 "runtime/debug" 12 "strings" 13 "time" 14 "unsafe" 15 16 "github.com/zuoyebang/bitalostable/internal/invariants" 17 "github.com/zuoyebang/bitalostable/internal/manual" 18 ) 19 20 var hashSeed = uint64(time.Now().UnixNano()) 21 22 // Fibonacci hash: https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 23 func robinHoodHash(k key, shift uint32) uint32 { 24 const m = 11400714819323198485 25 h := hashSeed 26 h ^= k.id * m 27 h ^= uint64(k.fileNum) * m 28 h ^= k.offset * m 29 return uint32(h >> shift) 30 } 31 32 type robinHoodEntry struct { 33 key key 34 // Note that value may point to a Go allocated object (if the "invariants" 35 // build tag was specified), even though the memory for the entry itself is 36 // manually managed. This is technically a volation of the Cgo pointer rules: 37 // 38 // https://golang.org/cmd/cgo/#hdr-Passing_pointers 39 // 40 // Specifically, Go pointers should not be stored in C allocated memory. The 41 // reason for this rule is that the Go GC will not look at C allocated memory 42 // to find pointers to Go objects. If the only reference to a Go object is 43 // stored in C allocated memory, the object will be reclaimed. What makes 44 // this "safe" is that the Cache guarantees that there are other pointers to 45 // the entry and shard which will keep them alive. In particular, every Go 46 // allocated entry in the cache is referenced by the shard.entries map. And 47 // every shard is referenced by the Cache.shards map. 48 value *entry 49 // The distance the entry is from its desired position. 50 dist uint32 51 } 52 53 type robinHoodEntries struct { 54 ptr unsafe.Pointer 55 len uint32 56 } 57 58 func newRobinHoodEntries(n uint32) robinHoodEntries { 59 size := uintptr(n) * unsafe.Sizeof(robinHoodEntry{}) 60 return robinHoodEntries{ 61 ptr: unsafe.Pointer(&(manual.New(int(size)))[0]), 62 len: n, 63 } 64 } 65 66 func (e robinHoodEntries) at(i uint32) *robinHoodEntry { 67 return (*robinHoodEntry)(unsafe.Pointer(uintptr(e.ptr) + 68 uintptr(i)*unsafe.Sizeof(robinHoodEntry{}))) 69 } 70 71 func (e robinHoodEntries) free() { 72 size := uintptr(e.len) * unsafe.Sizeof(robinHoodEntry{}) 73 buf := (*[manual.MaxArrayLen]byte)(e.ptr)[:size:size] 74 manual.Free(buf) 75 } 76 77 // robinHoodMap is an implementation of Robin Hood hashing. Robin Hood hashing 78 // is an open-address hash table using linear probing. The twist is that the 79 // linear probe distance is reduced by moving existing entries when inserting 80 // and deleting. This is accomplished by keeping track of how far an entry is 81 // from its "desired" slot (hash of key modulo number of slots). During 82 // insertion, if the new entry being inserted is farther from its desired slot 83 // than the target entry, we swap the target and new entry. This effectively 84 // steals from the "rich" target entry and gives to the "poor" new entry (thus 85 // the origin of the name). 86 // 87 // An extension over the base Robin Hood hashing idea comes from 88 // https://probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/. A cap 89 // is placed on the max distance an entry can be from its desired slot. When 90 // this threshold is reached during insertion, the size of the table is doubled 91 // and insertion is restarted. Additionally, the entries slice is given "max 92 // dist" extra entries on the end. The very last entry in the entries slice is 93 // never used and acts as a sentinel which terminates loops. The previous 94 // maxDist-1 entries act as the extra entries. For example, if the size of the 95 // table is 2, maxDist is computed as 4 and the actual size of the entry slice 96 // is 6. 97 // 98 // +---+---+---+---+---+---+ 99 // | 0 | 1 | 2 | 3 | 4 | 5 | 100 // +---+---+---+---+---+---+ 101 // ^ 102 // size 103 // 104 // In this scenario, the target entry for a key will always be in the range 105 // [0,1]. Valid entries may reside in the range [0,4] due to the linear probing 106 // of up to maxDist entries. The entry at index 5 will never contain a value, 107 // and instead acts as a sentinel (its distance is always 0). The max distance 108 // threshold is set to log2(num-entries). This ensures that retrieval is O(log 109 // N), though note that N is the number of total entries, not the count of 110 // valid entries. 111 // 112 // Deletion is implemented via the backward shift delete mechanism instead of 113 // tombstones. This preserves the performance of the table in the presence of 114 // deletions. See 115 // http://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion 116 // for details. 117 type robinHoodMap struct { 118 entries robinHoodEntries 119 size uint32 120 shift uint32 121 count uint32 122 maxDist uint32 123 } 124 125 func maxDistForSize(size uint32) uint32 { 126 desired := uint32(bits.Len32(size)) 127 if desired < 4 { 128 desired = 4 129 } 130 return desired 131 } 132 133 func newRobinHoodMap(initialCapacity int) *robinHoodMap { 134 m := &robinHoodMap{} 135 m.init(initialCapacity) 136 137 // Note: this is a no-op if invariants are disabled or race is enabled. 138 invariants.SetFinalizer(m, func(obj interface{}) { 139 m := obj.(*robinHoodMap) 140 if m.entries.ptr != nil { 141 fmt.Fprintf(os.Stderr, "%p: robin-hood map not freed\n", m) 142 os.Exit(1) 143 } 144 }) 145 return m 146 } 147 148 func (m *robinHoodMap) init(initialCapacity int) { 149 if initialCapacity < 1 { 150 initialCapacity = 1 151 } 152 targetSize := 1 << (uint(bits.Len(uint(2*initialCapacity-1))) - 1) 153 m.rehash(uint32(targetSize)) 154 } 155 156 func (m *robinHoodMap) free() { 157 if m.entries.ptr != nil { 158 m.entries.free() 159 m.entries.ptr = nil 160 } 161 } 162 163 func (m *robinHoodMap) rehash(size uint32) { 164 oldEntries := m.entries 165 166 m.size = size 167 m.shift = uint32(64 - bits.Len32(m.size-1)) 168 m.maxDist = maxDistForSize(size) 169 m.entries = newRobinHoodEntries(size + m.maxDist) 170 m.count = 0 171 172 for i := uint32(0); i < oldEntries.len; i++ { 173 e := oldEntries.at(i) 174 if e.value != nil { 175 m.Put(e.key, e.value) 176 } 177 } 178 179 if oldEntries.ptr != nil { 180 oldEntries.free() 181 } 182 } 183 184 // Find an entry containing the specified value. This is intended to be used 185 // from debug and test code. 186 func (m *robinHoodMap) findByValue(v *entry) *robinHoodEntry { 187 for i := uint32(0); i < m.entries.len; i++ { 188 e := m.entries.at(i) 189 if e.value == v { 190 return e 191 } 192 } 193 return nil 194 } 195 196 func (m *robinHoodMap) Count() int { 197 return int(m.count) 198 } 199 200 func (m *robinHoodMap) Put(k key, v *entry) { 201 maybeExists := true 202 n := robinHoodEntry{key: k, value: v, dist: 0} 203 for i := robinHoodHash(k, m.shift); ; i++ { 204 e := m.entries.at(i) 205 if maybeExists && k == e.key { 206 // Entry already exists: overwrite. 207 e.value = n.value 208 m.checkEntry(i) 209 return 210 } 211 212 if e.value == nil { 213 // Found an empty entry: insert here. 214 *e = n 215 m.count++ 216 m.checkEntry(i) 217 return 218 } 219 220 if e.dist < n.dist { 221 // Swap the new entry with the current entry because the current is 222 // rich. We then continue to loop, looking for a new location for the 223 // current entry. Note that this is also the not-found condition for 224 // retrieval, which means that "k" is not present in the map. See Get(). 225 n, *e = *e, n 226 m.checkEntry(i) 227 maybeExists = false 228 } 229 230 // The new entry gradually moves away from its ideal position. 231 n.dist++ 232 233 // If we've reached the max distance threshold, grow the table and restart 234 // the insertion. 235 if n.dist == m.maxDist { 236 m.rehash(2 * m.size) 237 i = robinHoodHash(n.key, m.shift) - 1 238 n.dist = 0 239 maybeExists = false 240 } 241 } 242 } 243 244 func (m *robinHoodMap) Get(k key) *entry { 245 var dist uint32 246 for i := robinHoodHash(k, m.shift); ; i++ { 247 e := m.entries.at(i) 248 if k == e.key { 249 // Found. 250 return e.value 251 } 252 if e.dist < dist { 253 // Not found. 254 return nil 255 } 256 dist++ 257 } 258 } 259 260 func (m *robinHoodMap) Delete(k key) { 261 var dist uint32 262 for i := robinHoodHash(k, m.shift); ; i++ { 263 e := m.entries.at(i) 264 if k == e.key { 265 m.checkEntry(i) 266 // We found the entry to delete. Shift the following entries backwards 267 // until the next empty value or entry with a zero distance. Note that 268 // empty values are guaranteed to have "dist == 0". 269 m.count-- 270 for j := i + 1; ; j++ { 271 t := m.entries.at(j) 272 if t.dist == 0 { 273 *e = robinHoodEntry{} 274 return 275 } 276 e.key = t.key 277 e.value = t.value 278 e.dist = t.dist - 1 279 e = t 280 m.checkEntry(j) 281 } 282 } 283 if dist > e.dist { 284 // Not found. 285 return 286 } 287 dist++ 288 } 289 } 290 291 func (m *robinHoodMap) checkEntry(i uint32) { 292 if invariants.Enabled { 293 e := m.entries.at(i) 294 if e.value != nil { 295 pos := robinHoodHash(e.key, m.shift) 296 if (uint32(i) - pos) != e.dist { 297 fmt.Fprintf(os.Stderr, "%d: invalid dist=%d, expected %d: %s\n%s", 298 i, e.dist, uint32(i)-pos, e.key, debug.Stack()) 299 os.Exit(1) 300 } 301 if e.dist > m.maxDist { 302 fmt.Fprintf(os.Stderr, "%d: invalid dist=%d > maxDist=%d: %s\n%s", 303 i, e.dist, m.maxDist, e.key, debug.Stack()) 304 os.Exit(1) 305 } 306 } 307 } 308 } 309 310 func (m *robinHoodMap) String() string { 311 var buf strings.Builder 312 fmt.Fprintf(&buf, "count: %d\n", m.count) 313 for i := uint32(0); i < m.entries.len; i++ { 314 e := m.entries.at(i) 315 if e.value != nil { 316 fmt.Fprintf(&buf, "%d: [%s,%p,%d]\n", i, e.key, e.value, e.dist) 317 } 318 } 319 return buf.String() 320 }