github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/kernel/futex/futex.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package futex provides an implementation of the futex interface as found in 16 // the Linux kernel. It allows one to easily transform Wait() calls into waits 17 // on a channel, which is useful in a Go-based kernel, for example. 18 package futex 19 20 import ( 21 "github.com/ttpreport/gvisor-ligolo/pkg/abi/linux" 22 "github.com/ttpreport/gvisor-ligolo/pkg/context" 23 "github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr" 24 "github.com/ttpreport/gvisor-ligolo/pkg/hostarch" 25 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/memmap" 26 ) 27 28 // KeyKind indicates the type of a Key. 29 type KeyKind int 30 31 const ( 32 // KindPrivate indicates a private futex (a futex syscall with the 33 // FUTEX_PRIVATE_FLAG set). 34 KindPrivate KeyKind = iota 35 36 // KindSharedPrivate indicates a shared futex on a private memory mapping. 37 // Although KindPrivate and KindSharedPrivate futexes both use memory 38 // addresses to identify futexes, they do not interoperate (in Linux, the 39 // two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key 40 // comparison). 41 KindSharedPrivate 42 43 // KindSharedMappable indicates a shared futex on a memory mapping other 44 // than a private anonymous memory mapping. 45 KindSharedMappable 46 ) 47 48 // Key represents something that a futex waiter may wait on. 49 type Key struct { 50 // Kind is the type of the Key. 51 Kind KeyKind 52 53 // Mappable is the memory-mapped object that is represented by the Key. 54 // Mappable is always nil if Kind is not KindSharedMappable, and may be nil 55 // even if it is. 56 Mappable memmap.Mappable 57 58 // MappingIdentity is the MappingIdentity associated with Mappable. 59 // MappingIdentity is always nil is Mappable is nil, and may be nil even if 60 // it isn't. 61 MappingIdentity memmap.MappingIdentity 62 63 // If Kind is KindPrivate or KindSharedPrivate, Offset is the represented 64 // memory address. Otherwise, Offset is the represented offset into 65 // Mappable. 66 Offset uint64 67 } 68 69 func (k *Key) release(t Target) { 70 if k.MappingIdentity != nil { 71 k.MappingIdentity.DecRef(t) 72 } 73 k.Mappable = nil 74 k.MappingIdentity = nil 75 } 76 77 func (k *Key) clone() Key { 78 if k.MappingIdentity != nil { 79 k.MappingIdentity.IncRef() 80 } 81 return *k 82 } 83 84 // Preconditions: k.Kind == KindPrivate or KindSharedPrivate. 85 func (k *Key) addr() hostarch.Addr { 86 return hostarch.Addr(k.Offset) 87 } 88 89 // matches returns true if a wakeup on k2 should wake a waiter waiting on k. 90 func (k *Key) matches(k2 *Key) bool { 91 // k.MappingIdentity is ignored; it's only used for reference counting. 92 return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset 93 } 94 95 // Target abstracts memory accesses and keys. 96 type Target interface { 97 context.Context 98 99 // SwapUint32 gives access to hostarch.IO.SwapUint32. 100 SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) 101 102 // CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32. 103 CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) 104 105 // LoadUint32 gives access to hostarch.IO.LoadUint32. 106 LoadUint32(addr hostarch.Addr) (uint32, error) 107 108 // GetSharedKey returns a Key with kind KindSharedPrivate or 109 // KindSharedMappable corresponding to the memory mapped at address addr. 110 // 111 // If GetSharedKey returns a Key with a non-nil MappingIdentity, a 112 // reference is held on the MappingIdentity, which must be dropped by the 113 // caller when the Key is no longer in use. 114 GetSharedKey(addr hostarch.Addr) (Key, error) 115 } 116 117 // check performs a basic equality check on the given address. 118 func check(t Target, addr hostarch.Addr, val uint32) error { 119 cur, err := t.LoadUint32(addr) 120 if err != nil { 121 return err 122 } 123 if cur != val { 124 return linuxerr.EAGAIN 125 } 126 return nil 127 } 128 129 // atomicOp performs a complex operation on the given address. 130 func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { 131 opType := (opIn >> 28) & 0xf 132 cmp := (opIn >> 24) & 0xf 133 opArg := (opIn >> 12) & 0xfff 134 cmpArg := opIn & 0xfff 135 136 if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 { 137 opArg = 1 << opArg 138 opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag. 139 } 140 141 var ( 142 oldVal uint32 143 err error 144 ) 145 if opType == linux.FUTEX_OP_SET { 146 oldVal, err = t.SwapUint32(addr, opArg) 147 if err != nil { 148 return false, err 149 } 150 } else { 151 for { 152 oldVal, err = t.LoadUint32(addr) 153 if err != nil { 154 return false, err 155 } 156 var newVal uint32 157 switch opType { 158 case linux.FUTEX_OP_ADD: 159 newVal = oldVal + opArg 160 case linux.FUTEX_OP_OR: 161 newVal = oldVal | opArg 162 case linux.FUTEX_OP_ANDN: 163 newVal = oldVal &^ opArg 164 case linux.FUTEX_OP_XOR: 165 newVal = oldVal ^ opArg 166 default: 167 return false, linuxerr.ENOSYS 168 } 169 prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) 170 if err != nil { 171 return false, err 172 } 173 if prev == oldVal { 174 break // Success. 175 } 176 } 177 } 178 179 switch cmp { 180 case linux.FUTEX_OP_CMP_EQ: 181 return oldVal == cmpArg, nil 182 case linux.FUTEX_OP_CMP_NE: 183 return oldVal != cmpArg, nil 184 case linux.FUTEX_OP_CMP_LT: 185 return oldVal < cmpArg, nil 186 case linux.FUTEX_OP_CMP_LE: 187 return oldVal <= cmpArg, nil 188 case linux.FUTEX_OP_CMP_GT: 189 return oldVal > cmpArg, nil 190 case linux.FUTEX_OP_CMP_GE: 191 return oldVal >= cmpArg, nil 192 default: 193 return false, linuxerr.ENOSYS 194 } 195 } 196 197 // Waiter is the struct which gets enqueued into buckets for wake up routines 198 // and requeue routines to scan and notify. Once a Waiter has been enqueued by 199 // WaitPrepare(), callers may listen on C for wake up events. 200 type Waiter struct { 201 // Synchronization: 202 // 203 // - A Waiter that is not enqueued in a bucket is exclusively owned (no 204 // synchronization applies). 205 // 206 // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this, 207 // waiterEntry, bucket, and key are protected by the bucket.mu ("bucket 208 // lock") of the containing bucket, and bitmask is immutable. Note that 209 // since bucket is mutated using atomic memory operations, bucket.Load() 210 // may be called without holding the bucket lock, although it may change 211 // racily. See WaitComplete(). 212 // 213 // - A Waiter is only guaranteed to be no longer queued after calling 214 // WaitComplete(). 215 216 // waiterEntry links Waiter into bucket.waiters. 217 waiterEntry 218 219 // bucket is the bucket this waiter is queued in. If bucket is nil, the 220 // waiter is not waiting and is not in any bucket. 221 bucket AtomicPtrBucket 222 223 // C is sent to when the Waiter is woken. 224 C chan struct{} 225 226 // key is what this waiter is waiting on. 227 key Key 228 229 // The bitmask we're waiting on. 230 // This is used the case of a FUTEX_WAKE_BITSET. 231 bitmask uint32 232 233 // tid is the thread ID for the waiter in case this is a PI mutex. 234 tid uint32 235 } 236 237 // NewWaiter returns a new unqueued Waiter. 238 func NewWaiter() *Waiter { 239 return &Waiter{ 240 C: make(chan struct{}, 1), 241 } 242 } 243 244 // woken returns true if w has been woken since the last call to WaitPrepare. 245 func (w *Waiter) woken() bool { 246 return len(w.C) != 0 247 } 248 249 // bucket holds a list of waiters for a given address hash. 250 // 251 // +stateify savable 252 type bucket struct { 253 // mu protects waiters and contained Waiter state. See comment in Waiter. 254 mu futexBucketMutex `state:"nosave"` 255 256 waiters waiterList `state:"zerovalue"` 257 } 258 259 // wakeLocked wakes up to n waiters matching the bitmask at the addr for this 260 // bucket and returns the number of waiters woken. 261 // 262 // Preconditions: b.mu must be locked. 263 func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int { 264 done := 0 265 for w := b.waiters.Front(); done < n && w != nil; { 266 if !w.key.matches(key) || w.bitmask&bitmask == 0 { 267 // Not matching. 268 w = w.Next() 269 continue 270 } 271 272 // Remove from the bucket and wake the waiter. 273 woke := w 274 w = w.Next() // Next iteration. 275 b.wakeWaiterLocked(woke) 276 done++ 277 } 278 return done 279 } 280 281 func (b *bucket) wakeWaiterLocked(w *Waiter) { 282 // Remove from the bucket and wake the waiter. 283 b.waiters.Remove(w) 284 w.C <- struct{}{} 285 286 // NOTE: The above channel write establishes a write barrier according 287 // to the memory model, so nothing may be ordered around it. Since 288 // we've dequeued w and will never touch it again, we can safely 289 // store nil to w.bucket here and allow the WaitComplete() to 290 // short-circuit grabbing the bucket lock. If they somehow miss the 291 // store, we are still holding the lock, so we can know that they won't 292 // dequeue w, assume it's free and have the below operation 293 // afterwards. 294 w.bucket.Store(nil) 295 } 296 297 // requeueLocked takes n waiters from the bucket and moves them to naddr on the 298 // bucket "to". 299 // 300 // Preconditions: b and to must be locked. 301 func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { 302 done := 0 303 for w := b.waiters.Front(); done < n && w != nil; { 304 if !w.key.matches(key) { 305 // Not matching. 306 w = w.Next() 307 continue 308 } 309 310 requeued := w 311 w = w.Next() // Next iteration. 312 b.waiters.Remove(requeued) 313 requeued.key.release(t) 314 requeued.key = nkey.clone() 315 to.waiters.PushBack(requeued) 316 requeued.bucket.Store(to) 317 done++ 318 } 319 return done 320 } 321 322 const ( 323 // bucketCount is the number of buckets per Manager. By having many of 324 // these we reduce contention when concurrent yet unrelated calls are made. 325 bucketCount = 1 << bucketCountBits 326 bucketCountBits = 10 327 ) 328 329 // getKey returns a Key representing address addr in c. 330 func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) { 331 // Ensure the address is aligned. 332 // It must be a DWORD boundary. 333 if addr&0x3 != 0 { 334 return Key{}, linuxerr.EINVAL 335 } 336 if private { 337 return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil 338 } 339 return t.GetSharedKey(addr) 340 } 341 342 // bucketIndexForAddr returns the index into Manager.buckets for addr. 343 func bucketIndexForAddr(addr hostarch.Addr) uintptr { 344 // - The bottom 2 bits of addr must be 0, per getKey. 345 // 346 // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 347 // for a canonical address, and (on all existing platforms) bit 47 must be 348 // 0 for an application address. 349 // 350 // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful" 351 // bits. We choose one of the simplest possible hash functions that at 352 // least uses all 45 useful bits in the output, given that bucketCountBits 353 // == 10. This hash function also has the property that it will usually map 354 // adjacent addresses to adjacent buckets, slightly improving memory 355 // locality when an application synchronization structure uses multiple 356 // nearby futexes. 357 // 358 // Note that despite the large number of arithmetic operations in the 359 // function, many components can be computed in parallel, such that the 360 // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This 361 // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... + 362 // (addr >> 42)" without any additional grouping, the compiler puts all 4 363 // additions in the critical path. 364 h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22) 365 h2 := uintptr(addr>>32) + uintptr(addr>>42) 366 return (h1 + h2) % bucketCount 367 } 368 369 // Manager holds futex state for a single virtual address space. 370 // 371 // +stateify savable 372 type Manager struct { 373 // privateBuckets holds buckets for KindPrivate and KindSharedPrivate 374 // futexes. 375 privateBuckets [bucketCount]bucket `state:"zerovalue"` 376 377 // sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket 378 // may be shared by multiple Managers. The sharedBucket pointer is 379 // immutable. 380 sharedBucket *bucket 381 } 382 383 // NewManager returns an initialized futex manager. 384 func NewManager() *Manager { 385 return &Manager{ 386 sharedBucket: &bucket{}, 387 } 388 } 389 390 // Fork returns a new Manager. Shared futex clients using the returned Manager 391 // may interoperate with those using m. 392 func (m *Manager) Fork() *Manager { 393 return &Manager{ 394 sharedBucket: m.sharedBucket, 395 } 396 } 397 398 // lockBucket returns a locked bucket for the given key. 399 // +checklocksacquire:b.mu 400 func (m *Manager) lockBucket(k *Key) (b *bucket) { 401 if k.Kind == KindSharedMappable { 402 b = m.sharedBucket 403 } else { 404 b = &m.privateBuckets[bucketIndexForAddr(k.addr())] 405 } 406 b.mu.Lock() 407 return b 408 } 409 410 // lockBuckets returns locked buckets for the given keys. 411 // It returns which bucket was locked first and second. They may be nil in case the buckets are 412 // identical or they did not need locking. 413 // 414 // +checklocksacquire:lockedFirst.mu 415 // +checklocksacquire:lockedSecond.mu 416 func (m *Manager) lockBuckets(k1, k2 *Key) (b1, b2, lockedFirst, lockedSecond *bucket) { 417 // Buckets must be consistently ordered to avoid circular lock 418 // dependencies. We order buckets in m.privateBuckets by index (lowest 419 // index first), and all buckets in m.privateBuckets precede 420 // m.sharedBucket. 421 422 // Handle the common case first: 423 if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { 424 i1 := bucketIndexForAddr(k1.addr()) 425 i2 := bucketIndexForAddr(k2.addr()) 426 b1 = &m.privateBuckets[i1] 427 b2 = &m.privateBuckets[i2] 428 switch { 429 case i1 < i2: 430 b1.mu.Lock() 431 b2.mu.NestedLock(futexBucketLockB) 432 return b1, b2, b1, b2 433 case i2 < i1: 434 b2.mu.Lock() 435 b1.mu.NestedLock(futexBucketLockB) 436 return b1, b2, b2, b1 437 default: 438 b1.mu.Lock() 439 return b1, b2, b1, nil // +checklocksforce 440 } 441 } 442 443 // At least one of b1 or b2 should be m.sharedBucket. 444 b1 = m.sharedBucket 445 b2 = m.sharedBucket 446 if k1.Kind != KindSharedMappable { 447 b1 = m.lockBucket(k1) 448 b2.mu.NestedLock(futexBucketLockB) 449 return b1, b2, b1, b2 450 } 451 if k2.Kind != KindSharedMappable { 452 b2 = m.lockBucket(k2) 453 b1.mu.NestedLock(futexBucketLockB) 454 return b1, b2, b2, b1 455 } 456 return b1, b2, nil, nil // +checklocksforce 457 } 458 459 // unlockBuckets unlocks two buckets. 460 // +checklocksrelease:lockedFirst.mu 461 // +checklocksrelease:lockedSecond.mu 462 func (m *Manager) unlockBuckets(lockedFirst, lockedSecond *bucket) { 463 if lockedSecond != nil { 464 lockedSecond.mu.NestedUnlock(futexBucketLockB) 465 } 466 if lockedFirst != nil && lockedFirst != lockedSecond { 467 lockedFirst.mu.Unlock() 468 } 469 return 470 } 471 472 // Wake wakes up to n waiters matching the bitmask on the given addr. 473 // The number of waiters woken is returned. 474 func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) { 475 // This function is very hot; avoid defer. 476 k, err := getKey(t, addr, private) 477 if err != nil { 478 return 0, err 479 } 480 481 b := m.lockBucket(&k) 482 r := b.wakeLocked(&k, bitmask, n) 483 484 b.mu.Unlock() 485 k.release(t) 486 return r, nil 487 } 488 489 func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { 490 k1, err := getKey(t, addr, private) 491 if err != nil { 492 return 0, err 493 } 494 defer k1.release(t) 495 k2, err := getKey(t, naddr, private) 496 if err != nil { 497 return 0, err 498 } 499 defer k2.release(t) 500 501 b1, b2, lockedFirst, lockedSecond := m.lockBuckets(&k1, &k2) 502 defer m.unlockBuckets(lockedFirst, lockedSecond) 503 504 if checkval { 505 if err := check(t, addr, val); err != nil { 506 return 0, err 507 } 508 } 509 510 // Wake the number required. 511 done := b1.wakeLocked(&k1, ^uint32(0), nwake) 512 513 // Requeue the number required. 514 b1.requeueLocked(t, b2, &k1, &k2, nreq) 515 516 return done, nil 517 } 518 519 // Requeue wakes up to nwake waiters on the given addr, and unconditionally 520 // requeues up to nreq waiters on naddr. 521 func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) { 522 return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq) 523 } 524 525 // RequeueCmp atomically checks that the addr contains val (via the Target), 526 // wakes up to nwake waiters on addr and then unconditionally requeues nreq 527 // waiters on naddr. 528 func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { 529 return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq) 530 } 531 532 // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1 533 // waiters unconditionally from addr1, and, based on the original value at addr2 534 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2. 535 // It returns the total number of waiters woken. 536 func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { 537 k1, err := getKey(t, addr1, private) 538 if err != nil { 539 return 0, err 540 } 541 defer k1.release(t) 542 k2, err := getKey(t, addr2, private) 543 if err != nil { 544 return 0, err 545 } 546 defer k2.release(t) 547 548 b1, b2, lockedFirst, lockedSecond := m.lockBuckets(&k1, &k2) 549 defer m.unlockBuckets(lockedFirst, lockedSecond) 550 551 done := 0 552 cond, err := atomicOp(t, addr2, op) 553 if err != nil { 554 return 0, err 555 } 556 557 // Wake up up to nwake1 entries from the first bucket. 558 done = b1.wakeLocked(&k1, ^uint32(0), nwake1) 559 560 // Wake up up to nwake2 entries from the second bucket if the 561 // operation yielded true. 562 if cond { 563 done += b2.wakeLocked(&k2, ^uint32(0), nwake2) 564 } 565 566 return done, nil 567 } 568 569 // WaitPrepare atomically checks that addr contains val (via the Checker), then 570 // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the 571 // Waiter must be subsequently removed by calling WaitComplete, whether or not 572 // a wakeup is received on w.C. 573 func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error { 574 k, err := getKey(t, addr, private) 575 if err != nil { 576 return err 577 } 578 // Ownership of k is transferred to w below. 579 580 // Prepare the Waiter before taking the bucket lock. 581 select { 582 case <-w.C: 583 default: 584 } 585 w.key = k 586 w.bitmask = bitmask 587 588 b := m.lockBucket(&k) 589 // This function is very hot; avoid defer. 590 591 // Perform our atomic check. 592 if err := check(t, addr, val); err != nil { 593 b.mu.Unlock() 594 w.key.release(t) 595 return err 596 } 597 598 // Add the waiter to the bucket. 599 b.waiters.PushBack(w) 600 w.bucket.Store(b) 601 602 b.mu.Unlock() 603 return nil 604 } 605 606 // WaitComplete must be called when a Waiter previously added by WaitPrepare is 607 // no longer eligible to be woken. 608 func (m *Manager) WaitComplete(w *Waiter, t Target) { 609 // Remove w from the bucket it's in. 610 for { 611 b := w.bucket.Load() 612 613 // If b is nil, the waiter isn't in any bucket anymore. This can't be 614 // racy because the waiter can't be concurrently re-queued in another 615 // bucket. 616 if b == nil { 617 break 618 } 619 620 // Take the bucket lock. Note that without holding the bucket lock, the 621 // waiter is not guaranteed to stay in that bucket, so after we take 622 // the bucket lock, we must ensure that the bucket hasn't changed: if 623 // it happens to have changed, we release the old bucket lock and try 624 // again with the new bucket; if it hasn't changed, we know it won't 625 // change now because we hold the lock. 626 b.mu.Lock() 627 if b != w.bucket.Load() { 628 b.mu.Unlock() 629 continue 630 } 631 632 // Remove waiter from bucket. 633 b.waiters.Remove(w) 634 w.bucket.Store(nil) 635 b.mu.Unlock() 636 break 637 } 638 639 // Release references held by the waiter. 640 w.key.release(t) 641 } 642 643 // LockPI attempts to lock the futex following the Priority-inheritance futex 644 // rules. The lock is acquired only when 'addr' points to 0. The TID of the 645 // calling task is set to 'addr' to indicate the futex is owned. It returns true 646 // if the futex was successfully acquired. 647 // 648 // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see 649 // exit_robust_list()). Given we don't support robust lists, although handled 650 // below, it's never set. 651 func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) { 652 k, err := getKey(t, addr, private) 653 if err != nil { 654 return false, err 655 } 656 // Ownership of k is transferred to w below. 657 658 // Prepare the Waiter before taking the bucket lock. 659 select { 660 case <-w.C: 661 default: 662 } 663 w.key = k 664 w.tid = tid 665 666 b := m.lockBucket(&k) 667 // Hot function: avoid defers. 668 669 success, err := m.lockPILocked(w, t, addr, tid, b, try) 670 if err != nil { 671 w.key.release(t) 672 b.mu.Unlock() 673 return false, err 674 } 675 if success || try { 676 // Release waiter if it's not going to be a wait. 677 w.key.release(t) 678 } 679 b.mu.Unlock() 680 return success, nil 681 } 682 683 func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) { 684 for { 685 cur, err := t.LoadUint32(addr) 686 if err != nil { 687 return false, err 688 } 689 if (cur & linux.FUTEX_TID_MASK) == tid { 690 return false, linuxerr.EDEADLK 691 } 692 693 if (cur & linux.FUTEX_TID_MASK) == 0 { 694 // No owner and no waiters, try to acquire the futex. 695 696 // Set TID and preserve owner died status. 697 val := tid 698 val |= cur & linux.FUTEX_OWNER_DIED 699 prev, err := t.CompareAndSwapUint32(addr, cur, val) 700 if err != nil { 701 return false, err 702 } 703 if prev != cur { 704 // CAS failed, retry... 705 // Linux reacquires the bucket lock on retries, which will re-lookup the 706 // mapping at the futex address. However, retrying while holding the 707 // lock is more efficient and reduces the chance of another conflict. 708 continue 709 } 710 // Futex acquired. 711 return true, nil 712 } 713 714 // Futex is already owned, prepare to wait. 715 716 if try { 717 // Caller doesn't want to wait. 718 return false, nil 719 } 720 721 // Set waiters bit if not set yet. 722 if cur&linux.FUTEX_WAITERS == 0 { 723 prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS) 724 if err != nil { 725 return false, err 726 } 727 if prev != cur { 728 // CAS failed, retry... 729 continue 730 } 731 } 732 733 // Add the waiter to the bucket. 734 b.waiters.PushBack(w) 735 w.bucket.Store(b) 736 return false, nil 737 } 738 } 739 740 // UnlockPI unlocks the futex following the Priority-inheritance futex rules. 741 // The address provided must contain the caller's TID. If there are waiters, 742 // TID of the next waiter (FIFO) is set to the given address, and the waiter 743 // woken up. If there are no waiters, 0 is set to the address. 744 func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error { 745 k, err := getKey(t, addr, private) 746 if err != nil { 747 return err 748 } 749 b := m.lockBucket(&k) 750 751 err = m.unlockPILocked(t, addr, tid, b, &k) 752 753 k.release(t) 754 b.mu.Unlock() 755 return err 756 } 757 758 func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error { 759 cur, err := t.LoadUint32(addr) 760 if err != nil { 761 return err 762 } 763 764 if (cur & linux.FUTEX_TID_MASK) != tid { 765 return linuxerr.EPERM 766 } 767 768 var next *Waiter // Who's the next owner? 769 var next2 *Waiter // Who's the one after that? 770 for w := b.waiters.Front(); w != nil; w = w.Next() { 771 if !w.key.matches(key) { 772 continue 773 } 774 775 if next == nil { 776 next = w 777 } else { 778 next2 = w 779 break 780 } 781 } 782 783 if next == nil { 784 // It's safe to set 0 because there are no waiters, no new owner, and the 785 // executing task is the current owner (no owner died bit). 786 prev, err := t.CompareAndSwapUint32(addr, cur, 0) 787 if err != nil { 788 return err 789 } 790 if prev != cur { 791 // Let user mode handle CAS races. This is different than lock, which 792 // retries when CAS fails. 793 return linuxerr.EAGAIN 794 } 795 return nil 796 } 797 798 // Set next owner's TID, waiters if there are any. Resets owner died bit, if 799 // set, because the executing task takes over as the owner. 800 val := next.tid 801 if next2 != nil { 802 val |= linux.FUTEX_WAITERS 803 } 804 805 prev, err := t.CompareAndSwapUint32(addr, cur, val) 806 if err != nil { 807 return err 808 } 809 if prev != cur { 810 return linuxerr.EINVAL 811 } 812 813 b.wakeWaiterLocked(next) 814 return nil 815 }