github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/futex/futex.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package futex provides an implementation of the futex interface as found in 16 // the Linux kernel. It allows one to easily transform Wait() calls into waits 17 // on a channel, which is useful in a Go-based kernel, for example. 18 package futex 19 20 import ( 21 "github.com/SagerNet/gvisor/pkg/abi/linux" 22 "github.com/SagerNet/gvisor/pkg/context" 23 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 24 "github.com/SagerNet/gvisor/pkg/hostarch" 25 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 26 "github.com/SagerNet/gvisor/pkg/sync" 27 "github.com/SagerNet/gvisor/pkg/syserror" 28 ) 29 30 // KeyKind indicates the type of a Key. 31 type KeyKind int 32 33 const ( 34 // KindPrivate indicates a private futex (a futex syscall with the 35 // FUTEX_PRIVATE_FLAG set). 36 KindPrivate KeyKind = iota 37 38 // KindSharedPrivate indicates a shared futex on a private memory mapping. 39 // Although KindPrivate and KindSharedPrivate futexes both use memory 40 // addresses to identify futexes, they do not interoperate (in Linux, the 41 // two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key 42 // comparison). 43 KindSharedPrivate 44 45 // KindSharedMappable indicates a shared futex on a memory mapping other 46 // than a private anonymous memory mapping. 47 KindSharedMappable 48 ) 49 50 // Key represents something that a futex waiter may wait on. 51 type Key struct { 52 // Kind is the type of the Key. 53 Kind KeyKind 54 55 // Mappable is the memory-mapped object that is represented by the Key. 56 // Mappable is always nil if Kind is not KindSharedMappable, and may be nil 57 // even if it is. 58 Mappable memmap.Mappable 59 60 // MappingIdentity is the MappingIdentity associated with Mappable. 61 // MappingIdentity is always nil is Mappable is nil, and may be nil even if 62 // it isn't. 63 MappingIdentity memmap.MappingIdentity 64 65 // If Kind is KindPrivate or KindSharedPrivate, Offset is the represented 66 // memory address. Otherwise, Offset is the represented offset into 67 // Mappable. 68 Offset uint64 69 } 70 71 func (k *Key) release(t Target) { 72 if k.MappingIdentity != nil { 73 k.MappingIdentity.DecRef(t) 74 } 75 k.Mappable = nil 76 k.MappingIdentity = nil 77 } 78 79 func (k *Key) clone() Key { 80 if k.MappingIdentity != nil { 81 k.MappingIdentity.IncRef() 82 } 83 return *k 84 } 85 86 // Preconditions: k.Kind == KindPrivate or KindSharedPrivate. 87 func (k *Key) addr() hostarch.Addr { 88 return hostarch.Addr(k.Offset) 89 } 90 91 // matches returns true if a wakeup on k2 should wake a waiter waiting on k. 92 func (k *Key) matches(k2 *Key) bool { 93 // k.MappingIdentity is ignored; it's only used for reference counting. 94 return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset 95 } 96 97 // Target abstracts memory accesses and keys. 98 type Target interface { 99 context.Context 100 101 // SwapUint32 gives access to hostarch.IO.SwapUint32. 102 SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) 103 104 // CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32. 105 CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) 106 107 // LoadUint32 gives access to hostarch.IO.LoadUint32. 108 LoadUint32(addr hostarch.Addr) (uint32, error) 109 110 // GetSharedKey returns a Key with kind KindSharedPrivate or 111 // KindSharedMappable corresponding to the memory mapped at address addr. 112 // 113 // If GetSharedKey returns a Key with a non-nil MappingIdentity, a 114 // reference is held on the MappingIdentity, which must be dropped by the 115 // caller when the Key is no longer in use. 116 GetSharedKey(addr hostarch.Addr) (Key, error) 117 } 118 119 // check performs a basic equality check on the given address. 120 func check(t Target, addr hostarch.Addr, val uint32) error { 121 cur, err := t.LoadUint32(addr) 122 if err != nil { 123 return err 124 } 125 if cur != val { 126 return linuxerr.EAGAIN 127 } 128 return nil 129 } 130 131 // atomicOp performs a complex operation on the given address. 132 func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { 133 opType := (opIn >> 28) & 0xf 134 cmp := (opIn >> 24) & 0xf 135 opArg := (opIn >> 12) & 0xfff 136 cmpArg := opIn & 0xfff 137 138 if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 { 139 opArg = 1 << opArg 140 opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag. 141 } 142 143 var ( 144 oldVal uint32 145 err error 146 ) 147 if opType == linux.FUTEX_OP_SET { 148 oldVal, err = t.SwapUint32(addr, opArg) 149 if err != nil { 150 return false, err 151 } 152 } else { 153 for { 154 oldVal, err = t.LoadUint32(addr) 155 if err != nil { 156 return false, err 157 } 158 var newVal uint32 159 switch opType { 160 case linux.FUTEX_OP_ADD: 161 newVal = oldVal + opArg 162 case linux.FUTEX_OP_OR: 163 newVal = oldVal | opArg 164 case linux.FUTEX_OP_ANDN: 165 newVal = oldVal &^ opArg 166 case linux.FUTEX_OP_XOR: 167 newVal = oldVal ^ opArg 168 default: 169 return false, syserror.ENOSYS 170 } 171 prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) 172 if err != nil { 173 return false, err 174 } 175 if prev == oldVal { 176 break // Success. 177 } 178 } 179 } 180 181 switch cmp { 182 case linux.FUTEX_OP_CMP_EQ: 183 return oldVal == cmpArg, nil 184 case linux.FUTEX_OP_CMP_NE: 185 return oldVal != cmpArg, nil 186 case linux.FUTEX_OP_CMP_LT: 187 return oldVal < cmpArg, nil 188 case linux.FUTEX_OP_CMP_LE: 189 return oldVal <= cmpArg, nil 190 case linux.FUTEX_OP_CMP_GT: 191 return oldVal > cmpArg, nil 192 case linux.FUTEX_OP_CMP_GE: 193 return oldVal >= cmpArg, nil 194 default: 195 return false, syserror.ENOSYS 196 } 197 } 198 199 // Waiter is the struct which gets enqueued into buckets for wake up routines 200 // and requeue routines to scan and notify. Once a Waiter has been enqueued by 201 // WaitPrepare(), callers may listen on C for wake up events. 202 type Waiter struct { 203 // Synchronization: 204 // 205 // - A Waiter that is not enqueued in a bucket is exclusively owned (no 206 // synchronization applies). 207 // 208 // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this, 209 // waiterEntry, bucket, and key are protected by the bucket.mu ("bucket 210 // lock") of the containing bucket, and bitmask is immutable. Note that 211 // since bucket is mutated using atomic memory operations, bucket.Load() 212 // may be called without holding the bucket lock, although it may change 213 // racily. See WaitComplete(). 214 // 215 // - A Waiter is only guaranteed to be no longer queued after calling 216 // WaitComplete(). 217 218 // waiterEntry links Waiter into bucket.waiters. 219 waiterEntry 220 221 // bucket is the bucket this waiter is queued in. If bucket is nil, the 222 // waiter is not waiting and is not in any bucket. 223 bucket AtomicPtrBucket 224 225 // C is sent to when the Waiter is woken. 226 C chan struct{} 227 228 // key is what this waiter is waiting on. 229 key Key 230 231 // The bitmask we're waiting on. 232 // This is used the case of a FUTEX_WAKE_BITSET. 233 bitmask uint32 234 235 // tid is the thread ID for the waiter in case this is a PI mutex. 236 tid uint32 237 } 238 239 // NewWaiter returns a new unqueued Waiter. 240 func NewWaiter() *Waiter { 241 return &Waiter{ 242 C: make(chan struct{}, 1), 243 } 244 } 245 246 // woken returns true if w has been woken since the last call to WaitPrepare. 247 func (w *Waiter) woken() bool { 248 return len(w.C) != 0 249 } 250 251 // bucket holds a list of waiters for a given address hash. 252 // 253 // +stateify savable 254 type bucket struct { 255 // mu protects waiters and contained Waiter state. See comment in Waiter. 256 mu sync.Mutex `state:"nosave"` 257 258 waiters waiterList `state:"zerovalue"` 259 } 260 261 // wakeLocked wakes up to n waiters matching the bitmask at the addr for this 262 // bucket and returns the number of waiters woken. 263 // 264 // Preconditions: b.mu must be locked. 265 func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int { 266 done := 0 267 for w := b.waiters.Front(); done < n && w != nil; { 268 if !w.key.matches(key) || w.bitmask&bitmask == 0 { 269 // Not matching. 270 w = w.Next() 271 continue 272 } 273 274 // Remove from the bucket and wake the waiter. 275 woke := w 276 w = w.Next() // Next iteration. 277 b.wakeWaiterLocked(woke) 278 done++ 279 } 280 return done 281 } 282 283 func (b *bucket) wakeWaiterLocked(w *Waiter) { 284 // Remove from the bucket and wake the waiter. 285 b.waiters.Remove(w) 286 w.C <- struct{}{} 287 288 // NOTE: The above channel write establishes a write barrier according 289 // to the memory model, so nothing may be ordered around it. Since 290 // we've dequeued w and will never touch it again, we can safely 291 // store nil to w.bucket here and allow the WaitComplete() to 292 // short-circuit grabbing the bucket lock. If they somehow miss the 293 // store, we are still holding the lock, so we can know that they won't 294 // dequeue w, assume it's free and have the below operation 295 // afterwards. 296 w.bucket.Store(nil) 297 } 298 299 // requeueLocked takes n waiters from the bucket and moves them to naddr on the 300 // bucket "to". 301 // 302 // Preconditions: b and to must be locked. 303 func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { 304 done := 0 305 for w := b.waiters.Front(); done < n && w != nil; { 306 if !w.key.matches(key) { 307 // Not matching. 308 w = w.Next() 309 continue 310 } 311 312 requeued := w 313 w = w.Next() // Next iteration. 314 b.waiters.Remove(requeued) 315 requeued.key.release(t) 316 requeued.key = nkey.clone() 317 to.waiters.PushBack(requeued) 318 requeued.bucket.Store(to) 319 done++ 320 } 321 return done 322 } 323 324 const ( 325 // bucketCount is the number of buckets per Manager. By having many of 326 // these we reduce contention when concurrent yet unrelated calls are made. 327 bucketCount = 1 << bucketCountBits 328 bucketCountBits = 10 329 ) 330 331 // getKey returns a Key representing address addr in c. 332 func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) { 333 // Ensure the address is aligned. 334 // It must be a DWORD boundary. 335 if addr&0x3 != 0 { 336 return Key{}, linuxerr.EINVAL 337 } 338 if private { 339 return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil 340 } 341 return t.GetSharedKey(addr) 342 } 343 344 // bucketIndexForAddr returns the index into Manager.buckets for addr. 345 func bucketIndexForAddr(addr hostarch.Addr) uintptr { 346 // - The bottom 2 bits of addr must be 0, per getKey. 347 // 348 // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 349 // for a canonical address, and (on all existing platforms) bit 47 must be 350 // 0 for an application address. 351 // 352 // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful" 353 // bits. We choose one of the simplest possible hash functions that at 354 // least uses all 45 useful bits in the output, given that bucketCountBits 355 // == 10. This hash function also has the property that it will usually map 356 // adjacent addresses to adjacent buckets, slightly improving memory 357 // locality when an application synchronization structure uses multiple 358 // nearby futexes. 359 // 360 // Note that despite the large number of arithmetic operations in the 361 // function, many components can be computed in parallel, such that the 362 // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This 363 // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... + 364 // (addr >> 42)" without any additional grouping, the compiler puts all 4 365 // additions in the critical path. 366 h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22) 367 h2 := uintptr(addr>>32) + uintptr(addr>>42) 368 return (h1 + h2) % bucketCount 369 } 370 371 // Manager holds futex state for a single virtual address space. 372 // 373 // +stateify savable 374 type Manager struct { 375 // privateBuckets holds buckets for KindPrivate and KindSharedPrivate 376 // futexes. 377 privateBuckets [bucketCount]bucket `state:"zerovalue"` 378 379 // sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket 380 // may be shared by multiple Managers. The sharedBucket pointer is 381 // immutable. 382 sharedBucket *bucket 383 } 384 385 // NewManager returns an initialized futex manager. 386 func NewManager() *Manager { 387 return &Manager{ 388 sharedBucket: &bucket{}, 389 } 390 } 391 392 // Fork returns a new Manager. Shared futex clients using the returned Manager 393 // may interoperate with those using m. 394 func (m *Manager) Fork() *Manager { 395 return &Manager{ 396 sharedBucket: m.sharedBucket, 397 } 398 } 399 400 // lockBucket returns a locked bucket for the given key. 401 // +checklocksacquire:b.mu 402 func (m *Manager) lockBucket(k *Key) (b *bucket) { 403 if k.Kind == KindSharedMappable { 404 b = m.sharedBucket 405 } else { 406 b = &m.privateBuckets[bucketIndexForAddr(k.addr())] 407 } 408 b.mu.Lock() 409 return b 410 } 411 412 // lockBuckets returns locked buckets for the given keys. 413 // +checklocksacquire:b1.mu 414 // +checklocksacquire:b2.mu 415 func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) { 416 // Buckets must be consistently ordered to avoid circular lock 417 // dependencies. We order buckets in m.privateBuckets by index (lowest 418 // index first), and all buckets in m.privateBuckets precede 419 // m.sharedBucket. 420 421 // Handle the common case first: 422 if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { 423 i1 := bucketIndexForAddr(k1.addr()) 424 i2 := bucketIndexForAddr(k2.addr()) 425 b1 = &m.privateBuckets[i1] 426 b2 = &m.privateBuckets[i2] 427 switch { 428 case i1 < i2: 429 b1.mu.Lock() 430 b2.mu.Lock() 431 case i2 < i1: 432 b2.mu.Lock() 433 b1.mu.Lock() 434 default: 435 b1.mu.Lock() 436 } 437 return b1, b2 // +checklocksforce 438 } 439 440 // At least one of b1 or b2 should be m.sharedBucket. 441 b1 = m.sharedBucket 442 b2 = m.sharedBucket 443 if k1.Kind != KindSharedMappable { 444 b1 = m.lockBucket(k1) 445 } else if k2.Kind != KindSharedMappable { 446 b2 = m.lockBucket(k2) 447 } 448 m.sharedBucket.mu.Lock() 449 return b1, b2 // +checklocksforce 450 } 451 452 // unlockBuckets unlocks two buckets. 453 // +checklocksrelease:b1.mu 454 // +checklocksrelease:b2.mu 455 func (m *Manager) unlockBuckets(b1, b2 *bucket) { 456 b1.mu.Unlock() 457 if b1 != b2 { 458 b2.mu.Unlock() 459 } 460 return // +checklocksforce 461 } 462 463 // Wake wakes up to n waiters matching the bitmask on the given addr. 464 // The number of waiters woken is returned. 465 func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) { 466 // This function is very hot; avoid defer. 467 k, err := getKey(t, addr, private) 468 if err != nil { 469 return 0, err 470 } 471 472 b := m.lockBucket(&k) 473 r := b.wakeLocked(&k, bitmask, n) 474 475 b.mu.Unlock() 476 k.release(t) 477 return r, nil 478 } 479 480 func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { 481 k1, err := getKey(t, addr, private) 482 if err != nil { 483 return 0, err 484 } 485 defer k1.release(t) 486 k2, err := getKey(t, naddr, private) 487 if err != nil { 488 return 0, err 489 } 490 defer k2.release(t) 491 492 b1, b2 := m.lockBuckets(&k1, &k2) 493 defer m.unlockBuckets(b1, b2) 494 495 if checkval { 496 if err := check(t, addr, val); err != nil { 497 return 0, err 498 } 499 } 500 501 // Wake the number required. 502 done := b1.wakeLocked(&k1, ^uint32(0), nwake) 503 504 // Requeue the number required. 505 b1.requeueLocked(t, b2, &k1, &k2, nreq) 506 507 return done, nil 508 } 509 510 // Requeue wakes up to nwake waiters on the given addr, and unconditionally 511 // requeues up to nreq waiters on naddr. 512 func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) { 513 return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq) 514 } 515 516 // RequeueCmp atomically checks that the addr contains val (via the Target), 517 // wakes up to nwake waiters on addr and then unconditionally requeues nreq 518 // waiters on naddr. 519 func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { 520 return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq) 521 } 522 523 // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1 524 // waiters unconditionally from addr1, and, based on the original value at addr2 525 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2. 526 // It returns the total number of waiters woken. 527 func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { 528 k1, err := getKey(t, addr1, private) 529 if err != nil { 530 return 0, err 531 } 532 defer k1.release(t) 533 k2, err := getKey(t, addr2, private) 534 if err != nil { 535 return 0, err 536 } 537 defer k2.release(t) 538 539 b1, b2 := m.lockBuckets(&k1, &k2) 540 defer m.unlockBuckets(b1, b2) 541 542 done := 0 543 cond, err := atomicOp(t, addr2, op) 544 if err != nil { 545 return 0, err 546 } 547 548 // Wake up up to nwake1 entries from the first bucket. 549 done = b1.wakeLocked(&k1, ^uint32(0), nwake1) 550 551 // Wake up up to nwake2 entries from the second bucket if the 552 // operation yielded true. 553 if cond { 554 done += b2.wakeLocked(&k2, ^uint32(0), nwake2) 555 } 556 557 return done, nil 558 } 559 560 // WaitPrepare atomically checks that addr contains val (via the Checker), then 561 // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the 562 // Waiter must be subsequently removed by calling WaitComplete, whether or not 563 // a wakeup is received on w.C. 564 func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error { 565 k, err := getKey(t, addr, private) 566 if err != nil { 567 return err 568 } 569 // Ownership of k is transferred to w below. 570 571 // Prepare the Waiter before taking the bucket lock. 572 select { 573 case <-w.C: 574 default: 575 } 576 w.key = k 577 w.bitmask = bitmask 578 579 b := m.lockBucket(&k) 580 // This function is very hot; avoid defer. 581 582 // Perform our atomic check. 583 if err := check(t, addr, val); err != nil { 584 b.mu.Unlock() 585 w.key.release(t) 586 return err 587 } 588 589 // Add the waiter to the bucket. 590 b.waiters.PushBack(w) 591 w.bucket.Store(b) 592 593 b.mu.Unlock() 594 return nil 595 } 596 597 // WaitComplete must be called when a Waiter previously added by WaitPrepare is 598 // no longer eligible to be woken. 599 func (m *Manager) WaitComplete(w *Waiter, t Target) { 600 // Remove w from the bucket it's in. 601 for { 602 b := w.bucket.Load() 603 604 // If b is nil, the waiter isn't in any bucket anymore. This can't be 605 // racy because the waiter can't be concurrently re-queued in another 606 // bucket. 607 if b == nil { 608 break 609 } 610 611 // Take the bucket lock. Note that without holding the bucket lock, the 612 // waiter is not guaranteed to stay in that bucket, so after we take 613 // the bucket lock, we must ensure that the bucket hasn't changed: if 614 // it happens to have changed, we release the old bucket lock and try 615 // again with the new bucket; if it hasn't changed, we know it won't 616 // change now because we hold the lock. 617 b.mu.Lock() 618 if b != w.bucket.Load() { 619 b.mu.Unlock() 620 continue 621 } 622 623 // Remove waiter from bucket. 624 b.waiters.Remove(w) 625 w.bucket.Store(nil) 626 b.mu.Unlock() 627 break 628 } 629 630 // Release references held by the waiter. 631 w.key.release(t) 632 } 633 634 // LockPI attempts to lock the futex following the Priority-inheritance futex 635 // rules. The lock is acquired only when 'addr' points to 0. The TID of the 636 // calling task is set to 'addr' to indicate the futex is owned. It returns true 637 // if the futex was successfully acquired. 638 // 639 // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see 640 // exit_robust_list()). Given we don't support robust lists, although handled 641 // below, it's never set. 642 func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) { 643 k, err := getKey(t, addr, private) 644 if err != nil { 645 return false, err 646 } 647 // Ownership of k is transferred to w below. 648 649 // Prepare the Waiter before taking the bucket lock. 650 select { 651 case <-w.C: 652 default: 653 } 654 w.key = k 655 w.tid = tid 656 657 b := m.lockBucket(&k) 658 // Hot function: avoid defers. 659 660 success, err := m.lockPILocked(w, t, addr, tid, b, try) 661 if err != nil { 662 w.key.release(t) 663 b.mu.Unlock() 664 return false, err 665 } 666 if success || try { 667 // Release waiter if it's not going to be a wait. 668 w.key.release(t) 669 } 670 b.mu.Unlock() 671 return success, nil 672 } 673 674 func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) { 675 for { 676 cur, err := t.LoadUint32(addr) 677 if err != nil { 678 return false, err 679 } 680 if (cur & linux.FUTEX_TID_MASK) == tid { 681 return false, linuxerr.EDEADLK 682 } 683 684 if (cur & linux.FUTEX_TID_MASK) == 0 { 685 // No owner and no waiters, try to acquire the futex. 686 687 // Set TID and preserve owner died status. 688 val := tid 689 val |= cur & linux.FUTEX_OWNER_DIED 690 prev, err := t.CompareAndSwapUint32(addr, cur, val) 691 if err != nil { 692 return false, err 693 } 694 if prev != cur { 695 // CAS failed, retry... 696 // Linux reacquires the bucket lock on retries, which will re-lookup the 697 // mapping at the futex address. However, retrying while holding the 698 // lock is more efficient and reduces the chance of another conflict. 699 continue 700 } 701 // Futex acquired. 702 return true, nil 703 } 704 705 // Futex is already owned, prepare to wait. 706 707 if try { 708 // Caller doesn't want to wait. 709 return false, nil 710 } 711 712 // Set waiters bit if not set yet. 713 if cur&linux.FUTEX_WAITERS == 0 { 714 prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS) 715 if err != nil { 716 return false, err 717 } 718 if prev != cur { 719 // CAS failed, retry... 720 continue 721 } 722 } 723 724 // Add the waiter to the bucket. 725 b.waiters.PushBack(w) 726 w.bucket.Store(b) 727 return false, nil 728 } 729 } 730 731 // UnlockPI unlocks the futex following the Priority-inheritance futex rules. 732 // The address provided must contain the caller's TID. If there are waiters, 733 // TID of the next waiter (FIFO) is set to the given address, and the waiter 734 // woken up. If there are no waiters, 0 is set to the address. 735 func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error { 736 k, err := getKey(t, addr, private) 737 if err != nil { 738 return err 739 } 740 b := m.lockBucket(&k) 741 742 err = m.unlockPILocked(t, addr, tid, b, &k) 743 744 k.release(t) 745 b.mu.Unlock() 746 return err 747 } 748 749 func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error { 750 cur, err := t.LoadUint32(addr) 751 if err != nil { 752 return err 753 } 754 755 if (cur & linux.FUTEX_TID_MASK) != tid { 756 return linuxerr.EPERM 757 } 758 759 var next *Waiter // Who's the next owner? 760 var next2 *Waiter // Who's the one after that? 761 for w := b.waiters.Front(); w != nil; w = w.Next() { 762 if !w.key.matches(key) { 763 continue 764 } 765 766 if next == nil { 767 next = w 768 } else { 769 next2 = w 770 break 771 } 772 } 773 774 if next == nil { 775 // It's safe to set 0 because there are no waiters, no new owner, and the 776 // executing task is the current owner (no owner died bit). 777 prev, err := t.CompareAndSwapUint32(addr, cur, 0) 778 if err != nil { 779 return err 780 } 781 if prev != cur { 782 // Let user mode handle CAS races. This is different than lock, which 783 // retries when CAS fails. 784 return linuxerr.EAGAIN 785 } 786 return nil 787 } 788 789 // Set next owner's TID, waiters if there are any. Resets owner died bit, if 790 // set, because the executing task takes over as the owner. 791 val := next.tid 792 if next2 != nil { 793 val |= linux.FUTEX_WAITERS 794 } 795 796 prev, err := t.CompareAndSwapUint32(addr, cur, val) 797 if err != nil { 798 return err 799 } 800 if prev != cur { 801 return linuxerr.EINVAL 802 } 803 804 b.wakeWaiterLocked(next) 805 return nil 806 }