gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/futex/futex.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package futex provides an implementation of the futex interface as found in
    16  // the Linux kernel. It allows one to easily transform Wait() calls into waits
    17  // on a channel, which is useful in a Go-based kernel, for example.
    18  package futex
    19  
    20  import (
    21  	"gvisor.dev/gvisor/pkg/abi/linux"
    22  	"gvisor.dev/gvisor/pkg/context"
    23  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    24  	"gvisor.dev/gvisor/pkg/hostarch"
    25  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    26  )
    27  
    28  // KeyKind indicates the type of a Key.
    29  type KeyKind int
    30  
    31  const (
    32  	// KindPrivate indicates a private futex (a futex syscall with the
    33  	// FUTEX_PRIVATE_FLAG set).
    34  	KindPrivate KeyKind = iota
    35  
    36  	// KindSharedPrivate indicates a shared futex on a private memory mapping.
    37  	// Although KindPrivate and KindSharedPrivate futexes both use memory
    38  	// addresses to identify futexes, they do not interoperate (in Linux, the
    39  	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
    40  	// comparison).
    41  	KindSharedPrivate
    42  
    43  	// KindSharedMappable indicates a shared futex on a memory mapping other
    44  	// than a private anonymous memory mapping.
    45  	KindSharedMappable
    46  )
    47  
    48  // Key represents something that a futex waiter may wait on.
    49  type Key struct {
    50  	// Kind is the type of the Key.
    51  	Kind KeyKind
    52  
    53  	// Mappable is the memory-mapped object that is represented by the Key.
    54  	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
    55  	// even if it is.
    56  	Mappable memmap.Mappable
    57  
    58  	// MappingIdentity is the MappingIdentity associated with Mappable.
    59  	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
    60  	// it isn't.
    61  	MappingIdentity memmap.MappingIdentity
    62  
    63  	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
    64  	// memory address. Otherwise, Offset is the represented offset into
    65  	// Mappable.
    66  	Offset uint64
    67  }
    68  
    69  func (k *Key) release(t Target) {
    70  	if k.MappingIdentity != nil {
    71  		k.MappingIdentity.DecRef(t)
    72  	}
    73  	k.Mappable = nil
    74  	k.MappingIdentity = nil
    75  }
    76  
    77  func (k *Key) clone() Key {
    78  	if k.MappingIdentity != nil {
    79  		k.MappingIdentity.IncRef()
    80  	}
    81  	return *k
    82  }
    83  
    84  // Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
    85  func (k *Key) addr() hostarch.Addr {
    86  	return hostarch.Addr(k.Offset)
    87  }
    88  
    89  // matches returns true if a wakeup on k2 should wake a waiter waiting on k.
    90  func (k *Key) matches(k2 *Key) bool {
    91  	// k.MappingIdentity is ignored; it's only used for reference counting.
    92  	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
    93  }
    94  
    95  // Target abstracts memory accesses and keys.
    96  type Target interface {
    97  	context.Context
    98  
    99  	// SwapUint32 gives access to hostarch.IO.SwapUint32.
   100  	SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)
   101  
   102  	// CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32.
   103  	CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)
   104  
   105  	// LoadUint32 gives access to hostarch.IO.LoadUint32.
   106  	LoadUint32(addr hostarch.Addr) (uint32, error)
   107  
   108  	// GetSharedKey returns a Key with kind KindSharedPrivate or
   109  	// KindSharedMappable corresponding to the memory mapped at address addr.
   110  	//
   111  	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
   112  	// reference is held on the MappingIdentity, which must be dropped by the
   113  	// caller when the Key is no longer in use.
   114  	GetSharedKey(addr hostarch.Addr) (Key, error)
   115  }
   116  
   117  // check performs a basic equality check on the given address.
   118  func check(t Target, addr hostarch.Addr, val uint32) error {
   119  	cur, err := t.LoadUint32(addr)
   120  	if err != nil {
   121  		return err
   122  	}
   123  	if cur != val {
   124  		return linuxerr.EAGAIN
   125  	}
   126  	return nil
   127  }
   128  
   129  // atomicOp performs a complex operation on the given address.
   130  func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) {
   131  	opType := (opIn >> 28) & 0xf
   132  	cmp := (opIn >> 24) & 0xf
   133  	opArg := (opIn >> 12) & 0xfff
   134  	cmpArg := opIn & 0xfff
   135  
   136  	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
   137  		opArg = 1 << opArg
   138  		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
   139  	}
   140  
   141  	var (
   142  		oldVal uint32
   143  		err    error
   144  	)
   145  	if opType == linux.FUTEX_OP_SET {
   146  		oldVal, err = t.SwapUint32(addr, opArg)
   147  		if err != nil {
   148  			return false, err
   149  		}
   150  	} else {
   151  		for {
   152  			oldVal, err = t.LoadUint32(addr)
   153  			if err != nil {
   154  				return false, err
   155  			}
   156  			var newVal uint32
   157  			switch opType {
   158  			case linux.FUTEX_OP_ADD:
   159  				newVal = oldVal + opArg
   160  			case linux.FUTEX_OP_OR:
   161  				newVal = oldVal | opArg
   162  			case linux.FUTEX_OP_ANDN:
   163  				newVal = oldVal &^ opArg
   164  			case linux.FUTEX_OP_XOR:
   165  				newVal = oldVal ^ opArg
   166  			default:
   167  				return false, linuxerr.ENOSYS
   168  			}
   169  			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
   170  			if err != nil {
   171  				return false, err
   172  			}
   173  			if prev == oldVal {
   174  				break // Success.
   175  			}
   176  		}
   177  	}
   178  
   179  	switch cmp {
   180  	case linux.FUTEX_OP_CMP_EQ:
   181  		return oldVal == cmpArg, nil
   182  	case linux.FUTEX_OP_CMP_NE:
   183  		return oldVal != cmpArg, nil
   184  	case linux.FUTEX_OP_CMP_LT:
   185  		return oldVal < cmpArg, nil
   186  	case linux.FUTEX_OP_CMP_LE:
   187  		return oldVal <= cmpArg, nil
   188  	case linux.FUTEX_OP_CMP_GT:
   189  		return oldVal > cmpArg, nil
   190  	case linux.FUTEX_OP_CMP_GE:
   191  		return oldVal >= cmpArg, nil
   192  	default:
   193  		return false, linuxerr.ENOSYS
   194  	}
   195  }
   196  
   197  // Waiter is the struct which gets enqueued into buckets for wake up routines
   198  // and requeue routines to scan and notify. Once a Waiter has been enqueued by
   199  // WaitPrepare(), callers may listen on C for wake up events.
   200  type Waiter struct {
   201  	// Synchronization:
   202  	//
   203  	//	- A Waiter that is not enqueued in a bucket is exclusively owned (no
   204  	//		synchronization applies).
   205  	//
   206  	//	- A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
   207  	//		waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
   208  	//		lock") of the containing bucket, and bitmask is immutable. Note that
   209  	//		since bucket is mutated using atomic memory operations, bucket.Load()
   210  	//		may be called without holding the bucket lock, although it may change
   211  	//		racily. See WaitComplete().
   212  	//
   213  	//	- A Waiter is only guaranteed to be no longer queued after calling
   214  	//		WaitComplete().
   215  
   216  	// waiterEntry links Waiter into bucket.waiters.
   217  	waiterEntry
   218  
   219  	// bucket is the bucket this waiter is queued in. If bucket is nil, the
   220  	// waiter is not waiting and is not in any bucket.
   221  	bucket AtomicPtrBucket
   222  
   223  	// C is sent to when the Waiter is woken.
   224  	C chan struct{}
   225  
   226  	// key is what this waiter is waiting on.
   227  	key Key
   228  
   229  	// The bitmask we're waiting on.
   230  	// This is used the case of a FUTEX_WAKE_BITSET.
   231  	bitmask uint32
   232  
   233  	// tid is the thread ID for the waiter in case this is a PI mutex.
   234  	tid uint32
   235  }
   236  
   237  // NewWaiter returns a new unqueued Waiter.
   238  func NewWaiter() *Waiter {
   239  	return &Waiter{
   240  		C: make(chan struct{}, 1),
   241  	}
   242  }
   243  
   244  // woken returns true if w has been woken since the last call to WaitPrepare.
   245  func (w *Waiter) woken() bool {
   246  	return len(w.C) != 0
   247  }
   248  
   249  // bucket holds a list of waiters for a given address hash.
   250  //
   251  // +stateify savable
   252  type bucket struct {
   253  	// mu protects waiters and contained Waiter state. See comment in Waiter.
   254  	mu futexBucketMutex `state:"nosave"`
   255  
   256  	waiters waiterList `state:"zerovalue"`
   257  }
   258  
   259  // wakeLocked wakes up to n waiters matching the bitmask at the addr for this
   260  // bucket and returns the number of waiters woken.
   261  //
   262  // Preconditions: b.mu must be locked.
   263  func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
   264  	done := 0
   265  	for w := b.waiters.Front(); done < n && w != nil; {
   266  		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
   267  			// Not matching.
   268  			w = w.Next()
   269  			continue
   270  		}
   271  
   272  		// Remove from the bucket and wake the waiter.
   273  		woke := w
   274  		w = w.Next() // Next iteration.
   275  		b.wakeWaiterLocked(woke)
   276  		done++
   277  	}
   278  	return done
   279  }
   280  
   281  func (b *bucket) wakeWaiterLocked(w *Waiter) {
   282  	// Remove from the bucket and wake the waiter.
   283  	b.waiters.Remove(w)
   284  	w.C <- struct{}{}
   285  
   286  	// NOTE: The above channel write establishes a write barrier according
   287  	// to the memory model, so nothing may be ordered around it. Since
   288  	// we've dequeued w and will never touch it again, we can safely
   289  	// store nil to w.bucket here and allow the WaitComplete() to
   290  	// short-circuit grabbing the bucket lock. If they somehow miss the
   291  	// store, we are still holding the lock, so we can know that they won't
   292  	// dequeue w, assume it's free and have the below operation
   293  	// afterwards.
   294  	w.bucket.Store(nil)
   295  }
   296  
   297  // requeueLocked takes n waiters from the bucket and moves them to naddr on the
   298  // bucket "to".
   299  //
   300  // Preconditions: b and to must be locked.
   301  func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int {
   302  	done := 0
   303  	for w := b.waiters.Front(); done < n && w != nil; {
   304  		if !w.key.matches(key) {
   305  			// Not matching.
   306  			w = w.Next()
   307  			continue
   308  		}
   309  
   310  		requeued := w
   311  		w = w.Next() // Next iteration.
   312  		b.waiters.Remove(requeued)
   313  		requeued.key.release(t)
   314  		requeued.key = nkey.clone()
   315  		to.waiters.PushBack(requeued)
   316  		requeued.bucket.Store(to)
   317  		done++
   318  	}
   319  	return done
   320  }
   321  
   322  const (
   323  	// bucketCount is the number of buckets per Manager. By having many of
   324  	// these we reduce contention when concurrent yet unrelated calls are made.
   325  	bucketCount     = 1 << bucketCountBits
   326  	bucketCountBits = 10
   327  )
   328  
   329  // getKey returns a Key representing address addr in c.
   330  func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) {
   331  	// Ensure the address is aligned.
   332  	// It must be a DWORD boundary.
   333  	if addr&0x3 != 0 {
   334  		return Key{}, linuxerr.EINVAL
   335  	}
   336  	if private {
   337  		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
   338  	}
   339  	return t.GetSharedKey(addr)
   340  }
   341  
   342  // bucketIndexForAddr returns the index into Manager.buckets for addr.
   343  func bucketIndexForAddr(addr hostarch.Addr) uintptr {
   344  	//	- The bottom 2 bits of addr must be 0, per getKey.
   345  	//
   346  	//	- On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
   347  	//		for a canonical address, and (on all existing platforms) bit 47 must be
   348  	//		0 for an application address.
   349  	//
   350  	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
   351  	// bits. We choose one of the simplest possible hash functions that at
   352  	// least uses all 45 useful bits in the output, given that bucketCountBits
   353  	// == 10. This hash function also has the property that it will usually map
   354  	// adjacent addresses to adjacent buckets, slightly improving memory
   355  	// locality when an application synchronization structure uses multiple
   356  	// nearby futexes.
   357  	//
   358  	// Note that despite the large number of arithmetic operations in the
   359  	// function, many components can be computed in parallel, such that the
   360  	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
   361  	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
   362  	// (addr >> 42)" without any additional grouping, the compiler puts all 4
   363  	// additions in the critical path.
   364  	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
   365  	h2 := uintptr(addr>>32) + uintptr(addr>>42)
   366  	return (h1 + h2) % bucketCount
   367  }
   368  
   369  // Manager holds futex state for a single virtual address space.
   370  //
   371  // +stateify savable
   372  type Manager struct {
   373  	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
   374  	// futexes.
   375  	privateBuckets [bucketCount]bucket `state:"zerovalue"`
   376  
   377  	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
   378  	// may be shared by multiple Managers. The sharedBucket pointer is
   379  	// immutable.
   380  	sharedBucket *bucket
   381  }
   382  
   383  // NewManager returns an initialized futex manager.
   384  func NewManager() *Manager {
   385  	return &Manager{
   386  		sharedBucket: &bucket{},
   387  	}
   388  }
   389  
   390  // Fork returns a new Manager. Shared futex clients using the returned Manager
   391  // may interoperate with those using m.
   392  func (m *Manager) Fork() *Manager {
   393  	return &Manager{
   394  		sharedBucket: m.sharedBucket,
   395  	}
   396  }
   397  
   398  // lockBucket returns a locked bucket for the given key.
   399  // +checklocksacquire:b.mu
   400  func (m *Manager) lockBucket(k *Key) (b *bucket) {
   401  	if k.Kind == KindSharedMappable {
   402  		b = m.sharedBucket
   403  	} else {
   404  		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
   405  	}
   406  	b.mu.Lock()
   407  	return b
   408  }
   409  
   410  // lockBuckets returns locked buckets for the given keys.
   411  // It returns which bucket was locked first and second. They may be nil in case the buckets are
   412  // identical or they did not need locking.
   413  //
   414  // +checklocksacquire:lockedFirst.mu
   415  // +checklocksacquire:lockedSecond.mu
   416  func (m *Manager) lockBuckets(k1, k2 *Key) (b1, b2, lockedFirst, lockedSecond *bucket) {
   417  	// Buckets must be consistently ordered to avoid circular lock
   418  	// dependencies. We order buckets in m.privateBuckets by index (lowest
   419  	// index first), and all buckets in m.privateBuckets precede
   420  	// m.sharedBucket.
   421  
   422  	// Handle the common case first:
   423  	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
   424  		i1 := bucketIndexForAddr(k1.addr())
   425  		i2 := bucketIndexForAddr(k2.addr())
   426  		b1 = &m.privateBuckets[i1]
   427  		b2 = &m.privateBuckets[i2]
   428  		switch {
   429  		case i1 < i2:
   430  			b1.mu.Lock()
   431  			b2.mu.NestedLock(futexBucketLockB)
   432  			return b1, b2, b1, b2
   433  		case i2 < i1:
   434  			b2.mu.Lock()
   435  			b1.mu.NestedLock(futexBucketLockB)
   436  			return b1, b2, b2, b1
   437  		default:
   438  			b1.mu.Lock()
   439  			return b1, b2, b1, nil // +checklocksforce
   440  		}
   441  	}
   442  
   443  	// At least one of b1 or b2 should be m.sharedBucket.
   444  	b1 = m.sharedBucket
   445  	b2 = m.sharedBucket
   446  	if k1.Kind != KindSharedMappable {
   447  		b1 = m.lockBucket(k1)
   448  		b2.mu.NestedLock(futexBucketLockB)
   449  		return b1, b2, b1, b2
   450  	}
   451  	if k2.Kind != KindSharedMappable {
   452  		b2 = m.lockBucket(k2)
   453  		b1.mu.NestedLock(futexBucketLockB)
   454  		return b1, b2, b2, b1
   455  	}
   456  	return b1, b2, nil, nil // +checklocksforce
   457  }
   458  
   459  // unlockBuckets unlocks two buckets.
   460  // +checklocksrelease:lockedFirst.mu
   461  // +checklocksrelease:lockedSecond.mu
   462  func (m *Manager) unlockBuckets(lockedFirst, lockedSecond *bucket) {
   463  	if lockedSecond != nil {
   464  		lockedSecond.mu.NestedUnlock(futexBucketLockB)
   465  	}
   466  	if lockedFirst != nil && lockedFirst != lockedSecond {
   467  		lockedFirst.mu.Unlock()
   468  	}
   469  	return
   470  }
   471  
   472  // Wake wakes up to n waiters matching the bitmask on the given addr.
   473  // The number of waiters woken is returned.
   474  func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) {
   475  	// This function is very hot; avoid defer.
   476  	k, err := getKey(t, addr, private)
   477  	if err != nil {
   478  		return 0, err
   479  	}
   480  
   481  	b := m.lockBucket(&k)
   482  	r := b.wakeLocked(&k, bitmask, n)
   483  
   484  	b.mu.Unlock()
   485  	k.release(t)
   486  	return r, nil
   487  }
   488  
   489  func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
   490  	k1, err := getKey(t, addr, private)
   491  	if err != nil {
   492  		return 0, err
   493  	}
   494  	defer k1.release(t)
   495  	k2, err := getKey(t, naddr, private)
   496  	if err != nil {
   497  		return 0, err
   498  	}
   499  	defer k2.release(t)
   500  
   501  	b1, b2, lockedFirst, lockedSecond := m.lockBuckets(&k1, &k2)
   502  	defer m.unlockBuckets(lockedFirst, lockedSecond)
   503  
   504  	if checkval {
   505  		if err := check(t, addr, val); err != nil {
   506  			return 0, err
   507  		}
   508  	}
   509  
   510  	// Wake the number required.
   511  	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
   512  
   513  	// Requeue the number required.
   514  	b1.requeueLocked(t, b2, &k1, &k2, nreq)
   515  
   516  	return done, nil
   517  }
   518  
   519  // Requeue wakes up to nwake waiters on the given addr, and unconditionally
   520  // requeues up to nreq waiters on naddr.
   521  func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) {
   522  	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
   523  }
   524  
   525  // RequeueCmp atomically checks that the addr contains val (via the Target),
   526  // wakes up to nwake waiters on addr and then unconditionally requeues nreq
   527  // waiters on naddr.
   528  func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
   529  	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
   530  }
   531  
   532  // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
   533  // waiters unconditionally from addr1, and, based on the original value at addr2
   534  // and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
   535  // It returns the total number of waiters woken.
   536  func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
   537  	k1, err := getKey(t, addr1, private)
   538  	if err != nil {
   539  		return 0, err
   540  	}
   541  	defer k1.release(t)
   542  	k2, err := getKey(t, addr2, private)
   543  	if err != nil {
   544  		return 0, err
   545  	}
   546  	defer k2.release(t)
   547  
   548  	b1, b2, lockedFirst, lockedSecond := m.lockBuckets(&k1, &k2)
   549  	defer m.unlockBuckets(lockedFirst, lockedSecond)
   550  
   551  	done := 0
   552  	cond, err := atomicOp(t, addr2, op)
   553  	if err != nil {
   554  		return 0, err
   555  	}
   556  
   557  	// Wake up up to nwake1 entries from the first bucket.
   558  	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
   559  
   560  	// Wake up up to nwake2 entries from the second bucket if the
   561  	// operation yielded true.
   562  	if cond {
   563  		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
   564  	}
   565  
   566  	return done, nil
   567  }
   568  
   569  // WaitPrepare atomically checks that addr contains val (via the Checker), then
   570  // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
   571  // Waiter must be subsequently removed by calling WaitComplete, whether or not
   572  // a wakeup is received on w.C.
   573  func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error {
   574  	k, err := getKey(t, addr, private)
   575  	if err != nil {
   576  		return err
   577  	}
   578  	// Ownership of k is transferred to w below.
   579  
   580  	// Prepare the Waiter before taking the bucket lock.
   581  	select {
   582  	case <-w.C:
   583  	default:
   584  	}
   585  	w.key = k
   586  	w.bitmask = bitmask
   587  
   588  	b := m.lockBucket(&k)
   589  	// This function is very hot; avoid defer.
   590  
   591  	// Perform our atomic check.
   592  	if err := check(t, addr, val); err != nil {
   593  		b.mu.Unlock()
   594  		w.key.release(t)
   595  		return err
   596  	}
   597  
   598  	// Add the waiter to the bucket.
   599  	b.waiters.PushBack(w)
   600  	w.bucket.Store(b)
   601  
   602  	b.mu.Unlock()
   603  	return nil
   604  }
   605  
   606  // WaitComplete must be called when a Waiter previously added by WaitPrepare is
   607  // no longer eligible to be woken.
   608  func (m *Manager) WaitComplete(w *Waiter, t Target) {
   609  	// Remove w from the bucket it's in.
   610  	for {
   611  		b := w.bucket.Load()
   612  
   613  		// If b is nil, the waiter isn't in any bucket anymore. This can't be
   614  		// racy because the waiter can't be concurrently re-queued in another
   615  		// bucket.
   616  		if b == nil {
   617  			break
   618  		}
   619  
   620  		// Take the bucket lock. Note that without holding the bucket lock, the
   621  		// waiter is not guaranteed to stay in that bucket, so after we take
   622  		// the bucket lock, we must ensure that the bucket hasn't changed: if
   623  		// it happens to have changed, we release the old bucket lock and try
   624  		// again with the new bucket; if it hasn't changed, we know it won't
   625  		// change now because we hold the lock.
   626  		b.mu.Lock()
   627  		if b != w.bucket.Load() {
   628  			b.mu.Unlock()
   629  			continue
   630  		}
   631  
   632  		// Remove waiter from bucket.
   633  		b.waiters.Remove(w)
   634  		w.bucket.Store(nil)
   635  		b.mu.Unlock()
   636  		break
   637  	}
   638  
   639  	// Release references held by the waiter.
   640  	w.key.release(t)
   641  }
   642  
   643  // LockPI attempts to lock the futex following the Priority-inheritance futex
   644  // rules. The lock is acquired only when 'addr' points to 0. The TID of the
   645  // calling task is set to 'addr' to indicate the futex is owned. It returns true
   646  // if the futex was successfully acquired.
   647  //
   648  // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
   649  // exit_robust_list()). Given we don't support robust lists, although handled
   650  // below, it's never set.
   651  func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) {
   652  	k, err := getKey(t, addr, private)
   653  	if err != nil {
   654  		return false, err
   655  	}
   656  	// Ownership of k is transferred to w below.
   657  
   658  	// Prepare the Waiter before taking the bucket lock.
   659  	select {
   660  	case <-w.C:
   661  	default:
   662  	}
   663  	w.key = k
   664  	w.tid = tid
   665  
   666  	b := m.lockBucket(&k)
   667  	// Hot function: avoid defers.
   668  
   669  	success, err := m.lockPILocked(w, t, addr, tid, b, try)
   670  	if err != nil {
   671  		w.key.release(t)
   672  		b.mu.Unlock()
   673  		return false, err
   674  	}
   675  	if success || try {
   676  		// Release waiter if it's not going to be a wait.
   677  		w.key.release(t)
   678  	}
   679  	b.mu.Unlock()
   680  	return success, nil
   681  }
   682  
   683  func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) {
   684  	for {
   685  		cur, err := t.LoadUint32(addr)
   686  		if err != nil {
   687  			return false, err
   688  		}
   689  		if (cur & linux.FUTEX_TID_MASK) == tid {
   690  			return false, linuxerr.EDEADLK
   691  		}
   692  
   693  		if (cur & linux.FUTEX_TID_MASK) == 0 {
   694  			// No owner and no waiters, try to acquire the futex.
   695  
   696  			// Set TID and preserve owner died status.
   697  			val := tid
   698  			val |= cur & linux.FUTEX_OWNER_DIED
   699  			prev, err := t.CompareAndSwapUint32(addr, cur, val)
   700  			if err != nil {
   701  				return false, err
   702  			}
   703  			if prev != cur {
   704  				// CAS failed, retry...
   705  				// Linux reacquires the bucket lock on retries, which will re-lookup the
   706  				// mapping at the futex address. However, retrying while holding the
   707  				// lock is more efficient and reduces the chance of another conflict.
   708  				continue
   709  			}
   710  			// Futex acquired.
   711  			return true, nil
   712  		}
   713  
   714  		// Futex is already owned, prepare to wait.
   715  
   716  		if try {
   717  			// Caller doesn't want to wait.
   718  			return false, nil
   719  		}
   720  
   721  		// Set waiters bit if not set yet.
   722  		if cur&linux.FUTEX_WAITERS == 0 {
   723  			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
   724  			if err != nil {
   725  				return false, err
   726  			}
   727  			if prev != cur {
   728  				// CAS failed, retry...
   729  				continue
   730  			}
   731  		}
   732  
   733  		// Add the waiter to the bucket.
   734  		b.waiters.PushBack(w)
   735  		w.bucket.Store(b)
   736  		return false, nil
   737  	}
   738  }
   739  
   740  // UnlockPI unlocks the futex following the Priority-inheritance futex rules.
   741  // The address provided must contain the caller's TID. If there are waiters,
   742  // TID of the next waiter (FIFO) is set to the given address, and the waiter
   743  // woken up. If there are no waiters, 0 is set to the address.
   744  func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error {
   745  	k, err := getKey(t, addr, private)
   746  	if err != nil {
   747  		return err
   748  	}
   749  	b := m.lockBucket(&k)
   750  
   751  	err = m.unlockPILocked(t, addr, tid, b, &k)
   752  
   753  	k.release(t)
   754  	b.mu.Unlock()
   755  	return err
   756  }
   757  
   758  func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error {
   759  	cur, err := t.LoadUint32(addr)
   760  	if err != nil {
   761  		return err
   762  	}
   763  
   764  	if (cur & linux.FUTEX_TID_MASK) != tid {
   765  		return linuxerr.EPERM
   766  	}
   767  
   768  	var next *Waiter  // Who's the next owner?
   769  	var next2 *Waiter // Who's the one after that?
   770  	for w := b.waiters.Front(); w != nil; w = w.Next() {
   771  		if !w.key.matches(key) {
   772  			continue
   773  		}
   774  
   775  		if next == nil {
   776  			next = w
   777  		} else {
   778  			next2 = w
   779  			break
   780  		}
   781  	}
   782  
   783  	if next == nil {
   784  		// It's safe to set 0 because there are no waiters, no new owner, and the
   785  		// executing task is the current owner (no owner died bit).
   786  		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
   787  		if err != nil {
   788  			return err
   789  		}
   790  		if prev != cur {
   791  			// Let user mode handle CAS races. This is different than lock, which
   792  			// retries when CAS fails.
   793  			return linuxerr.EAGAIN
   794  		}
   795  		return nil
   796  	}
   797  
   798  	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
   799  	// set, because the executing task takes over as the owner.
   800  	val := next.tid
   801  	if next2 != nil {
   802  		val |= linux.FUTEX_WAITERS
   803  	}
   804  
   805  	prev, err := t.CompareAndSwapUint32(addr, cur, val)
   806  	if err != nil {
   807  		return err
   808  	}
   809  	if prev != cur {
   810  		return linuxerr.EINVAL
   811  	}
   812  
   813  	b.wakeWaiterLocked(next)
   814  	return nil
   815  }