github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/futex/futex.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/futex/futex.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package futex provides an implementation of the futex interface as found in
    16  // the Linux kernel. It allows one to easily transform Wait() calls into waits
    17  // on a channel, which is useful in a Go-based kernel, for example.
    18  package futex
    19  
    20  import (
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/hostarch"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    26  	"github.com/SagerNet/gvisor/pkg/sync"
    27  	"github.com/SagerNet/gvisor/pkg/syserror"
    28  )
    29  
    30  // KeyKind indicates the type of a Key.
    31  type KeyKind int
    32  
    33  const (
    34  	// KindPrivate indicates a private futex (a futex syscall with the
    35  	// FUTEX_PRIVATE_FLAG set).
    36  	KindPrivate KeyKind = iota
    37  
    38  	// KindSharedPrivate indicates a shared futex on a private memory mapping.
    39  	// Although KindPrivate and KindSharedPrivate futexes both use memory
    40  	// addresses to identify futexes, they do not interoperate (in Linux, the
    41  	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
    42  	// comparison).
    43  	KindSharedPrivate
    44  
    45  	// KindSharedMappable indicates a shared futex on a memory mapping other
    46  	// than a private anonymous memory mapping.
    47  	KindSharedMappable
    48  )
    49  
    50  // Key represents something that a futex waiter may wait on.
    51  type Key struct {
    52  	// Kind is the type of the Key.
    53  	Kind KeyKind
    54  
    55  	// Mappable is the memory-mapped object that is represented by the Key.
    56  	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
    57  	// even if it is.
    58  	Mappable memmap.Mappable
    59  
    60  	// MappingIdentity is the MappingIdentity associated with Mappable.
    61  	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
    62  	// it isn't.
    63  	MappingIdentity memmap.MappingIdentity
    64  
    65  	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
    66  	// memory address. Otherwise, Offset is the represented offset into
    67  	// Mappable.
    68  	Offset uint64
    69  }
    70  
    71  func (k *Key) release(t Target) {
    72  	if k.MappingIdentity != nil {
    73  		k.MappingIdentity.DecRef(t)
    74  	}
    75  	k.Mappable = nil
    76  	k.MappingIdentity = nil
    77  }
    78  
    79  func (k *Key) clone() Key {
    80  	if k.MappingIdentity != nil {
    81  		k.MappingIdentity.IncRef()
    82  	}
    83  	return *k
    84  }
    85  
    86  // Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
    87  func (k *Key) addr() hostarch.Addr {
    88  	return hostarch.Addr(k.Offset)
    89  }
    90  
    91  // matches returns true if a wakeup on k2 should wake a waiter waiting on k.
    92  func (k *Key) matches(k2 *Key) bool {
    93  	// k.MappingIdentity is ignored; it's only used for reference counting.
    94  	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
    95  }
    96  
    97  // Target abstracts memory accesses and keys.
    98  type Target interface {
    99  	context.Context
   100  
   101  	// SwapUint32 gives access to hostarch.IO.SwapUint32.
   102  	SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)
   103  
   104  	// CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32.
   105  	CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)
   106  
   107  	// LoadUint32 gives access to hostarch.IO.LoadUint32.
   108  	LoadUint32(addr hostarch.Addr) (uint32, error)
   109  
   110  	// GetSharedKey returns a Key with kind KindSharedPrivate or
   111  	// KindSharedMappable corresponding to the memory mapped at address addr.
   112  	//
   113  	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
   114  	// reference is held on the MappingIdentity, which must be dropped by the
   115  	// caller when the Key is no longer in use.
   116  	GetSharedKey(addr hostarch.Addr) (Key, error)
   117  }
   118  
   119  // check performs a basic equality check on the given address.
   120  func check(t Target, addr hostarch.Addr, val uint32) error {
   121  	cur, err := t.LoadUint32(addr)
   122  	if err != nil {
   123  		return err
   124  	}
   125  	if cur != val {
   126  		return linuxerr.EAGAIN
   127  	}
   128  	return nil
   129  }
   130  
   131  // atomicOp performs a complex operation on the given address.
   132  func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) {
   133  	opType := (opIn >> 28) & 0xf
   134  	cmp := (opIn >> 24) & 0xf
   135  	opArg := (opIn >> 12) & 0xfff
   136  	cmpArg := opIn & 0xfff
   137  
   138  	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
   139  		opArg = 1 << opArg
   140  		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
   141  	}
   142  
   143  	var (
   144  		oldVal uint32
   145  		err    error
   146  	)
   147  	if opType == linux.FUTEX_OP_SET {
   148  		oldVal, err = t.SwapUint32(addr, opArg)
   149  		if err != nil {
   150  			return false, err
   151  		}
   152  	} else {
   153  		for {
   154  			oldVal, err = t.LoadUint32(addr)
   155  			if err != nil {
   156  				return false, err
   157  			}
   158  			var newVal uint32
   159  			switch opType {
   160  			case linux.FUTEX_OP_ADD:
   161  				newVal = oldVal + opArg
   162  			case linux.FUTEX_OP_OR:
   163  				newVal = oldVal | opArg
   164  			case linux.FUTEX_OP_ANDN:
   165  				newVal = oldVal &^ opArg
   166  			case linux.FUTEX_OP_XOR:
   167  				newVal = oldVal ^ opArg
   168  			default:
   169  				return false, syserror.ENOSYS
   170  			}
   171  			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
   172  			if err != nil {
   173  				return false, err
   174  			}
   175  			if prev == oldVal {
   176  				break // Success.
   177  			}
   178  		}
   179  	}
   180  
   181  	switch cmp {
   182  	case linux.FUTEX_OP_CMP_EQ:
   183  		return oldVal == cmpArg, nil
   184  	case linux.FUTEX_OP_CMP_NE:
   185  		return oldVal != cmpArg, nil
   186  	case linux.FUTEX_OP_CMP_LT:
   187  		return oldVal < cmpArg, nil
   188  	case linux.FUTEX_OP_CMP_LE:
   189  		return oldVal <= cmpArg, nil
   190  	case linux.FUTEX_OP_CMP_GT:
   191  		return oldVal > cmpArg, nil
   192  	case linux.FUTEX_OP_CMP_GE:
   193  		return oldVal >= cmpArg, nil
   194  	default:
   195  		return false, syserror.ENOSYS
   196  	}
   197  }
   198  
   199  // Waiter is the struct which gets enqueued into buckets for wake up routines
   200  // and requeue routines to scan and notify. Once a Waiter has been enqueued by
   201  // WaitPrepare(), callers may listen on C for wake up events.
   202  type Waiter struct {
   203  	// Synchronization:
   204  	//
   205  	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
   206  	// synchronization applies).
   207  	//
   208  	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
   209  	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
   210  	// lock") of the containing bucket, and bitmask is immutable. Note that
   211  	// since bucket is mutated using atomic memory operations, bucket.Load()
   212  	// may be called without holding the bucket lock, although it may change
   213  	// racily. See WaitComplete().
   214  	//
   215  	// - A Waiter is only guaranteed to be no longer queued after calling
   216  	// WaitComplete().
   217  
   218  	// waiterEntry links Waiter into bucket.waiters.
   219  	waiterEntry
   220  
   221  	// bucket is the bucket this waiter is queued in. If bucket is nil, the
   222  	// waiter is not waiting and is not in any bucket.
   223  	bucket AtomicPtrBucket
   224  
   225  	// C is sent to when the Waiter is woken.
   226  	C chan struct{}
   227  
   228  	// key is what this waiter is waiting on.
   229  	key Key
   230  
   231  	// The bitmask we're waiting on.
   232  	// This is used the case of a FUTEX_WAKE_BITSET.
   233  	bitmask uint32
   234  
   235  	// tid is the thread ID for the waiter in case this is a PI mutex.
   236  	tid uint32
   237  }
   238  
   239  // NewWaiter returns a new unqueued Waiter.
   240  func NewWaiter() *Waiter {
   241  	return &Waiter{
   242  		C: make(chan struct{}, 1),
   243  	}
   244  }
   245  
   246  // woken returns true if w has been woken since the last call to WaitPrepare.
   247  func (w *Waiter) woken() bool {
   248  	return len(w.C) != 0
   249  }
   250  
   251  // bucket holds a list of waiters for a given address hash.
   252  //
   253  // +stateify savable
   254  type bucket struct {
   255  	// mu protects waiters and contained Waiter state. See comment in Waiter.
   256  	mu sync.Mutex `state:"nosave"`
   257  
   258  	waiters waiterList `state:"zerovalue"`
   259  }
   260  
   261  // wakeLocked wakes up to n waiters matching the bitmask at the addr for this
   262  // bucket and returns the number of waiters woken.
   263  //
   264  // Preconditions: b.mu must be locked.
   265  func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
   266  	done := 0
   267  	for w := b.waiters.Front(); done < n && w != nil; {
   268  		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
   269  			// Not matching.
   270  			w = w.Next()
   271  			continue
   272  		}
   273  
   274  		// Remove from the bucket and wake the waiter.
   275  		woke := w
   276  		w = w.Next() // Next iteration.
   277  		b.wakeWaiterLocked(woke)
   278  		done++
   279  	}
   280  	return done
   281  }
   282  
   283  func (b *bucket) wakeWaiterLocked(w *Waiter) {
   284  	// Remove from the bucket and wake the waiter.
   285  	b.waiters.Remove(w)
   286  	w.C <- struct{}{}
   287  
   288  	// NOTE: The above channel write establishes a write barrier according
   289  	// to the memory model, so nothing may be ordered around it. Since
   290  	// we've dequeued w and will never touch it again, we can safely
   291  	// store nil to w.bucket here and allow the WaitComplete() to
   292  	// short-circuit grabbing the bucket lock. If they somehow miss the
   293  	// store, we are still holding the lock, so we can know that they won't
   294  	// dequeue w, assume it's free and have the below operation
   295  	// afterwards.
   296  	w.bucket.Store(nil)
   297  }
   298  
   299  // requeueLocked takes n waiters from the bucket and moves them to naddr on the
   300  // bucket "to".
   301  //
   302  // Preconditions: b and to must be locked.
   303  func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int {
   304  	done := 0
   305  	for w := b.waiters.Front(); done < n && w != nil; {
   306  		if !w.key.matches(key) {
   307  			// Not matching.
   308  			w = w.Next()
   309  			continue
   310  		}
   311  
   312  		requeued := w
   313  		w = w.Next() // Next iteration.
   314  		b.waiters.Remove(requeued)
   315  		requeued.key.release(t)
   316  		requeued.key = nkey.clone()
   317  		to.waiters.PushBack(requeued)
   318  		requeued.bucket.Store(to)
   319  		done++
   320  	}
   321  	return done
   322  }
   323  
   324  const (
   325  	// bucketCount is the number of buckets per Manager. By having many of
   326  	// these we reduce contention when concurrent yet unrelated calls are made.
   327  	bucketCount     = 1 << bucketCountBits
   328  	bucketCountBits = 10
   329  )
   330  
   331  // getKey returns a Key representing address addr in c.
   332  func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) {
   333  	// Ensure the address is aligned.
   334  	// It must be a DWORD boundary.
   335  	if addr&0x3 != 0 {
   336  		return Key{}, linuxerr.EINVAL
   337  	}
   338  	if private {
   339  		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
   340  	}
   341  	return t.GetSharedKey(addr)
   342  }
   343  
   344  // bucketIndexForAddr returns the index into Manager.buckets for addr.
   345  func bucketIndexForAddr(addr hostarch.Addr) uintptr {
   346  	// - The bottom 2 bits of addr must be 0, per getKey.
   347  	//
   348  	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
   349  	// for a canonical address, and (on all existing platforms) bit 47 must be
   350  	// 0 for an application address.
   351  	//
   352  	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
   353  	// bits. We choose one of the simplest possible hash functions that at
   354  	// least uses all 45 useful bits in the output, given that bucketCountBits
   355  	// == 10. This hash function also has the property that it will usually map
   356  	// adjacent addresses to adjacent buckets, slightly improving memory
   357  	// locality when an application synchronization structure uses multiple
   358  	// nearby futexes.
   359  	//
   360  	// Note that despite the large number of arithmetic operations in the
   361  	// function, many components can be computed in parallel, such that the
   362  	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
   363  	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
   364  	// (addr >> 42)" without any additional grouping, the compiler puts all 4
   365  	// additions in the critical path.
   366  	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
   367  	h2 := uintptr(addr>>32) + uintptr(addr>>42)
   368  	return (h1 + h2) % bucketCount
   369  }
   370  
   371  // Manager holds futex state for a single virtual address space.
   372  //
   373  // +stateify savable
   374  type Manager struct {
   375  	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
   376  	// futexes.
   377  	privateBuckets [bucketCount]bucket `state:"zerovalue"`
   378  
   379  	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
   380  	// may be shared by multiple Managers. The sharedBucket pointer is
   381  	// immutable.
   382  	sharedBucket *bucket
   383  }
   384  
   385  // NewManager returns an initialized futex manager.
   386  func NewManager() *Manager {
   387  	return &Manager{
   388  		sharedBucket: &bucket{},
   389  	}
   390  }
   391  
   392  // Fork returns a new Manager. Shared futex clients using the returned Manager
   393  // may interoperate with those using m.
   394  func (m *Manager) Fork() *Manager {
   395  	return &Manager{
   396  		sharedBucket: m.sharedBucket,
   397  	}
   398  }
   399  
   400  // lockBucket returns a locked bucket for the given key.
   401  // +checklocksacquire:b.mu
   402  func (m *Manager) lockBucket(k *Key) (b *bucket) {
   403  	if k.Kind == KindSharedMappable {
   404  		b = m.sharedBucket
   405  	} else {
   406  		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
   407  	}
   408  	b.mu.Lock()
   409  	return b
   410  }
   411  
   412  // lockBuckets returns locked buckets for the given keys.
   413  // +checklocksacquire:b1.mu
   414  // +checklocksacquire:b2.mu
   415  func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) {
   416  	// Buckets must be consistently ordered to avoid circular lock
   417  	// dependencies. We order buckets in m.privateBuckets by index (lowest
   418  	// index first), and all buckets in m.privateBuckets precede
   419  	// m.sharedBucket.
   420  
   421  	// Handle the common case first:
   422  	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
   423  		i1 := bucketIndexForAddr(k1.addr())
   424  		i2 := bucketIndexForAddr(k2.addr())
   425  		b1 = &m.privateBuckets[i1]
   426  		b2 = &m.privateBuckets[i2]
   427  		switch {
   428  		case i1 < i2:
   429  			b1.mu.Lock()
   430  			b2.mu.Lock()
   431  		case i2 < i1:
   432  			b2.mu.Lock()
   433  			b1.mu.Lock()
   434  		default:
   435  			b1.mu.Lock()
   436  		}
   437  		return b1, b2 // +checklocksforce
   438  	}
   439  
   440  	// At least one of b1 or b2 should be m.sharedBucket.
   441  	b1 = m.sharedBucket
   442  	b2 = m.sharedBucket
   443  	if k1.Kind != KindSharedMappable {
   444  		b1 = m.lockBucket(k1)
   445  	} else if k2.Kind != KindSharedMappable {
   446  		b2 = m.lockBucket(k2)
   447  	}
   448  	m.sharedBucket.mu.Lock()
   449  	return b1, b2 // +checklocksforce
   450  }
   451  
   452  // unlockBuckets unlocks two buckets.
   453  // +checklocksrelease:b1.mu
   454  // +checklocksrelease:b2.mu
   455  func (m *Manager) unlockBuckets(b1, b2 *bucket) {
   456  	b1.mu.Unlock()
   457  	if b1 != b2 {
   458  		b2.mu.Unlock()
   459  	}
   460  	return // +checklocksforce
   461  }
   462  
   463  // Wake wakes up to n waiters matching the bitmask on the given addr.
   464  // The number of waiters woken is returned.
   465  func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) {
   466  	// This function is very hot; avoid defer.
   467  	k, err := getKey(t, addr, private)
   468  	if err != nil {
   469  		return 0, err
   470  	}
   471  
   472  	b := m.lockBucket(&k)
   473  	r := b.wakeLocked(&k, bitmask, n)
   474  
   475  	b.mu.Unlock()
   476  	k.release(t)
   477  	return r, nil
   478  }
   479  
   480  func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
   481  	k1, err := getKey(t, addr, private)
   482  	if err != nil {
   483  		return 0, err
   484  	}
   485  	defer k1.release(t)
   486  	k2, err := getKey(t, naddr, private)
   487  	if err != nil {
   488  		return 0, err
   489  	}
   490  	defer k2.release(t)
   491  
   492  	b1, b2 := m.lockBuckets(&k1, &k2)
   493  	defer m.unlockBuckets(b1, b2)
   494  
   495  	if checkval {
   496  		if err := check(t, addr, val); err != nil {
   497  			return 0, err
   498  		}
   499  	}
   500  
   501  	// Wake the number required.
   502  	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
   503  
   504  	// Requeue the number required.
   505  	b1.requeueLocked(t, b2, &k1, &k2, nreq)
   506  
   507  	return done, nil
   508  }
   509  
   510  // Requeue wakes up to nwake waiters on the given addr, and unconditionally
   511  // requeues up to nreq waiters on naddr.
   512  func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) {
   513  	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
   514  }
   515  
   516  // RequeueCmp atomically checks that the addr contains val (via the Target),
   517  // wakes up to nwake waiters on addr and then unconditionally requeues nreq
   518  // waiters on naddr.
   519  func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
   520  	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
   521  }
   522  
   523  // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
   524  // waiters unconditionally from addr1, and, based on the original value at addr2
   525  // and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
   526  // It returns the total number of waiters woken.
   527  func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
   528  	k1, err := getKey(t, addr1, private)
   529  	if err != nil {
   530  		return 0, err
   531  	}
   532  	defer k1.release(t)
   533  	k2, err := getKey(t, addr2, private)
   534  	if err != nil {
   535  		return 0, err
   536  	}
   537  	defer k2.release(t)
   538  
   539  	b1, b2 := m.lockBuckets(&k1, &k2)
   540  	defer m.unlockBuckets(b1, b2)
   541  
   542  	done := 0
   543  	cond, err := atomicOp(t, addr2, op)
   544  	if err != nil {
   545  		return 0, err
   546  	}
   547  
   548  	// Wake up up to nwake1 entries from the first bucket.
   549  	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
   550  
   551  	// Wake up up to nwake2 entries from the second bucket if the
   552  	// operation yielded true.
   553  	if cond {
   554  		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
   555  	}
   556  
   557  	return done, nil
   558  }
   559  
   560  // WaitPrepare atomically checks that addr contains val (via the Checker), then
   561  // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
   562  // Waiter must be subsequently removed by calling WaitComplete, whether or not
   563  // a wakeup is received on w.C.
   564  func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error {
   565  	k, err := getKey(t, addr, private)
   566  	if err != nil {
   567  		return err
   568  	}
   569  	// Ownership of k is transferred to w below.
   570  
   571  	// Prepare the Waiter before taking the bucket lock.
   572  	select {
   573  	case <-w.C:
   574  	default:
   575  	}
   576  	w.key = k
   577  	w.bitmask = bitmask
   578  
   579  	b := m.lockBucket(&k)
   580  	// This function is very hot; avoid defer.
   581  
   582  	// Perform our atomic check.
   583  	if err := check(t, addr, val); err != nil {
   584  		b.mu.Unlock()
   585  		w.key.release(t)
   586  		return err
   587  	}
   588  
   589  	// Add the waiter to the bucket.
   590  	b.waiters.PushBack(w)
   591  	w.bucket.Store(b)
   592  
   593  	b.mu.Unlock()
   594  	return nil
   595  }
   596  
   597  // WaitComplete must be called when a Waiter previously added by WaitPrepare is
   598  // no longer eligible to be woken.
   599  func (m *Manager) WaitComplete(w *Waiter, t Target) {
   600  	// Remove w from the bucket it's in.
   601  	for {
   602  		b := w.bucket.Load()
   603  
   604  		// If b is nil, the waiter isn't in any bucket anymore. This can't be
   605  		// racy because the waiter can't be concurrently re-queued in another
   606  		// bucket.
   607  		if b == nil {
   608  			break
   609  		}
   610  
   611  		// Take the bucket lock. Note that without holding the bucket lock, the
   612  		// waiter is not guaranteed to stay in that bucket, so after we take
   613  		// the bucket lock, we must ensure that the bucket hasn't changed: if
   614  		// it happens to have changed, we release the old bucket lock and try
   615  		// again with the new bucket; if it hasn't changed, we know it won't
   616  		// change now because we hold the lock.
   617  		b.mu.Lock()
   618  		if b != w.bucket.Load() {
   619  			b.mu.Unlock()
   620  			continue
   621  		}
   622  
   623  		// Remove waiter from bucket.
   624  		b.waiters.Remove(w)
   625  		w.bucket.Store(nil)
   626  		b.mu.Unlock()
   627  		break
   628  	}
   629  
   630  	// Release references held by the waiter.
   631  	w.key.release(t)
   632  }
   633  
   634  // LockPI attempts to lock the futex following the Priority-inheritance futex
   635  // rules. The lock is acquired only when 'addr' points to 0. The TID of the
   636  // calling task is set to 'addr' to indicate the futex is owned. It returns true
   637  // if the futex was successfully acquired.
   638  //
   639  // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
   640  // exit_robust_list()). Given we don't support robust lists, although handled
   641  // below, it's never set.
   642  func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) {
   643  	k, err := getKey(t, addr, private)
   644  	if err != nil {
   645  		return false, err
   646  	}
   647  	// Ownership of k is transferred to w below.
   648  
   649  	// Prepare the Waiter before taking the bucket lock.
   650  	select {
   651  	case <-w.C:
   652  	default:
   653  	}
   654  	w.key = k
   655  	w.tid = tid
   656  
   657  	b := m.lockBucket(&k)
   658  	// Hot function: avoid defers.
   659  
   660  	success, err := m.lockPILocked(w, t, addr, tid, b, try)
   661  	if err != nil {
   662  		w.key.release(t)
   663  		b.mu.Unlock()
   664  		return false, err
   665  	}
   666  	if success || try {
   667  		// Release waiter if it's not going to be a wait.
   668  		w.key.release(t)
   669  	}
   670  	b.mu.Unlock()
   671  	return success, nil
   672  }
   673  
   674  func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) {
   675  	for {
   676  		cur, err := t.LoadUint32(addr)
   677  		if err != nil {
   678  			return false, err
   679  		}
   680  		if (cur & linux.FUTEX_TID_MASK) == tid {
   681  			return false, linuxerr.EDEADLK
   682  		}
   683  
   684  		if (cur & linux.FUTEX_TID_MASK) == 0 {
   685  			// No owner and no waiters, try to acquire the futex.
   686  
   687  			// Set TID and preserve owner died status.
   688  			val := tid
   689  			val |= cur & linux.FUTEX_OWNER_DIED
   690  			prev, err := t.CompareAndSwapUint32(addr, cur, val)
   691  			if err != nil {
   692  				return false, err
   693  			}
   694  			if prev != cur {
   695  				// CAS failed, retry...
   696  				// Linux reacquires the bucket lock on retries, which will re-lookup the
   697  				// mapping at the futex address. However, retrying while holding the
   698  				// lock is more efficient and reduces the chance of another conflict.
   699  				continue
   700  			}
   701  			// Futex acquired.
   702  			return true, nil
   703  		}
   704  
   705  		// Futex is already owned, prepare to wait.
   706  
   707  		if try {
   708  			// Caller doesn't want to wait.
   709  			return false, nil
   710  		}
   711  
   712  		// Set waiters bit if not set yet.
   713  		if cur&linux.FUTEX_WAITERS == 0 {
   714  			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
   715  			if err != nil {
   716  				return false, err
   717  			}
   718  			if prev != cur {
   719  				// CAS failed, retry...
   720  				continue
   721  			}
   722  		}
   723  
   724  		// Add the waiter to the bucket.
   725  		b.waiters.PushBack(w)
   726  		w.bucket.Store(b)
   727  		return false, nil
   728  	}
   729  }
   730  
   731  // UnlockPI unlocks the futex following the Priority-inheritance futex rules.
   732  // The address provided must contain the caller's TID. If there are waiters,
   733  // TID of the next waiter (FIFO) is set to the given address, and the waiter
   734  // woken up. If there are no waiters, 0 is set to the address.
   735  func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error {
   736  	k, err := getKey(t, addr, private)
   737  	if err != nil {
   738  		return err
   739  	}
   740  	b := m.lockBucket(&k)
   741  
   742  	err = m.unlockPILocked(t, addr, tid, b, &k)
   743  
   744  	k.release(t)
   745  	b.mu.Unlock()
   746  	return err
   747  }
   748  
   749  func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error {
   750  	cur, err := t.LoadUint32(addr)
   751  	if err != nil {
   752  		return err
   753  	}
   754  
   755  	if (cur & linux.FUTEX_TID_MASK) != tid {
   756  		return linuxerr.EPERM
   757  	}
   758  
   759  	var next *Waiter  // Who's the next owner?
   760  	var next2 *Waiter // Who's the one after that?
   761  	for w := b.waiters.Front(); w != nil; w = w.Next() {
   762  		if !w.key.matches(key) {
   763  			continue
   764  		}
   765  
   766  		if next == nil {
   767  			next = w
   768  		} else {
   769  			next2 = w
   770  			break
   771  		}
   772  	}
   773  
   774  	if next == nil {
   775  		// It's safe to set 0 because there are no waiters, no new owner, and the
   776  		// executing task is the current owner (no owner died bit).
   777  		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
   778  		if err != nil {
   779  			return err
   780  		}
   781  		if prev != cur {
   782  			// Let user mode handle CAS races. This is different than lock, which
   783  			// retries when CAS fails.
   784  			return linuxerr.EAGAIN
   785  		}
   786  		return nil
   787  	}
   788  
   789  	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
   790  	// set, because the executing task takes over as the owner.
   791  	val := next.tid
   792  	if next2 != nil {
   793  		val |= linux.FUTEX_WAITERS
   794  	}
   795  
   796  	prev, err := t.CompareAndSwapUint32(addr, cur, val)
   797  	if err != nil {
   798  		return err
   799  	}
   800  	if prev != cur {
   801  		return linuxerr.EINVAL
   802  	}
   803  
   804  	b.wakeWaiterLocked(next)
   805  	return nil
   806  }