gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/syscalls/linux/sys_futex.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"time"
    19  
    20  	"gvisor.dev/gvisor/pkg/abi/linux"
    21  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    22  	"gvisor.dev/gvisor/pkg/hostarch"
    23  	"gvisor.dev/gvisor/pkg/sentry/arch"
    24  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    25  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    26  )
    27  
    28  // futexWaitRestartBlock encapsulates the state required to restart futex(2)
    29  // via restart_syscall(2).
    30  //
    31  // +stateify savable
    32  type futexWaitRestartBlock struct {
    33  	duration time.Duration
    34  
    35  	// addr stored as uint64 since uintptr is not save-able.
    36  	addr    uint64
    37  	private bool
    38  	val     uint32
    39  	mask    uint32
    40  }
    41  
    42  // Restart implements kernel.SyscallRestartBlock.Restart.
    43  func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
    44  	return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask)
    45  }
    46  
    47  // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
    48  // complete.
    49  //
    50  // The wait blocks forever if forever is true, otherwise it blocks until ts.
    51  //
    52  // If blocking is interrupted, the syscall is restarted with the original
    53  // arguments.
    54  func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
    55  	w := t.FutexWaiter()
    56  	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
    57  	if err != nil {
    58  		return 0, err
    59  	}
    60  
    61  	if forever {
    62  		err = t.Block(w.C)
    63  	} else if clockRealtime {
    64  		err = t.BlockWithDeadlineFrom(w.C, t.Kernel().RealtimeClock(), true, ktime.FromTimespec(ts))
    65  	} else {
    66  		err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
    67  	}
    68  
    69  	t.Futex().WaitComplete(w, t)
    70  	return 0, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS)
    71  }
    72  
    73  // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
    74  // complete.
    75  //
    76  // The wait blocks forever if forever is true, otherwise is blocks for
    77  // duration.
    78  //
    79  // If blocking is interrupted, forever determines how to restart the
    80  // syscall. If forever is true, the syscall is restarted with the original
    81  // arguments. If forever is false, duration is a relative timeout and the
    82  // syscall is restarted with the remaining timeout.
    83  func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
    84  	w := t.FutexWaiter()
    85  	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
    86  	if err != nil {
    87  		return 0, err
    88  	}
    89  
    90  	remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
    91  	t.Futex().WaitComplete(w, t)
    92  	if err == nil {
    93  		return 0, nil
    94  	}
    95  
    96  	// The wait was unsuccessful for some reason other than interruption. Simply
    97  	// forward the error.
    98  	if err != linuxerr.ErrInterrupted {
    99  		return 0, err
   100  	}
   101  
   102  	// The wait was interrupted and we need to restart. Decide how.
   103  
   104  	// The wait duration was absolute, restart with the original arguments.
   105  	if forever {
   106  		return 0, linuxerr.ERESTARTSYS
   107  	}
   108  
   109  	// The wait duration was relative, restart with the remaining duration.
   110  	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
   111  		duration: remaining,
   112  		addr:     uint64(addr),
   113  		private:  private,
   114  		val:      val,
   115  		mask:     mask,
   116  	})
   117  	return 0, linuxerr.ERESTART_RESTARTBLOCK
   118  }
   119  
   120  func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error {
   121  	w := t.FutexWaiter()
   122  	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
   123  	if err != nil {
   124  		return err
   125  	}
   126  	if locked {
   127  		// Futex acquired, we're done!
   128  		return nil
   129  	}
   130  
   131  	if forever {
   132  		err = t.Block(w.C)
   133  	} else {
   134  		err = t.BlockWithDeadlineFrom(w.C, t.Kernel().RealtimeClock(), true, ktime.FromTimespec(ts))
   135  	}
   136  
   137  	t.Futex().WaitComplete(w, t)
   138  	return linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS)
   139  }
   140  
   141  func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error {
   142  	w := t.FutexWaiter()
   143  	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
   144  	if err != nil {
   145  		return err
   146  	}
   147  	if !locked {
   148  		return linuxerr.EWOULDBLOCK
   149  	}
   150  	return nil
   151  }
   152  
   153  // Futex implements linux syscall futex(2).
   154  // It provides a method for a program to wait for a value at a given address to
   155  // change, and a method to wake up anyone waiting on a particular address.
   156  func Futex(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   157  	addr := args[0].Pointer()
   158  	futexOp := args[1].Int()
   159  	val := int(args[2].Int())
   160  	nreq := int(args[3].Int())
   161  	timeout := args[3].Pointer()
   162  	naddr := args[4].Pointer()
   163  	val3 := args[5].Int()
   164  
   165  	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
   166  	private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
   167  	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
   168  	mask := uint32(val3)
   169  
   170  	switch cmd {
   171  	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
   172  		// WAIT{_BITSET} wait forever if the timeout isn't passed.
   173  		forever := (timeout == 0)
   174  
   175  		var timespec linux.Timespec
   176  		if !forever {
   177  			var err error
   178  			timespec, err = copyTimespecIn(t, timeout)
   179  			if err != nil {
   180  				return 0, nil, err
   181  			}
   182  		}
   183  
   184  		switch cmd {
   185  		case linux.FUTEX_WAIT:
   186  			// WAIT uses a relative timeout.
   187  			mask = linux.FUTEX_BITSET_MATCH_ANY
   188  			var timeoutDur time.Duration
   189  			if !forever {
   190  				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
   191  			}
   192  			n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
   193  			return n, nil, err
   194  
   195  		case linux.FUTEX_WAIT_BITSET:
   196  			// WAIT_BITSET uses an absolute timeout which is either
   197  			// CLOCK_MONOTONIC or CLOCK_REALTIME.
   198  			if mask == 0 {
   199  				return 0, nil, linuxerr.EINVAL
   200  			}
   201  			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
   202  			return n, nil, err
   203  		default:
   204  			panic("unreachable")
   205  		}
   206  
   207  	case linux.FUTEX_WAKE:
   208  		mask = ^uint32(0)
   209  		fallthrough
   210  
   211  	case linux.FUTEX_WAKE_BITSET:
   212  		if mask == 0 {
   213  			return 0, nil, linuxerr.EINVAL
   214  		}
   215  		if val <= 0 {
   216  			// The Linux kernel wakes one waiter even if val is
   217  			// non-positive.
   218  			val = 1
   219  		}
   220  		n, err := t.Futex().Wake(t, addr, private, mask, val)
   221  		return uintptr(n), nil, err
   222  
   223  	case linux.FUTEX_REQUEUE:
   224  		n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
   225  		return uintptr(n), nil, err
   226  
   227  	case linux.FUTEX_CMP_REQUEUE:
   228  		// 'val3' contains the value to be checked at 'addr' and
   229  		// 'val' is the number of waiters that should be woken up.
   230  		nval := uint32(val3)
   231  		n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
   232  		return uintptr(n), nil, err
   233  
   234  	case linux.FUTEX_WAKE_OP:
   235  		op := uint32(val3)
   236  		if val <= 0 {
   237  			// The Linux kernel wakes one waiter even if val is
   238  			// non-positive.
   239  			val = 1
   240  		}
   241  		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
   242  		return uintptr(n), nil, err
   243  
   244  	case linux.FUTEX_LOCK_PI:
   245  		forever := (timeout == 0)
   246  
   247  		var timespec linux.Timespec
   248  		if !forever {
   249  			var err error
   250  			timespec, err = copyTimespecIn(t, timeout)
   251  			if err != nil {
   252  				return 0, nil, err
   253  			}
   254  		}
   255  		err := futexLockPI(t, timespec, forever, addr, private)
   256  		return 0, nil, err
   257  
   258  	case linux.FUTEX_TRYLOCK_PI:
   259  		err := tryLockPI(t, addr, private)
   260  		return 0, nil, err
   261  
   262  	case linux.FUTEX_UNLOCK_PI:
   263  		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
   264  		return 0, nil, err
   265  
   266  	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
   267  		t.Kernel().EmitUnimplementedEvent(t, sysno)
   268  		return 0, nil, linuxerr.ENOSYS
   269  
   270  	default:
   271  		// We don't even know about this command.
   272  		return 0, nil, linuxerr.ENOSYS
   273  	}
   274  }
   275  
   276  // SetRobustList implements linux syscall set_robust_list(2).
   277  func SetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   278  	// Despite the syscall using the name 'pid' for this variable, it is
   279  	// very much a tid.
   280  	head := args[0].Pointer()
   281  	length := args[1].SizeT()
   282  
   283  	if length != uint(linux.SizeOfRobustListHead) {
   284  		return 0, nil, linuxerr.EINVAL
   285  	}
   286  	t.SetRobustList(head)
   287  	return 0, nil, nil
   288  }
   289  
   290  // GetRobustList implements linux syscall get_robust_list(2).
   291  func GetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   292  	// Despite the syscall using the name 'pid' for this variable, it is
   293  	// very much a tid.
   294  	tid := args[0].Int()
   295  	headAddr := args[1].Pointer()
   296  	sizeAddr := args[2].Pointer()
   297  
   298  	if tid < 0 {
   299  		return 0, nil, linuxerr.EINVAL
   300  	}
   301  
   302  	ot := t
   303  	if tid != 0 {
   304  		if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
   305  			return 0, nil, linuxerr.ESRCH
   306  		}
   307  	}
   308  
   309  	// Copy out head pointer.
   310  	head := t.Arch().Native(uintptr(ot.GetRobustList()))
   311  	if _, err := head.CopyOut(t, headAddr); err != nil {
   312  		return 0, nil, err
   313  	}
   314  
   315  	// Copy out size, which is a constant. Note that while size isn't
   316  	// an address, it is defined as the arch-dependent size_t, so it
   317  	// needs to be converted to a native-sized int.
   318  	size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
   319  	if _, err := size.CopyOut(t, sizeAddr); err != nil {
   320  		return 0, nil, err
   321  	}
   322  
   323  	return 0, nil, nil
   324  }