github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/sys_futex.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"time"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/hostarch"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    25  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    26  	"github.com/SagerNet/gvisor/pkg/syserror"
    27  )
    28  
    29  // futexWaitRestartBlock encapsulates the state required to restart futex(2)
    30  // via restart_syscall(2).
    31  //
    32  // +stateify savable
    33  type futexWaitRestartBlock struct {
    34  	duration time.Duration
    35  
    36  	// addr stored as uint64 since uintptr is not save-able.
    37  	addr    uint64
    38  	private bool
    39  	val     uint32
    40  	mask    uint32
    41  }
    42  
    43  // Restart implements kernel.SyscallRestartBlock.Restart.
    44  func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
    45  	return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask)
    46  }
    47  
    48  // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
    49  // complete.
    50  //
    51  // The wait blocks forever if forever is true, otherwise it blocks until ts.
    52  //
    53  // If blocking is interrupted, the syscall is restarted with the original
    54  // arguments.
    55  func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
    56  	w := t.FutexWaiter()
    57  	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
    58  	if err != nil {
    59  		return 0, err
    60  	}
    61  
    62  	if forever {
    63  		err = t.Block(w.C)
    64  	} else if clockRealtime {
    65  		notifier, tchan := ktime.NewChannelNotifier()
    66  		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
    67  		timer.Swap(ktime.Setting{
    68  			Enabled: true,
    69  			Next:    ktime.FromTimespec(ts),
    70  		})
    71  		err = t.BlockWithTimer(w.C, tchan)
    72  		timer.Destroy()
    73  	} else {
    74  		err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
    75  	}
    76  
    77  	t.Futex().WaitComplete(w, t)
    78  	return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
    79  }
    80  
    81  // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
    82  // complete.
    83  //
    84  // The wait blocks forever if forever is true, otherwise is blocks for
    85  // duration.
    86  //
    87  // If blocking is interrupted, forever determines how to restart the
    88  // syscall. If forever is true, the syscall is restarted with the original
    89  // arguments. If forever is false, duration is a relative timeout and the
    90  // syscall is restarted with the remaining timeout.
    91  func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
    92  	w := t.FutexWaiter()
    93  	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
    94  	if err != nil {
    95  		return 0, err
    96  	}
    97  
    98  	remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
    99  	t.Futex().WaitComplete(w, t)
   100  	if err == nil {
   101  		return 0, nil
   102  	}
   103  
   104  	// The wait was unsuccessful for some reason other than interruption. Simply
   105  	// forward the error.
   106  	if err != syserror.ErrInterrupted {
   107  		return 0, err
   108  	}
   109  
   110  	// The wait was interrupted and we need to restart. Decide how.
   111  
   112  	// The wait duration was absolute, restart with the original arguments.
   113  	if forever {
   114  		return 0, syserror.ERESTARTSYS
   115  	}
   116  
   117  	// The wait duration was relative, restart with the remaining duration.
   118  	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
   119  		duration: remaining,
   120  		addr:     uint64(addr),
   121  		private:  private,
   122  		val:      val,
   123  		mask:     mask,
   124  	})
   125  	return 0, syserror.ERESTART_RESTARTBLOCK
   126  }
   127  
   128  func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error {
   129  	w := t.FutexWaiter()
   130  	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
   131  	if err != nil {
   132  		return err
   133  	}
   134  	if locked {
   135  		// Futex acquired, we're done!
   136  		return nil
   137  	}
   138  
   139  	if forever {
   140  		err = t.Block(w.C)
   141  	} else {
   142  		notifier, tchan := ktime.NewChannelNotifier()
   143  		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
   144  		timer.Swap(ktime.Setting{
   145  			Enabled: true,
   146  			Next:    ktime.FromTimespec(ts),
   147  		})
   148  		err = t.BlockWithTimer(w.C, tchan)
   149  		timer.Destroy()
   150  	}
   151  
   152  	t.Futex().WaitComplete(w, t)
   153  	return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
   154  }
   155  
   156  func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error {
   157  	w := t.FutexWaiter()
   158  	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
   159  	if err != nil {
   160  		return err
   161  	}
   162  	if !locked {
   163  		return linuxerr.EWOULDBLOCK
   164  	}
   165  	return nil
   166  }
   167  
   168  // Futex implements linux syscall futex(2).
   169  // It provides a method for a program to wait for a value at a given address to
   170  // change, and a method to wake up anyone waiting on a particular address.
   171  func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   172  	addr := args[0].Pointer()
   173  	futexOp := args[1].Int()
   174  	val := int(args[2].Int())
   175  	nreq := int(args[3].Int())
   176  	timeout := args[3].Pointer()
   177  	naddr := args[4].Pointer()
   178  	val3 := args[5].Int()
   179  
   180  	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
   181  	private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
   182  	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
   183  	mask := uint32(val3)
   184  
   185  	switch cmd {
   186  	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
   187  		// WAIT{_BITSET} wait forever if the timeout isn't passed.
   188  		forever := (timeout == 0)
   189  
   190  		var timespec linux.Timespec
   191  		if !forever {
   192  			var err error
   193  			timespec, err = copyTimespecIn(t, timeout)
   194  			if err != nil {
   195  				return 0, nil, err
   196  			}
   197  		}
   198  
   199  		switch cmd {
   200  		case linux.FUTEX_WAIT:
   201  			// WAIT uses a relative timeout.
   202  			mask = linux.FUTEX_BITSET_MATCH_ANY
   203  			var timeoutDur time.Duration
   204  			if !forever {
   205  				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
   206  			}
   207  			n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
   208  			return n, nil, err
   209  
   210  		case linux.FUTEX_WAIT_BITSET:
   211  			// WAIT_BITSET uses an absolute timeout which is either
   212  			// CLOCK_MONOTONIC or CLOCK_REALTIME.
   213  			if mask == 0 {
   214  				return 0, nil, linuxerr.EINVAL
   215  			}
   216  			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
   217  			return n, nil, err
   218  		default:
   219  			panic("unreachable")
   220  		}
   221  
   222  	case linux.FUTEX_WAKE:
   223  		mask = ^uint32(0)
   224  		fallthrough
   225  
   226  	case linux.FUTEX_WAKE_BITSET:
   227  		if mask == 0 {
   228  			return 0, nil, linuxerr.EINVAL
   229  		}
   230  		if val <= 0 {
   231  			// The Linux kernel wakes one waiter even if val is
   232  			// non-positive.
   233  			val = 1
   234  		}
   235  		n, err := t.Futex().Wake(t, addr, private, mask, val)
   236  		return uintptr(n), nil, err
   237  
   238  	case linux.FUTEX_REQUEUE:
   239  		n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
   240  		return uintptr(n), nil, err
   241  
   242  	case linux.FUTEX_CMP_REQUEUE:
   243  		// 'val3' contains the value to be checked at 'addr' and
   244  		// 'val' is the number of waiters that should be woken up.
   245  		nval := uint32(val3)
   246  		n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
   247  		return uintptr(n), nil, err
   248  
   249  	case linux.FUTEX_WAKE_OP:
   250  		op := uint32(val3)
   251  		if val <= 0 {
   252  			// The Linux kernel wakes one waiter even if val is
   253  			// non-positive.
   254  			val = 1
   255  		}
   256  		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
   257  		return uintptr(n), nil, err
   258  
   259  	case linux.FUTEX_LOCK_PI:
   260  		forever := (timeout == 0)
   261  
   262  		var timespec linux.Timespec
   263  		if !forever {
   264  			var err error
   265  			timespec, err = copyTimespecIn(t, timeout)
   266  			if err != nil {
   267  				return 0, nil, err
   268  			}
   269  		}
   270  		err := futexLockPI(t, timespec, forever, addr, private)
   271  		return 0, nil, err
   272  
   273  	case linux.FUTEX_TRYLOCK_PI:
   274  		err := tryLockPI(t, addr, private)
   275  		return 0, nil, err
   276  
   277  	case linux.FUTEX_UNLOCK_PI:
   278  		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
   279  		return 0, nil, err
   280  
   281  	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
   282  		t.Kernel().EmitUnimplementedEvent(t)
   283  		return 0, nil, syserror.ENOSYS
   284  
   285  	default:
   286  		// We don't even know about this command.
   287  		return 0, nil, syserror.ENOSYS
   288  	}
   289  }
   290  
   291  // SetRobustList implements linux syscall set_robust_list(2).
   292  func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   293  	// Despite the syscall using the name 'pid' for this variable, it is
   294  	// very much a tid.
   295  	head := args[0].Pointer()
   296  	length := args[1].SizeT()
   297  
   298  	if length != uint(linux.SizeOfRobustListHead) {
   299  		return 0, nil, linuxerr.EINVAL
   300  	}
   301  	t.SetRobustList(head)
   302  	return 0, nil, nil
   303  }
   304  
   305  // GetRobustList implements linux syscall get_robust_list(2).
   306  func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   307  	// Despite the syscall using the name 'pid' for this variable, it is
   308  	// very much a tid.
   309  	tid := args[0].Int()
   310  	headAddr := args[1].Pointer()
   311  	sizeAddr := args[2].Pointer()
   312  
   313  	if tid < 0 {
   314  		return 0, nil, linuxerr.EINVAL
   315  	}
   316  
   317  	ot := t
   318  	if tid != 0 {
   319  		if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
   320  			return 0, nil, syserror.ESRCH
   321  		}
   322  	}
   323  
   324  	// Copy out head pointer.
   325  	head := t.Arch().Native(uintptr(ot.GetRobustList()))
   326  	if _, err := head.CopyOut(t, headAddr); err != nil {
   327  		return 0, nil, err
   328  	}
   329  
   330  	// Copy out size, which is a constant. Note that while size isn't
   331  	// an address, it is defined as the arch-dependent size_t, so it
   332  	// needs to be converted to a native-sized int.
   333  	size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
   334  	if _, err := size.CopyOut(t, sizeAddr); err != nil {
   335  		return 0, nil, err
   336  	}
   337  
   338  	return 0, nil, nil
   339  }