github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/syscalls/linux/sys_futex.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"time"
    19  
    20  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr"
    22  	"github.com/ttpreport/gvisor-ligolo/pkg/hostarch"
    23  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/arch"
    24  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel"
    25  	ktime "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/time"
    26  )
    27  
    28  // futexWaitRestartBlock encapsulates the state required to restart futex(2)
    29  // via restart_syscall(2).
    30  //
    31  // +stateify savable
    32  type futexWaitRestartBlock struct {
    33  	duration time.Duration
    34  
    35  	// addr stored as uint64 since uintptr is not save-able.
    36  	addr    uint64
    37  	private bool
    38  	val     uint32
    39  	mask    uint32
    40  }
    41  
    42  // Restart implements kernel.SyscallRestartBlock.Restart.
    43  func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
    44  	return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask)
    45  }
    46  
    47  // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
    48  // complete.
    49  //
    50  // The wait blocks forever if forever is true, otherwise it blocks until ts.
    51  //
    52  // If blocking is interrupted, the syscall is restarted with the original
    53  // arguments.
    54  func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
    55  	w := t.FutexWaiter()
    56  	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
    57  	if err != nil {
    58  		return 0, err
    59  	}
    60  
    61  	if forever {
    62  		err = t.Block(w.C)
    63  	} else if clockRealtime {
    64  		notifier, tchan := ktime.NewChannelNotifier()
    65  		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
    66  		timer.Swap(ktime.Setting{
    67  			Enabled: true,
    68  			Next:    ktime.FromTimespec(ts),
    69  		})
    70  		err = t.BlockWithTimer(w.C, tchan)
    71  		timer.Destroy()
    72  	} else {
    73  		err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
    74  	}
    75  
    76  	t.Futex().WaitComplete(w, t)
    77  	return 0, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS)
    78  }
    79  
    80  // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
    81  // complete.
    82  //
    83  // The wait blocks forever if forever is true, otherwise is blocks for
    84  // duration.
    85  //
    86  // If blocking is interrupted, forever determines how to restart the
    87  // syscall. If forever is true, the syscall is restarted with the original
    88  // arguments. If forever is false, duration is a relative timeout and the
    89  // syscall is restarted with the remaining timeout.
    90  func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
    91  	w := t.FutexWaiter()
    92  	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
    93  	if err != nil {
    94  		return 0, err
    95  	}
    96  
    97  	remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
    98  	t.Futex().WaitComplete(w, t)
    99  	if err == nil {
   100  		return 0, nil
   101  	}
   102  
   103  	// The wait was unsuccessful for some reason other than interruption. Simply
   104  	// forward the error.
   105  	if err != linuxerr.ErrInterrupted {
   106  		return 0, err
   107  	}
   108  
   109  	// The wait was interrupted and we need to restart. Decide how.
   110  
   111  	// The wait duration was absolute, restart with the original arguments.
   112  	if forever {
   113  		return 0, linuxerr.ERESTARTSYS
   114  	}
   115  
   116  	// The wait duration was relative, restart with the remaining duration.
   117  	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
   118  		duration: remaining,
   119  		addr:     uint64(addr),
   120  		private:  private,
   121  		val:      val,
   122  		mask:     mask,
   123  	})
   124  	return 0, linuxerr.ERESTART_RESTARTBLOCK
   125  }
   126  
   127  func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error {
   128  	w := t.FutexWaiter()
   129  	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
   130  	if err != nil {
   131  		return err
   132  	}
   133  	if locked {
   134  		// Futex acquired, we're done!
   135  		return nil
   136  	}
   137  
   138  	if forever {
   139  		err = t.Block(w.C)
   140  	} else {
   141  		notifier, tchan := ktime.NewChannelNotifier()
   142  		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
   143  		timer.Swap(ktime.Setting{
   144  			Enabled: true,
   145  			Next:    ktime.FromTimespec(ts),
   146  		})
   147  		err = t.BlockWithTimer(w.C, tchan)
   148  		timer.Destroy()
   149  	}
   150  
   151  	t.Futex().WaitComplete(w, t)
   152  	return linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS)
   153  }
   154  
   155  func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error {
   156  	w := t.FutexWaiter()
   157  	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
   158  	if err != nil {
   159  		return err
   160  	}
   161  	if !locked {
   162  		return linuxerr.EWOULDBLOCK
   163  	}
   164  	return nil
   165  }
   166  
   167  // Futex implements linux syscall futex(2).
   168  // It provides a method for a program to wait for a value at a given address to
   169  // change, and a method to wake up anyone waiting on a particular address.
   170  func Futex(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   171  	addr := args[0].Pointer()
   172  	futexOp := args[1].Int()
   173  	val := int(args[2].Int())
   174  	nreq := int(args[3].Int())
   175  	timeout := args[3].Pointer()
   176  	naddr := args[4].Pointer()
   177  	val3 := args[5].Int()
   178  
   179  	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
   180  	private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
   181  	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
   182  	mask := uint32(val3)
   183  
   184  	switch cmd {
   185  	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
   186  		// WAIT{_BITSET} wait forever if the timeout isn't passed.
   187  		forever := (timeout == 0)
   188  
   189  		var timespec linux.Timespec
   190  		if !forever {
   191  			var err error
   192  			timespec, err = copyTimespecIn(t, timeout)
   193  			if err != nil {
   194  				return 0, nil, err
   195  			}
   196  		}
   197  
   198  		switch cmd {
   199  		case linux.FUTEX_WAIT:
   200  			// WAIT uses a relative timeout.
   201  			mask = linux.FUTEX_BITSET_MATCH_ANY
   202  			var timeoutDur time.Duration
   203  			if !forever {
   204  				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
   205  			}
   206  			n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
   207  			return n, nil, err
   208  
   209  		case linux.FUTEX_WAIT_BITSET:
   210  			// WAIT_BITSET uses an absolute timeout which is either
   211  			// CLOCK_MONOTONIC or CLOCK_REALTIME.
   212  			if mask == 0 {
   213  				return 0, nil, linuxerr.EINVAL
   214  			}
   215  			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
   216  			return n, nil, err
   217  		default:
   218  			panic("unreachable")
   219  		}
   220  
   221  	case linux.FUTEX_WAKE:
   222  		mask = ^uint32(0)
   223  		fallthrough
   224  
   225  	case linux.FUTEX_WAKE_BITSET:
   226  		if mask == 0 {
   227  			return 0, nil, linuxerr.EINVAL
   228  		}
   229  		if val <= 0 {
   230  			// The Linux kernel wakes one waiter even if val is
   231  			// non-positive.
   232  			val = 1
   233  		}
   234  		n, err := t.Futex().Wake(t, addr, private, mask, val)
   235  		return uintptr(n), nil, err
   236  
   237  	case linux.FUTEX_REQUEUE:
   238  		n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
   239  		return uintptr(n), nil, err
   240  
   241  	case linux.FUTEX_CMP_REQUEUE:
   242  		// 'val3' contains the value to be checked at 'addr' and
   243  		// 'val' is the number of waiters that should be woken up.
   244  		nval := uint32(val3)
   245  		n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
   246  		return uintptr(n), nil, err
   247  
   248  	case linux.FUTEX_WAKE_OP:
   249  		op := uint32(val3)
   250  		if val <= 0 {
   251  			// The Linux kernel wakes one waiter even if val is
   252  			// non-positive.
   253  			val = 1
   254  		}
   255  		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
   256  		return uintptr(n), nil, err
   257  
   258  	case linux.FUTEX_LOCK_PI:
   259  		forever := (timeout == 0)
   260  
   261  		var timespec linux.Timespec
   262  		if !forever {
   263  			var err error
   264  			timespec, err = copyTimespecIn(t, timeout)
   265  			if err != nil {
   266  				return 0, nil, err
   267  			}
   268  		}
   269  		err := futexLockPI(t, timespec, forever, addr, private)
   270  		return 0, nil, err
   271  
   272  	case linux.FUTEX_TRYLOCK_PI:
   273  		err := tryLockPI(t, addr, private)
   274  		return 0, nil, err
   275  
   276  	case linux.FUTEX_UNLOCK_PI:
   277  		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
   278  		return 0, nil, err
   279  
   280  	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
   281  		t.Kernel().EmitUnimplementedEvent(t, sysno)
   282  		return 0, nil, linuxerr.ENOSYS
   283  
   284  	default:
   285  		// We don't even know about this command.
   286  		return 0, nil, linuxerr.ENOSYS
   287  	}
   288  }
   289  
   290  // SetRobustList implements linux syscall set_robust_list(2).
   291  func SetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   292  	// Despite the syscall using the name 'pid' for this variable, it is
   293  	// very much a tid.
   294  	head := args[0].Pointer()
   295  	length := args[1].SizeT()
   296  
   297  	if length != uint(linux.SizeOfRobustListHead) {
   298  		return 0, nil, linuxerr.EINVAL
   299  	}
   300  	t.SetRobustList(head)
   301  	return 0, nil, nil
   302  }
   303  
   304  // GetRobustList implements linux syscall get_robust_list(2).
   305  func GetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   306  	// Despite the syscall using the name 'pid' for this variable, it is
   307  	// very much a tid.
   308  	tid := args[0].Int()
   309  	headAddr := args[1].Pointer()
   310  	sizeAddr := args[2].Pointer()
   311  
   312  	if tid < 0 {
   313  		return 0, nil, linuxerr.EINVAL
   314  	}
   315  
   316  	ot := t
   317  	if tid != 0 {
   318  		if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
   319  			return 0, nil, linuxerr.ESRCH
   320  		}
   321  	}
   322  
   323  	// Copy out head pointer.
   324  	head := t.Arch().Native(uintptr(ot.GetRobustList()))
   325  	if _, err := head.CopyOut(t, headAddr); err != nil {
   326  		return 0, nil, err
   327  	}
   328  
   329  	// Copy out size, which is a constant. Note that while size isn't
   330  	// an address, it is defined as the arch-dependent size_t, so it
   331  	// needs to be converted to a native-sized int.
   332  	size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
   333  	if _, err := size.CopyOut(t, sizeAddr); err != nil {
   334  		return 0, nil, err
   335  	}
   336  
   337  	return 0, nil, nil
   338  }