gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_syscall.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"runtime/trace"
    21  
    22  	"golang.org/x/sys/unix"
    23  	"gvisor.dev/gvisor/pkg/abi/linux"
    24  	"gvisor.dev/gvisor/pkg/bits"
    25  	"gvisor.dev/gvisor/pkg/errors"
    26  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    27  	"gvisor.dev/gvisor/pkg/hostarch"
    28  	"gvisor.dev/gvisor/pkg/marshal"
    29  	"gvisor.dev/gvisor/pkg/metric"
    30  	"gvisor.dev/gvisor/pkg/sentry/arch"
    31  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    32  	"gvisor.dev/gvisor/pkg/sentry/platform"
    33  	"gvisor.dev/gvisor/pkg/sentry/seccheck"
    34  	pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    35  )
    36  
    37  // SyscallRestartBlock represents the restart block for a syscall restartable
    38  // with a custom function. It encapsulates the state required to restart a
    39  // syscall across a S/R.
    40  type SyscallRestartBlock interface {
    41  	Restart(t *Task) (uintptr, error)
    42  }
    43  
    44  // SyscallControl is returned by syscalls to control the behavior of
    45  // Task.doSyscallInvoke.
    46  type SyscallControl struct {
    47  	// next is the state that the task goroutine should switch to. If next is
    48  	// nil, the task goroutine should continue to syscall exit as usual.
    49  	next taskRunState
    50  
    51  	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
    52  	// in the task's syscall return value register.
    53  	ignoreReturn bool
    54  }
    55  
    56  var (
    57  	// CtrlDoExit is returned by the implementations of the exit and exit_group
    58  	// syscalls to enter the task exit path directly, skipping syscall exit
    59  	// tracing.
    60  	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
    61  
    62  	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
    63  	// feature before syscall execution. This causes Task.doSyscallInvoke
    64  	// to return runSyscallReinvoke, allowing Task.run to check for stops
    65  	// before immediately re-invoking the syscall (skipping the re-checking
    66  	// of seccomp filters and ptrace which would confuse userspace
    67  	// tracing).
    68  	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
    69  
    70  	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
    71  	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
    72  	// than tail-calling it, allowing stops to be checked before syscall exit.
    73  	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
    74  )
    75  
    76  func (t *Task) invokeExternal() {
    77  	t.BeginExternalStop()
    78  	go func() { // S/R-SAFE: External control flow.
    79  		defer t.EndExternalStop()
    80  		t.SyscallTable().External(t.Kernel())
    81  	}()
    82  }
    83  
    84  func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
    85  	s := t.SyscallTable()
    86  
    87  	fe := s.FeatureEnable.Word(sysno)
    88  
    89  	var straceContext any
    90  	if bits.IsAnyOn32(fe, StraceEnableBits) {
    91  		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
    92  	}
    93  
    94  	if bits.IsAnyOn32(fe, SecCheckRawEnter) {
    95  		info := pb.Syscall{
    96  			Sysno: uint64(sysno),
    97  			Arg1:  args[0].Uint64(),
    98  			Arg2:  args[1].Uint64(),
    99  			Arg3:  args[2].Uint64(),
   100  			Arg4:  args[3].Uint64(),
   101  			Arg5:  args[4].Uint64(),
   102  			Arg6:  args[5].Uint64(),
   103  		}
   104  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno))
   105  		if !fields.Context.Empty() {
   106  			info.ContextData = &pb.ContextData{}
   107  			LoadSeccheckData(t, fields.Context, info.ContextData)
   108  		}
   109  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   110  			return c.RawSyscall(t, fields, &info)
   111  		})
   112  	}
   113  	if bits.IsAnyOn32(fe, SecCheckEnter) {
   114  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallEnter, sysno))
   115  		var ctxData *pb.ContextData
   116  		if !fields.Context.Empty() {
   117  			ctxData = &pb.ContextData{}
   118  			LoadSeccheckData(t, fields.Context, ctxData)
   119  		}
   120  		info := SyscallInfo{
   121  			Sysno: sysno,
   122  			Args:  args,
   123  		}
   124  		cb := s.LookupSyscallToProto(sysno)
   125  		msg, msgType := cb(t, fields, ctxData, info)
   126  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   127  			return c.Syscall(t, fields, ctxData, msgType, msg)
   128  		})
   129  	}
   130  
   131  	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
   132  		t.invokeExternal()
   133  		// Ensure we check for stops, then invoke the syscall again.
   134  		ctrl = ctrlStopAndReinvokeSyscall
   135  	} else {
   136  		fn := s.Lookup(sysno)
   137  		var region *trace.Region // Only non-nil if tracing == true.
   138  		if trace.IsEnabled() {
   139  			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
   140  		}
   141  		if fn != nil {
   142  			// Call our syscall implementation.
   143  			rval, ctrl, err = fn(t, sysno, args)
   144  		} else {
   145  			// Use the missing function if not found.
   146  			rval, err = t.SyscallTable().Missing(t, sysno, args)
   147  		}
   148  		if region != nil {
   149  			region.End()
   150  		}
   151  	}
   152  
   153  	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
   154  		t.invokeExternal()
   155  		// Don't reinvoke the unix.
   156  	}
   157  
   158  	if bits.IsAnyOn32(fe, StraceEnableBits) {
   159  		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
   160  	}
   161  
   162  	if bits.IsAnyOn32(fe, SecCheckRawExit) {
   163  		info := pb.Syscall{
   164  			Sysno: uint64(sysno),
   165  			Arg1:  args[0].Uint64(),
   166  			Arg2:  args[1].Uint64(),
   167  			Arg3:  args[2].Uint64(),
   168  			Arg4:  args[3].Uint64(),
   169  			Arg5:  args[4].Uint64(),
   170  			Arg6:  args[5].Uint64(),
   171  			Exit: &pb.Exit{
   172  				Result:  int64(rval),
   173  				Errorno: int64(ExtractErrno(err, int(sysno))),
   174  			},
   175  		}
   176  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno))
   177  		if !fields.Context.Empty() {
   178  			info.ContextData = &pb.ContextData{}
   179  			LoadSeccheckData(t, fields.Context, info.ContextData)
   180  		}
   181  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   182  			return c.RawSyscall(t, fields, &info)
   183  		})
   184  	}
   185  	if bits.IsAnyOn32(fe, SecCheckExit) {
   186  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallExit, sysno))
   187  		var ctxData *pb.ContextData
   188  		if !fields.Context.Empty() {
   189  			ctxData = &pb.ContextData{}
   190  			LoadSeccheckData(t, fields.Context, ctxData)
   191  		}
   192  		info := SyscallInfo{
   193  			Exit:  true,
   194  			Sysno: sysno,
   195  			Args:  args,
   196  			Rval:  rval,
   197  			Errno: ExtractErrno(err, int(sysno)),
   198  		}
   199  		cb := s.LookupSyscallToProto(sysno)
   200  		msg, msgType := cb(t, fields, ctxData, info)
   201  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   202  			return c.Syscall(t, fields, ctxData, msgType, msg)
   203  		})
   204  	}
   205  
   206  	return
   207  }
   208  
   209  // doSyscall is the entry point for an invocation of a system call specified by
   210  // the current state of t's registers.
   211  //
   212  // The syscall path is very hot; avoid defer.
   213  func (t *Task) doSyscall() taskRunState {
   214  	// Save value of the register which is clobbered in the following
   215  	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
   216  	//
   217  	// On x86, register rax was shared by syscall number and return
   218  	// value, and at the entry of the syscall handler, the rax was
   219  	// saved to regs.orig_rax which was exposed to userspace.
   220  	// But on arm64, syscall number was passed through X8, and the X0
   221  	// was shared by the first syscall argument and return value. The
   222  	// X0 was saved to regs.orig_x0 which was not exposed to userspace.
   223  	// So we have to do the same operation here to save the X0 value
   224  	// into the task context.
   225  	t.Arch().SyscallSaveOrig()
   226  
   227  	sysno := t.Arch().SyscallNo()
   228  	args := t.Arch().SyscallArgs()
   229  
   230  	// Tracers expect to see this between when the task traps into the kernel
   231  	// to perform a syscall and when the syscall is actually invoked.
   232  	// This useless-looking temporary is needed because Go.
   233  	tmp := uintptr(unix.ENOSYS)
   234  	t.Arch().SetReturn(-tmp)
   235  
   236  	// Check seccomp filters. The nil check is for performance (as seccomp use
   237  	// is rare), not needed for correctness.
   238  	if t.seccomp.Load() != nil {
   239  		switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
   240  		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
   241  			t.Debugf("Syscall %d: denied by seccomp", sysno)
   242  			return (*runSyscallExit)(nil)
   243  		case linux.SECCOMP_RET_ALLOW:
   244  			// ok
   245  		case linux.SECCOMP_RET_KILL_THREAD:
   246  			t.Debugf("Syscall %d: killed by seccomp", sysno)
   247  			t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
   248  			return (*runExit)(nil)
   249  		case linux.SECCOMP_RET_TRACE:
   250  			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
   251  			return (*runSyscallAfterPtraceEventSeccomp)(nil)
   252  		default:
   253  			panic(fmt.Sprintf("Unknown seccomp result %d", r))
   254  		}
   255  	}
   256  
   257  	syscallCounter.Increment()
   258  	return t.doSyscallEnter(sysno, args)
   259  }
   260  
   261  type runSyscallAfterPtraceEventSeccomp struct{}
   262  
   263  func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
   264  	if t.killed() {
   265  		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
   266  		// ptrace(2)
   267  		return (*runInterrupt)(nil)
   268  	}
   269  	sysno := t.Arch().SyscallNo()
   270  	// "The tracer can skip the system call by changing the syscall number to
   271  	// -1." - Documentation/prctl/seccomp_filter.txt
   272  	if sysno == ^uintptr(0) {
   273  		return (*runSyscallExit)(nil).execute(t)
   274  	}
   275  	args := t.Arch().SyscallArgs()
   276  	return t.doSyscallEnter(sysno, args)
   277  }
   278  
   279  func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
   280  	if next, ok := t.ptraceSyscallEnter(); ok {
   281  		return next
   282  	}
   283  	return t.doSyscallInvoke(sysno, args)
   284  }
   285  
   286  // +stateify savable
   287  type runSyscallAfterSyscallEnterStop struct{}
   288  
   289  func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
   290  	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
   291  		t.tg.signalHandlers.mu.Lock()
   292  		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
   293  		t.tg.signalHandlers.mu.Unlock()
   294  	}
   295  	if t.killed() {
   296  		return (*runInterrupt)(nil)
   297  	}
   298  	sysno := t.Arch().SyscallNo()
   299  	if sysno == ^uintptr(0) {
   300  		return (*runSyscallExit)(nil)
   301  	}
   302  	args := t.Arch().SyscallArgs()
   303  
   304  	return t.doSyscallInvoke(sysno, args)
   305  }
   306  
   307  // +stateify savable
   308  type runSyscallAfterSysemuStop struct{}
   309  
   310  func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
   311  	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
   312  		t.tg.signalHandlers.mu.Lock()
   313  		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
   314  		t.tg.signalHandlers.mu.Unlock()
   315  	}
   316  	if t.killed() {
   317  		return (*runInterrupt)(nil)
   318  	}
   319  	return (*runSyscallExit)(nil).execute(t)
   320  }
   321  
   322  func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
   323  	rval, ctrl, err := t.executeSyscall(sysno, args)
   324  
   325  	if ctrl != nil {
   326  		if !ctrl.ignoreReturn {
   327  			t.Arch().SetReturn(rval)
   328  		}
   329  		if ctrl.next != nil {
   330  			return ctrl.next
   331  		}
   332  	} else if err != nil {
   333  		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
   334  		t.haveSyscallReturn = true
   335  	} else {
   336  		t.Arch().SetReturn(rval)
   337  	}
   338  
   339  	return (*runSyscallExit)(nil).execute(t)
   340  }
   341  
   342  // +stateify savable
   343  type runSyscallReinvoke struct{}
   344  
   345  func (*runSyscallReinvoke) execute(t *Task) taskRunState {
   346  	if t.killed() {
   347  		// It's possible that since the last execution, the task has
   348  		// been forcible killed. Invoking the system call here could
   349  		// result in an infinite loop if it is again preempted by an
   350  		// external stop and reinvoked.
   351  		return (*runInterrupt)(nil)
   352  	}
   353  
   354  	sysno := t.Arch().SyscallNo()
   355  	args := t.Arch().SyscallArgs()
   356  	return t.doSyscallInvoke(sysno, args)
   357  }
   358  
   359  // +stateify savable
   360  type runSyscallExit struct{}
   361  
   362  func (*runSyscallExit) execute(t *Task) taskRunState {
   363  	t.ptraceSyscallExit()
   364  	return (*runApp)(nil)
   365  }
   366  
   367  // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
   368  // indicated by an execution fault at address addr. doVsyscall returns the
   369  // task's next run state.
   370  func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
   371  	metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeVsyscallCount)
   372  
   373  	// Grab the caller up front, to make sure there's a sensible stack.
   374  	caller := t.Arch().Native(uintptr(0))
   375  	if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
   376  		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
   377  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   378  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   379  		return (*runApp)(nil)
   380  	}
   381  
   382  	// For _vsyscalls_, there is no need to translate System V calling convention
   383  	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
   384  	// arguments and none of the vsyscalls uses more than two arguments.
   385  	args := t.Arch().SyscallArgs()
   386  	if t.seccomp.Load() != nil {
   387  		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
   388  		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
   389  			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
   390  			return (*runApp)(nil)
   391  		case linux.SECCOMP_RET_ALLOW:
   392  			// ok
   393  		case linux.SECCOMP_RET_TRACE:
   394  			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
   395  			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
   396  		case linux.SECCOMP_RET_KILL_THREAD:
   397  			t.Debugf("vsyscall %d: killed by seccomp", sysno)
   398  			t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
   399  			return (*runExit)(nil)
   400  		default:
   401  			panic(fmt.Sprintf("Unknown seccomp result %d", r))
   402  		}
   403  	}
   404  
   405  	return t.doVsyscallInvoke(sysno, args, caller)
   406  }
   407  
   408  type runVsyscallAfterPtraceEventSeccomp struct {
   409  	addr   hostarch.Addr
   410  	sysno  uintptr
   411  	caller marshal.Marshallable
   412  }
   413  
   414  func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
   415  	if t.killed() {
   416  		return (*runInterrupt)(nil)
   417  	}
   418  	sysno := t.Arch().SyscallNo()
   419  	// "... the syscall may not be changed to another system call using the
   420  	// orig_rax register. It may only be changed to -1 order [sic] to skip the
   421  	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
   422  	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
   423  	// causes do_exit(SIGSYS), and changing sp is ignored.
   424  	if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
   425  		t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
   426  		return (*runExit)(nil)
   427  	}
   428  	if sysno == ^uintptr(0) {
   429  		return (*runApp)(nil)
   430  	}
   431  	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
   432  }
   433  
   434  func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
   435  	rval, ctrl, err := t.executeSyscall(sysno, args)
   436  	if ctrl != nil {
   437  		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
   438  		// Set the return value. The stack has already been adjusted.
   439  		t.Arch().SetReturn(0)
   440  	} else if err == nil {
   441  		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
   442  		// Set the return value. The stack has already been adjusted.
   443  		t.Arch().SetReturn(uintptr(rval))
   444  	} else {
   445  		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
   446  		if linuxerr.Equals(linuxerr.EFAULT, err) {
   447  			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   448  			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   449  			// A return is not emulated in this case.
   450  			return (*runApp)(nil)
   451  		}
   452  		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
   453  	}
   454  	t.Arch().SetIP(t.Arch().Value(caller))
   455  	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
   456  	return (*runApp)(nil)
   457  }
   458  
   459  // ExtractErrno extracts an integer error number from the error.
   460  // The syscall number is purely for context in the error case. Use -1 if
   461  // syscall number is unknown.
   462  func ExtractErrno(err error, sysno int) int {
   463  	switch err := err.(type) {
   464  	case nil:
   465  		return 0
   466  	case unix.Errno:
   467  		return int(err)
   468  	case *errors.Error:
   469  		return int(linuxerr.ToUnix(err))
   470  	case *memmap.BusError:
   471  		// Bus errors may generate SIGBUS, but for syscalls they still
   472  		// return EFAULT. See case in task_run.go where the fault is
   473  		// handled (and the SIGBUS is delivered).
   474  		return int(unix.EFAULT)
   475  	case *os.PathError:
   476  		return ExtractErrno(err.Err, sysno)
   477  	case *os.LinkError:
   478  		return ExtractErrno(err.Err, sysno)
   479  	case *os.SyscallError:
   480  		return ExtractErrno(err.Err, sysno)
   481  	case *platform.ContextError:
   482  		return int(err.Errno)
   483  	default:
   484  		if errno, ok := linuxerr.TranslateError(err); ok {
   485  			return int(linuxerr.ToUnix(errno))
   486  		}
   487  	}
   488  	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
   489  }