github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_syscall.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"runtime/trace"
    21  
    22  	"golang.org/x/sys/unix"
    23  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    24  	"github.com/SagerNet/gvisor/pkg/bits"
    25  	"github.com/SagerNet/gvisor/pkg/errors"
    26  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    27  	"github.com/SagerNet/gvisor/pkg/hostarch"
    28  	"github.com/SagerNet/gvisor/pkg/marshal"
    29  	"github.com/SagerNet/gvisor/pkg/metric"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    32  	"github.com/SagerNet/gvisor/pkg/syserror"
    33  )
    34  
    35  // SyscallRestartBlock represents the restart block for a syscall restartable
    36  // with a custom function. It encapsulates the state required to restart a
    37  // syscall across a S/R.
    38  type SyscallRestartBlock interface {
    39  	Restart(t *Task) (uintptr, error)
    40  }
    41  
    42  // SyscallControl is returned by syscalls to control the behavior of
    43  // Task.doSyscallInvoke.
    44  type SyscallControl struct {
    45  	// next is the state that the task goroutine should switch to. If next is
    46  	// nil, the task goroutine should continue to syscall exit as usual.
    47  	next taskRunState
    48  
    49  	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
    50  	// in the task's syscall return value register.
    51  	ignoreReturn bool
    52  }
    53  
    54  var (
    55  	// CtrlDoExit is returned by the implementations of the exit and exit_group
    56  	// syscalls to enter the task exit path directly, skipping syscall exit
    57  	// tracing.
    58  	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
    59  
    60  	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
    61  	// feature before syscall execution. This causes Task.doSyscallInvoke
    62  	// to return runSyscallReinvoke, allowing Task.run to check for stops
    63  	// before immediately re-invoking the syscall (skipping the re-checking
    64  	// of seccomp filters and ptrace which would confuse userspace
    65  	// tracing).
    66  	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
    67  
    68  	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
    69  	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
    70  	// than tail-calling it, allowing stops to be checked before syscall exit.
    71  	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
    72  )
    73  
    74  func (t *Task) invokeExternal() {
    75  	t.BeginExternalStop()
    76  	go func() { // S/R-SAFE: External control flow.
    77  		defer t.EndExternalStop()
    78  		t.SyscallTable().External(t.Kernel())
    79  	}()
    80  }
    81  
    82  func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
    83  	s := t.SyscallTable()
    84  
    85  	fe := s.FeatureEnable.Word(sysno)
    86  
    87  	var straceContext interface{}
    88  	if bits.IsAnyOn32(fe, StraceEnableBits) {
    89  		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
    90  	}
    91  
    92  	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
    93  		t.invokeExternal()
    94  		// Ensure we check for stops, then invoke the syscall again.
    95  		ctrl = ctrlStopAndReinvokeSyscall
    96  	} else {
    97  		fn := s.Lookup(sysno)
    98  		var region *trace.Region // Only non-nil if tracing == true.
    99  		if trace.IsEnabled() {
   100  			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
   101  		}
   102  		if fn != nil {
   103  			// Call our syscall implementation.
   104  			rval, ctrl, err = fn(t, args)
   105  		} else {
   106  			// Use the missing function if not found.
   107  			rval, err = t.SyscallTable().Missing(t, sysno, args)
   108  		}
   109  		if region != nil {
   110  			region.End()
   111  		}
   112  	}
   113  
   114  	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
   115  		t.invokeExternal()
   116  		// Don't reinvoke the unix.
   117  	}
   118  
   119  	if bits.IsAnyOn32(fe, StraceEnableBits) {
   120  		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
   121  	}
   122  
   123  	return
   124  }
   125  
   126  // doSyscall is the entry point for an invocation of a system call specified by
   127  // the current state of t's registers.
   128  //
   129  // The syscall path is very hot; avoid defer.
   130  func (t *Task) doSyscall() taskRunState {
   131  	// Save value of the register which is clobbered in the following
   132  	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
   133  	//
   134  	// On x86, register rax was shared by syscall number and return
   135  	// value, and at the entry of the syscall handler, the rax was
   136  	// saved to regs.orig_rax which was exposed to userspace.
   137  	// But on arm64, syscall number was passed through X8, and the X0
   138  	// was shared by the first syscall argument and return value. The
   139  	// X0 was saved to regs.orig_x0 which was not exposed to userspace.
   140  	// So we have to do the same operation here to save the X0 value
   141  	// into the task context.
   142  	t.Arch().SyscallSaveOrig()
   143  
   144  	sysno := t.Arch().SyscallNo()
   145  	args := t.Arch().SyscallArgs()
   146  
   147  	// Tracers expect to see this between when the task traps into the kernel
   148  	// to perform a syscall and when the syscall is actually invoked.
   149  	// This useless-looking temporary is needed because Go.
   150  	tmp := uintptr(unix.ENOSYS)
   151  	t.Arch().SetReturn(-tmp)
   152  
   153  	// Check seccomp filters. The nil check is for performance (as seccomp use
   154  	// is rare), not needed for correctness.
   155  	if t.syscallFilters.Load() != nil {
   156  		switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
   157  		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
   158  			t.Debugf("Syscall %d: denied by seccomp", sysno)
   159  			return (*runSyscallExit)(nil)
   160  		case linux.SECCOMP_RET_ALLOW:
   161  			// ok
   162  		case linux.SECCOMP_RET_KILL_THREAD:
   163  			t.Debugf("Syscall %d: killed by seccomp", sysno)
   164  			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
   165  			return (*runExit)(nil)
   166  		case linux.SECCOMP_RET_TRACE:
   167  			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
   168  			return (*runSyscallAfterPtraceEventSeccomp)(nil)
   169  		default:
   170  			panic(fmt.Sprintf("Unknown seccomp result %d", r))
   171  		}
   172  	}
   173  
   174  	return t.doSyscallEnter(sysno, args)
   175  }
   176  
   177  type runSyscallAfterPtraceEventSeccomp struct{}
   178  
   179  func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
   180  	if t.killed() {
   181  		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
   182  		// ptrace(2)
   183  		return (*runInterrupt)(nil)
   184  	}
   185  	sysno := t.Arch().SyscallNo()
   186  	// "The tracer can skip the system call by changing the syscall number to
   187  	// -1." - Documentation/prctl/seccomp_filter.txt
   188  	if sysno == ^uintptr(0) {
   189  		return (*runSyscallExit)(nil).execute(t)
   190  	}
   191  	args := t.Arch().SyscallArgs()
   192  	return t.doSyscallEnter(sysno, args)
   193  }
   194  
   195  func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
   196  	if next, ok := t.ptraceSyscallEnter(); ok {
   197  		return next
   198  	}
   199  	return t.doSyscallInvoke(sysno, args)
   200  }
   201  
   202  // +stateify savable
   203  type runSyscallAfterSyscallEnterStop struct{}
   204  
   205  func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
   206  	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
   207  		t.tg.signalHandlers.mu.Lock()
   208  		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
   209  		t.tg.signalHandlers.mu.Unlock()
   210  	}
   211  	if t.killed() {
   212  		return (*runInterrupt)(nil)
   213  	}
   214  	sysno := t.Arch().SyscallNo()
   215  	if sysno == ^uintptr(0) {
   216  		return (*runSyscallExit)(nil)
   217  	}
   218  	args := t.Arch().SyscallArgs()
   219  
   220  	return t.doSyscallInvoke(sysno, args)
   221  }
   222  
   223  // +stateify savable
   224  type runSyscallAfterSysemuStop struct{}
   225  
   226  func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
   227  	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
   228  		t.tg.signalHandlers.mu.Lock()
   229  		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
   230  		t.tg.signalHandlers.mu.Unlock()
   231  	}
   232  	if t.killed() {
   233  		return (*runInterrupt)(nil)
   234  	}
   235  	return (*runSyscallExit)(nil).execute(t)
   236  }
   237  
   238  func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
   239  	rval, ctrl, err := t.executeSyscall(sysno, args)
   240  
   241  	if ctrl != nil {
   242  		if !ctrl.ignoreReturn {
   243  			t.Arch().SetReturn(rval)
   244  		}
   245  		if ctrl.next != nil {
   246  			return ctrl.next
   247  		}
   248  	} else if err != nil {
   249  		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
   250  		t.haveSyscallReturn = true
   251  	} else {
   252  		t.Arch().SetReturn(rval)
   253  	}
   254  
   255  	return (*runSyscallExit)(nil).execute(t)
   256  }
   257  
   258  // +stateify savable
   259  type runSyscallReinvoke struct{}
   260  
   261  func (*runSyscallReinvoke) execute(t *Task) taskRunState {
   262  	if t.killed() {
   263  		// It's possible that since the last execution, the task has
   264  		// been forcible killed. Invoking the system call here could
   265  		// result in an infinite loop if it is again preempted by an
   266  		// external stop and reinvoked.
   267  		return (*runInterrupt)(nil)
   268  	}
   269  
   270  	sysno := t.Arch().SyscallNo()
   271  	args := t.Arch().SyscallArgs()
   272  	return t.doSyscallInvoke(sysno, args)
   273  }
   274  
   275  // +stateify savable
   276  type runSyscallExit struct{}
   277  
   278  func (*runSyscallExit) execute(t *Task) taskRunState {
   279  	t.ptraceSyscallExit()
   280  	return (*runApp)(nil)
   281  }
   282  
   283  // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
   284  // indicated by an execution fault at address addr. doVsyscall returns the
   285  // task's next run state.
   286  func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
   287  	metric.WeirdnessMetric.Increment("vsyscall_count")
   288  
   289  	// Grab the caller up front, to make sure there's a sensible stack.
   290  	caller := t.Arch().Native(uintptr(0))
   291  	if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
   292  		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
   293  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   294  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   295  		return (*runApp)(nil)
   296  	}
   297  
   298  	// For _vsyscalls_, there is no need to translate System V calling convention
   299  	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
   300  	// arguments and none of the vsyscalls uses more than two arguments.
   301  	args := t.Arch().SyscallArgs()
   302  	if t.syscallFilters.Load() != nil {
   303  		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
   304  		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
   305  			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
   306  			return (*runApp)(nil)
   307  		case linux.SECCOMP_RET_ALLOW:
   308  			// ok
   309  		case linux.SECCOMP_RET_TRACE:
   310  			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
   311  			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
   312  		case linux.SECCOMP_RET_KILL_THREAD:
   313  			t.Debugf("vsyscall %d: killed by seccomp", sysno)
   314  			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
   315  			return (*runExit)(nil)
   316  		default:
   317  			panic(fmt.Sprintf("Unknown seccomp result %d", r))
   318  		}
   319  	}
   320  
   321  	return t.doVsyscallInvoke(sysno, args, caller)
   322  }
   323  
   324  type runVsyscallAfterPtraceEventSeccomp struct {
   325  	addr   hostarch.Addr
   326  	sysno  uintptr
   327  	caller marshal.Marshallable
   328  }
   329  
   330  func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
   331  	if t.killed() {
   332  		return (*runInterrupt)(nil)
   333  	}
   334  	sysno := t.Arch().SyscallNo()
   335  	// "... the syscall may not be changed to another system call using the
   336  	// orig_rax register. It may only be changed to -1 order [sic] to skip the
   337  	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
   338  	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
   339  	// causes do_exit(SIGSYS), and changing sp is ignored.
   340  	if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
   341  		t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
   342  		return (*runExit)(nil)
   343  	}
   344  	if sysno == ^uintptr(0) {
   345  		return (*runApp)(nil)
   346  	}
   347  	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
   348  }
   349  
   350  func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
   351  	rval, ctrl, err := t.executeSyscall(sysno, args)
   352  	if ctrl != nil {
   353  		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
   354  		// Set the return value. The stack has already been adjusted.
   355  		t.Arch().SetReturn(0)
   356  	} else if err == nil {
   357  		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
   358  		// Set the return value. The stack has already been adjusted.
   359  		t.Arch().SetReturn(uintptr(rval))
   360  	} else {
   361  		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
   362  		if linuxerr.Equals(linuxerr.EFAULT, err) {
   363  			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   364  			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   365  			// A return is not emulated in this case.
   366  			return (*runApp)(nil)
   367  		}
   368  		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
   369  	}
   370  	t.Arch().SetIP(t.Arch().Value(caller))
   371  	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
   372  	return (*runApp)(nil)
   373  }
   374  
   375  // ExtractErrno extracts an integer error number from the error.
   376  // The syscall number is purely for context in the error case. Use -1 if
   377  // syscall number is unknown.
   378  func ExtractErrno(err error, sysno int) int {
   379  	switch err := err.(type) {
   380  	case nil:
   381  		return 0
   382  	case unix.Errno:
   383  		return int(err)
   384  	case *errors.Error:
   385  		return int(err.Errno())
   386  	case syserror.SyscallRestartErrno:
   387  		return int(err)
   388  	case *memmap.BusError:
   389  		// Bus errors may generate SIGBUS, but for syscalls they still
   390  		// return EFAULT. See case in task_run.go where the fault is
   391  		// handled (and the SIGBUS is delivered).
   392  		return int(unix.EFAULT)
   393  	case *os.PathError:
   394  		return ExtractErrno(err.Err, sysno)
   395  	case *os.LinkError:
   396  		return ExtractErrno(err.Err, sysno)
   397  	case *os.SyscallError:
   398  		return ExtractErrno(err.Err, sysno)
   399  	default:
   400  		if errno, ok := syserror.TranslateError(err); ok {
   401  			return int(errno)
   402  		}
   403  	}
   404  	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
   405  }