github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/kernel/task_syscall.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"runtime/trace"
    21  
    22  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    23  	"github.com/ttpreport/gvisor-ligolo/pkg/bits"
    24  	"github.com/ttpreport/gvisor-ligolo/pkg/errors"
    25  	"github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr"
    26  	"github.com/ttpreport/gvisor-ligolo/pkg/hostarch"
    27  	"github.com/ttpreport/gvisor-ligolo/pkg/marshal"
    28  	"github.com/ttpreport/gvisor-ligolo/pkg/metric"
    29  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/arch"
    30  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/memmap"
    31  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/seccheck"
    32  	pb "github.com/ttpreport/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto"
    33  	"golang.org/x/sys/unix"
    34  )
    35  
    36  // SyscallRestartBlock represents the restart block for a syscall restartable
    37  // with a custom function. It encapsulates the state required to restart a
    38  // syscall across a S/R.
    39  type SyscallRestartBlock interface {
    40  	Restart(t *Task) (uintptr, error)
    41  }
    42  
    43  // SyscallControl is returned by syscalls to control the behavior of
    44  // Task.doSyscallInvoke.
    45  type SyscallControl struct {
    46  	// next is the state that the task goroutine should switch to. If next is
    47  	// nil, the task goroutine should continue to syscall exit as usual.
    48  	next taskRunState
    49  
    50  	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
    51  	// in the task's syscall return value register.
    52  	ignoreReturn bool
    53  }
    54  
    55  var (
    56  	// CtrlDoExit is returned by the implementations of the exit and exit_group
    57  	// syscalls to enter the task exit path directly, skipping syscall exit
    58  	// tracing.
    59  	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
    60  
    61  	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
    62  	// feature before syscall execution. This causes Task.doSyscallInvoke
    63  	// to return runSyscallReinvoke, allowing Task.run to check for stops
    64  	// before immediately re-invoking the syscall (skipping the re-checking
    65  	// of seccomp filters and ptrace which would confuse userspace
    66  	// tracing).
    67  	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
    68  
    69  	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
    70  	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
    71  	// than tail-calling it, allowing stops to be checked before syscall exit.
    72  	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
    73  )
    74  
    75  func (t *Task) invokeExternal() {
    76  	t.BeginExternalStop()
    77  	go func() { // S/R-SAFE: External control flow.
    78  		defer t.EndExternalStop()
    79  		t.SyscallTable().External(t.Kernel())
    80  	}()
    81  }
    82  
    83  func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
    84  	s := t.SyscallTable()
    85  
    86  	fe := s.FeatureEnable.Word(sysno)
    87  
    88  	var straceContext any
    89  	if bits.IsAnyOn32(fe, StraceEnableBits) {
    90  		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
    91  	}
    92  
    93  	if bits.IsAnyOn32(fe, SecCheckRawEnter) {
    94  		info := pb.Syscall{
    95  			Sysno: uint64(sysno),
    96  			Arg1:  args[0].Uint64(),
    97  			Arg2:  args[1].Uint64(),
    98  			Arg3:  args[2].Uint64(),
    99  			Arg4:  args[3].Uint64(),
   100  			Arg5:  args[4].Uint64(),
   101  			Arg6:  args[5].Uint64(),
   102  		}
   103  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno))
   104  		if !fields.Context.Empty() {
   105  			info.ContextData = &pb.ContextData{}
   106  			LoadSeccheckData(t, fields.Context, info.ContextData)
   107  		}
   108  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   109  			return c.RawSyscall(t, fields, &info)
   110  		})
   111  	}
   112  	if bits.IsAnyOn32(fe, SecCheckEnter) {
   113  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallEnter, sysno))
   114  		var ctxData *pb.ContextData
   115  		if !fields.Context.Empty() {
   116  			ctxData = &pb.ContextData{}
   117  			LoadSeccheckData(t, fields.Context, ctxData)
   118  		}
   119  		info := SyscallInfo{
   120  			Sysno: sysno,
   121  			Args:  args,
   122  		}
   123  		cb := s.LookupSyscallToProto(sysno)
   124  		msg, msgType := cb(t, fields, ctxData, info)
   125  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   126  			return c.Syscall(t, fields, ctxData, msgType, msg)
   127  		})
   128  	}
   129  
   130  	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
   131  		t.invokeExternal()
   132  		// Ensure we check for stops, then invoke the syscall again.
   133  		ctrl = ctrlStopAndReinvokeSyscall
   134  	} else {
   135  		fn := s.Lookup(sysno)
   136  		var region *trace.Region // Only non-nil if tracing == true.
   137  		if trace.IsEnabled() {
   138  			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
   139  		}
   140  		if fn != nil {
   141  			// Call our syscall implementation.
   142  			rval, ctrl, err = fn(t, sysno, args)
   143  		} else {
   144  			// Use the missing function if not found.
   145  			rval, err = t.SyscallTable().Missing(t, sysno, args)
   146  		}
   147  		if region != nil {
   148  			region.End()
   149  		}
   150  	}
   151  
   152  	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
   153  		t.invokeExternal()
   154  		// Don't reinvoke the unix.
   155  	}
   156  
   157  	if bits.IsAnyOn32(fe, StraceEnableBits) {
   158  		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
   159  	}
   160  
   161  	if bits.IsAnyOn32(fe, SecCheckRawExit) {
   162  		info := pb.Syscall{
   163  			Sysno: uint64(sysno),
   164  			Arg1:  args[0].Uint64(),
   165  			Arg2:  args[1].Uint64(),
   166  			Arg3:  args[2].Uint64(),
   167  			Arg4:  args[3].Uint64(),
   168  			Arg5:  args[4].Uint64(),
   169  			Arg6:  args[5].Uint64(),
   170  			Exit: &pb.Exit{
   171  				Result:  int64(rval),
   172  				Errorno: int64(ExtractErrno(err, int(sysno))),
   173  			},
   174  		}
   175  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno))
   176  		if !fields.Context.Empty() {
   177  			info.ContextData = &pb.ContextData{}
   178  			LoadSeccheckData(t, fields.Context, info.ContextData)
   179  		}
   180  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   181  			return c.RawSyscall(t, fields, &info)
   182  		})
   183  	}
   184  	if bits.IsAnyOn32(fe, SecCheckExit) {
   185  		fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallExit, sysno))
   186  		var ctxData *pb.ContextData
   187  		if !fields.Context.Empty() {
   188  			ctxData = &pb.ContextData{}
   189  			LoadSeccheckData(t, fields.Context, ctxData)
   190  		}
   191  		info := SyscallInfo{
   192  			Exit:  true,
   193  			Sysno: sysno,
   194  			Args:  args,
   195  			Rval:  rval,
   196  			Errno: ExtractErrno(err, int(sysno)),
   197  		}
   198  		cb := s.LookupSyscallToProto(sysno)
   199  		msg, msgType := cb(t, fields, ctxData, info)
   200  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   201  			return c.Syscall(t, fields, ctxData, msgType, msg)
   202  		})
   203  	}
   204  
   205  	return
   206  }
   207  
   208  // doSyscall is the entry point for an invocation of a system call specified by
   209  // the current state of t's registers.
   210  //
   211  // The syscall path is very hot; avoid defer.
   212  func (t *Task) doSyscall() taskRunState {
   213  	// Save value of the register which is clobbered in the following
   214  	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
   215  	//
   216  	// On x86, register rax was shared by syscall number and return
   217  	// value, and at the entry of the syscall handler, the rax was
   218  	// saved to regs.orig_rax which was exposed to userspace.
   219  	// But on arm64, syscall number was passed through X8, and the X0
   220  	// was shared by the first syscall argument and return value. The
   221  	// X0 was saved to regs.orig_x0 which was not exposed to userspace.
   222  	// So we have to do the same operation here to save the X0 value
   223  	// into the task context.
   224  	t.Arch().SyscallSaveOrig()
   225  
   226  	sysno := t.Arch().SyscallNo()
   227  	args := t.Arch().SyscallArgs()
   228  
   229  	// Tracers expect to see this between when the task traps into the kernel
   230  	// to perform a syscall and when the syscall is actually invoked.
   231  	// This useless-looking temporary is needed because Go.
   232  	tmp := uintptr(unix.ENOSYS)
   233  	t.Arch().SetReturn(-tmp)
   234  
   235  	// Check seccomp filters. The nil check is for performance (as seccomp use
   236  	// is rare), not needed for correctness.
   237  	if t.syscallFilters.Load() != nil {
   238  		switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
   239  		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
   240  			t.Debugf("Syscall %d: denied by seccomp", sysno)
   241  			return (*runSyscallExit)(nil)
   242  		case linux.SECCOMP_RET_ALLOW:
   243  			// ok
   244  		case linux.SECCOMP_RET_KILL_THREAD:
   245  			t.Debugf("Syscall %d: killed by seccomp", sysno)
   246  			t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
   247  			return (*runExit)(nil)
   248  		case linux.SECCOMP_RET_TRACE:
   249  			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
   250  			return (*runSyscallAfterPtraceEventSeccomp)(nil)
   251  		default:
   252  			panic(fmt.Sprintf("Unknown seccomp result %d", r))
   253  		}
   254  	}
   255  
   256  	syscallCounter.Increment()
   257  	return t.doSyscallEnter(sysno, args)
   258  }
   259  
   260  type runSyscallAfterPtraceEventSeccomp struct{}
   261  
   262  func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
   263  	if t.killed() {
   264  		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
   265  		// ptrace(2)
   266  		return (*runInterrupt)(nil)
   267  	}
   268  	sysno := t.Arch().SyscallNo()
   269  	// "The tracer can skip the system call by changing the syscall number to
   270  	// -1." - Documentation/prctl/seccomp_filter.txt
   271  	if sysno == ^uintptr(0) {
   272  		return (*runSyscallExit)(nil).execute(t)
   273  	}
   274  	args := t.Arch().SyscallArgs()
   275  	return t.doSyscallEnter(sysno, args)
   276  }
   277  
   278  func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
   279  	if next, ok := t.ptraceSyscallEnter(); ok {
   280  		return next
   281  	}
   282  	return t.doSyscallInvoke(sysno, args)
   283  }
   284  
   285  // +stateify savable
   286  type runSyscallAfterSyscallEnterStop struct{}
   287  
   288  func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
   289  	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
   290  		t.tg.signalHandlers.mu.Lock()
   291  		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
   292  		t.tg.signalHandlers.mu.Unlock()
   293  	}
   294  	if t.killed() {
   295  		return (*runInterrupt)(nil)
   296  	}
   297  	sysno := t.Arch().SyscallNo()
   298  	if sysno == ^uintptr(0) {
   299  		return (*runSyscallExit)(nil)
   300  	}
   301  	args := t.Arch().SyscallArgs()
   302  
   303  	return t.doSyscallInvoke(sysno, args)
   304  }
   305  
   306  // +stateify savable
   307  type runSyscallAfterSysemuStop struct{}
   308  
   309  func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
   310  	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
   311  		t.tg.signalHandlers.mu.Lock()
   312  		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
   313  		t.tg.signalHandlers.mu.Unlock()
   314  	}
   315  	if t.killed() {
   316  		return (*runInterrupt)(nil)
   317  	}
   318  	return (*runSyscallExit)(nil).execute(t)
   319  }
   320  
   321  func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
   322  	rval, ctrl, err := t.executeSyscall(sysno, args)
   323  
   324  	if ctrl != nil {
   325  		if !ctrl.ignoreReturn {
   326  			t.Arch().SetReturn(rval)
   327  		}
   328  		if ctrl.next != nil {
   329  			return ctrl.next
   330  		}
   331  	} else if err != nil {
   332  		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
   333  		t.haveSyscallReturn = true
   334  	} else {
   335  		t.Arch().SetReturn(rval)
   336  	}
   337  
   338  	return (*runSyscallExit)(nil).execute(t)
   339  }
   340  
   341  // +stateify savable
   342  type runSyscallReinvoke struct{}
   343  
   344  func (*runSyscallReinvoke) execute(t *Task) taskRunState {
   345  	if t.killed() {
   346  		// It's possible that since the last execution, the task has
   347  		// been forcible killed. Invoking the system call here could
   348  		// result in an infinite loop if it is again preempted by an
   349  		// external stop and reinvoked.
   350  		return (*runInterrupt)(nil)
   351  	}
   352  
   353  	sysno := t.Arch().SyscallNo()
   354  	args := t.Arch().SyscallArgs()
   355  	return t.doSyscallInvoke(sysno, args)
   356  }
   357  
   358  // +stateify savable
   359  type runSyscallExit struct{}
   360  
   361  func (*runSyscallExit) execute(t *Task) taskRunState {
   362  	t.ptraceSyscallExit()
   363  	return (*runApp)(nil)
   364  }
   365  
   366  // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
   367  // indicated by an execution fault at address addr. doVsyscall returns the
   368  // task's next run state.
   369  func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
   370  	metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeVsyscallCount)
   371  
   372  	// Grab the caller up front, to make sure there's a sensible stack.
   373  	caller := t.Arch().Native(uintptr(0))
   374  	if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
   375  		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
   376  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   377  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   378  		return (*runApp)(nil)
   379  	}
   380  
   381  	// For _vsyscalls_, there is no need to translate System V calling convention
   382  	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
   383  	// arguments and none of the vsyscalls uses more than two arguments.
   384  	args := t.Arch().SyscallArgs()
   385  	if t.syscallFilters.Load() != nil {
   386  		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
   387  		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
   388  			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
   389  			return (*runApp)(nil)
   390  		case linux.SECCOMP_RET_ALLOW:
   391  			// ok
   392  		case linux.SECCOMP_RET_TRACE:
   393  			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
   394  			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
   395  		case linux.SECCOMP_RET_KILL_THREAD:
   396  			t.Debugf("vsyscall %d: killed by seccomp", sysno)
   397  			t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
   398  			return (*runExit)(nil)
   399  		default:
   400  			panic(fmt.Sprintf("Unknown seccomp result %d", r))
   401  		}
   402  	}
   403  
   404  	return t.doVsyscallInvoke(sysno, args, caller)
   405  }
   406  
   407  type runVsyscallAfterPtraceEventSeccomp struct {
   408  	addr   hostarch.Addr
   409  	sysno  uintptr
   410  	caller marshal.Marshallable
   411  }
   412  
   413  func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
   414  	if t.killed() {
   415  		return (*runInterrupt)(nil)
   416  	}
   417  	sysno := t.Arch().SyscallNo()
   418  	// "... the syscall may not be changed to another system call using the
   419  	// orig_rax register. It may only be changed to -1 order [sic] to skip the
   420  	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
   421  	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
   422  	// causes do_exit(SIGSYS), and changing sp is ignored.
   423  	if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
   424  		t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
   425  		return (*runExit)(nil)
   426  	}
   427  	if sysno == ^uintptr(0) {
   428  		return (*runApp)(nil)
   429  	}
   430  	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
   431  }
   432  
   433  func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
   434  	rval, ctrl, err := t.executeSyscall(sysno, args)
   435  	if ctrl != nil {
   436  		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
   437  		// Set the return value. The stack has already been adjusted.
   438  		t.Arch().SetReturn(0)
   439  	} else if err == nil {
   440  		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
   441  		// Set the return value. The stack has already been adjusted.
   442  		t.Arch().SetReturn(uintptr(rval))
   443  	} else {
   444  		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
   445  		if linuxerr.Equals(linuxerr.EFAULT, err) {
   446  			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   447  			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   448  			// A return is not emulated in this case.
   449  			return (*runApp)(nil)
   450  		}
   451  		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
   452  	}
   453  	t.Arch().SetIP(t.Arch().Value(caller))
   454  	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
   455  	return (*runApp)(nil)
   456  }
   457  
   458  // ExtractErrno extracts an integer error number from the error.
   459  // The syscall number is purely for context in the error case. Use -1 if
   460  // syscall number is unknown.
   461  func ExtractErrno(err error, sysno int) int {
   462  	switch err := err.(type) {
   463  	case nil:
   464  		return 0
   465  	case unix.Errno:
   466  		return int(err)
   467  	case *errors.Error:
   468  		return int(linuxerr.ToUnix(err))
   469  	case *memmap.BusError:
   470  		// Bus errors may generate SIGBUS, but for syscalls they still
   471  		// return EFAULT. See case in task_run.go where the fault is
   472  		// handled (and the SIGBUS is delivered).
   473  		return int(unix.EFAULT)
   474  	case *os.PathError:
   475  		return ExtractErrno(err.Err, sysno)
   476  	case *os.LinkError:
   477  		return ExtractErrno(err.Err, sysno)
   478  	case *os.SyscallError:
   479  		return ExtractErrno(err.Err, sysno)
   480  	default:
   481  		if errno, ok := linuxerr.TranslateError(err); ok {
   482  			return int(linuxerr.ToUnix(errno))
   483  		}
   484  	}
   485  	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
   486  }