github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/seccomp.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"golang.org/x/sys/unix"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/bpf"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    24  )
    25  
    26  const maxSyscallFilterInstructions = 1 << 15
    27  
    28  // dataAsBPFInput returns a serialized BPF program, only valid on the current task
    29  // goroutine.
    30  //
    31  // Note: this is called for every syscall, which is a very hot path.
    32  func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
    33  	buf := t.CopyScratchBuffer(d.SizeBytes())
    34  	d.MarshalUnsafe(buf)
    35  	return bpf.InputBytes{
    36  		Data: buf,
    37  		// Go-marshal always uses the native byte order.
    38  		Order: hostarch.ByteOrder,
    39  	}
    40  }
    41  
    42  func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo {
    43  	si := &linux.SignalInfo{
    44  		Signo: int32(linux.SIGSYS),
    45  		Errno: errno,
    46  		Code:  linux.SYS_SECCOMP,
    47  	}
    48  	si.SetCallAddr(uint64(ip))
    49  	si.SetSyscall(sysno)
    50  	si.SetArch(t.SyscallTable().AuditNumber)
    51  	return si
    52  }
    53  
    54  // checkSeccompSyscall applies the task's seccomp filters before the execution
    55  // of syscall sysno at instruction pointer ip. (These parameters must be passed
    56  // in because vsyscalls do not use the values in t.Arch().)
    57  //
    58  // Preconditions: The caller must be running on the task goroutine.
    59  func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction {
    60  	result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
    61  	action := result & linux.SECCOMP_RET_ACTION
    62  	switch action {
    63  	case linux.SECCOMP_RET_TRAP:
    64  		// "Results in the kernel sending a SIGSYS signal to the triggering
    65  		// task without executing the system call. ... The SECCOMP_RET_DATA
    66  		// portion of the return value will be passed as si_errno." -
    67  		// Documentation/prctl/seccomp_filter.txt
    68  		t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
    69  		// "The return value register will contain an arch-dependent value." In
    70  		// practice, it's ~always the syscall number.
    71  		t.Arch().SetReturn(uintptr(sysno))
    72  
    73  	case linux.SECCOMP_RET_ERRNO:
    74  		// "Results in the lower 16-bits of the return value being passed to
    75  		// userland as the errno without executing the system call."
    76  		t.Arch().SetReturn(-uintptr(result.Data()))
    77  
    78  	case linux.SECCOMP_RET_TRACE:
    79  		// "When returned, this value will cause the kernel to attempt to
    80  		// notify a ptrace()-based tracer prior to executing the system call.
    81  		// If there is no tracer present, -ENOSYS is returned to userland and
    82  		// the system call is not executed."
    83  		if !t.ptraceSeccomp(result.Data()) {
    84  			// This useless-looking temporary is needed because Go.
    85  			tmp := uintptr(unix.ENOSYS)
    86  			t.Arch().SetReturn(-tmp)
    87  			return linux.SECCOMP_RET_ERRNO
    88  		}
    89  
    90  	case linux.SECCOMP_RET_ALLOW:
    91  		// "Results in the system call being executed."
    92  
    93  	case linux.SECCOMP_RET_KILL_THREAD:
    94  		// "Results in the task exiting immediately without executing the
    95  		// system call. The exit status of the task will be SIGSYS, not
    96  		// SIGKILL."
    97  
    98  	default:
    99  		// consistent with Linux
   100  		return linux.SECCOMP_RET_KILL_THREAD
   101  	}
   102  	return action
   103  }
   104  
   105  func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 {
   106  	data := linux.SeccompData{
   107  		Nr:                 sysno,
   108  		Arch:               t.image.st.AuditNumber,
   109  		InstructionPointer: uint64(ip),
   110  	}
   111  	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
   112  	// we can't do any slicing tricks or even use copy/append here.
   113  	for i, arg := range args {
   114  		if i >= len(data.Args) {
   115  			break
   116  		}
   117  		data.Args[i] = arg.Uint64()
   118  	}
   119  	input := dataAsBPFInput(t, &data)
   120  
   121  	ret := uint32(linux.SECCOMP_RET_ALLOW)
   122  	f := t.syscallFilters.Load()
   123  	if f == nil {
   124  		return ret
   125  	}
   126  
   127  	// "Every filter successfully installed will be evaluated (in reverse
   128  	// order) for each system call the task makes." - kernel/seccomp.c
   129  	for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
   130  		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
   131  		if err != nil {
   132  			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
   133  			thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD)
   134  		}
   135  		// "If multiple filters exist, the return value for the evaluation of a
   136  		// given system call will always use the highest precedent value." -
   137  		// Documentation/prctl/seccomp_filter.txt
   138  		//
   139  		// (Note that this contradicts prctl(2): "If the filters permit prctl()
   140  		// calls, then additional filters can be added; they are run in order
   141  		// until the first non-allow result is seen." prctl(2) is incorrect.)
   142  		//
   143  		// "The ordering ensures that a min_t() over composed return values
   144  		// always selects the least permissive choice." -
   145  		// include/uapi/linux/seccomp.h
   146  		if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
   147  			ret = thisRet
   148  		}
   149  	}
   150  
   151  	return ret
   152  }
   153  
   154  // AppendSyscallFilter adds BPF program p as a system call filter.
   155  //
   156  // Preconditions: The caller must be running on the task goroutine.
   157  func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error {
   158  	// While syscallFilters are an atomic.Value we must take the mutex to prevent
   159  	// our read-copy-update from happening while another task is syncing syscall
   160  	// filters to us, this keeps the filters in a consistent state.
   161  	t.tg.signalHandlers.mu.Lock()
   162  	defer t.tg.signalHandlers.mu.Unlock()
   163  
   164  	// Cap the combined length of all syscall filters (plus a penalty of 4
   165  	// instructions per filter beyond the first) to maxSyscallFilterInstructions.
   166  	// This restriction is inherited from Linux.
   167  	totalLength := p.Length()
   168  	var newFilters []bpf.Program
   169  
   170  	if sf := t.syscallFilters.Load(); sf != nil {
   171  		oldFilters := sf.([]bpf.Program)
   172  		for _, f := range oldFilters {
   173  			totalLength += f.Length() + 4
   174  		}
   175  		newFilters = append(newFilters, oldFilters...)
   176  	}
   177  
   178  	if totalLength > maxSyscallFilterInstructions {
   179  		return linuxerr.ENOMEM
   180  	}
   181  
   182  	newFilters = append(newFilters, p)
   183  	t.syscallFilters.Store(newFilters)
   184  
   185  	if syncAll {
   186  		// Note: No new privs is always assumed to be set.
   187  		for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
   188  			if ot != t {
   189  				var copiedFilters []bpf.Program
   190  				copiedFilters = append(copiedFilters, newFilters...)
   191  				ot.syscallFilters.Store(copiedFilters)
   192  			}
   193  		}
   194  	}
   195  
   196  	return nil
   197  }
   198  
   199  // SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
   200  // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
   201  // and /proc/[pid]/status.
   202  func (t *Task) SeccompMode() int {
   203  	f := t.syscallFilters.Load()
   204  	if f != nil && len(f.([]bpf.Program)) > 0 {
   205  		return linux.SECCOMP_MODE_FILTER
   206  	}
   207  	return linux.SECCOMP_MODE_NONE
   208  }