github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/seccomp.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"reflect"
    20  
    21  	"golang.org/x/sys/unix"
    22  	"github.com/metacubex/gvisor/pkg/abi/linux"
    23  	"github.com/metacubex/gvisor/pkg/abi/sentry"
    24  	"github.com/metacubex/gvisor/pkg/bpf"
    25  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    26  	"github.com/metacubex/gvisor/pkg/hostarch"
    27  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    28  )
    29  
    30  const (
    31  	maxSyscallFilterInstructions = 1 << 15
    32  
    33  	// uncacheableBPFAction is an invalid seccomp action code.
    34  	// It is used as a sentinel value in `taskSeccompFilters.cache` to indicate
    35  	// that a specific syscall number is uncachable.
    36  	uncacheableBPFAction = linux.SECCOMP_RET_ACTION_FULL
    37  )
    38  
    39  // taskSeccomp holds seccomp-related data for a `Task`.
    40  //
    41  // +stateify savable
    42  type taskSeccomp struct {
    43  	// filters is the list of seccomp programs that are applied to the task,
    44  	// in the order in which they were installed.
    45  	filters []bpf.Program
    46  
    47  	// cache maps syscall numbers to the action to take for that syscall number.
    48  	// It is only populated for syscalls where determining this action does not
    49  	// involve any input data other than the architecture and the syscall
    50  	// number in any of `filters`.
    51  	// If any other input is necessary, the cache stores `uncacheableBPFAction`
    52  	// to indicate that this syscall number's rules are not cacheable.
    53  	cache [sentry.MaxSyscallNum + 1]linux.BPFAction
    54  
    55  	// cacheAuditNumber is the AUDIT_ARCH_* constant of the task image used
    56  	// at the time of computing `cache`.
    57  	cacheAuditNumber uint32
    58  }
    59  
    60  // copy returns a copy of this `taskSeccomp`.
    61  func (ts *taskSeccomp) copy() *taskSeccomp {
    62  	return &taskSeccomp{
    63  		filters:          append(([]bpf.Program)(nil), ts.filters...),
    64  		cacheAuditNumber: ts.cacheAuditNumber,
    65  		cache:            ts.cache,
    66  	}
    67  }
    68  
    69  // dataAsBPFInput returns a serialized BPF program, only valid on the current task
    70  // goroutine.
    71  //
    72  // Note: this is called for every syscall, which is a very hot path.
    73  func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
    74  	buf := t.CopyScratchBuffer(d.SizeBytes())
    75  	d.MarshalUnsafe(buf)
    76  	return buf[:d.SizeBytes()]
    77  }
    78  
    79  func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo {
    80  	si := &linux.SignalInfo{
    81  		Signo: int32(linux.SIGSYS),
    82  		Errno: errno,
    83  		Code:  linux.SYS_SECCOMP,
    84  	}
    85  	si.SetCallAddr(uint64(ip))
    86  	si.SetSyscall(sysno)
    87  	si.SetArch(t.SyscallTable().AuditNumber)
    88  	return si
    89  }
    90  
    91  // checkSeccompSyscall applies the task's seccomp filters before the execution
    92  // of syscall sysno at instruction pointer ip. (These parameters must be passed
    93  // in because vsyscalls do not use the values in t.Arch().)
    94  //
    95  // Preconditions: The caller must be running on the task goroutine.
    96  func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction {
    97  	result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
    98  	action := result & linux.SECCOMP_RET_ACTION
    99  	switch action {
   100  	case linux.SECCOMP_RET_TRAP:
   101  		// "Results in the kernel sending a SIGSYS signal to the triggering
   102  		// task without executing the system call. ... The SECCOMP_RET_DATA
   103  		// portion of the return value will be passed as si_errno." -
   104  		// Documentation/prctl/seccomp_filter.txt
   105  		t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
   106  		// "The return value register will contain an arch-dependent value." In
   107  		// practice, it's ~always the syscall number.
   108  		t.Arch().SetReturn(uintptr(sysno))
   109  
   110  	case linux.SECCOMP_RET_ERRNO:
   111  		// "Results in the lower 16-bits of the return value being passed to
   112  		// userland as the errno without executing the system call."
   113  		t.Arch().SetReturn(-uintptr(result.Data()))
   114  
   115  	case linux.SECCOMP_RET_TRACE:
   116  		// "When returned, this value will cause the kernel to attempt to
   117  		// notify a ptrace()-based tracer prior to executing the system call.
   118  		// If there is no tracer present, -ENOSYS is returned to userland and
   119  		// the system call is not executed."
   120  		if !t.ptraceSeccomp(result.Data()) {
   121  			// This useless-looking temporary is needed because Go.
   122  			tmp := uintptr(unix.ENOSYS)
   123  			t.Arch().SetReturn(-tmp)
   124  			return linux.SECCOMP_RET_ERRNO
   125  		}
   126  
   127  	case linux.SECCOMP_RET_ALLOW:
   128  		// "Results in the system call being executed."
   129  
   130  	case linux.SECCOMP_RET_KILL_THREAD:
   131  		// "Results in the task exiting immediately without executing the
   132  		// system call. The exit status of the task will be SIGSYS, not
   133  		// SIGKILL."
   134  
   135  	default:
   136  		// consistent with Linux
   137  		return linux.SECCOMP_RET_KILL_THREAD
   138  	}
   139  	return action
   140  }
   141  
   142  func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 {
   143  	ret := uint32(linux.SECCOMP_RET_ALLOW)
   144  	ts := t.seccomp.Load()
   145  	if ts == nil {
   146  		return ret
   147  	}
   148  	arch := t.image.st.AuditNumber
   149  	if arch == ts.cacheAuditNumber && sysno >= 0 && sysno <= sentry.MaxSyscallNum {
   150  		if cached := ts.cache[sysno]; cached != uncacheableBPFAction {
   151  			return uint32(cached)
   152  		}
   153  	}
   154  
   155  	data := linux.SeccompData{
   156  		Nr:                 sysno,
   157  		Arch:               arch,
   158  		InstructionPointer: uint64(ip),
   159  	}
   160  	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
   161  	// we can't do any slicing tricks or even use copy/append here.
   162  	for i, arg := range args {
   163  		if i >= len(data.Args) {
   164  			break
   165  		}
   166  		data.Args[i] = arg.Uint64()
   167  	}
   168  	input := dataAsBPFInput(t, &data)
   169  
   170  	// "Every filter successfully installed will be evaluated (in reverse
   171  	// order) for each system call the task makes." - kernel/seccomp.c
   172  	for i := len(ts.filters) - 1; i >= 0; i-- {
   173  		thisRet, err := bpf.Exec[bpf.NativeEndian](ts.filters[i], input)
   174  		if err != nil {
   175  			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
   176  			thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD)
   177  		}
   178  		// "If multiple filters exist, the return value for the evaluation of a
   179  		// given system call will always use the highest precedent value." -
   180  		// Documentation/prctl/seccomp_filter.txt
   181  		//
   182  		// (Note that this contradicts prctl(2): "If the filters permit prctl()
   183  		// calls, then additional filters can be added; they are run in order
   184  		// until the first non-allow result is seen." prctl(2) is incorrect.)
   185  		//
   186  		// "The ordering ensures that a min_t() over composed return values
   187  		// always selects the least permissive choice." -
   188  		// include/uapi/linux/seccomp.h
   189  		if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
   190  			ret = thisRet
   191  		}
   192  	}
   193  
   194  	return ret
   195  }
   196  
   197  // checkFilterCacheability executes `program` on the given `input`, and
   198  // checks if its result is cacheable. If it is, it returns that result.
   199  func checkFilterCacheability(program bpf.Program, input bpf.Input) (uint32, error) {
   200  	// Look up Nr and Arch fields, we'll use their offsets later
   201  	// to verify whether they were accessed.
   202  	sdType := reflect.TypeOf(linux.SeccompData{})
   203  	nrField, ok := sdType.FieldByName("Nr")
   204  	if !ok {
   205  		panic("linux.SeccompData.Nr field not found")
   206  	}
   207  	archField, ok := sdType.FieldByName("Arch")
   208  	if !ok {
   209  		panic("linux.SeccompData.Arch field not found")
   210  	}
   211  
   212  	exec, err := bpf.InstrumentedExec[bpf.NativeEndian](program, input)
   213  	if err != nil {
   214  		return 0, err
   215  	}
   216  	for offset, accessed := range exec.InputAccessed {
   217  		if !accessed {
   218  			continue // Input byte not accessed by the program.
   219  		}
   220  		if uintptr(offset) >= nrField.Offset && uintptr(offset) < nrField.Offset+nrField.Type.Size() {
   221  			continue // The program accessed the "Nr" field, this is OK.
   222  		}
   223  		if uintptr(offset) >= archField.Offset && uintptr(offset) < archField.Offset+archField.Type.Size() {
   224  			continue // The program accessed the "Arch" field, this is OK.
   225  		}
   226  		return 0, fmt.Errorf("program accessed byte at offset %d which is not the sysno or arch field", offset)
   227  	}
   228  	return exec.ReturnValue, nil
   229  }
   230  
   231  // populateCache recomputes `ts.cache` from `ts.filters`.
   232  func (ts *taskSeccomp) populateCache(t *Task) {
   233  	ts.cacheAuditNumber = t.image.st.AuditNumber
   234  	sd := linux.SeccompData{}
   235  	input := bpf.Input(make([]byte, sd.SizeBytes()))
   236  
   237  	for sysno := int32(0); sysno <= sentry.MaxSyscallNum; sysno++ {
   238  		sd.Nr = sysno
   239  		sd.Arch = ts.cacheAuditNumber
   240  		clear(input)
   241  		sd.MarshalBytes(input)
   242  		sysnoIsCacheable := true
   243  		ret := linux.BPFAction(linux.SECCOMP_RET_ALLOW)
   244  		// See notes in `evaluateSyscallFilters` for how to properly interpret
   245  		// seccomp filter and results. We use the same approach here: iterate
   246  		// through filters backwards, and take the smallest result.
   247  		// If any filter is not cacheable, then we cannot cache the result for
   248  		// this sysno.
   249  		for i := len(ts.filters) - 1; i >= 0; i-- {
   250  			result, cacheErr := checkFilterCacheability(ts.filters[i], input)
   251  			if cacheErr != nil {
   252  				sysnoIsCacheable = false
   253  				break
   254  			}
   255  			if (linux.BPFAction(result) & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
   256  				ret = linux.BPFAction(result)
   257  			}
   258  		}
   259  		if sysnoIsCacheable {
   260  			ts.cache[sysno] = ret
   261  		} else {
   262  			ts.cache[sysno] = uncacheableBPFAction
   263  		}
   264  	}
   265  }
   266  
   267  // AppendSyscallFilter adds BPF program p as a system call filter.
   268  //
   269  // Preconditions: The caller must be running on the task goroutine.
   270  func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error {
   271  	// While syscallFilters are an atomic.Value we must take the mutex to prevent
   272  	// our read-copy-update from happening while another task is syncing syscall
   273  	// filters to us, this keeps the filters in a consistent state.
   274  	t.tg.signalHandlers.mu.Lock()
   275  	defer t.tg.signalHandlers.mu.Unlock()
   276  
   277  	// Cap the combined length of all syscall filters (plus a penalty of 4
   278  	// instructions per filter beyond the first) to maxSyscallFilterInstructions.
   279  	// This restriction is inherited from Linux.
   280  	totalLength := p.Length()
   281  	newSeccomp := &taskSeccomp{}
   282  
   283  	if ts := t.seccomp.Load(); ts != nil {
   284  		for _, f := range ts.filters {
   285  			totalLength += f.Length() + 4
   286  		}
   287  		newSeccomp.filters = append(newSeccomp.filters, ts.filters...)
   288  	}
   289  
   290  	if totalLength > maxSyscallFilterInstructions {
   291  		return linuxerr.ENOMEM
   292  	}
   293  
   294  	newSeccomp.filters = append(newSeccomp.filters, p)
   295  	newSeccomp.populateCache(t)
   296  	t.seccomp.Store(newSeccomp)
   297  
   298  	if syncAll {
   299  		// Note: No new privs is always assumed to be set.
   300  		for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
   301  			if ot != t {
   302  				seccompCopy := newSeccomp.copy()
   303  				seccompCopy.populateCache(ot)
   304  				ot.seccomp.Store(seccompCopy)
   305  			}
   306  		}
   307  	}
   308  
   309  	return nil
   310  }
   311  
   312  // SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
   313  // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
   314  // and /proc/[pid]/status.
   315  func (t *Task) SeccompMode() int {
   316  	if ts := t.seccomp.Load(); ts != nil && len(ts.filters) > 0 {
   317  		return linux.SECCOMP_MODE_FILTER
   318  	}
   319  	return linux.SECCOMP_MODE_NONE
   320  }