github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/seccomp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 19 "github.com/MerlinKodo/gvisor/pkg/bpf" 20 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 21 "github.com/MerlinKodo/gvisor/pkg/hostarch" 22 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 23 "golang.org/x/sys/unix" 24 ) 25 26 const maxSyscallFilterInstructions = 1 << 15 27 28 // dataAsBPFInput returns a serialized BPF program, only valid on the current task 29 // goroutine. 30 // 31 // Note: this is called for every syscall, which is a very hot path. 32 func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { 33 buf := t.CopyScratchBuffer(d.SizeBytes()) 34 d.MarshalUnsafe(buf) 35 return bpf.InputBytes{ 36 Data: buf, 37 // Go-marshal always uses the native byte order. 38 Order: hostarch.ByteOrder, 39 } 40 } 41 42 func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo { 43 si := &linux.SignalInfo{ 44 Signo: int32(linux.SIGSYS), 45 Errno: errno, 46 Code: linux.SYS_SECCOMP, 47 } 48 si.SetCallAddr(uint64(ip)) 49 si.SetSyscall(sysno) 50 si.SetArch(t.SyscallTable().AuditNumber) 51 return si 52 } 53 54 // checkSeccompSyscall applies the task's seccomp filters before the execution 55 // of syscall sysno at instruction pointer ip. (These parameters must be passed 56 // in because vsyscalls do not use the values in t.Arch().) 57 // 58 // Preconditions: The caller must be running on the task goroutine. 59 func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction { 60 result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip)) 61 action := result & linux.SECCOMP_RET_ACTION 62 switch action { 63 case linux.SECCOMP_RET_TRAP: 64 // "Results in the kernel sending a SIGSYS signal to the triggering 65 // task without executing the system call. ... The SECCOMP_RET_DATA 66 // portion of the return value will be passed as si_errno." - 67 // Documentation/prctl/seccomp_filter.txt 68 t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip)) 69 // "The return value register will contain an arch-dependent value." In 70 // practice, it's ~always the syscall number. 71 t.Arch().SetReturn(uintptr(sysno)) 72 73 case linux.SECCOMP_RET_ERRNO: 74 // "Results in the lower 16-bits of the return value being passed to 75 // userland as the errno without executing the system call." 76 t.Arch().SetReturn(-uintptr(result.Data())) 77 78 case linux.SECCOMP_RET_TRACE: 79 // "When returned, this value will cause the kernel to attempt to 80 // notify a ptrace()-based tracer prior to executing the system call. 81 // If there is no tracer present, -ENOSYS is returned to userland and 82 // the system call is not executed." 83 if !t.ptraceSeccomp(result.Data()) { 84 // This useless-looking temporary is needed because Go. 85 tmp := uintptr(unix.ENOSYS) 86 t.Arch().SetReturn(-tmp) 87 return linux.SECCOMP_RET_ERRNO 88 } 89 90 case linux.SECCOMP_RET_ALLOW: 91 // "Results in the system call being executed." 92 93 case linux.SECCOMP_RET_KILL_THREAD: 94 // "Results in the task exiting immediately without executing the 95 // system call. The exit status of the task will be SIGSYS, not 96 // SIGKILL." 97 98 default: 99 // consistent with Linux 100 return linux.SECCOMP_RET_KILL_THREAD 101 } 102 return action 103 } 104 105 func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 { 106 data := linux.SeccompData{ 107 Nr: sysno, 108 Arch: t.image.st.AuditNumber, 109 InstructionPointer: uint64(ip), 110 } 111 // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so 112 // we can't do any slicing tricks or even use copy/append here. 113 for i, arg := range args { 114 if i >= len(data.Args) { 115 break 116 } 117 data.Args[i] = arg.Uint64() 118 } 119 input := dataAsBPFInput(t, &data) 120 121 ret := uint32(linux.SECCOMP_RET_ALLOW) 122 f := t.syscallFilters.Load() 123 if f == nil { 124 return ret 125 } 126 127 // "Every filter successfully installed will be evaluated (in reverse 128 // order) for each system call the task makes." - kernel/seccomp.c 129 for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- { 130 thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input) 131 if err != nil { 132 t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) 133 thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD) 134 } 135 // "If multiple filters exist, the return value for the evaluation of a 136 // given system call will always use the highest precedent value." - 137 // Documentation/prctl/seccomp_filter.txt 138 // 139 // (Note that this contradicts prctl(2): "If the filters permit prctl() 140 // calls, then additional filters can be added; they are run in order 141 // until the first non-allow result is seen." prctl(2) is incorrect.) 142 // 143 // "The ordering ensures that a min_t() over composed return values 144 // always selects the least permissive choice." - 145 // include/uapi/linux/seccomp.h 146 if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { 147 ret = thisRet 148 } 149 } 150 151 return ret 152 } 153 154 // AppendSyscallFilter adds BPF program p as a system call filter. 155 // 156 // Preconditions: The caller must be running on the task goroutine. 157 func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { 158 // While syscallFilters are an atomic.Value we must take the mutex to prevent 159 // our read-copy-update from happening while another task is syncing syscall 160 // filters to us, this keeps the filters in a consistent state. 161 t.tg.signalHandlers.mu.Lock() 162 defer t.tg.signalHandlers.mu.Unlock() 163 164 // Cap the combined length of all syscall filters (plus a penalty of 4 165 // instructions per filter beyond the first) to maxSyscallFilterInstructions. 166 // This restriction is inherited from Linux. 167 totalLength := p.Length() 168 var newFilters []bpf.Program 169 170 if sf := t.syscallFilters.Load(); sf != nil { 171 oldFilters := sf.([]bpf.Program) 172 for _, f := range oldFilters { 173 totalLength += f.Length() + 4 174 } 175 newFilters = append(newFilters, oldFilters...) 176 } 177 178 if totalLength > maxSyscallFilterInstructions { 179 return linuxerr.ENOMEM 180 } 181 182 newFilters = append(newFilters, p) 183 t.syscallFilters.Store(newFilters) 184 185 if syncAll { 186 // Note: No new privs is always assumed to be set. 187 for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { 188 if ot != t { 189 var copiedFilters []bpf.Program 190 copiedFilters = append(copiedFilters, newFilters...) 191 ot.syscallFilters.Store(copiedFilters) 192 } 193 } 194 } 195 196 return nil 197 } 198 199 // SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current 200 // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) 201 // and /proc/[pid]/status. 202 func (t *Task) SeccompMode() int { 203 f := t.syscallFilters.Load() 204 if f != nil && len(f.([]bpf.Program)) > 0 { 205 return linux.SECCOMP_MODE_FILTER 206 } 207 return linux.SECCOMP_MODE_NONE 208 }