gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/seccomp.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "reflect" 20 21 "golang.org/x/sys/unix" 22 "gvisor.dev/gvisor/pkg/abi/linux" 23 "gvisor.dev/gvisor/pkg/abi/sentry" 24 "gvisor.dev/gvisor/pkg/bpf" 25 "gvisor.dev/gvisor/pkg/errors/linuxerr" 26 "gvisor.dev/gvisor/pkg/hostarch" 27 "gvisor.dev/gvisor/pkg/sentry/arch" 28 ) 29 30 const ( 31 maxSyscallFilterInstructions = 1 << 15 32 33 // uncacheableBPFAction is an invalid seccomp action code. 34 // It is used as a sentinel value in `taskSeccompFilters.cache` to indicate 35 // that a specific syscall number is uncachable. 36 uncacheableBPFAction = linux.SECCOMP_RET_ACTION_FULL 37 ) 38 39 // taskSeccomp holds seccomp-related data for a `Task`. 40 // 41 // +stateify savable 42 type taskSeccomp struct { 43 // filters is the list of seccomp programs that are applied to the task, 44 // in the order in which they were installed. 45 filters []bpf.Program 46 47 // cache maps syscall numbers to the action to take for that syscall number. 48 // It is only populated for syscalls where determining this action does not 49 // involve any input data other than the architecture and the syscall 50 // number in any of `filters`. 51 // If any other input is necessary, the cache stores `uncacheableBPFAction` 52 // to indicate that this syscall number's rules are not cacheable. 53 cache [sentry.MaxSyscallNum + 1]linux.BPFAction 54 55 // cacheAuditNumber is the AUDIT_ARCH_* constant of the task image used 56 // at the time of computing `cache`. 57 cacheAuditNumber uint32 58 } 59 60 // copy returns a copy of this `taskSeccomp`. 61 func (ts *taskSeccomp) copy() *taskSeccomp { 62 return &taskSeccomp{ 63 filters: append(([]bpf.Program)(nil), ts.filters...), 64 cacheAuditNumber: ts.cacheAuditNumber, 65 cache: ts.cache, 66 } 67 } 68 69 // dataAsBPFInput returns a serialized BPF program, only valid on the current task 70 // goroutine. 71 // 72 // Note: this is called for every syscall, which is a very hot path. 73 func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { 74 buf := t.CopyScratchBuffer(d.SizeBytes()) 75 d.MarshalUnsafe(buf) 76 return buf[:d.SizeBytes()] 77 } 78 79 func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo { 80 si := &linux.SignalInfo{ 81 Signo: int32(linux.SIGSYS), 82 Errno: errno, 83 Code: linux.SYS_SECCOMP, 84 } 85 si.SetCallAddr(uint64(ip)) 86 si.SetSyscall(sysno) 87 si.SetArch(t.SyscallTable().AuditNumber) 88 return si 89 } 90 91 // checkSeccompSyscall applies the task's seccomp filters before the execution 92 // of syscall sysno at instruction pointer ip. (These parameters must be passed 93 // in because vsyscalls do not use the values in t.Arch().) 94 // 95 // Preconditions: The caller must be running on the task goroutine. 96 func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction { 97 result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip)) 98 action := result & linux.SECCOMP_RET_ACTION 99 switch action { 100 case linux.SECCOMP_RET_TRAP: 101 // "Results in the kernel sending a SIGSYS signal to the triggering 102 // task without executing the system call. ... The SECCOMP_RET_DATA 103 // portion of the return value will be passed as si_errno." - 104 // Documentation/prctl/seccomp_filter.txt 105 t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip)) 106 // "The return value register will contain an arch-dependent value." In 107 // practice, it's ~always the syscall number. 108 t.Arch().SetReturn(uintptr(sysno)) 109 110 case linux.SECCOMP_RET_ERRNO: 111 // "Results in the lower 16-bits of the return value being passed to 112 // userland as the errno without executing the system call." 113 t.Arch().SetReturn(-uintptr(result.Data())) 114 115 case linux.SECCOMP_RET_TRACE: 116 // "When returned, this value will cause the kernel to attempt to 117 // notify a ptrace()-based tracer prior to executing the system call. 118 // If there is no tracer present, -ENOSYS is returned to userland and 119 // the system call is not executed." 120 if !t.ptraceSeccomp(result.Data()) { 121 // This useless-looking temporary is needed because Go. 122 tmp := uintptr(unix.ENOSYS) 123 t.Arch().SetReturn(-tmp) 124 return linux.SECCOMP_RET_ERRNO 125 } 126 127 case linux.SECCOMP_RET_ALLOW: 128 // "Results in the system call being executed." 129 130 case linux.SECCOMP_RET_KILL_THREAD: 131 // "Results in the task exiting immediately without executing the 132 // system call. The exit status of the task will be SIGSYS, not 133 // SIGKILL." 134 135 default: 136 // consistent with Linux 137 return linux.SECCOMP_RET_KILL_THREAD 138 } 139 return action 140 } 141 142 func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 { 143 ret := uint32(linux.SECCOMP_RET_ALLOW) 144 ts := t.seccomp.Load() 145 if ts == nil { 146 return ret 147 } 148 arch := t.image.st.AuditNumber 149 if arch == ts.cacheAuditNumber && sysno >= 0 && sysno <= sentry.MaxSyscallNum { 150 if cached := ts.cache[sysno]; cached != uncacheableBPFAction { 151 return uint32(cached) 152 } 153 } 154 155 data := linux.SeccompData{ 156 Nr: sysno, 157 Arch: arch, 158 InstructionPointer: uint64(ip), 159 } 160 // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so 161 // we can't do any slicing tricks or even use copy/append here. 162 for i, arg := range args { 163 if i >= len(data.Args) { 164 break 165 } 166 data.Args[i] = arg.Uint64() 167 } 168 input := dataAsBPFInput(t, &data) 169 170 // "Every filter successfully installed will be evaluated (in reverse 171 // order) for each system call the task makes." - kernel/seccomp.c 172 for i := len(ts.filters) - 1; i >= 0; i-- { 173 thisRet, err := bpf.Exec[bpf.NativeEndian](ts.filters[i], input) 174 if err != nil { 175 t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) 176 thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD) 177 } 178 // "If multiple filters exist, the return value for the evaluation of a 179 // given system call will always use the highest precedent value." - 180 // Documentation/prctl/seccomp_filter.txt 181 // 182 // (Note that this contradicts prctl(2): "If the filters permit prctl() 183 // calls, then additional filters can be added; they are run in order 184 // until the first non-allow result is seen." prctl(2) is incorrect.) 185 // 186 // "The ordering ensures that a min_t() over composed return values 187 // always selects the least permissive choice." - 188 // include/uapi/linux/seccomp.h 189 if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { 190 ret = thisRet 191 } 192 } 193 194 return ret 195 } 196 197 // checkFilterCacheability executes `program` on the given `input`, and 198 // checks if its result is cacheable. If it is, it returns that result. 199 func checkFilterCacheability(program bpf.Program, input bpf.Input) (uint32, error) { 200 // Look up Nr and Arch fields, we'll use their offsets later 201 // to verify whether they were accessed. 202 sdType := reflect.TypeOf(linux.SeccompData{}) 203 nrField, ok := sdType.FieldByName("Nr") 204 if !ok { 205 panic("linux.SeccompData.Nr field not found") 206 } 207 archField, ok := sdType.FieldByName("Arch") 208 if !ok { 209 panic("linux.SeccompData.Arch field not found") 210 } 211 212 exec, err := bpf.InstrumentedExec[bpf.NativeEndian](program, input) 213 if err != nil { 214 return 0, err 215 } 216 for offset, accessed := range exec.InputAccessed { 217 if !accessed { 218 continue // Input byte not accessed by the program. 219 } 220 if uintptr(offset) >= nrField.Offset && uintptr(offset) < nrField.Offset+nrField.Type.Size() { 221 continue // The program accessed the "Nr" field, this is OK. 222 } 223 if uintptr(offset) >= archField.Offset && uintptr(offset) < archField.Offset+archField.Type.Size() { 224 continue // The program accessed the "Arch" field, this is OK. 225 } 226 return 0, fmt.Errorf("program accessed byte at offset %d which is not the sysno or arch field", offset) 227 } 228 return exec.ReturnValue, nil 229 } 230 231 // populateCache recomputes `ts.cache` from `ts.filters`. 232 func (ts *taskSeccomp) populateCache(t *Task) { 233 ts.cacheAuditNumber = t.image.st.AuditNumber 234 sd := linux.SeccompData{} 235 input := bpf.Input(make([]byte, sd.SizeBytes())) 236 237 for sysno := int32(0); sysno <= sentry.MaxSyscallNum; sysno++ { 238 sd.Nr = sysno 239 sd.Arch = ts.cacheAuditNumber 240 clear(input) 241 sd.MarshalBytes(input) 242 sysnoIsCacheable := true 243 ret := linux.BPFAction(linux.SECCOMP_RET_ALLOW) 244 // See notes in `evaluateSyscallFilters` for how to properly interpret 245 // seccomp filter and results. We use the same approach here: iterate 246 // through filters backwards, and take the smallest result. 247 // If any filter is not cacheable, then we cannot cache the result for 248 // this sysno. 249 for i := len(ts.filters) - 1; i >= 0; i-- { 250 result, cacheErr := checkFilterCacheability(ts.filters[i], input) 251 if cacheErr != nil { 252 sysnoIsCacheable = false 253 break 254 } 255 if (linux.BPFAction(result) & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { 256 ret = linux.BPFAction(result) 257 } 258 } 259 if sysnoIsCacheable { 260 ts.cache[sysno] = ret 261 } else { 262 ts.cache[sysno] = uncacheableBPFAction 263 } 264 } 265 } 266 267 // AppendSyscallFilter adds BPF program p as a system call filter. 268 // 269 // Preconditions: The caller must be running on the task goroutine. 270 func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { 271 // While syscallFilters are an atomic.Value we must take the mutex to prevent 272 // our read-copy-update from happening while another task is syncing syscall 273 // filters to us, this keeps the filters in a consistent state. 274 t.tg.signalHandlers.mu.Lock() 275 defer t.tg.signalHandlers.mu.Unlock() 276 277 // Cap the combined length of all syscall filters (plus a penalty of 4 278 // instructions per filter beyond the first) to maxSyscallFilterInstructions. 279 // This restriction is inherited from Linux. 280 totalLength := p.Length() 281 newSeccomp := &taskSeccomp{} 282 283 if ts := t.seccomp.Load(); ts != nil { 284 for _, f := range ts.filters { 285 totalLength += f.Length() + 4 286 } 287 newSeccomp.filters = append(newSeccomp.filters, ts.filters...) 288 } 289 290 if totalLength > maxSyscallFilterInstructions { 291 return linuxerr.ENOMEM 292 } 293 294 newSeccomp.filters = append(newSeccomp.filters, p) 295 newSeccomp.populateCache(t) 296 t.seccomp.Store(newSeccomp) 297 298 if syncAll { 299 // Note: No new privs is always assumed to be set. 300 for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { 301 if ot != t { 302 seccompCopy := newSeccomp.copy() 303 seccompCopy.populateCache(ot) 304 ot.seccomp.Store(seccompCopy) 305 } 306 } 307 } 308 309 return nil 310 } 311 312 // SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current 313 // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) 314 // and /proc/[pid]/status. 315 func (t *Task) SeccompMode() int { 316 if ts := t.seccomp.Load(); ts != nil && len(ts.filters) > 0 { 317 return linux.SECCOMP_MODE_FILTER 318 } 319 return linux.SECCOMP_MODE_NONE 320 }