gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_syscall.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "os" 20 "runtime/trace" 21 22 "golang.org/x/sys/unix" 23 "gvisor.dev/gvisor/pkg/abi/linux" 24 "gvisor.dev/gvisor/pkg/bits" 25 "gvisor.dev/gvisor/pkg/errors" 26 "gvisor.dev/gvisor/pkg/errors/linuxerr" 27 "gvisor.dev/gvisor/pkg/hostarch" 28 "gvisor.dev/gvisor/pkg/marshal" 29 "gvisor.dev/gvisor/pkg/metric" 30 "gvisor.dev/gvisor/pkg/sentry/arch" 31 "gvisor.dev/gvisor/pkg/sentry/memmap" 32 "gvisor.dev/gvisor/pkg/sentry/platform" 33 "gvisor.dev/gvisor/pkg/sentry/seccheck" 34 pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" 35 ) 36 37 // SyscallRestartBlock represents the restart block for a syscall restartable 38 // with a custom function. It encapsulates the state required to restart a 39 // syscall across a S/R. 40 type SyscallRestartBlock interface { 41 Restart(t *Task) (uintptr, error) 42 } 43 44 // SyscallControl is returned by syscalls to control the behavior of 45 // Task.doSyscallInvoke. 46 type SyscallControl struct { 47 // next is the state that the task goroutine should switch to. If next is 48 // nil, the task goroutine should continue to syscall exit as usual. 49 next taskRunState 50 51 // If ignoreReturn is true, Task.doSyscallInvoke should not store any value 52 // in the task's syscall return value register. 53 ignoreReturn bool 54 } 55 56 var ( 57 // CtrlDoExit is returned by the implementations of the exit and exit_group 58 // syscalls to enter the task exit path directly, skipping syscall exit 59 // tracing. 60 CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} 61 62 // ctrlStopAndReinvokeSyscall is returned by syscalls using the external 63 // feature before syscall execution. This causes Task.doSyscallInvoke 64 // to return runSyscallReinvoke, allowing Task.run to check for stops 65 // before immediately re-invoking the syscall (skipping the re-checking 66 // of seccomp filters and ptrace which would confuse userspace 67 // tracing). 68 ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} 69 70 // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at 71 // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather 72 // than tail-calling it, allowing stops to be checked before syscall exit. 73 ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} 74 ) 75 76 func (t *Task) invokeExternal() { 77 t.BeginExternalStop() 78 go func() { // S/R-SAFE: External control flow. 79 defer t.EndExternalStop() 80 t.SyscallTable().External(t.Kernel()) 81 }() 82 } 83 84 func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { 85 s := t.SyscallTable() 86 87 fe := s.FeatureEnable.Word(sysno) 88 89 var straceContext any 90 if bits.IsAnyOn32(fe, StraceEnableBits) { 91 straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) 92 } 93 94 if bits.IsAnyOn32(fe, SecCheckRawEnter) { 95 info := pb.Syscall{ 96 Sysno: uint64(sysno), 97 Arg1: args[0].Uint64(), 98 Arg2: args[1].Uint64(), 99 Arg3: args[2].Uint64(), 100 Arg4: args[3].Uint64(), 101 Arg5: args[4].Uint64(), 102 Arg6: args[5].Uint64(), 103 } 104 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno)) 105 if !fields.Context.Empty() { 106 info.ContextData = &pb.ContextData{} 107 LoadSeccheckData(t, fields.Context, info.ContextData) 108 } 109 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 110 return c.RawSyscall(t, fields, &info) 111 }) 112 } 113 if bits.IsAnyOn32(fe, SecCheckEnter) { 114 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallEnter, sysno)) 115 var ctxData *pb.ContextData 116 if !fields.Context.Empty() { 117 ctxData = &pb.ContextData{} 118 LoadSeccheckData(t, fields.Context, ctxData) 119 } 120 info := SyscallInfo{ 121 Sysno: sysno, 122 Args: args, 123 } 124 cb := s.LookupSyscallToProto(sysno) 125 msg, msgType := cb(t, fields, ctxData, info) 126 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 127 return c.Syscall(t, fields, ctxData, msgType, msg) 128 }) 129 } 130 131 if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { 132 t.invokeExternal() 133 // Ensure we check for stops, then invoke the syscall again. 134 ctrl = ctrlStopAndReinvokeSyscall 135 } else { 136 fn := s.Lookup(sysno) 137 var region *trace.Region // Only non-nil if tracing == true. 138 if trace.IsEnabled() { 139 region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) 140 } 141 if fn != nil { 142 // Call our syscall implementation. 143 rval, ctrl, err = fn(t, sysno, args) 144 } else { 145 // Use the missing function if not found. 146 rval, err = t.SyscallTable().Missing(t, sysno, args) 147 } 148 if region != nil { 149 region.End() 150 } 151 } 152 153 if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { 154 t.invokeExternal() 155 // Don't reinvoke the unix. 156 } 157 158 if bits.IsAnyOn32(fe, StraceEnableBits) { 159 s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) 160 } 161 162 if bits.IsAnyOn32(fe, SecCheckRawExit) { 163 info := pb.Syscall{ 164 Sysno: uint64(sysno), 165 Arg1: args[0].Uint64(), 166 Arg2: args[1].Uint64(), 167 Arg3: args[2].Uint64(), 168 Arg4: args[3].Uint64(), 169 Arg5: args[4].Uint64(), 170 Arg6: args[5].Uint64(), 171 Exit: &pb.Exit{ 172 Result: int64(rval), 173 Errorno: int64(ExtractErrno(err, int(sysno))), 174 }, 175 } 176 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno)) 177 if !fields.Context.Empty() { 178 info.ContextData = &pb.ContextData{} 179 LoadSeccheckData(t, fields.Context, info.ContextData) 180 } 181 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 182 return c.RawSyscall(t, fields, &info) 183 }) 184 } 185 if bits.IsAnyOn32(fe, SecCheckExit) { 186 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallExit, sysno)) 187 var ctxData *pb.ContextData 188 if !fields.Context.Empty() { 189 ctxData = &pb.ContextData{} 190 LoadSeccheckData(t, fields.Context, ctxData) 191 } 192 info := SyscallInfo{ 193 Exit: true, 194 Sysno: sysno, 195 Args: args, 196 Rval: rval, 197 Errno: ExtractErrno(err, int(sysno)), 198 } 199 cb := s.LookupSyscallToProto(sysno) 200 msg, msgType := cb(t, fields, ctxData, info) 201 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 202 return c.Syscall(t, fields, ctxData, msgType, msg) 203 }) 204 } 205 206 return 207 } 208 209 // doSyscall is the entry point for an invocation of a system call specified by 210 // the current state of t's registers. 211 // 212 // The syscall path is very hot; avoid defer. 213 func (t *Task) doSyscall() taskRunState { 214 // Save value of the register which is clobbered in the following 215 // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. 216 // 217 // On x86, register rax was shared by syscall number and return 218 // value, and at the entry of the syscall handler, the rax was 219 // saved to regs.orig_rax which was exposed to userspace. 220 // But on arm64, syscall number was passed through X8, and the X0 221 // was shared by the first syscall argument and return value. The 222 // X0 was saved to regs.orig_x0 which was not exposed to userspace. 223 // So we have to do the same operation here to save the X0 value 224 // into the task context. 225 t.Arch().SyscallSaveOrig() 226 227 sysno := t.Arch().SyscallNo() 228 args := t.Arch().SyscallArgs() 229 230 // Tracers expect to see this between when the task traps into the kernel 231 // to perform a syscall and when the syscall is actually invoked. 232 // This useless-looking temporary is needed because Go. 233 tmp := uintptr(unix.ENOSYS) 234 t.Arch().SetReturn(-tmp) 235 236 // Check seccomp filters. The nil check is for performance (as seccomp use 237 // is rare), not needed for correctness. 238 if t.seccomp.Load() != nil { 239 switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r { 240 case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: 241 t.Debugf("Syscall %d: denied by seccomp", sysno) 242 return (*runSyscallExit)(nil) 243 case linux.SECCOMP_RET_ALLOW: 244 // ok 245 case linux.SECCOMP_RET_KILL_THREAD: 246 t.Debugf("Syscall %d: killed by seccomp", sysno) 247 t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) 248 return (*runExit)(nil) 249 case linux.SECCOMP_RET_TRACE: 250 t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) 251 return (*runSyscallAfterPtraceEventSeccomp)(nil) 252 default: 253 panic(fmt.Sprintf("Unknown seccomp result %d", r)) 254 } 255 } 256 257 syscallCounter.Increment() 258 return t.doSyscallEnter(sysno, args) 259 } 260 261 type runSyscallAfterPtraceEventSeccomp struct{} 262 263 func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { 264 if t.killed() { 265 // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - 266 // ptrace(2) 267 return (*runInterrupt)(nil) 268 } 269 sysno := t.Arch().SyscallNo() 270 // "The tracer can skip the system call by changing the syscall number to 271 // -1." - Documentation/prctl/seccomp_filter.txt 272 if sysno == ^uintptr(0) { 273 return (*runSyscallExit)(nil).execute(t) 274 } 275 args := t.Arch().SyscallArgs() 276 return t.doSyscallEnter(sysno, args) 277 } 278 279 func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { 280 if next, ok := t.ptraceSyscallEnter(); ok { 281 return next 282 } 283 return t.doSyscallInvoke(sysno, args) 284 } 285 286 // +stateify savable 287 type runSyscallAfterSyscallEnterStop struct{} 288 289 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { 290 if sig := linux.Signal(t.ptraceCode); sig.IsValid() { 291 t.tg.signalHandlers.mu.Lock() 292 t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) 293 t.tg.signalHandlers.mu.Unlock() 294 } 295 if t.killed() { 296 return (*runInterrupt)(nil) 297 } 298 sysno := t.Arch().SyscallNo() 299 if sysno == ^uintptr(0) { 300 return (*runSyscallExit)(nil) 301 } 302 args := t.Arch().SyscallArgs() 303 304 return t.doSyscallInvoke(sysno, args) 305 } 306 307 // +stateify savable 308 type runSyscallAfterSysemuStop struct{} 309 310 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { 311 if sig := linux.Signal(t.ptraceCode); sig.IsValid() { 312 t.tg.signalHandlers.mu.Lock() 313 t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) 314 t.tg.signalHandlers.mu.Unlock() 315 } 316 if t.killed() { 317 return (*runInterrupt)(nil) 318 } 319 return (*runSyscallExit)(nil).execute(t) 320 } 321 322 func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { 323 rval, ctrl, err := t.executeSyscall(sysno, args) 324 325 if ctrl != nil { 326 if !ctrl.ignoreReturn { 327 t.Arch().SetReturn(rval) 328 } 329 if ctrl.next != nil { 330 return ctrl.next 331 } 332 } else if err != nil { 333 t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) 334 t.haveSyscallReturn = true 335 } else { 336 t.Arch().SetReturn(rval) 337 } 338 339 return (*runSyscallExit)(nil).execute(t) 340 } 341 342 // +stateify savable 343 type runSyscallReinvoke struct{} 344 345 func (*runSyscallReinvoke) execute(t *Task) taskRunState { 346 if t.killed() { 347 // It's possible that since the last execution, the task has 348 // been forcible killed. Invoking the system call here could 349 // result in an infinite loop if it is again preempted by an 350 // external stop and reinvoked. 351 return (*runInterrupt)(nil) 352 } 353 354 sysno := t.Arch().SyscallNo() 355 args := t.Arch().SyscallArgs() 356 return t.doSyscallInvoke(sysno, args) 357 } 358 359 // +stateify savable 360 type runSyscallExit struct{} 361 362 func (*runSyscallExit) execute(t *Task) taskRunState { 363 t.ptraceSyscallExit() 364 return (*runApp)(nil) 365 } 366 367 // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as 368 // indicated by an execution fault at address addr. doVsyscall returns the 369 // task's next run state. 370 func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { 371 metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeVsyscallCount) 372 373 // Grab the caller up front, to make sure there's a sensible stack. 374 caller := t.Arch().Native(uintptr(0)) 375 if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil { 376 t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) 377 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 378 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 379 return (*runApp)(nil) 380 } 381 382 // For _vsyscalls_, there is no need to translate System V calling convention 383 // to syscall ABI because they both use RDI, RSI, and RDX for the first three 384 // arguments and none of the vsyscalls uses more than two arguments. 385 args := t.Arch().SyscallArgs() 386 if t.seccomp.Load() != nil { 387 switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { 388 case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: 389 t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) 390 return (*runApp)(nil) 391 case linux.SECCOMP_RET_ALLOW: 392 // ok 393 case linux.SECCOMP_RET_TRACE: 394 t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) 395 return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} 396 case linux.SECCOMP_RET_KILL_THREAD: 397 t.Debugf("vsyscall %d: killed by seccomp", sysno) 398 t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) 399 return (*runExit)(nil) 400 default: 401 panic(fmt.Sprintf("Unknown seccomp result %d", r)) 402 } 403 } 404 405 return t.doVsyscallInvoke(sysno, args, caller) 406 } 407 408 type runVsyscallAfterPtraceEventSeccomp struct { 409 addr hostarch.Addr 410 sysno uintptr 411 caller marshal.Marshallable 412 } 413 414 func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { 415 if t.killed() { 416 return (*runInterrupt)(nil) 417 } 418 sysno := t.Arch().SyscallNo() 419 // "... the syscall may not be changed to another system call using the 420 // orig_rax register. It may only be changed to -1 order [sic] to skip the 421 // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - 422 // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip 423 // causes do_exit(SIGSYS), and changing sp is ignored. 424 if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { 425 t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) 426 return (*runExit)(nil) 427 } 428 if sysno == ^uintptr(0) { 429 return (*runApp)(nil) 430 } 431 return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) 432 } 433 434 func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState { 435 rval, ctrl, err := t.executeSyscall(sysno, args) 436 if ctrl != nil { 437 t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) 438 // Set the return value. The stack has already been adjusted. 439 t.Arch().SetReturn(0) 440 } else if err == nil { 441 t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) 442 // Set the return value. The stack has already been adjusted. 443 t.Arch().SetReturn(uintptr(rval)) 444 } else { 445 t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) 446 if linuxerr.Equals(linuxerr.EFAULT, err) { 447 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 448 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 449 // A return is not emulated in this case. 450 return (*runApp)(nil) 451 } 452 t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) 453 } 454 t.Arch().SetIP(t.Arch().Value(caller)) 455 t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) 456 return (*runApp)(nil) 457 } 458 459 // ExtractErrno extracts an integer error number from the error. 460 // The syscall number is purely for context in the error case. Use -1 if 461 // syscall number is unknown. 462 func ExtractErrno(err error, sysno int) int { 463 switch err := err.(type) { 464 case nil: 465 return 0 466 case unix.Errno: 467 return int(err) 468 case *errors.Error: 469 return int(linuxerr.ToUnix(err)) 470 case *memmap.BusError: 471 // Bus errors may generate SIGBUS, but for syscalls they still 472 // return EFAULT. See case in task_run.go where the fault is 473 // handled (and the SIGBUS is delivered). 474 return int(unix.EFAULT) 475 case *os.PathError: 476 return ExtractErrno(err.Err, sysno) 477 case *os.LinkError: 478 return ExtractErrno(err.Err, sysno) 479 case *os.SyscallError: 480 return ExtractErrno(err.Err, sysno) 481 case *platform.ContextError: 482 return int(err.Errno) 483 default: 484 if errno, ok := linuxerr.TranslateError(err); ok { 485 return int(linuxerr.ToUnix(errno)) 486 } 487 } 488 panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) 489 }