github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_syscall.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "os" 20 "runtime/trace" 21 22 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 23 "github.com/MerlinKodo/gvisor/pkg/bits" 24 "github.com/MerlinKodo/gvisor/pkg/errors" 25 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 26 "github.com/MerlinKodo/gvisor/pkg/hostarch" 27 "github.com/MerlinKodo/gvisor/pkg/marshal" 28 "github.com/MerlinKodo/gvisor/pkg/metric" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 31 "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck" 32 pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto" 33 "golang.org/x/sys/unix" 34 ) 35 36 // SyscallRestartBlock represents the restart block for a syscall restartable 37 // with a custom function. It encapsulates the state required to restart a 38 // syscall across a S/R. 39 type SyscallRestartBlock interface { 40 Restart(t *Task) (uintptr, error) 41 } 42 43 // SyscallControl is returned by syscalls to control the behavior of 44 // Task.doSyscallInvoke. 45 type SyscallControl struct { 46 // next is the state that the task goroutine should switch to. If next is 47 // nil, the task goroutine should continue to syscall exit as usual. 48 next taskRunState 49 50 // If ignoreReturn is true, Task.doSyscallInvoke should not store any value 51 // in the task's syscall return value register. 52 ignoreReturn bool 53 } 54 55 var ( 56 // CtrlDoExit is returned by the implementations of the exit and exit_group 57 // syscalls to enter the task exit path directly, skipping syscall exit 58 // tracing. 59 CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} 60 61 // ctrlStopAndReinvokeSyscall is returned by syscalls using the external 62 // feature before syscall execution. This causes Task.doSyscallInvoke 63 // to return runSyscallReinvoke, allowing Task.run to check for stops 64 // before immediately re-invoking the syscall (skipping the re-checking 65 // of seccomp filters and ptrace which would confuse userspace 66 // tracing). 67 ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} 68 69 // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at 70 // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather 71 // than tail-calling it, allowing stops to be checked before syscall exit. 72 ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} 73 ) 74 75 func (t *Task) invokeExternal() { 76 t.BeginExternalStop() 77 go func() { // S/R-SAFE: External control flow. 78 defer t.EndExternalStop() 79 t.SyscallTable().External(t.Kernel()) 80 }() 81 } 82 83 func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { 84 s := t.SyscallTable() 85 86 fe := s.FeatureEnable.Word(sysno) 87 88 var straceContext any 89 if bits.IsAnyOn32(fe, StraceEnableBits) { 90 straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) 91 } 92 93 if bits.IsAnyOn32(fe, SecCheckRawEnter) { 94 info := pb.Syscall{ 95 Sysno: uint64(sysno), 96 Arg1: args[0].Uint64(), 97 Arg2: args[1].Uint64(), 98 Arg3: args[2].Uint64(), 99 Arg4: args[3].Uint64(), 100 Arg5: args[4].Uint64(), 101 Arg6: args[5].Uint64(), 102 } 103 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno)) 104 if !fields.Context.Empty() { 105 info.ContextData = &pb.ContextData{} 106 LoadSeccheckData(t, fields.Context, info.ContextData) 107 } 108 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 109 return c.RawSyscall(t, fields, &info) 110 }) 111 } 112 if bits.IsAnyOn32(fe, SecCheckEnter) { 113 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallEnter, sysno)) 114 var ctxData *pb.ContextData 115 if !fields.Context.Empty() { 116 ctxData = &pb.ContextData{} 117 LoadSeccheckData(t, fields.Context, ctxData) 118 } 119 info := SyscallInfo{ 120 Sysno: sysno, 121 Args: args, 122 } 123 cb := s.LookupSyscallToProto(sysno) 124 msg, msgType := cb(t, fields, ctxData, info) 125 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 126 return c.Syscall(t, fields, ctxData, msgType, msg) 127 }) 128 } 129 130 if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { 131 t.invokeExternal() 132 // Ensure we check for stops, then invoke the syscall again. 133 ctrl = ctrlStopAndReinvokeSyscall 134 } else { 135 fn := s.Lookup(sysno) 136 var region *trace.Region // Only non-nil if tracing == true. 137 if trace.IsEnabled() { 138 region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) 139 } 140 if fn != nil { 141 // Call our syscall implementation. 142 rval, ctrl, err = fn(t, sysno, args) 143 } else { 144 // Use the missing function if not found. 145 rval, err = t.SyscallTable().Missing(t, sysno, args) 146 } 147 if region != nil { 148 region.End() 149 } 150 } 151 152 if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { 153 t.invokeExternal() 154 // Don't reinvoke the unix. 155 } 156 157 if bits.IsAnyOn32(fe, StraceEnableBits) { 158 s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) 159 } 160 161 if bits.IsAnyOn32(fe, SecCheckRawExit) { 162 info := pb.Syscall{ 163 Sysno: uint64(sysno), 164 Arg1: args[0].Uint64(), 165 Arg2: args[1].Uint64(), 166 Arg3: args[2].Uint64(), 167 Arg4: args[3].Uint64(), 168 Arg5: args[4].Uint64(), 169 Arg6: args[5].Uint64(), 170 Exit: &pb.Exit{ 171 Result: int64(rval), 172 Errorno: int64(ExtractErrno(err, int(sysno))), 173 }, 174 } 175 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno)) 176 if !fields.Context.Empty() { 177 info.ContextData = &pb.ContextData{} 178 LoadSeccheckData(t, fields.Context, info.ContextData) 179 } 180 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 181 return c.RawSyscall(t, fields, &info) 182 }) 183 } 184 if bits.IsAnyOn32(fe, SecCheckExit) { 185 fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallExit, sysno)) 186 var ctxData *pb.ContextData 187 if !fields.Context.Empty() { 188 ctxData = &pb.ContextData{} 189 LoadSeccheckData(t, fields.Context, ctxData) 190 } 191 info := SyscallInfo{ 192 Exit: true, 193 Sysno: sysno, 194 Args: args, 195 Rval: rval, 196 Errno: ExtractErrno(err, int(sysno)), 197 } 198 cb := s.LookupSyscallToProto(sysno) 199 msg, msgType := cb(t, fields, ctxData, info) 200 seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 201 return c.Syscall(t, fields, ctxData, msgType, msg) 202 }) 203 } 204 205 return 206 } 207 208 // doSyscall is the entry point for an invocation of a system call specified by 209 // the current state of t's registers. 210 // 211 // The syscall path is very hot; avoid defer. 212 func (t *Task) doSyscall() taskRunState { 213 // Save value of the register which is clobbered in the following 214 // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. 215 // 216 // On x86, register rax was shared by syscall number and return 217 // value, and at the entry of the syscall handler, the rax was 218 // saved to regs.orig_rax which was exposed to userspace. 219 // But on arm64, syscall number was passed through X8, and the X0 220 // was shared by the first syscall argument and return value. The 221 // X0 was saved to regs.orig_x0 which was not exposed to userspace. 222 // So we have to do the same operation here to save the X0 value 223 // into the task context. 224 t.Arch().SyscallSaveOrig() 225 226 sysno := t.Arch().SyscallNo() 227 args := t.Arch().SyscallArgs() 228 229 // Tracers expect to see this between when the task traps into the kernel 230 // to perform a syscall and when the syscall is actually invoked. 231 // This useless-looking temporary is needed because Go. 232 tmp := uintptr(unix.ENOSYS) 233 t.Arch().SetReturn(-tmp) 234 235 // Check seccomp filters. The nil check is for performance (as seccomp use 236 // is rare), not needed for correctness. 237 if t.syscallFilters.Load() != nil { 238 switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r { 239 case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: 240 t.Debugf("Syscall %d: denied by seccomp", sysno) 241 return (*runSyscallExit)(nil) 242 case linux.SECCOMP_RET_ALLOW: 243 // ok 244 case linux.SECCOMP_RET_KILL_THREAD: 245 t.Debugf("Syscall %d: killed by seccomp", sysno) 246 t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) 247 return (*runExit)(nil) 248 case linux.SECCOMP_RET_TRACE: 249 t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) 250 return (*runSyscallAfterPtraceEventSeccomp)(nil) 251 default: 252 panic(fmt.Sprintf("Unknown seccomp result %d", r)) 253 } 254 } 255 256 syscallCounter.Increment() 257 return t.doSyscallEnter(sysno, args) 258 } 259 260 type runSyscallAfterPtraceEventSeccomp struct{} 261 262 func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { 263 if t.killed() { 264 // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - 265 // ptrace(2) 266 return (*runInterrupt)(nil) 267 } 268 sysno := t.Arch().SyscallNo() 269 // "The tracer can skip the system call by changing the syscall number to 270 // -1." - Documentation/prctl/seccomp_filter.txt 271 if sysno == ^uintptr(0) { 272 return (*runSyscallExit)(nil).execute(t) 273 } 274 args := t.Arch().SyscallArgs() 275 return t.doSyscallEnter(sysno, args) 276 } 277 278 func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { 279 if next, ok := t.ptraceSyscallEnter(); ok { 280 return next 281 } 282 return t.doSyscallInvoke(sysno, args) 283 } 284 285 // +stateify savable 286 type runSyscallAfterSyscallEnterStop struct{} 287 288 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { 289 if sig := linux.Signal(t.ptraceCode); sig.IsValid() { 290 t.tg.signalHandlers.mu.Lock() 291 t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) 292 t.tg.signalHandlers.mu.Unlock() 293 } 294 if t.killed() { 295 return (*runInterrupt)(nil) 296 } 297 sysno := t.Arch().SyscallNo() 298 if sysno == ^uintptr(0) { 299 return (*runSyscallExit)(nil) 300 } 301 args := t.Arch().SyscallArgs() 302 303 return t.doSyscallInvoke(sysno, args) 304 } 305 306 // +stateify savable 307 type runSyscallAfterSysemuStop struct{} 308 309 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { 310 if sig := linux.Signal(t.ptraceCode); sig.IsValid() { 311 t.tg.signalHandlers.mu.Lock() 312 t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) 313 t.tg.signalHandlers.mu.Unlock() 314 } 315 if t.killed() { 316 return (*runInterrupt)(nil) 317 } 318 return (*runSyscallExit)(nil).execute(t) 319 } 320 321 func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { 322 rval, ctrl, err := t.executeSyscall(sysno, args) 323 324 if ctrl != nil { 325 if !ctrl.ignoreReturn { 326 t.Arch().SetReturn(rval) 327 } 328 if ctrl.next != nil { 329 return ctrl.next 330 } 331 } else if err != nil { 332 t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) 333 t.haveSyscallReturn = true 334 } else { 335 t.Arch().SetReturn(rval) 336 } 337 338 return (*runSyscallExit)(nil).execute(t) 339 } 340 341 // +stateify savable 342 type runSyscallReinvoke struct{} 343 344 func (*runSyscallReinvoke) execute(t *Task) taskRunState { 345 if t.killed() { 346 // It's possible that since the last execution, the task has 347 // been forcible killed. Invoking the system call here could 348 // result in an infinite loop if it is again preempted by an 349 // external stop and reinvoked. 350 return (*runInterrupt)(nil) 351 } 352 353 sysno := t.Arch().SyscallNo() 354 args := t.Arch().SyscallArgs() 355 return t.doSyscallInvoke(sysno, args) 356 } 357 358 // +stateify savable 359 type runSyscallExit struct{} 360 361 func (*runSyscallExit) execute(t *Task) taskRunState { 362 t.ptraceSyscallExit() 363 return (*runApp)(nil) 364 } 365 366 // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as 367 // indicated by an execution fault at address addr. doVsyscall returns the 368 // task's next run state. 369 func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { 370 metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeVsyscallCount) 371 372 // Grab the caller up front, to make sure there's a sensible stack. 373 caller := t.Arch().Native(uintptr(0)) 374 if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil { 375 t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) 376 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 377 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 378 return (*runApp)(nil) 379 } 380 381 // For _vsyscalls_, there is no need to translate System V calling convention 382 // to syscall ABI because they both use RDI, RSI, and RDX for the first three 383 // arguments and none of the vsyscalls uses more than two arguments. 384 args := t.Arch().SyscallArgs() 385 if t.syscallFilters.Load() != nil { 386 switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { 387 case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: 388 t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) 389 return (*runApp)(nil) 390 case linux.SECCOMP_RET_ALLOW: 391 // ok 392 case linux.SECCOMP_RET_TRACE: 393 t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) 394 return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} 395 case linux.SECCOMP_RET_KILL_THREAD: 396 t.Debugf("vsyscall %d: killed by seccomp", sysno) 397 t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) 398 return (*runExit)(nil) 399 default: 400 panic(fmt.Sprintf("Unknown seccomp result %d", r)) 401 } 402 } 403 404 return t.doVsyscallInvoke(sysno, args, caller) 405 } 406 407 type runVsyscallAfterPtraceEventSeccomp struct { 408 addr hostarch.Addr 409 sysno uintptr 410 caller marshal.Marshallable 411 } 412 413 func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { 414 if t.killed() { 415 return (*runInterrupt)(nil) 416 } 417 sysno := t.Arch().SyscallNo() 418 // "... the syscall may not be changed to another system call using the 419 // orig_rax register. It may only be changed to -1 order [sic] to skip the 420 // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - 421 // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip 422 // causes do_exit(SIGSYS), and changing sp is ignored. 423 if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { 424 t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) 425 return (*runExit)(nil) 426 } 427 if sysno == ^uintptr(0) { 428 return (*runApp)(nil) 429 } 430 return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) 431 } 432 433 func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState { 434 rval, ctrl, err := t.executeSyscall(sysno, args) 435 if ctrl != nil { 436 t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) 437 // Set the return value. The stack has already been adjusted. 438 t.Arch().SetReturn(0) 439 } else if err == nil { 440 t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) 441 // Set the return value. The stack has already been adjusted. 442 t.Arch().SetReturn(uintptr(rval)) 443 } else { 444 t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) 445 if linuxerr.Equals(linuxerr.EFAULT, err) { 446 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 447 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 448 // A return is not emulated in this case. 449 return (*runApp)(nil) 450 } 451 t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) 452 } 453 t.Arch().SetIP(t.Arch().Value(caller)) 454 t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) 455 return (*runApp)(nil) 456 } 457 458 // ExtractErrno extracts an integer error number from the error. 459 // The syscall number is purely for context in the error case. Use -1 if 460 // syscall number is unknown. 461 func ExtractErrno(err error, sysno int) int { 462 switch err := err.(type) { 463 case nil: 464 return 0 465 case unix.Errno: 466 return int(err) 467 case *errors.Error: 468 return int(linuxerr.ToUnix(err)) 469 case *memmap.BusError: 470 // Bus errors may generate SIGBUS, but for syscalls they still 471 // return EFAULT. See case in task_run.go where the fault is 472 // handled (and the SIGBUS is delivered). 473 return int(unix.EFAULT) 474 case *os.PathError: 475 return ExtractErrno(err.Err, sysno) 476 case *os.LinkError: 477 return ExtractErrno(err.Err, sysno) 478 case *os.SyscallError: 479 return ExtractErrno(err.Err, sysno) 480 default: 481 if errno, ok := linuxerr.TranslateError(err); ok { 482 return int(linuxerr.ToUnix(errno)) 483 } 484 } 485 panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) 486 }