github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_syscall.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "os" 20 "runtime/trace" 21 22 "golang.org/x/sys/unix" 23 "github.com/SagerNet/gvisor/pkg/abi/linux" 24 "github.com/SagerNet/gvisor/pkg/bits" 25 "github.com/SagerNet/gvisor/pkg/errors" 26 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 27 "github.com/SagerNet/gvisor/pkg/hostarch" 28 "github.com/SagerNet/gvisor/pkg/marshal" 29 "github.com/SagerNet/gvisor/pkg/metric" 30 "github.com/SagerNet/gvisor/pkg/sentry/arch" 31 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 32 "github.com/SagerNet/gvisor/pkg/syserror" 33 ) 34 35 // SyscallRestartBlock represents the restart block for a syscall restartable 36 // with a custom function. It encapsulates the state required to restart a 37 // syscall across a S/R. 38 type SyscallRestartBlock interface { 39 Restart(t *Task) (uintptr, error) 40 } 41 42 // SyscallControl is returned by syscalls to control the behavior of 43 // Task.doSyscallInvoke. 44 type SyscallControl struct { 45 // next is the state that the task goroutine should switch to. If next is 46 // nil, the task goroutine should continue to syscall exit as usual. 47 next taskRunState 48 49 // If ignoreReturn is true, Task.doSyscallInvoke should not store any value 50 // in the task's syscall return value register. 51 ignoreReturn bool 52 } 53 54 var ( 55 // CtrlDoExit is returned by the implementations of the exit and exit_group 56 // syscalls to enter the task exit path directly, skipping syscall exit 57 // tracing. 58 CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} 59 60 // ctrlStopAndReinvokeSyscall is returned by syscalls using the external 61 // feature before syscall execution. This causes Task.doSyscallInvoke 62 // to return runSyscallReinvoke, allowing Task.run to check for stops 63 // before immediately re-invoking the syscall (skipping the re-checking 64 // of seccomp filters and ptrace which would confuse userspace 65 // tracing). 66 ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} 67 68 // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at 69 // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather 70 // than tail-calling it, allowing stops to be checked before syscall exit. 71 ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} 72 ) 73 74 func (t *Task) invokeExternal() { 75 t.BeginExternalStop() 76 go func() { // S/R-SAFE: External control flow. 77 defer t.EndExternalStop() 78 t.SyscallTable().External(t.Kernel()) 79 }() 80 } 81 82 func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { 83 s := t.SyscallTable() 84 85 fe := s.FeatureEnable.Word(sysno) 86 87 var straceContext interface{} 88 if bits.IsAnyOn32(fe, StraceEnableBits) { 89 straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) 90 } 91 92 if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { 93 t.invokeExternal() 94 // Ensure we check for stops, then invoke the syscall again. 95 ctrl = ctrlStopAndReinvokeSyscall 96 } else { 97 fn := s.Lookup(sysno) 98 var region *trace.Region // Only non-nil if tracing == true. 99 if trace.IsEnabled() { 100 region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) 101 } 102 if fn != nil { 103 // Call our syscall implementation. 104 rval, ctrl, err = fn(t, args) 105 } else { 106 // Use the missing function if not found. 107 rval, err = t.SyscallTable().Missing(t, sysno, args) 108 } 109 if region != nil { 110 region.End() 111 } 112 } 113 114 if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { 115 t.invokeExternal() 116 // Don't reinvoke the unix. 117 } 118 119 if bits.IsAnyOn32(fe, StraceEnableBits) { 120 s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) 121 } 122 123 return 124 } 125 126 // doSyscall is the entry point for an invocation of a system call specified by 127 // the current state of t's registers. 128 // 129 // The syscall path is very hot; avoid defer. 130 func (t *Task) doSyscall() taskRunState { 131 // Save value of the register which is clobbered in the following 132 // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. 133 // 134 // On x86, register rax was shared by syscall number and return 135 // value, and at the entry of the syscall handler, the rax was 136 // saved to regs.orig_rax which was exposed to userspace. 137 // But on arm64, syscall number was passed through X8, and the X0 138 // was shared by the first syscall argument and return value. The 139 // X0 was saved to regs.orig_x0 which was not exposed to userspace. 140 // So we have to do the same operation here to save the X0 value 141 // into the task context. 142 t.Arch().SyscallSaveOrig() 143 144 sysno := t.Arch().SyscallNo() 145 args := t.Arch().SyscallArgs() 146 147 // Tracers expect to see this between when the task traps into the kernel 148 // to perform a syscall and when the syscall is actually invoked. 149 // This useless-looking temporary is needed because Go. 150 tmp := uintptr(unix.ENOSYS) 151 t.Arch().SetReturn(-tmp) 152 153 // Check seccomp filters. The nil check is for performance (as seccomp use 154 // is rare), not needed for correctness. 155 if t.syscallFilters.Load() != nil { 156 switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r { 157 case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: 158 t.Debugf("Syscall %d: denied by seccomp", sysno) 159 return (*runSyscallExit)(nil) 160 case linux.SECCOMP_RET_ALLOW: 161 // ok 162 case linux.SECCOMP_RET_KILL_THREAD: 163 t.Debugf("Syscall %d: killed by seccomp", sysno) 164 t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) 165 return (*runExit)(nil) 166 case linux.SECCOMP_RET_TRACE: 167 t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) 168 return (*runSyscallAfterPtraceEventSeccomp)(nil) 169 default: 170 panic(fmt.Sprintf("Unknown seccomp result %d", r)) 171 } 172 } 173 174 return t.doSyscallEnter(sysno, args) 175 } 176 177 type runSyscallAfterPtraceEventSeccomp struct{} 178 179 func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { 180 if t.killed() { 181 // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - 182 // ptrace(2) 183 return (*runInterrupt)(nil) 184 } 185 sysno := t.Arch().SyscallNo() 186 // "The tracer can skip the system call by changing the syscall number to 187 // -1." - Documentation/prctl/seccomp_filter.txt 188 if sysno == ^uintptr(0) { 189 return (*runSyscallExit)(nil).execute(t) 190 } 191 args := t.Arch().SyscallArgs() 192 return t.doSyscallEnter(sysno, args) 193 } 194 195 func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { 196 if next, ok := t.ptraceSyscallEnter(); ok { 197 return next 198 } 199 return t.doSyscallInvoke(sysno, args) 200 } 201 202 // +stateify savable 203 type runSyscallAfterSyscallEnterStop struct{} 204 205 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { 206 if sig := linux.Signal(t.ptraceCode); sig.IsValid() { 207 t.tg.signalHandlers.mu.Lock() 208 t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) 209 t.tg.signalHandlers.mu.Unlock() 210 } 211 if t.killed() { 212 return (*runInterrupt)(nil) 213 } 214 sysno := t.Arch().SyscallNo() 215 if sysno == ^uintptr(0) { 216 return (*runSyscallExit)(nil) 217 } 218 args := t.Arch().SyscallArgs() 219 220 return t.doSyscallInvoke(sysno, args) 221 } 222 223 // +stateify savable 224 type runSyscallAfterSysemuStop struct{} 225 226 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { 227 if sig := linux.Signal(t.ptraceCode); sig.IsValid() { 228 t.tg.signalHandlers.mu.Lock() 229 t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) 230 t.tg.signalHandlers.mu.Unlock() 231 } 232 if t.killed() { 233 return (*runInterrupt)(nil) 234 } 235 return (*runSyscallExit)(nil).execute(t) 236 } 237 238 func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { 239 rval, ctrl, err := t.executeSyscall(sysno, args) 240 241 if ctrl != nil { 242 if !ctrl.ignoreReturn { 243 t.Arch().SetReturn(rval) 244 } 245 if ctrl.next != nil { 246 return ctrl.next 247 } 248 } else if err != nil { 249 t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) 250 t.haveSyscallReturn = true 251 } else { 252 t.Arch().SetReturn(rval) 253 } 254 255 return (*runSyscallExit)(nil).execute(t) 256 } 257 258 // +stateify savable 259 type runSyscallReinvoke struct{} 260 261 func (*runSyscallReinvoke) execute(t *Task) taskRunState { 262 if t.killed() { 263 // It's possible that since the last execution, the task has 264 // been forcible killed. Invoking the system call here could 265 // result in an infinite loop if it is again preempted by an 266 // external stop and reinvoked. 267 return (*runInterrupt)(nil) 268 } 269 270 sysno := t.Arch().SyscallNo() 271 args := t.Arch().SyscallArgs() 272 return t.doSyscallInvoke(sysno, args) 273 } 274 275 // +stateify savable 276 type runSyscallExit struct{} 277 278 func (*runSyscallExit) execute(t *Task) taskRunState { 279 t.ptraceSyscallExit() 280 return (*runApp)(nil) 281 } 282 283 // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as 284 // indicated by an execution fault at address addr. doVsyscall returns the 285 // task's next run state. 286 func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { 287 metric.WeirdnessMetric.Increment("vsyscall_count") 288 289 // Grab the caller up front, to make sure there's a sensible stack. 290 caller := t.Arch().Native(uintptr(0)) 291 if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil { 292 t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) 293 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 294 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 295 return (*runApp)(nil) 296 } 297 298 // For _vsyscalls_, there is no need to translate System V calling convention 299 // to syscall ABI because they both use RDI, RSI, and RDX for the first three 300 // arguments and none of the vsyscalls uses more than two arguments. 301 args := t.Arch().SyscallArgs() 302 if t.syscallFilters.Load() != nil { 303 switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { 304 case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: 305 t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) 306 return (*runApp)(nil) 307 case linux.SECCOMP_RET_ALLOW: 308 // ok 309 case linux.SECCOMP_RET_TRACE: 310 t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) 311 return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} 312 case linux.SECCOMP_RET_KILL_THREAD: 313 t.Debugf("vsyscall %d: killed by seccomp", sysno) 314 t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) 315 return (*runExit)(nil) 316 default: 317 panic(fmt.Sprintf("Unknown seccomp result %d", r)) 318 } 319 } 320 321 return t.doVsyscallInvoke(sysno, args, caller) 322 } 323 324 type runVsyscallAfterPtraceEventSeccomp struct { 325 addr hostarch.Addr 326 sysno uintptr 327 caller marshal.Marshallable 328 } 329 330 func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { 331 if t.killed() { 332 return (*runInterrupt)(nil) 333 } 334 sysno := t.Arch().SyscallNo() 335 // "... the syscall may not be changed to another system call using the 336 // orig_rax register. It may only be changed to -1 order [sic] to skip the 337 // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - 338 // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip 339 // causes do_exit(SIGSYS), and changing sp is ignored. 340 if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { 341 t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) 342 return (*runExit)(nil) 343 } 344 if sysno == ^uintptr(0) { 345 return (*runApp)(nil) 346 } 347 return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) 348 } 349 350 func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState { 351 rval, ctrl, err := t.executeSyscall(sysno, args) 352 if ctrl != nil { 353 t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) 354 // Set the return value. The stack has already been adjusted. 355 t.Arch().SetReturn(0) 356 } else if err == nil { 357 t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) 358 // Set the return value. The stack has already been adjusted. 359 t.Arch().SetReturn(uintptr(rval)) 360 } else { 361 t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) 362 if linuxerr.Equals(linuxerr.EFAULT, err) { 363 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 364 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 365 // A return is not emulated in this case. 366 return (*runApp)(nil) 367 } 368 t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) 369 } 370 t.Arch().SetIP(t.Arch().Value(caller)) 371 t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) 372 return (*runApp)(nil) 373 } 374 375 // ExtractErrno extracts an integer error number from the error. 376 // The syscall number is purely for context in the error case. Use -1 if 377 // syscall number is unknown. 378 func ExtractErrno(err error, sysno int) int { 379 switch err := err.(type) { 380 case nil: 381 return 0 382 case unix.Errno: 383 return int(err) 384 case *errors.Error: 385 return int(err.Errno()) 386 case syserror.SyscallRestartErrno: 387 return int(err) 388 case *memmap.BusError: 389 // Bus errors may generate SIGBUS, but for syscalls they still 390 // return EFAULT. See case in task_run.go where the fault is 391 // handled (and the SIGBUS is delivered). 392 return int(unix.EFAULT) 393 case *os.PathError: 394 return ExtractErrno(err.Err, sysno) 395 case *os.LinkError: 396 return ExtractErrno(err.Err, sysno) 397 case *os.SyscallError: 398 return ExtractErrno(err.Err, sysno) 399 default: 400 if errno, ok := syserror.TranslateError(err); ok { 401 return int(errno) 402 } 403 } 404 panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) 405 }