github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/subprocess.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package systrap 16 17 import ( 18 "fmt" 19 "os" 20 "runtime" 21 "sync" 22 "sync/atomic" 23 24 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 25 "github.com/MerlinKodo/gvisor/pkg/hostarch" 26 "github.com/MerlinKodo/gvisor/pkg/log" 27 "github.com/MerlinKodo/gvisor/pkg/pool" 28 "github.com/MerlinKodo/gvisor/pkg/seccomp" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 31 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/sysmsg" 34 "github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/usertrap" 35 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 36 "golang.org/x/sys/unix" 37 ) 38 39 var ( 40 // globalPool tracks all subprocesses in various state: active or available for 41 // reuse. 42 globalPool = subprocessPool{} 43 44 // maximumUserAddress is the largest possible user address. 45 maximumUserAddress = linux.TaskSize 46 47 // stubInitAddress is the initial attempt link address for the stub. 48 stubInitAddress = linux.TaskSize 49 50 // maxRandomOffsetOfStubAddress is the maximum offset for randomizing a 51 // stub address. It is set to the default value of mm.mmap_rnd_bits. 52 // 53 // Note: Tools like ThreadSanitizer don't like when the memory layout 54 // is changed significantly. 55 maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1) 56 57 // maxStubUserAddress is the largest possible user address for 58 // processes running inside gVisor. It is fixed because 59 // * we don't want to reveal a stub address. 60 // * it has to be the same across checkpoint/restore. 61 maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress 62 ) 63 64 // Linux kernel errnos which "should never be seen by user programs", but will 65 // be revealed to ptrace syscall exit tracing. 66 // 67 // These constants are only used in subprocess.go. 68 const ( 69 ERESTARTSYS = unix.Errno(512) 70 ERESTARTNOINTR = unix.Errno(513) 71 ERESTARTNOHAND = unix.Errno(514) 72 ) 73 74 // thread is a traced thread; it is a thread identifier. 75 // 76 // This is a convenience type for defining ptrace operations. 77 type thread struct { 78 tgid int32 79 tid int32 80 81 // sysmsgStackID is a stack ID in subprocess.sysmsgStackPool. 82 sysmsgStackID uint64 83 84 // initRegs are the initial registers for the first thread. 85 // 86 // These are used for the register set for system calls. 87 initRegs arch.Registers 88 } 89 90 // requestThread is used to request a new sysmsg thread. A thread identifier will 91 // be sent into the thread channel. 92 type requestThread struct { 93 thread chan *thread 94 } 95 96 // requestStub is used to request a new stub process. 97 type requestStub struct { 98 done chan *thread 99 } 100 101 // maxSysmsgThreads specifies the maximum number of system threads that a 102 // subprocess can create in context decoupled mode. 103 // TODO(b/268366549): Replace maxSystemThreads below. 104 var maxSysmsgThreads = runtime.GOMAXPROCS(0) 105 106 const ( 107 // maxSystemThreads specifies the maximum number of system threads that a 108 // subprocess may create in order to process the contexts. 109 maxSystemThreads = 4096 110 // maxGuestContexts specifies the maximum number of task contexts that a 111 // subprocess can handle. 112 maxGuestContexts = 4095 113 // invalidContextID specifies an invalid ID. 114 invalidContextID uint32 = 0xfefefefe 115 // invalidThreadID is used to indicate that a context is not being worked on by 116 // any sysmsg thread. 117 invalidThreadID uint32 = 0xfefefefe 118 ) 119 120 // subprocess is a collection of threads being traced. 121 type subprocess struct { 122 platform.NoAddressSpaceIO 123 subprocessRefs 124 125 // requests is used to signal creation of new threads. 126 requests chan any 127 128 // sysmsgInitRegs is used to reset sysemu regs. 129 sysmsgInitRegs arch.Registers 130 131 // mu protects the following fields. 132 mu sync.Mutex 133 134 // faultedContexts is the set of contexts for which it's possible that 135 // context.lastFaultSP == this subprocess. 136 faultedContexts map[*context]struct{} 137 138 // sysmsgStackPool is a pool of available sysmsg stacks. 139 sysmsgStackPool pool.Pool 140 141 // threadContextPool is a pool of available sysmsg.ThreadContext IDs. 142 threadContextPool pool.Pool 143 144 // threadContextRegion defines the ThreadContext memory region start 145 // within the sentry address space. 146 threadContextRegion uintptr 147 148 // memoryFile is used to allocate a sysmsg stack which is shared 149 // between a stub process and the Sentry. 150 memoryFile *pgalloc.MemoryFile 151 152 // usertrap is the state of the usertrap table which contains syscall 153 // trampolines. 154 usertrap *usertrap.State 155 156 syscallThreadMu sync.Mutex 157 syscallThread *syscallThread 158 159 // sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads 160 sysmsgThreadsMu sync.Mutex 161 // sysmsgThreads is a collection of all active sysmsg threads in the 162 // subprocess. 163 sysmsgThreads map[uint32]*sysmsgThread 164 // numSysmsgThreads counts the number of active sysmsg threads; we use a 165 // counter instead of using len(sysmsgThreads) because we need to synchronize 166 // how many threads get created _before_ the creation happens. 167 numSysmsgThreads int 168 169 // contextQueue is a queue of all contexts that are ready to switch back to 170 // user mode. 171 contextQueue *contextQueue 172 } 173 174 func (s *subprocess) initSyscallThread(ptraceThread *thread) error { 175 s.syscallThreadMu.Lock() 176 defer s.syscallThreadMu.Unlock() 177 178 id, ok := s.sysmsgStackPool.Get() 179 if !ok { 180 panic("unable to allocate a sysmsg stub thread") 181 } 182 183 ptraceThread.sysmsgStackID = id 184 t := syscallThread{ 185 subproc: s, 186 thread: ptraceThread, 187 } 188 189 if err := t.init(); err != nil { 190 panic(fmt.Sprintf("failed to create a syscall thread")) 191 } 192 s.syscallThread = &t 193 194 s.syscallThread.detach() 195 196 return nil 197 } 198 199 // handlePtraceSyscallRequest executes system calls that can't be run via 200 // syscallThread without using ptrace. Look at the description of syscallThread 201 // to get more details about its limitations. 202 func (s *subprocess) handlePtraceSyscallRequest(req any) { 203 s.syscallThreadMu.Lock() 204 defer s.syscallThreadMu.Unlock() 205 runtime.LockOSThread() 206 defer runtime.UnlockOSThread() 207 s.syscallThread.attach() 208 defer s.syscallThread.detach() 209 210 ptraceThread := s.syscallThread.thread 211 212 switch req.(type) { 213 case requestThread: 214 r := req.(requestThread) 215 t, err := ptraceThread.clone() 216 if err != nil { 217 // Should not happen: not recoverable. 218 panic(fmt.Sprintf("error initializing first thread: %v", err)) 219 } 220 221 // Since the new thread was created with 222 // clone(CLONE_PTRACE), it will begin execution with 223 // SIGSTOP pending and with this thread as its tracer. 224 // (Hopefully nobody tgkilled it with a signal < 225 // SIGSTOP before the SIGSTOP was delivered, in which 226 // case that signal would be delivered before SIGSTOP.) 227 if sig := t.wait(stopped); sig != unix.SIGSTOP { 228 panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) 229 } 230 231 t.initRegs = ptraceThread.initRegs 232 // Set the parent death signal to SIGKILL. 233 _, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL, 234 arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG}, 235 arch.SyscallArgument{Value: uintptr(unix.SIGKILL)}, 236 arch.SyscallArgument{Value: 0}, 237 arch.SyscallArgument{Value: 0}, 238 arch.SyscallArgument{Value: 0}, 239 arch.SyscallArgument{Value: 0}, 240 ) 241 if err != nil { 242 panic(fmt.Sprintf("prctl: %v", err)) 243 } 244 245 id, ok := s.sysmsgStackPool.Get() 246 if !ok { 247 panic("unable to allocate a sysmsg stub thread") 248 } 249 t.sysmsgStackID = id 250 251 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 { 252 panic(fmt.Sprintf("tkill failed: %v", e)) 253 } 254 255 // Detach the thread. 256 t.detach() 257 258 // Return the thread. 259 r.thread <- t 260 case requestStub: 261 r := req.(requestStub) 262 t, err := ptraceThread.createStub() 263 if err != nil { 264 panic(fmt.Sprintf("unable to create a stub process: %s", err)) 265 } 266 r.done <- t 267 268 } 269 } 270 271 // newSubprocess returns a usable subprocess. 272 // 273 // This will either be a newly created subprocess, or one from the global pool. 274 // The create function will be called in the latter case, which is guaranteed 275 // to happen with the runtime thread locked. 276 func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile) (*subprocess, error) { 277 if sp := globalPool.fetchAvailable(); sp != nil { 278 sp.subprocessRefs.InitRefs() 279 sp.usertrap = usertrap.New() 280 return sp, nil 281 } 282 283 // The following goroutine is responsible for creating the first traced 284 // thread, and responding to requests to make additional threads in the 285 // traced process. The process will be killed and reaped when the 286 // request channel is closed, which happens in Release below. 287 requests := make(chan any) 288 289 // Ready. 290 sp := &subprocess{ 291 requests: requests, 292 faultedContexts: make(map[*context]struct{}), 293 sysmsgStackPool: pool.Pool{Start: 0, Limit: maxSystemThreads}, 294 threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts}, 295 memoryFile: memoryFile, 296 sysmsgThreads: make(map[uint32]*sysmsgThread), 297 } 298 sp.subprocessRefs.InitRefs() 299 runtime.LockOSThread() 300 defer runtime.UnlockOSThread() 301 302 // Initialize the syscall thread. 303 ptraceThread, err := create() 304 if err != nil { 305 return nil, err 306 } 307 sp.sysmsgInitRegs = ptraceThread.initRegs 308 309 if err := sp.initSyscallThread(ptraceThread); err != nil { 310 return nil, err 311 } 312 313 go func() { // S/R-SAFE: Platform-related. 314 315 // Wait for requests to create threads. 316 for req := range requests { 317 sp.handlePtraceSyscallRequest(req) 318 } 319 320 // Requests should never be closed. 321 panic("unreachable") 322 }() 323 324 sp.unmap() 325 sp.usertrap = usertrap.New() 326 sp.mapSharedRegions() 327 sp.mapPrivateRegions() 328 329 // Create the initial sysmsg thread. 330 atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1) 331 if err := sp.createSysmsgThread(); err != nil { 332 return nil, err 333 } 334 sp.numSysmsgThreads++ 335 336 return sp, nil 337 } 338 339 // mapSharedRegions maps the shared regions that are used between the subprocess 340 // and ALL of the subsequently created sysmsg threads into both the sentry and 341 // the syscall thread. 342 // 343 // Should be called before any sysmsg threads are created. 344 // Initializes s.contextQueue and s.threadContextRegion. 345 func (s *subprocess) mapSharedRegions() { 346 if s.contextQueue != nil || s.threadContextRegion != 0 { 347 panic("contextQueue or threadContextRegion was already initialized") 348 } 349 350 opts := pgalloc.AllocOpts{ 351 Kind: usage.System, 352 Dir: pgalloc.TopDown, 353 } 354 355 // Map shared regions into the sentry. 356 contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts) 357 contextQueue.init() 358 359 // Map thread context region into the syscall thread. 360 _, err := s.syscallThread.syscall( 361 unix.SYS_MMAP, 362 arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)}, 363 arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())}, 364 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 365 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 366 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 367 arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)}) 368 if err != nil { 369 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 370 } 371 372 s.contextQueue = contextQueue 373 374 // Map thread context region into the sentry. 375 threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts) 376 if err != nil { 377 panic(fmt.Sprintf("failed to allocate a new subprocess context memory region")) 378 } 379 sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6( 380 unix.SYS_MMAP, 381 0, 382 uintptr(threadContextFR.Length()), 383 unix.PROT_WRITE|unix.PROT_READ, 384 unix.MAP_SHARED|unix.MAP_FILE, 385 uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start)) 386 if errno != 0 { 387 panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno)) 388 } 389 390 // Map thread context region into the syscall thread. 391 if _, err := s.syscallThread.syscall( 392 unix.SYS_MMAP, 393 arch.SyscallArgument{Value: uintptr(stubContextRegion)}, 394 arch.SyscallArgument{Value: uintptr(threadContextFR.Length())}, 395 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 396 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 397 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 398 arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil { 399 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 400 } 401 402 s.threadContextRegion = sentryThreadContextRegionAddr 403 } 404 405 func (s *subprocess) mapPrivateRegions() { 406 _, err := s.syscallThread.syscall( 407 unix.SYS_MMAP, 408 arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)}, 409 arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)}, 410 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 411 arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)}, 412 arch.SyscallArgument{Value: 0}, 413 arch.SyscallArgument{Value: 0}) 414 if err != nil { 415 panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err)) 416 } 417 } 418 419 // unmap unmaps non-stub regions of the process. 420 // 421 // This will panic on failure (which should never happen). 422 func (s *subprocess) unmap() { 423 s.Unmap(0, uint64(stubStart)) 424 if maximumUserAddress != stubEnd { 425 s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) 426 } 427 } 428 429 // Release kills the subprocess. 430 // 431 // Just kidding! We can't safely co-ordinate the detaching of all the 432 // tracees (since the tracers are random runtime threads, and the process 433 // won't exit until tracers have been notifier). 434 // 435 // Therefore we simply unmap everything in the subprocess and return it to the 436 // globalPool. This has the added benefit of reducing creation time for new 437 // subprocesses. 438 func (s *subprocess) Release() { 439 s.unmap() 440 s.DecRef(s.release) 441 } 442 443 // release returns the subprocess to the global pool. 444 func (s *subprocess) release() { 445 globalPool.markAvailable(s) 446 } 447 448 // newThread creates a new traced thread. 449 // 450 // Precondition: the OS thread must be locked. 451 func (s *subprocess) newThread() *thread { 452 // Ask the first thread to create a new one. 453 var r requestThread 454 r.thread = make(chan *thread) 455 s.requests <- r 456 t := <-r.thread 457 458 // Attach the subprocess to this one. 459 t.attach() 460 461 // Return the new thread, which is now bound. 462 return t 463 } 464 465 // attach attaches to the thread. 466 func (t *thread) attach() { 467 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 468 panic(fmt.Sprintf("unable to attach: %v", errno)) 469 } 470 471 // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already 472 // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of 473 // newSubprocess), so we always expect to see signal-delivery-stop with 474 // SIGSTOP. 475 if sig := t.wait(stopped); sig != unix.SIGSTOP { 476 panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig)) 477 } 478 479 // Initialize options. 480 t.init() 481 } 482 483 func (t *thread) grabInitRegs() { 484 // Grab registers. 485 // 486 // Note that we adjust the current register RIP value to be just before 487 // the current system call executed. This depends on the definition of 488 // the stub itself. 489 if err := t.getRegs(&t.initRegs); err != nil { 490 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 491 } 492 t.adjustInitRegsRip() 493 t.initRegs.SetStackPointer(0) 494 } 495 496 // detach detaches from the thread. 497 // 498 // Because the SIGSTOP is not suppressed, the thread will enter group-stop. 499 func (t *thread) detach() { 500 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 { 501 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 502 } 503 } 504 505 // waitOutcome is used for wait below. 506 type waitOutcome int 507 508 const ( 509 // stopped indicates that the process was stopped. 510 stopped waitOutcome = iota 511 512 // killed indicates that the process was killed. 513 killed 514 ) 515 516 func (t *thread) Debugf(format string, v ...any) { 517 prefix := fmt.Sprintf("%8d:", t.tid) 518 log.DebugfAtDepth(1, prefix+format, v...) 519 } 520 521 func (t *thread) dumpAndPanic(message string) { 522 var regs arch.Registers 523 message += "\n" 524 if err := t.getRegs(®s); err == nil { 525 message += dumpRegs(®s) 526 } else { 527 log.Warningf("unable to get registers: %v", err) 528 } 529 message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) 530 panic(message) 531 } 532 533 func (t *thread) dumpRegs(message string) { 534 var regs arch.Registers 535 message += "\n" 536 if err := t.getRegs(®s); err == nil { 537 message += dumpRegs(®s) 538 } else { 539 log.Warningf("unable to get registers: %v", err) 540 } 541 log.Infof("%s", message) 542 } 543 544 func (t *thread) unexpectedStubExit() { 545 msg, err := t.getEventMessage() 546 status := unix.WaitStatus(msg) 547 if status.Signaled() && status.Signal() == unix.SIGKILL { 548 // SIGKILL can be only sent by a user or OOM-killer. In both 549 // these cases, we don't need to panic. There is no reasons to 550 // think that something wrong in gVisor. 551 log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) 552 pid := os.Getpid() 553 unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL)) 554 } 555 t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) 556 } 557 558 // wait waits for a stop event. 559 // 560 // Precondition: outcome is a valid waitOutcome. 561 func (t *thread) wait(outcome waitOutcome) unix.Signal { 562 var status unix.WaitStatus 563 564 for { 565 r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil) 566 if err == unix.EINTR || err == unix.EAGAIN { 567 // Wait was interrupted; wait again. 568 continue 569 } else if err != nil { 570 panic(fmt.Sprintf("ptrace wait failed: %v", err)) 571 } 572 if int(r) != int(t.tid) { 573 panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) 574 } 575 switch outcome { 576 case stopped: 577 if !status.Stopped() { 578 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) 579 } 580 stopSig := status.StopSignal() 581 if stopSig == 0 { 582 continue // Spurious stop. 583 } 584 if stopSig == unix.SIGTRAP { 585 if status.TrapCause() == unix.PTRACE_EVENT_EXIT { 586 t.unexpectedStubExit() 587 } 588 // Re-encode the trap cause the way it's expected. 589 return stopSig | unix.Signal(status.TrapCause()<<8) 590 } 591 // Not a trap signal. 592 return stopSig 593 case killed: 594 if !status.Exited() && !status.Signaled() { 595 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) 596 } 597 return unix.Signal(status.ExitStatus()) 598 default: 599 // Should not happen. 600 t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) 601 } 602 } 603 } 604 605 // destroy kills the thread. 606 // 607 // Note that this should not be used in the general case; the death of threads 608 // will typically cause the death of the parent. This is a utility method for 609 // manually created threads. 610 func (t *thread) destroy() { 611 t.detach() 612 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) 613 t.wait(killed) 614 } 615 616 // init initializes trace options. 617 func (t *thread) init() { 618 // Set the TRACESYSGOOD option to differentiate real SIGTRAP. 619 // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the 620 // sentry will immediately kill the associated stubs. 621 _, _, errno := unix.RawSyscall6( 622 unix.SYS_PTRACE, 623 unix.PTRACE_SETOPTIONS, 624 uintptr(t.tid), 625 0, 626 unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL, 627 0, 0) 628 if errno != 0 { 629 panic(fmt.Sprintf("ptrace set options failed: %v", errno)) 630 } 631 } 632 633 // syscall executes a system call cycle in the traced context. 634 // 635 // This is _not_ for use by application system calls, rather it is for use when 636 // a system call must be injected into the remote context (e.g. mmap, munmap). 637 // Note that clones are handled separately. 638 func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { 639 // Set registers. 640 if err := t.setRegs(regs); err != nil { 641 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 642 } 643 644 for { 645 // Execute the syscall instruction. The task has to stop on the 646 // trap instruction which is right after the syscall 647 // instruction. 648 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 649 panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) 650 } 651 652 sig := t.wait(stopped) 653 if sig == unix.SIGTRAP { 654 // Reached syscall-enter-stop. 655 break 656 } else { 657 // Some other signal caused a thread stop; ignore. 658 if sig != unix.SIGSTOP && sig != unix.SIGCHLD { 659 log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) 660 } 661 continue 662 } 663 } 664 665 // Grab registers. 666 if err := t.getRegs(regs); err != nil { 667 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 668 } 669 return syscallReturnValue(regs) 670 } 671 672 // syscallIgnoreInterrupt ignores interrupts on the system call thread and 673 // restarts the syscall if the kernel indicates that should happen. 674 func (t *thread) syscallIgnoreInterrupt( 675 initRegs *arch.Registers, 676 sysno uintptr, 677 args ...arch.SyscallArgument) (uintptr, error) { 678 for { 679 regs := createSyscallRegs(initRegs, sysno, args...) 680 rval, err := t.syscall(®s) 681 switch err { 682 case ERESTARTSYS: 683 continue 684 case ERESTARTNOINTR: 685 continue 686 case ERESTARTNOHAND: 687 continue 688 default: 689 return rval, err 690 } 691 } 692 } 693 694 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 695 func (t *thread) NotifyInterrupt() { 696 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt)) 697 } 698 699 func (s *subprocess) incAwakeContexts() { 700 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1) 701 if nr > uint32(maxSysmsgThreads) { 702 return 703 } 704 nr = nrMaxAwakeStubThreads.Add(1) 705 if nr > fastPathContextLimit { 706 dispatcher.disableStubFastPath() 707 } 708 } 709 710 func (s *subprocess) decAwakeContexts() { 711 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0)) 712 if nr >= uint32(maxSysmsgThreads) { 713 return 714 } 715 nrMaxAwakeStubThreads.Add(^uint32(0)) 716 } 717 718 // switchToApp is called from the main SwitchToApp entrypoint. 719 // 720 // This function returns true on a system call, false on a signal. 721 // The second return value is true if a syscall instruction can be replaced on 722 // a function call. 723 func (s *subprocess) switchToApp(c *context, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err error) { 724 // Reset necessary registers. 725 regs := &ac.StateData().Regs 726 s.resetSysemuRegs(regs) 727 ctx := c.sharedContext 728 ctx.shared.Regs = regs.PtraceRegs 729 restoreArchSpecificState(ctx.shared, ac) 730 731 // Check for interrupts, and ensure that future interrupts signal the context. 732 if !c.interrupt.Enable(c.sharedContext) { 733 // Pending interrupt; simulate. 734 ctx.clearInterrupt() 735 c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} 736 return false, false, nil 737 } 738 defer func() { 739 ctx.clearInterrupt() 740 c.interrupt.Disable() 741 }() 742 743 restoreFPState(ctx, c, ac) 744 745 // Place the context onto the context queue. 746 if ctx.sleeping { 747 ctx.sleeping = false 748 s.incAwakeContexts() 749 } 750 stubFastPathEnabled := dispatcher.stubFastPathEnabled() 751 ctx.setState(sysmsg.ContextStateNone) 752 s.contextQueue.add(ctx, stubFastPathEnabled) 753 s.waitOnState(ctx, stubFastPathEnabled) 754 755 // Check if there's been an error. 756 threadID := ctx.threadID() 757 if threadID != invalidThreadID { 758 if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 { 759 msg := sysThread.msg 760 panic(fmt.Sprintf("stub thread %d failed: err 0x%x line %d: %s", sysThread.thread.tid, msg.Err, msg.Line, msg)) 761 } 762 log.Warningf("systrap: found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID) 763 } 764 765 // Copy register state locally. 766 regs.PtraceRegs = ctx.shared.Regs 767 retrieveArchSpecificState(ctx.shared, ac) 768 c.needToPullFullState = true 769 // We have a signal. We verify however, that the signal was 770 // either delivered from the kernel or from this process. We 771 // don't respect other signals. 772 c.signalInfo = ctx.shared.SignalInfo 773 ctxState := ctx.state() 774 if ctxState == sysmsg.ContextStateSyscallCanBePatched { 775 ctxState = sysmsg.ContextStateSyscall 776 shouldPatchSyscall = true 777 } 778 779 if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap { 780 if maybePatchSignalInfo(regs, &c.signalInfo) { 781 return false, false, nil 782 } 783 updateSyscallRegs(regs) 784 return true, shouldPatchSyscall, nil 785 } else if ctxState != sysmsg.ContextStateFault { 786 panic(fmt.Sprintf("unknown context state: %v", ctxState)) 787 } 788 789 return false, false, nil 790 } 791 792 func (s *subprocess) waitOnState(ctx *sharedContext, stubFastPathEnabled bool) { 793 ctx.kicked = false 794 slowPath := false 795 start := cputicks() 796 ctx.startWaitingTS = start 797 if !stubFastPathEnabled || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 { 798 ctx.kicked = s.kickSysmsgThread() 799 } 800 for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() { 801 if !slowPath { 802 events := dispatcher.waitFor(ctx) 803 if events&sharedContextKicked != 0 { 804 if ctx.kicked { 805 continue 806 } 807 if ctx.isAcked() { 808 ctx.kicked = true 809 continue 810 } 811 s.kickSysmsgThread() 812 ctx.kicked = true 813 continue 814 } 815 if events&sharedContextSlowPath != 0 { 816 ctx.disableSentryFastPath() 817 slowPath = true 818 continue 819 } 820 } else { 821 // If the context already received a handshake then it knows it's being 822 // worked on. 823 if !ctx.kicked && !ctx.isAcked() { 824 ctx.kicked = s.kickSysmsgThread() 825 } 826 827 ctx.sleepOnState(curState) 828 } 829 } 830 831 ctx.resetAcked() 832 ctx.enableSentryFastPath() 833 } 834 835 // canKickSysmsgThread returns true if a new thread can be kicked. 836 // The second return value is the expected number of threads after kicking a 837 // new one. 838 func (s *subprocess) canKickSysmsgThread() (bool, uint32) { 839 // numActiveContexts and numActiveThreads can be changed from stub 840 // threads that handles the contextQueue without any locks. The idea 841 // here is that any stub thread that gets CPU time can make some 842 // progress. In stub threads, we can use only spinlock-like 843 // synchronizations, but they don't work well because a thread that 844 // holds a lock can be preempted by another thread that is waiting for 845 // the same lock. 846 nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads) 847 nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup) 848 nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts) 849 850 nrActiveThreads += nrThreadsToWakeup + 1 851 if nrActiveThreads > nrActiveContexts { 852 // This can happen when one or more stub threads are 853 // waiting for cpu time. The host probably has more 854 // running tasks than a number of cpu-s. 855 return false, nrActiveThreads 856 } 857 return true, nrActiveThreads 858 } 859 860 func (s *subprocess) kickSysmsgThread() bool { 861 kick, _ := s.canKickSysmsgThread() 862 if !kick { 863 return false 864 } 865 866 s.sysmsgThreadsMu.Lock() 867 kick, nrThreads := s.canKickSysmsgThread() 868 if !kick { 869 s.sysmsgThreadsMu.Unlock() 870 return false 871 } 872 atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1) 873 if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) { 874 s.numSysmsgThreads++ 875 s.sysmsgThreadsMu.Unlock() 876 if err := s.createSysmsgThread(); err != nil { 877 log.Warningf("Unable to create a new stub thread: %s", err) 878 s.sysmsgThreadsMu.Lock() 879 s.numSysmsgThreads-- 880 s.sysmsgThreadsMu.Unlock() 881 } 882 } else { 883 s.sysmsgThreadsMu.Unlock() 884 } 885 s.contextQueue.wakeupSysmsgThread() 886 887 return false 888 } 889 890 // syscall executes the given system call without handling interruptions. 891 func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { 892 s.syscallThreadMu.Lock() 893 defer s.syscallThreadMu.Unlock() 894 895 return s.syscallThread.syscall(sysno, args...) 896 } 897 898 // MapFile implements platform.AddressSpace.MapFile. 899 func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { 900 var flags int 901 if precommit { 902 flags |= unix.MAP_POPULATE 903 } 904 _, err := s.syscall( 905 unix.SYS_MMAP, 906 arch.SyscallArgument{Value: uintptr(addr)}, 907 arch.SyscallArgument{Value: uintptr(fr.Length())}, 908 arch.SyscallArgument{Value: uintptr(at.Prot())}, 909 arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)}, 910 arch.SyscallArgument{Value: uintptr(f.FD())}, 911 arch.SyscallArgument{Value: uintptr(fr.Start)}) 912 return err 913 } 914 915 // Unmap implements platform.AddressSpace.Unmap. 916 func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) { 917 ar, ok := addr.ToRange(length) 918 if !ok { 919 panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) 920 } 921 s.mu.Lock() 922 for c := range s.faultedContexts { 923 c.mu.Lock() 924 if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { 925 // Forget the last fault so that if c faults again, the fault isn't 926 // incorrectly reported as a write fault. If this is being called 927 // due to munmap() of the corresponding vma, handling of the second 928 // fault will fail anyway. 929 c.lastFaultSP = nil 930 delete(s.faultedContexts, c) 931 } 932 c.mu.Unlock() 933 } 934 s.mu.Unlock() 935 _, err := s.syscall( 936 unix.SYS_MUNMAP, 937 arch.SyscallArgument{Value: uintptr(addr)}, 938 arch.SyscallArgument{Value: uintptr(length)}) 939 if err != nil { 940 // We never expect this to happen. 941 panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) 942 } 943 } 944 945 func (s *subprocess) PullFullState(c *context, ac *arch.Context64) error { 946 if !c.sharedContext.isActiveInSubprocess(s) { 947 panic("Attempted to PullFullState for context that is not used in subprocess") 948 } 949 saveFPState(c.sharedContext, ac) 950 return nil 951 } 952 953 var sysmsgThreadPriority int 954 955 func initSysmsgThreadPriority() { 956 prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0) 957 if err != nil { 958 panic("unable to get current scheduling priority") 959 } 960 // Sysmsg threads are executed with a priority one lower than the Sentry. 961 sysmsgThreadPriority = 20 - prio + 1 962 } 963 964 // createSysmsgThread creates a new sysmsg thread. 965 // The thread starts processing any available context in the context queue. 966 func (s *subprocess) createSysmsgThread() error { 967 // Create a new seccomp process. 968 var r requestThread 969 r.thread = make(chan *thread) 970 s.requests <- r 971 p := <-r.thread 972 973 runtime.LockOSThread() 974 defer runtime.UnlockOSThread() 975 p.attach() 976 977 // Skip SIGSTOP. 978 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 979 panic(fmt.Sprintf("ptrace cont failed: %v", errno)) 980 } 981 sig := p.wait(stopped) 982 if sig != unix.SIGSTOP { 983 panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) 984 } 985 986 // Allocate a new stack for the BPF process. 987 opts := pgalloc.AllocOpts{ 988 Kind: usage.System, 989 Dir: pgalloc.TopDown, 990 } 991 fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts) 992 if err != nil { 993 // TODO(b/144063246): Need to fail the clone system call. 994 panic(fmt.Sprintf("failed to allocate a new stack: %v", err)) 995 } 996 sysThread := &sysmsgThread{ 997 thread: p, 998 subproc: s, 999 stackRange: fr, 1000 } 1001 // Use the sysmsgStackID as a handle on this thread instead of host tid in 1002 // order to be able to reliably specify invalidThreadID. 1003 threadID := uint32(p.sysmsgStackID) 1004 1005 // Map the stack into the sentry. 1006 sentryStackAddr, _, errno := unix.RawSyscall6( 1007 unix.SYS_MMAP, 1008 0, 1009 sysmsg.PerThreadSharedStackSize, 1010 unix.PROT_WRITE|unix.PROT_READ, 1011 unix.MAP_SHARED|unix.MAP_FILE, 1012 uintptr(s.memoryFile.FD()), uintptr(fr.Start)) 1013 if errno != 0 { 1014 panic(fmt.Sprintf("mmap failed: %v", errno)) 1015 } 1016 1017 // Before installing the stub syscall filters, we need to call a few 1018 // system calls (e.g. sigaltstack, sigaction) which have in-memory 1019 // arguments. We need to prevent changing these parameters by other 1020 // stub threads, so lets map the future BPF stack as read-only and 1021 // fill syscall arguments from the Sentry. 1022 sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset 1023 err = sysThread.mapStack(sysmsgStackAddr, true) 1024 if err != nil { 1025 panic(fmt.Sprintf("mmap failed: %v", err)) 1026 } 1027 1028 sysThread.init(sentryStackAddr, sysmsgStackAddr) 1029 1030 // Map the stack into the BPF process. 1031 err = sysThread.mapStack(sysmsgStackAddr, false) 1032 if err != nil { 1033 s.memoryFile.DecRef(fr) 1034 panic(fmt.Sprintf("mmap failed: %v", err)) 1035 } 1036 1037 // Map the stack into the BPF process. 1038 privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset 1039 err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize) 1040 if err != nil { 1041 s.memoryFile.DecRef(fr) 1042 panic(fmt.Sprintf("mmap failed: %v", err)) 1043 } 1044 1045 sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr)) 1046 sysThread.msg.Init(threadID) 1047 sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack) 1048 sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr())) 1049 sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler)) 1050 1051 sysThread.msg.State.Set(sysmsg.ThreadStateInitializing) 1052 1053 if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil { 1054 log.Warningf("Unable to change priority of a stub thread: %s", err) 1055 } 1056 1057 // Install a pre-compiled seccomp rules for the BPF process. 1058 _, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL, 1059 arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)}, 1060 arch.SyscallArgument{Value: uintptr(1)}, 1061 arch.SyscallArgument{Value: uintptr(0)}, 1062 arch.SyscallArgument{Value: uintptr(0)}, 1063 arch.SyscallArgument{Value: uintptr(0)}, 1064 arch.SyscallArgument{Value: uintptr(0)}) 1065 if err != nil { 1066 panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err)) 1067 } 1068 1069 _, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP, 1070 arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, 1071 arch.SyscallArgument{Value: uintptr(0)}, 1072 arch.SyscallArgument{Value: stubSysmsgRules}) 1073 if err != nil { 1074 panic(fmt.Sprintf("seccomp failed: %v", err)) 1075 } 1076 1077 // Prepare to start the BPF process. 1078 tregs := &arch.Registers{} 1079 s.resetSysemuRegs(tregs) 1080 setArchSpecificRegs(sysThread, tregs) 1081 if err := p.setRegs(tregs); err != nil { 1082 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 1083 } 1084 archSpecificSysmsgThreadInit(sysThread) 1085 // Skip SIGSTOP. 1086 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { 1087 panic(fmt.Sprintf("tkill failed: %v", e)) 1088 } 1089 // Resume the BPF process. 1090 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 1091 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 1092 } 1093 1094 s.sysmsgThreadsMu.Lock() 1095 s.sysmsgThreads[threadID] = sysThread 1096 s.sysmsgThreadsMu.Unlock() 1097 1098 return nil 1099 } 1100 1101 // PreFork implements platform.AddressSpace.PreFork. 1102 // We need to take the usertrap lock to be sure that fork() will not be in the 1103 // middle of applying a binary patch. 1104 func (s *subprocess) PreFork() { 1105 s.usertrap.PreFork() 1106 } 1107 1108 // PostFork implements platform.AddressSpace.PostFork. 1109 func (s *subprocess) PostFork() { 1110 s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above. 1111 } 1112 1113 // activateContext activates the context in this subprocess. 1114 // No-op if the context is already active within the subprocess; if not, 1115 // deactivates it from its last subprocess. 1116 func (s *subprocess) activateContext(c *context) error { 1117 if !c.sharedContext.isActiveInSubprocess(s) { 1118 c.sharedContext.release() 1119 c.sharedContext = nil 1120 1121 shared, err := s.getSharedContext() 1122 if err != nil { 1123 return err 1124 } 1125 c.sharedContext = shared 1126 } 1127 return nil 1128 }