gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/subprocess.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package systrap 16 17 import ( 18 "fmt" 19 "os" 20 "runtime" 21 "sync" 22 "sync/atomic" 23 24 "golang.org/x/sys/unix" 25 "gvisor.dev/gvisor/pkg/abi/linux" 26 "gvisor.dev/gvisor/pkg/atomicbitops" 27 "gvisor.dev/gvisor/pkg/hostarch" 28 "gvisor.dev/gvisor/pkg/log" 29 "gvisor.dev/gvisor/pkg/pool" 30 "gvisor.dev/gvisor/pkg/seccomp" 31 "gvisor.dev/gvisor/pkg/sentry/arch" 32 "gvisor.dev/gvisor/pkg/sentry/memmap" 33 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 34 "gvisor.dev/gvisor/pkg/sentry/platform" 35 "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" 36 "gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap" 37 "gvisor.dev/gvisor/pkg/sentry/usage" 38 ) 39 40 var ( 41 // globalPool tracks all subprocesses in various state: active or available for 42 // reuse. 43 globalPool = subprocessPool{} 44 45 // maximumUserAddress is the largest possible user address. 46 maximumUserAddress = linux.TaskSize 47 48 // stubInitAddress is the initial attempt link address for the stub. 49 stubInitAddress = linux.TaskSize 50 51 // maxRandomOffsetOfStubAddress is the maximum offset for randomizing a 52 // stub address. It is set to the default value of mm.mmap_rnd_bits. 53 // 54 // Note: Tools like ThreadSanitizer don't like when the memory layout 55 // is changed significantly. 56 maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1) 57 58 // maxStubUserAddress is the largest possible user address for 59 // processes running inside gVisor. It is fixed because 60 // * we don't want to reveal a stub address. 61 // * it has to be the same across checkpoint/restore. 62 maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress 63 ) 64 65 // Linux kernel errnos which "should never be seen by user programs", but will 66 // be revealed to ptrace syscall exit tracing. 67 // 68 // These constants are only used in subprocess.go. 69 const ( 70 ERESTARTSYS = unix.Errno(512) 71 ERESTARTNOINTR = unix.Errno(513) 72 ERESTARTNOHAND = unix.Errno(514) 73 ) 74 75 // thread is a traced thread; it is a thread identifier. 76 // 77 // This is a convenience type for defining ptrace operations. 78 type thread struct { 79 tgid int32 80 tid int32 81 82 // sysmsgStackID is a stack ID in subprocess.sysmsgStackPool. 83 sysmsgStackID uint64 84 85 // initRegs are the initial registers for the first thread. 86 // 87 // These are used for the register set for system calls. 88 initRegs arch.Registers 89 90 logPrefix atomic.Pointer[string] 91 } 92 93 // requestThread is used to request a new sysmsg thread. A thread identifier will 94 // be sent into the thread channel. 95 type requestThread struct { 96 thread chan *thread 97 } 98 99 // requestStub is used to request a new stub process. 100 type requestStub struct { 101 done chan *thread 102 } 103 104 // maxSysmsgThreads is the maximum number of sysmsg threads that a subprocess 105 // can create. It is based on GOMAXPROCS and set once, so it must be set after 106 // GOMAXPROCS has been adjusted (see loader.go:Args.NumCPU). 107 var maxSysmsgThreads = 0 108 109 // maxChildThreads is the max number of all child system threads that a 110 // subprocess can create, including sysmsg threads. 111 var maxChildThreads = 0 112 113 const ( 114 // maxGuestContexts specifies the maximum number of task contexts that a 115 // subprocess can handle. 116 maxGuestContexts = 4095 117 // invalidContextID specifies an invalid ID. 118 invalidContextID uint32 = 0xfefefefe 119 // invalidThreadID is used to indicate that a context is not being worked on by 120 // any sysmsg thread. 121 invalidThreadID uint32 = 0xfefefefe 122 ) 123 124 // subprocess is a collection of threads being traced. 125 type subprocess struct { 126 platform.NoAddressSpaceIO 127 subprocessRefs 128 129 // requests is used to signal creation of new threads. 130 requests chan any 131 132 // sysmsgInitRegs is used to reset sysemu regs. 133 sysmsgInitRegs arch.Registers 134 135 // mu protects the following fields. 136 mu sync.Mutex 137 138 // faultedContexts is the set of contexts for which it's possible that 139 // platformContext.lastFaultSP == this subprocess. 140 faultedContexts map[*platformContext]struct{} 141 142 // sysmsgStackPool is a pool of available sysmsg stacks. 143 sysmsgStackPool pool.Pool 144 145 // threadContextPool is a pool of available sysmsg.ThreadContext IDs. 146 threadContextPool pool.Pool 147 148 // threadContextRegion defines the ThreadContext memory region start 149 // within the sentry address space. 150 threadContextRegion uintptr 151 152 // memoryFile is used to allocate a sysmsg stack which is shared 153 // between a stub process and the Sentry. 154 memoryFile *pgalloc.MemoryFile 155 156 // usertrap is the state of the usertrap table which contains syscall 157 // trampolines. 158 usertrap *usertrap.State 159 160 syscallThreadMu sync.Mutex 161 syscallThread *syscallThread 162 163 // sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads 164 sysmsgThreadsMu sync.Mutex 165 // sysmsgThreads is a collection of all active sysmsg threads in the 166 // subprocess. 167 sysmsgThreads map[uint32]*sysmsgThread 168 // numSysmsgThreads counts the number of active sysmsg threads; we use a 169 // counter instead of using len(sysmsgThreads) because we need to synchronize 170 // how many threads get created _before_ the creation happens. 171 numSysmsgThreads int 172 173 // contextQueue is a queue of all contexts that are ready to switch back to 174 // user mode. 175 contextQueue *contextQueue 176 177 // dead indicates whether the subprocess is alive or not. 178 dead atomicbitops.Bool 179 } 180 181 func (s *subprocess) initSyscallThread(ptraceThread *thread, seccompNotify bool) error { 182 s.syscallThreadMu.Lock() 183 defer s.syscallThreadMu.Unlock() 184 185 id, ok := s.sysmsgStackPool.Get() 186 if !ok { 187 panic("unable to allocate a sysmsg stub thread") 188 } 189 190 ptraceThread.sysmsgStackID = id 191 t := syscallThread{ 192 subproc: s, 193 thread: ptraceThread, 194 } 195 196 if err := t.init(seccompNotify); err != nil { 197 panic(fmt.Sprintf("failed to create a syscall thread")) 198 } 199 s.syscallThread = &t 200 201 s.syscallThread.detach() 202 203 return nil 204 } 205 206 func handlePtraceSyscallRequestError(req any, format string, values ...any) { 207 switch req.(type) { 208 case requestThread: 209 req.(requestThread).thread <- nil 210 case requestStub: 211 req.(requestStub).done <- nil 212 } 213 log.Warningf("handlePtraceSyscallRequest failed: "+format, values...) 214 } 215 216 // handlePtraceSyscallRequest executes system calls that can't be run via 217 // syscallThread without using ptrace. Look at the description of syscallThread 218 // to get more details about its limitations. 219 func (s *subprocess) handlePtraceSyscallRequest(req any) { 220 s.syscallThreadMu.Lock() 221 defer s.syscallThreadMu.Unlock() 222 runtime.LockOSThread() 223 defer runtime.UnlockOSThread() 224 if err := s.syscallThread.attach(); err != nil { 225 handlePtraceSyscallRequestError(req, err.Error()) 226 return 227 } 228 defer s.syscallThread.detach() 229 230 ptraceThread := s.syscallThread.thread 231 232 switch r := req.(type) { 233 case requestThread: 234 t, err := ptraceThread.clone() 235 if err != nil { 236 handlePtraceSyscallRequestError(req, "error initializing thread: %v", err) 237 return 238 } 239 240 // Since the new thread was created with 241 // clone(CLONE_PTRACE), it will begin execution with 242 // SIGSTOP pending and with this thread as its tracer. 243 // (Hopefully nobody tgkilled it with a signal < 244 // SIGSTOP before the SIGSTOP was delivered, in which 245 // case that signal would be delivered before SIGSTOP.) 246 if sig := t.wait(stopped); sig != unix.SIGSTOP { 247 handlePtraceSyscallRequestError(req, "error waiting for new clone: expected SIGSTOP, got %v", sig) 248 return 249 } 250 251 t.initRegs = ptraceThread.initRegs 252 // Set the parent death signal to SIGKILL. 253 _, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL, 254 arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG}, 255 arch.SyscallArgument{Value: uintptr(unix.SIGKILL)}, 256 arch.SyscallArgument{Value: 0}, 257 arch.SyscallArgument{Value: 0}, 258 arch.SyscallArgument{Value: 0}, 259 arch.SyscallArgument{Value: 0}, 260 ) 261 if err != nil { 262 handlePtraceSyscallRequestError(req, "prctl: %v", err) 263 return 264 } 265 266 id, ok := s.sysmsgStackPool.Get() 267 if !ok { 268 handlePtraceSyscallRequestError(req, "unable to allocate a sysmsg stub thread") 269 return 270 } 271 t.sysmsgStackID = id 272 273 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 { 274 handlePtraceSyscallRequestError(req, "tkill failed: %v", e) 275 return 276 } 277 278 // Detach the thread. 279 t.detach() 280 281 // Return the thread. 282 r.thread <- t 283 case requestStub: 284 t, err := ptraceThread.createStub() 285 if err != nil { 286 handlePtraceSyscallRequestError(req, "unable to create a stub process: %v", err) 287 return 288 } 289 r.done <- t 290 291 } 292 } 293 294 // newSubprocess returns a usable subprocess. 295 // 296 // This will either be a newly created subprocess, or one from the global pool. 297 // The create function will be called in the latter case, which is guaranteed 298 // to happen with the runtime thread locked. 299 // 300 // seccompNotify indicates a ways of comunications with syscall threads. 301 // If it is false, futex-s are used. Otherwise, seccomp-unotify is used. 302 // seccomp-unotify can't be used for the source pool process, because it is a 303 // parent of all other stub processes, but only one filter can be installed 304 // with SECCOMP_FILTER_FLAG_NEW_LISTENER. 305 func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile, seccompNotify bool) (*subprocess, error) { 306 if sp := globalPool.fetchAvailable(); sp != nil { 307 sp.subprocessRefs.InitRefs() 308 sp.usertrap = usertrap.New() 309 return sp, nil 310 } 311 312 // The following goroutine is responsible for creating the first traced 313 // thread, and responding to requests to make additional threads in the 314 // traced process. The process will be killed and reaped when the 315 // request channel is closed, which happens in Release below. 316 requests := make(chan any) 317 318 // Ready. 319 sp := &subprocess{ 320 requests: requests, 321 faultedContexts: make(map[*platformContext]struct{}), 322 sysmsgStackPool: pool.Pool{Start: 0, Limit: uint64(maxChildThreads)}, 323 threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts}, 324 memoryFile: memoryFile, 325 sysmsgThreads: make(map[uint32]*sysmsgThread), 326 } 327 sp.subprocessRefs.InitRefs() 328 runtime.LockOSThread() 329 defer runtime.UnlockOSThread() 330 331 // Initialize the syscall thread. 332 ptraceThread, err := create() 333 if err != nil { 334 return nil, err 335 } 336 sp.sysmsgInitRegs = ptraceThread.initRegs 337 338 if err := sp.initSyscallThread(ptraceThread, seccompNotify); err != nil { 339 return nil, err 340 } 341 342 go func() { // S/R-SAFE: Platform-related. 343 344 // Wait for requests to create threads. 345 for req := range requests { 346 sp.handlePtraceSyscallRequest(req) 347 } 348 349 // Requests should never be closed. 350 panic("unreachable") 351 }() 352 353 sp.unmap() 354 sp.usertrap = usertrap.New() 355 sp.mapSharedRegions() 356 sp.mapPrivateRegions() 357 358 // The main stub doesn't need sysmsg threads. 359 if seccompNotify { 360 // Create the initial sysmsg thread. 361 atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1) 362 if err := sp.createSysmsgThread(); err != nil { 363 return nil, err 364 } 365 sp.numSysmsgThreads++ 366 } 367 368 return sp, nil 369 } 370 371 // mapSharedRegions maps the shared regions that are used between the subprocess 372 // and ALL of the subsequently created sysmsg threads into both the sentry and 373 // the syscall thread. 374 // 375 // Should be called before any sysmsg threads are created. 376 // Initializes s.contextQueue and s.threadContextRegion. 377 func (s *subprocess) mapSharedRegions() { 378 if s.contextQueue != nil || s.threadContextRegion != 0 { 379 panic("contextQueue or threadContextRegion was already initialized") 380 } 381 382 opts := pgalloc.AllocOpts{ 383 Kind: usage.System, 384 Dir: pgalloc.TopDown, 385 } 386 387 // Map shared regions into the sentry. 388 contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts) 389 contextQueue.init() 390 391 // Map thread context region into the syscall thread. 392 _, err := s.syscallThread.syscall( 393 unix.SYS_MMAP, 394 arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)}, 395 arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())}, 396 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 397 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 398 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 399 arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)}) 400 if err != nil { 401 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 402 } 403 404 s.contextQueue = contextQueue 405 406 // Map thread context region into the sentry. 407 threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts) 408 if err != nil { 409 panic(fmt.Sprintf("failed to allocate a new subprocess context memory region")) 410 } 411 sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6( 412 unix.SYS_MMAP, 413 0, 414 uintptr(threadContextFR.Length()), 415 unix.PROT_WRITE|unix.PROT_READ, 416 unix.MAP_SHARED|unix.MAP_FILE, 417 uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start)) 418 if errno != 0 { 419 panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno)) 420 } 421 422 // Map thread context region into the syscall thread. 423 if _, err := s.syscallThread.syscall( 424 unix.SYS_MMAP, 425 arch.SyscallArgument{Value: uintptr(stubContextRegion)}, 426 arch.SyscallArgument{Value: uintptr(threadContextFR.Length())}, 427 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 428 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 429 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 430 arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil { 431 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 432 } 433 434 s.threadContextRegion = sentryThreadContextRegionAddr 435 } 436 437 func (s *subprocess) mapPrivateRegions() { 438 _, err := s.syscallThread.syscall( 439 unix.SYS_MMAP, 440 arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)}, 441 arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)}, 442 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 443 arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)}, 444 arch.SyscallArgument{Value: 0}, 445 arch.SyscallArgument{Value: 0}) 446 if err != nil { 447 panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err)) 448 } 449 } 450 451 // unmap unmaps non-stub regions of the process. 452 // 453 // This will panic on failure (which should never happen). 454 func (s *subprocess) unmap() { 455 s.Unmap(0, uint64(stubStart)) 456 if maximumUserAddress != stubEnd { 457 s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) 458 } 459 } 460 461 // Release kills the subprocess. 462 // 463 // Just kidding! We can't safely coordinate the detaching of all the 464 // tracees (since the tracers are random runtime threads, and the process 465 // won't exit until tracers have been notifier). 466 // 467 // Therefore we simply unmap everything in the subprocess and return it to the 468 // globalPool. This has the added benefit of reducing creation time for new 469 // subprocesses. 470 func (s *subprocess) Release() { 471 if !s.alive() { 472 return 473 } 474 s.unmap() 475 s.DecRef(s.release) 476 } 477 478 // release returns the subprocess to the global pool. 479 func (s *subprocess) release() { 480 if s.alive() { 481 globalPool.markAvailable(s) 482 return 483 } 484 if s.syscallThread != nil && s.syscallThread.seccompNotify != nil { 485 s.syscallThread.seccompNotify.Close() 486 } 487 } 488 489 // attach attaches to the thread. 490 func (t *thread) attach() error { 491 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 492 return fmt.Errorf("unable to attach: %v", errno) 493 } 494 495 // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already 496 // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of 497 // newSubprocess), so we always expect to see signal-delivery-stop with 498 // SIGSTOP. 499 if sig := t.wait(stopped); sig != unix.SIGSTOP { 500 return fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 501 } 502 503 // Initialize options. 504 t.init() 505 return nil 506 } 507 508 func (t *thread) grabInitRegs() { 509 // Grab registers. 510 // 511 // Note that we adjust the current register RIP value to be just before 512 // the current system call executed. This depends on the definition of 513 // the stub itself. 514 if err := t.getRegs(&t.initRegs); err != nil { 515 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 516 } 517 t.adjustInitRegsRip() 518 t.initRegs.SetStackPointer(0) 519 } 520 521 // detach detaches from the thread. 522 // 523 // Because the SIGSTOP is not suppressed, the thread will enter group-stop. 524 func (t *thread) detach() { 525 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 { 526 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 527 } 528 } 529 530 // waitOutcome is used for wait below. 531 type waitOutcome int 532 533 const ( 534 // stopped indicates that the process was stopped. 535 stopped waitOutcome = iota 536 537 // killed indicates that the process was killed. 538 killed 539 ) 540 541 func (t *thread) loadLogPrefix() *string { 542 p := t.logPrefix.Load() 543 if p == nil { 544 prefix := fmt.Sprintf("[% 4d:% 4d] ", t.tgid, t.tid) 545 t.logPrefix.Store(&prefix) 546 p = &prefix 547 } 548 return p 549 } 550 551 // Debugf logs with the debugging severity. 552 func (t *thread) Debugf(format string, v ...any) { 553 if log.IsLogging(log.Debug) { 554 log.DebugfAtDepth(1, *t.loadLogPrefix()+format, v...) 555 } 556 } 557 558 // Warningf logs with the warning severity. 559 func (t *thread) Warningf(format string, v ...any) { 560 if log.IsLogging(log.Warning) { 561 log.WarningfAtDepth(1, *t.loadLogPrefix()+format, v...) 562 } 563 } 564 565 func (t *thread) dumpAndPanic(message string) { 566 var regs arch.Registers 567 message += "\n" 568 if err := t.getRegs(®s); err == nil { 569 message += dumpRegs(®s) 570 } else { 571 log.Warningf("unable to get registers: %v", err) 572 } 573 message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) 574 panic(message) 575 } 576 577 func (t *thread) dumpRegs(message string) { 578 var regs arch.Registers 579 message += "\n" 580 if err := t.getRegs(®s); err == nil { 581 message += dumpRegs(®s) 582 } else { 583 log.Warningf("unable to get registers: %v", err) 584 } 585 log.Infof("%s", message) 586 } 587 588 func (t *thread) unexpectedStubExit() { 589 msg, err := t.getEventMessage() 590 status := unix.WaitStatus(msg) 591 if status.Signaled() && status.Signal() == unix.SIGKILL { 592 // SIGKILL can be only sent by a user or OOM-killer. In both 593 // these cases, we don't need to panic. There is no reasons to 594 // think that something wrong in gVisor. 595 log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) 596 pid := os.Getpid() 597 unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL)) 598 } 599 t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) 600 } 601 602 // wait waits for a stop event. 603 // 604 // Precondition: outcome is a valid waitOutcome. 605 func (t *thread) wait(outcome waitOutcome) unix.Signal { 606 var status unix.WaitStatus 607 608 for { 609 r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil) 610 if err == unix.EINTR || err == unix.EAGAIN { 611 // Wait was interrupted; wait again. 612 continue 613 } else if err != nil { 614 panic(fmt.Sprintf("ptrace wait failed: %v", err)) 615 } 616 if int(r) != int(t.tid) { 617 panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) 618 } 619 switch outcome { 620 case stopped: 621 if !status.Stopped() { 622 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) 623 } 624 stopSig := status.StopSignal() 625 if stopSig == 0 { 626 continue // Spurious stop. 627 } 628 if stopSig == unix.SIGTRAP { 629 if status.TrapCause() == unix.PTRACE_EVENT_EXIT { 630 t.unexpectedStubExit() 631 } 632 // Re-encode the trap cause the way it's expected. 633 return stopSig | unix.Signal(status.TrapCause()<<8) 634 } 635 // Not a trap signal. 636 return stopSig 637 case killed: 638 if !status.Exited() && !status.Signaled() { 639 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) 640 } 641 return unix.Signal(status.ExitStatus()) 642 default: 643 // Should not happen. 644 t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) 645 } 646 } 647 } 648 649 // kill kills the thread; 650 func (t *thread) kill() { 651 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) 652 } 653 654 // destroy kills and waits on the thread. 655 // 656 // Note that this should not be used in the general case; the death of threads 657 // will typically cause the death of the parent. This is a utility method for 658 // manually created threads. 659 func (t *thread) destroy() { 660 t.detach() 661 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) 662 t.wait(killed) 663 } 664 665 // init initializes trace options. 666 func (t *thread) init() { 667 // Set the TRACESYSGOOD option to differentiate real SIGTRAP. 668 // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the 669 // sentry will immediately kill the associated stubs. 670 _, _, errno := unix.RawSyscall6( 671 unix.SYS_PTRACE, 672 unix.PTRACE_SETOPTIONS, 673 uintptr(t.tid), 674 0, 675 unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL, 676 0, 0) 677 if errno != 0 { 678 panic(fmt.Sprintf("ptrace set options failed: %v", errno)) 679 } 680 } 681 682 // syscall executes a system call cycle in the traced context. 683 // 684 // This is _not_ for use by application system calls, rather it is for use when 685 // a system call must be injected into the remote context (e.g. mmap, munmap). 686 // Note that clones are handled separately. 687 func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { 688 // Set registers. 689 if err := t.setRegs(regs); err != nil { 690 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 691 } 692 693 for { 694 // Execute the syscall instruction. The task has to stop on the 695 // trap instruction which is right after the syscall 696 // instruction. 697 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 698 panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) 699 } 700 701 sig := t.wait(stopped) 702 if sig == unix.SIGTRAP { 703 // Reached syscall-enter-stop. 704 break 705 } else { 706 // Some other signal caused a thread stop; ignore. 707 if sig != unix.SIGSTOP && sig != unix.SIGCHLD { 708 log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) 709 } 710 continue 711 } 712 } 713 714 // Grab registers. 715 if err := t.getRegs(regs); err != nil { 716 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 717 } 718 return syscallReturnValue(regs) 719 } 720 721 // syscallIgnoreInterrupt ignores interrupts on the system call thread and 722 // restarts the syscall if the kernel indicates that should happen. 723 func (t *thread) syscallIgnoreInterrupt( 724 initRegs *arch.Registers, 725 sysno uintptr, 726 args ...arch.SyscallArgument) (uintptr, error) { 727 for { 728 regs := createSyscallRegs(initRegs, sysno, args...) 729 rval, err := t.syscall(®s) 730 switch err { 731 case ERESTARTSYS: 732 continue 733 case ERESTARTNOINTR: 734 continue 735 case ERESTARTNOHAND: 736 continue 737 default: 738 return rval, err 739 } 740 } 741 } 742 743 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 744 func (t *thread) NotifyInterrupt() { 745 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt)) 746 } 747 748 func (s *subprocess) incAwakeContexts() { 749 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1) 750 if nr > uint32(maxSysmsgThreads) { 751 return 752 } 753 fastpath.nrMaxAwakeStubThreads.Add(1) 754 } 755 756 func (s *subprocess) decAwakeContexts() { 757 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0)) 758 if nr >= uint32(maxSysmsgThreads) { 759 return 760 } 761 fastpath.nrMaxAwakeStubThreads.Add(^uint32(0)) 762 } 763 764 // switchToApp is called from the main SwitchToApp entrypoint. 765 // 766 // This function returns true on a system call, false on a signal. 767 // The second return value is true if a syscall instruction can be replaced on 768 // a function call. 769 func (s *subprocess) switchToApp(c *platformContext, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err *platform.ContextError) { 770 // Reset necessary registers. 771 regs := &ac.StateData().Regs 772 s.resetSysemuRegs(regs) 773 ctx := c.sharedContext 774 ctx.shared.Regs = regs.PtraceRegs 775 restoreArchSpecificState(ctx.shared, ac) 776 777 // Check for interrupts, and ensure that future interrupts signal the context. 778 if !c.interrupt.Enable(c.sharedContext) { 779 // Pending interrupt; simulate. 780 ctx.clearInterrupt() 781 c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} 782 return false, false, nil 783 } 784 defer func() { 785 ctx.clearInterrupt() 786 c.interrupt.Disable() 787 }() 788 789 restoreFPState(ctx, c, ac) 790 791 // Place the context onto the context queue. 792 if ctx.sleeping { 793 ctx.sleeping = false 794 s.incAwakeContexts() 795 } 796 ctx.setState(sysmsg.ContextStateNone) 797 if err := s.contextQueue.add(ctx); err != nil { 798 return false, false, err 799 } 800 801 if err := s.waitOnState(ctx); err != nil { 802 return false, false, corruptedSharedMemoryErr(err.Error()) 803 } 804 805 // Check if there's been an error. 806 threadID := ctx.threadID() 807 if threadID != invalidThreadID { 808 if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 { 809 return false, false, sysThread.msg.ConvertSysmsgErr() 810 } 811 return false, false, corruptedSharedMemoryErr(fmt.Sprintf("found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID)) 812 } 813 814 // Copy register state locally. 815 regs.PtraceRegs = ctx.shared.Regs 816 retrieveArchSpecificState(ctx.shared, ac) 817 c.needToPullFullState = true 818 // We have a signal. We verify however, that the signal was 819 // either delivered from the kernel or from this process. We 820 // don't respect other signals. 821 c.signalInfo = ctx.shared.SignalInfo 822 ctxState := ctx.state() 823 if ctxState == sysmsg.ContextStateSyscallCanBePatched { 824 ctxState = sysmsg.ContextStateSyscall 825 shouldPatchSyscall = true 826 } 827 828 if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap { 829 if maybePatchSignalInfo(regs, &c.signalInfo) { 830 return false, false, nil 831 } 832 updateSyscallRegs(regs) 833 return true, shouldPatchSyscall, nil 834 } else if ctxState != sysmsg.ContextStateFault { 835 return false, false, corruptedSharedMemoryErr(fmt.Sprintf("unknown context state: %v", ctxState)) 836 } 837 838 return false, false, nil 839 } 840 841 func (s *subprocess) waitOnState(ctx *sharedContext) error { 842 ctx.kicked = false 843 slowPath := false 844 if !s.contextQueue.fastPathEnabled() || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 { 845 ctx.kicked = s.kickSysmsgThread() 846 } 847 for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() { 848 if !slowPath { 849 events := dispatcher.waitFor(ctx) 850 if events&sharedContextKicked != 0 { 851 if ctx.kicked { 852 continue 853 } 854 if ctx.isAcked() { 855 ctx.kicked = true 856 continue 857 } 858 s.kickSysmsgThread() 859 ctx.kicked = true 860 continue 861 } 862 if events&sharedContextSlowPath != 0 { 863 ctx.disableSentryFastPath() 864 slowPath = true 865 continue 866 } 867 } else { 868 // If the context already received a handshake then it knows it's being 869 // worked on. 870 if !ctx.kicked && !ctx.isAcked() { 871 ctx.kicked = s.kickSysmsgThread() 872 } 873 874 if err := ctx.sleepOnState(curState); err != nil { 875 return err 876 } 877 } 878 } 879 880 ctx.recordLatency() 881 ctx.resetLatencyMeasures() 882 ctx.enableSentryFastPath() 883 884 return nil 885 } 886 887 // canKickSysmsgThread returns true if a new thread can be kicked. 888 // The second return value is the expected number of threads after kicking a 889 // new one. 890 func (s *subprocess) canKickSysmsgThread() (bool, uint32) { 891 // numActiveContexts and numActiveThreads can be changed from stub 892 // threads that handles the contextQueue without any locks. The idea 893 // here is that any stub thread that gets CPU time can make some 894 // progress. In stub threads, we can use only spinlock-like 895 // synchronizations, but they don't work well because a thread that 896 // holds a lock can be preempted by another thread that is waiting for 897 // the same lock. 898 nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads) 899 nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup) 900 nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts) 901 902 nrActiveThreads += nrThreadsToWakeup + 1 903 if nrActiveThreads > nrActiveContexts { 904 // This can happen when one or more stub threads are 905 // waiting for cpu time. The host probably has more 906 // running tasks than a number of cpu-s. 907 return false, nrActiveThreads 908 } 909 return true, nrActiveThreads 910 } 911 912 // kickSysmsgThread returns true if it was able to wake up or create a new sysmsg 913 // stub thread. 914 func (s *subprocess) kickSysmsgThread() bool { 915 kick, _ := s.canKickSysmsgThread() 916 if !kick { 917 return false 918 } 919 920 s.sysmsgThreadsMu.Lock() 921 kick, nrThreads := s.canKickSysmsgThread() 922 if !kick { 923 s.sysmsgThreadsMu.Unlock() 924 return false 925 } 926 numTimesStubKicked.Increment() 927 atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1) 928 if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) { 929 s.numSysmsgThreads++ 930 s.sysmsgThreadsMu.Unlock() 931 if err := s.createSysmsgThread(); err != nil { 932 log.Warningf("Unable to create a new stub thread: %s", err) 933 s.sysmsgThreadsMu.Lock() 934 s.numSysmsgThreads-- 935 s.sysmsgThreadsMu.Unlock() 936 } 937 } else { 938 s.sysmsgThreadsMu.Unlock() 939 } 940 s.contextQueue.wakeupSysmsgThread() 941 942 return true 943 } 944 945 // syscall executes the given system call without handling interruptions. 946 func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { 947 s.syscallThreadMu.Lock() 948 defer s.syscallThreadMu.Unlock() 949 950 return s.syscallThread.syscall(sysno, args...) 951 } 952 953 // MapFile implements platform.AddressSpace.MapFile. 954 func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { 955 var flags int 956 if precommit { 957 flags |= unix.MAP_POPULATE 958 } 959 _, err := s.syscall( 960 unix.SYS_MMAP, 961 arch.SyscallArgument{Value: uintptr(addr)}, 962 arch.SyscallArgument{Value: uintptr(fr.Length())}, 963 arch.SyscallArgument{Value: uintptr(at.Prot())}, 964 arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)}, 965 arch.SyscallArgument{Value: uintptr(f.FD())}, 966 arch.SyscallArgument{Value: uintptr(fr.Start)}) 967 return err 968 } 969 970 // Unmap implements platform.AddressSpace.Unmap. 971 func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) { 972 ar, ok := addr.ToRange(length) 973 if !ok { 974 panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) 975 } 976 s.mu.Lock() 977 for c := range s.faultedContexts { 978 c.mu.Lock() 979 if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { 980 // Forget the last fault so that if c faults again, the fault isn't 981 // incorrectly reported as a write fault. If this is being called 982 // due to munmap() of the corresponding vma, handling of the second 983 // fault will fail anyway. 984 c.lastFaultSP = nil 985 delete(s.faultedContexts, c) 986 } 987 c.mu.Unlock() 988 } 989 s.mu.Unlock() 990 _, err := s.syscall( 991 unix.SYS_MUNMAP, 992 arch.SyscallArgument{Value: uintptr(addr)}, 993 arch.SyscallArgument{Value: uintptr(length)}) 994 if err != nil && err != errDeadSubprocess { 995 // We never expect this to happen. 996 panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) 997 } 998 } 999 1000 func (s *subprocess) PullFullState(c *platformContext, ac *arch.Context64) error { 1001 if !c.sharedContext.isActiveInSubprocess(s) { 1002 panic("Attempted to PullFullState for context that is not used in subprocess") 1003 } 1004 saveFPState(c.sharedContext, ac) 1005 return nil 1006 } 1007 1008 var ( 1009 sysmsgThreadPriorityOnce sync.Once 1010 sysmsgThreadPriority int 1011 ) 1012 1013 // initSysmsgThreadPriority looks at the current priority of the process 1014 // and updates `sysmsgThreadPriority` accordingly. 1015 func initSysmsgThreadPriority() { 1016 sysmsgThreadPriorityOnce.Do(func() { 1017 prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0) 1018 if err != nil { 1019 panic("unable to get current scheduling priority") 1020 } 1021 // Sysmsg threads are executed with a priority one lower than the Sentry. 1022 sysmsgThreadPriority = 20 - prio + 1 1023 }) 1024 } 1025 1026 // createSysmsgThread creates a new sysmsg thread. 1027 // The thread starts processing any available context in the context queue. 1028 func (s *subprocess) createSysmsgThread() error { 1029 // Create a new seccomp process. 1030 var r requestThread 1031 r.thread = make(chan *thread) 1032 s.requests <- r 1033 p := <-r.thread 1034 if p == nil { 1035 return fmt.Errorf("createSysmsgThread: failed to get clone") 1036 } 1037 1038 runtime.LockOSThread() 1039 defer runtime.UnlockOSThread() 1040 if err := p.attach(); err != nil { 1041 return err 1042 } 1043 1044 // Skip SIGSTOP. 1045 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 1046 panic(fmt.Sprintf("ptrace cont failed: %v", errno)) 1047 } 1048 sig := p.wait(stopped) 1049 if sig != unix.SIGSTOP { 1050 panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) 1051 } 1052 1053 // Allocate a new stack for the BPF process. 1054 opts := pgalloc.AllocOpts{ 1055 Kind: usage.System, 1056 Dir: pgalloc.TopDown, 1057 } 1058 fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts) 1059 if err != nil { 1060 // TODO(b/144063246): Need to fail the clone system call. 1061 panic(fmt.Sprintf("failed to allocate a new stack: %v", err)) 1062 } 1063 sysThread := &sysmsgThread{ 1064 thread: p, 1065 subproc: s, 1066 stackRange: fr, 1067 } 1068 // Use the sysmsgStackID as a handle on this thread instead of host tid in 1069 // order to be able to reliably specify invalidThreadID. 1070 threadID := uint32(p.sysmsgStackID) 1071 1072 // Map the stack into the sentry. 1073 sentryStackAddr, _, errno := unix.RawSyscall6( 1074 unix.SYS_MMAP, 1075 0, 1076 sysmsg.PerThreadSharedStackSize, 1077 unix.PROT_WRITE|unix.PROT_READ, 1078 unix.MAP_SHARED|unix.MAP_FILE, 1079 uintptr(s.memoryFile.FD()), uintptr(fr.Start)) 1080 if errno != 0 { 1081 panic(fmt.Sprintf("mmap failed: %v", errno)) 1082 } 1083 1084 // Before installing the stub syscall filters, we need to call a few 1085 // system calls (e.g. sigaltstack, sigaction) which have in-memory 1086 // arguments. We need to prevent changing these parameters by other 1087 // stub threads, so lets map the future BPF stack as read-only and 1088 // fill syscall arguments from the Sentry. 1089 sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset 1090 err = sysThread.mapStack(sysmsgStackAddr, true) 1091 if err != nil { 1092 panic(fmt.Sprintf("mmap failed: %v", err)) 1093 } 1094 1095 sysThread.init(sentryStackAddr, sysmsgStackAddr) 1096 1097 // Map the stack into the BPF process. 1098 err = sysThread.mapStack(sysmsgStackAddr, false) 1099 if err != nil { 1100 s.memoryFile.DecRef(fr) 1101 panic(fmt.Sprintf("mmap failed: %v", err)) 1102 } 1103 1104 // Map the stack into the BPF process. 1105 privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset 1106 err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize) 1107 if err != nil { 1108 s.memoryFile.DecRef(fr) 1109 panic(fmt.Sprintf("mmap failed: %v", err)) 1110 } 1111 1112 sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr)) 1113 sysThread.msg.Init(threadID) 1114 sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack) 1115 sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr())) 1116 sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler)) 1117 1118 sysThread.msg.State.Set(sysmsg.ThreadStateInitializing) 1119 1120 if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil { 1121 log.Warningf("Unable to change priority of a stub thread: %s", err) 1122 } 1123 1124 // Install a pre-compiled seccomp rules for the BPF process. 1125 _, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL, 1126 arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)}, 1127 arch.SyscallArgument{Value: uintptr(1)}, 1128 arch.SyscallArgument{Value: uintptr(0)}, 1129 arch.SyscallArgument{Value: uintptr(0)}, 1130 arch.SyscallArgument{Value: uintptr(0)}, 1131 arch.SyscallArgument{Value: uintptr(0)}) 1132 if err != nil { 1133 panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err)) 1134 } 1135 1136 _, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP, 1137 arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, 1138 arch.SyscallArgument{Value: uintptr(0)}, 1139 arch.SyscallArgument{Value: stubSysmsgRules}) 1140 if err != nil { 1141 panic(fmt.Sprintf("seccomp failed: %v", err)) 1142 } 1143 1144 // Prepare to start the BPF process. 1145 tregs := &arch.Registers{} 1146 s.resetSysemuRegs(tregs) 1147 setArchSpecificRegs(sysThread, tregs) 1148 if err := p.setRegs(tregs); err != nil { 1149 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 1150 } 1151 archSpecificSysmsgThreadInit(sysThread) 1152 // Skip SIGSTOP. 1153 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { 1154 panic(fmt.Sprintf("tkill failed: %v", e)) 1155 } 1156 // Resume the BPF process. 1157 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 1158 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 1159 } 1160 1161 s.sysmsgThreadsMu.Lock() 1162 s.sysmsgThreads[threadID] = sysThread 1163 s.sysmsgThreadsMu.Unlock() 1164 1165 return nil 1166 } 1167 1168 // PreFork implements platform.AddressSpace.PreFork. 1169 // We need to take the usertrap lock to be sure that fork() will not be in the 1170 // middle of applying a binary patch. 1171 func (s *subprocess) PreFork() { 1172 s.usertrap.PreFork() 1173 } 1174 1175 // PostFork implements platform.AddressSpace.PostFork. 1176 func (s *subprocess) PostFork() { 1177 s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above. 1178 } 1179 1180 // activateContext activates the context in this subprocess. 1181 // No-op if the context is already active within the subprocess; if not, 1182 // deactivates it from its last subprocess. 1183 func (s *subprocess) activateContext(c *platformContext) error { 1184 if !c.sharedContext.isActiveInSubprocess(s) { 1185 c.sharedContext.release() 1186 c.sharedContext = nil 1187 1188 shared, err := s.getSharedContext() 1189 if err != nil { 1190 return err 1191 } 1192 c.sharedContext = shared 1193 } 1194 return nil 1195 }