github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/subprocess.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package systrap 16 17 import ( 18 "fmt" 19 "os" 20 "runtime" 21 "sync" 22 "sync/atomic" 23 24 "golang.org/x/sys/unix" 25 "github.com/metacubex/gvisor/pkg/abi/linux" 26 "github.com/metacubex/gvisor/pkg/atomicbitops" 27 "github.com/metacubex/gvisor/pkg/hostarch" 28 "github.com/metacubex/gvisor/pkg/log" 29 "github.com/metacubex/gvisor/pkg/pool" 30 "github.com/metacubex/gvisor/pkg/seccomp" 31 "github.com/metacubex/gvisor/pkg/sentry/arch" 32 "github.com/metacubex/gvisor/pkg/sentry/memmap" 33 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 34 "github.com/metacubex/gvisor/pkg/sentry/platform" 35 "github.com/metacubex/gvisor/pkg/sentry/platform/systrap/sysmsg" 36 "github.com/metacubex/gvisor/pkg/sentry/platform/systrap/usertrap" 37 "github.com/metacubex/gvisor/pkg/sentry/usage" 38 ) 39 40 var ( 41 // globalPool tracks all subprocesses in various state: active or available for 42 // reuse. 43 globalPool = subprocessPool{} 44 45 // maximumUserAddress is the largest possible user address. 46 maximumUserAddress = linux.TaskSize 47 48 // stubInitAddress is the initial attempt link address for the stub. 49 stubInitAddress = linux.TaskSize 50 51 // maxRandomOffsetOfStubAddress is the maximum offset for randomizing a 52 // stub address. It is set to the default value of mm.mmap_rnd_bits. 53 // 54 // Note: Tools like ThreadSanitizer don't like when the memory layout 55 // is changed significantly. 56 maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1) 57 58 // maxStubUserAddress is the largest possible user address for 59 // processes running inside gVisor. It is fixed because 60 // * we don't want to reveal a stub address. 61 // * it has to be the same across checkpoint/restore. 62 maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress 63 ) 64 65 // Linux kernel errnos which "should never be seen by user programs", but will 66 // be revealed to ptrace syscall exit tracing. 67 // 68 // These constants are only used in subprocess.go. 69 const ( 70 ERESTARTSYS = unix.Errno(512) 71 ERESTARTNOINTR = unix.Errno(513) 72 ERESTARTNOHAND = unix.Errno(514) 73 ) 74 75 // thread is a traced thread; it is a thread identifier. 76 // 77 // This is a convenience type for defining ptrace operations. 78 type thread struct { 79 tgid int32 80 tid int32 81 82 // sysmsgStackID is a stack ID in subprocess.sysmsgStackPool. 83 sysmsgStackID uint64 84 85 // initRegs are the initial registers for the first thread. 86 // 87 // These are used for the register set for system calls. 88 initRegs arch.Registers 89 } 90 91 // requestThread is used to request a new sysmsg thread. A thread identifier will 92 // be sent into the thread channel. 93 type requestThread struct { 94 thread chan *thread 95 } 96 97 // requestStub is used to request a new stub process. 98 type requestStub struct { 99 done chan *thread 100 } 101 102 // maxSysmsgThreads specifies the maximum number of system threads that a 103 // subprocess can create in context decoupled mode. 104 // TODO(b/268366549): Replace maxSystemThreads below. 105 var maxSysmsgThreads = runtime.GOMAXPROCS(0) 106 107 const ( 108 // maxSystemThreads specifies the maximum number of system threads that a 109 // subprocess may create in order to process the contexts. 110 maxSystemThreads = 4096 111 // maxGuestContexts specifies the maximum number of task contexts that a 112 // subprocess can handle. 113 maxGuestContexts = 4095 114 // invalidContextID specifies an invalid ID. 115 invalidContextID uint32 = 0xfefefefe 116 // invalidThreadID is used to indicate that a context is not being worked on by 117 // any sysmsg thread. 118 invalidThreadID uint32 = 0xfefefefe 119 ) 120 121 // subprocess is a collection of threads being traced. 122 type subprocess struct { 123 platform.NoAddressSpaceIO 124 subprocessRefs 125 126 // requests is used to signal creation of new threads. 127 requests chan any 128 129 // sysmsgInitRegs is used to reset sysemu regs. 130 sysmsgInitRegs arch.Registers 131 132 // mu protects the following fields. 133 mu sync.Mutex 134 135 // faultedContexts is the set of contexts for which it's possible that 136 // platformContext.lastFaultSP == this subprocess. 137 faultedContexts map[*platformContext]struct{} 138 139 // sysmsgStackPool is a pool of available sysmsg stacks. 140 sysmsgStackPool pool.Pool 141 142 // threadContextPool is a pool of available sysmsg.ThreadContext IDs. 143 threadContextPool pool.Pool 144 145 // threadContextRegion defines the ThreadContext memory region start 146 // within the sentry address space. 147 threadContextRegion uintptr 148 149 // memoryFile is used to allocate a sysmsg stack which is shared 150 // between a stub process and the Sentry. 151 memoryFile *pgalloc.MemoryFile 152 153 // usertrap is the state of the usertrap table which contains syscall 154 // trampolines. 155 usertrap *usertrap.State 156 157 syscallThreadMu sync.Mutex 158 syscallThread *syscallThread 159 160 // sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads 161 sysmsgThreadsMu sync.Mutex 162 // sysmsgThreads is a collection of all active sysmsg threads in the 163 // subprocess. 164 sysmsgThreads map[uint32]*sysmsgThread 165 // numSysmsgThreads counts the number of active sysmsg threads; we use a 166 // counter instead of using len(sysmsgThreads) because we need to synchronize 167 // how many threads get created _before_ the creation happens. 168 numSysmsgThreads int 169 170 // contextQueue is a queue of all contexts that are ready to switch back to 171 // user mode. 172 contextQueue *contextQueue 173 174 // dead indicates whether the subprocess is alive or not. 175 dead atomicbitops.Bool 176 } 177 178 func (s *subprocess) initSyscallThread(ptraceThread *thread) error { 179 s.syscallThreadMu.Lock() 180 defer s.syscallThreadMu.Unlock() 181 182 id, ok := s.sysmsgStackPool.Get() 183 if !ok { 184 panic("unable to allocate a sysmsg stub thread") 185 } 186 187 ptraceThread.sysmsgStackID = id 188 t := syscallThread{ 189 subproc: s, 190 thread: ptraceThread, 191 } 192 193 if err := t.init(); err != nil { 194 panic(fmt.Sprintf("failed to create a syscall thread")) 195 } 196 s.syscallThread = &t 197 198 s.syscallThread.detach() 199 200 return nil 201 } 202 203 func handlePtraceSyscallRequestError(req any, format string, values ...any) { 204 switch req.(type) { 205 case requestThread: 206 req.(requestThread).thread <- nil 207 case requestStub: 208 req.(requestStub).done <- nil 209 } 210 log.Warningf("handlePtraceSyscallRequest failed: "+format, values...) 211 } 212 213 // handlePtraceSyscallRequest executes system calls that can't be run via 214 // syscallThread without using ptrace. Look at the description of syscallThread 215 // to get more details about its limitations. 216 func (s *subprocess) handlePtraceSyscallRequest(req any) { 217 s.syscallThreadMu.Lock() 218 defer s.syscallThreadMu.Unlock() 219 runtime.LockOSThread() 220 defer runtime.UnlockOSThread() 221 if err := s.syscallThread.attach(); err != nil { 222 handlePtraceSyscallRequestError(req, err.Error()) 223 return 224 } 225 defer s.syscallThread.detach() 226 227 ptraceThread := s.syscallThread.thread 228 229 switch r := req.(type) { 230 case requestThread: 231 t, err := ptraceThread.clone() 232 if err != nil { 233 handlePtraceSyscallRequestError(req, "error initializing thread: %v", err) 234 return 235 } 236 237 // Since the new thread was created with 238 // clone(CLONE_PTRACE), it will begin execution with 239 // SIGSTOP pending and with this thread as its tracer. 240 // (Hopefully nobody tgkilled it with a signal < 241 // SIGSTOP before the SIGSTOP was delivered, in which 242 // case that signal would be delivered before SIGSTOP.) 243 if sig := t.wait(stopped); sig != unix.SIGSTOP { 244 handlePtraceSyscallRequestError(req, "error waiting for new clone: expected SIGSTOP, got %v", sig) 245 return 246 } 247 248 t.initRegs = ptraceThread.initRegs 249 // Set the parent death signal to SIGKILL. 250 _, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL, 251 arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG}, 252 arch.SyscallArgument{Value: uintptr(unix.SIGKILL)}, 253 arch.SyscallArgument{Value: 0}, 254 arch.SyscallArgument{Value: 0}, 255 arch.SyscallArgument{Value: 0}, 256 arch.SyscallArgument{Value: 0}, 257 ) 258 if err != nil { 259 handlePtraceSyscallRequestError(req, "prctl: %v", err) 260 return 261 } 262 263 id, ok := s.sysmsgStackPool.Get() 264 if !ok { 265 handlePtraceSyscallRequestError(req, "unable to allocate a sysmsg stub thread") 266 return 267 } 268 t.sysmsgStackID = id 269 270 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 { 271 handlePtraceSyscallRequestError(req, "tkill failed: %v", e) 272 return 273 } 274 275 // Detach the thread. 276 t.detach() 277 278 // Return the thread. 279 r.thread <- t 280 case requestStub: 281 t, err := ptraceThread.createStub() 282 if err != nil { 283 handlePtraceSyscallRequestError(req, "unable to create a stub process: %v", err) 284 return 285 } 286 r.done <- t 287 288 } 289 } 290 291 // newSubprocess returns a usable subprocess. 292 // 293 // This will either be a newly created subprocess, or one from the global pool. 294 // The create function will be called in the latter case, which is guaranteed 295 // to happen with the runtime thread locked. 296 func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile) (*subprocess, error) { 297 if sp := globalPool.fetchAvailable(); sp != nil { 298 sp.subprocessRefs.InitRefs() 299 sp.usertrap = usertrap.New() 300 return sp, nil 301 } 302 303 // The following goroutine is responsible for creating the first traced 304 // thread, and responding to requests to make additional threads in the 305 // traced process. The process will be killed and reaped when the 306 // request channel is closed, which happens in Release below. 307 requests := make(chan any) 308 309 // Ready. 310 sp := &subprocess{ 311 requests: requests, 312 faultedContexts: make(map[*platformContext]struct{}), 313 sysmsgStackPool: pool.Pool{Start: 0, Limit: maxSystemThreads}, 314 threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts}, 315 memoryFile: memoryFile, 316 sysmsgThreads: make(map[uint32]*sysmsgThread), 317 } 318 sp.subprocessRefs.InitRefs() 319 runtime.LockOSThread() 320 defer runtime.UnlockOSThread() 321 322 // Initialize the syscall thread. 323 ptraceThread, err := create() 324 if err != nil { 325 return nil, err 326 } 327 sp.sysmsgInitRegs = ptraceThread.initRegs 328 329 if err := sp.initSyscallThread(ptraceThread); err != nil { 330 return nil, err 331 } 332 333 go func() { // S/R-SAFE: Platform-related. 334 335 // Wait for requests to create threads. 336 for req := range requests { 337 sp.handlePtraceSyscallRequest(req) 338 } 339 340 // Requests should never be closed. 341 panic("unreachable") 342 }() 343 344 sp.unmap() 345 sp.usertrap = usertrap.New() 346 sp.mapSharedRegions() 347 sp.mapPrivateRegions() 348 349 // Create the initial sysmsg thread. 350 atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1) 351 if err := sp.createSysmsgThread(); err != nil { 352 return nil, err 353 } 354 sp.numSysmsgThreads++ 355 356 return sp, nil 357 } 358 359 // mapSharedRegions maps the shared regions that are used between the subprocess 360 // and ALL of the subsequently created sysmsg threads into both the sentry and 361 // the syscall thread. 362 // 363 // Should be called before any sysmsg threads are created. 364 // Initializes s.contextQueue and s.threadContextRegion. 365 func (s *subprocess) mapSharedRegions() { 366 if s.contextQueue != nil || s.threadContextRegion != 0 { 367 panic("contextQueue or threadContextRegion was already initialized") 368 } 369 370 opts := pgalloc.AllocOpts{ 371 Kind: usage.System, 372 Dir: pgalloc.TopDown, 373 } 374 375 // Map shared regions into the sentry. 376 contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts) 377 contextQueue.init() 378 379 // Map thread context region into the syscall thread. 380 _, err := s.syscallThread.syscall( 381 unix.SYS_MMAP, 382 arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)}, 383 arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())}, 384 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 385 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 386 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 387 arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)}) 388 if err != nil { 389 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 390 } 391 392 s.contextQueue = contextQueue 393 394 // Map thread context region into the sentry. 395 threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts) 396 if err != nil { 397 panic(fmt.Sprintf("failed to allocate a new subprocess context memory region")) 398 } 399 sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6( 400 unix.SYS_MMAP, 401 0, 402 uintptr(threadContextFR.Length()), 403 unix.PROT_WRITE|unix.PROT_READ, 404 unix.MAP_SHARED|unix.MAP_FILE, 405 uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start)) 406 if errno != 0 { 407 panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno)) 408 } 409 410 // Map thread context region into the syscall thread. 411 if _, err := s.syscallThread.syscall( 412 unix.SYS_MMAP, 413 arch.SyscallArgument{Value: uintptr(stubContextRegion)}, 414 arch.SyscallArgument{Value: uintptr(threadContextFR.Length())}, 415 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 416 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 417 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 418 arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil { 419 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 420 } 421 422 s.threadContextRegion = sentryThreadContextRegionAddr 423 } 424 425 func (s *subprocess) mapPrivateRegions() { 426 _, err := s.syscallThread.syscall( 427 unix.SYS_MMAP, 428 arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)}, 429 arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)}, 430 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 431 arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)}, 432 arch.SyscallArgument{Value: 0}, 433 arch.SyscallArgument{Value: 0}) 434 if err != nil { 435 panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err)) 436 } 437 } 438 439 // unmap unmaps non-stub regions of the process. 440 // 441 // This will panic on failure (which should never happen). 442 func (s *subprocess) unmap() { 443 s.Unmap(0, uint64(stubStart)) 444 if maximumUserAddress != stubEnd { 445 s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) 446 } 447 } 448 449 // Release kills the subprocess. 450 // 451 // Just kidding! We can't safely coordinate the detaching of all the 452 // tracees (since the tracers are random runtime threads, and the process 453 // won't exit until tracers have been notifier). 454 // 455 // Therefore we simply unmap everything in the subprocess and return it to the 456 // globalPool. This has the added benefit of reducing creation time for new 457 // subprocesses. 458 func (s *subprocess) Release() { 459 if !s.alive() { 460 return 461 } 462 s.unmap() 463 s.DecRef(s.release) 464 } 465 466 // release returns the subprocess to the global pool. 467 func (s *subprocess) release() { 468 if s.alive() { 469 globalPool.markAvailable(s) 470 } 471 } 472 473 // attach attaches to the thread. 474 func (t *thread) attach() error { 475 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 476 return fmt.Errorf("unable to attach: %v", errno) 477 } 478 479 // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already 480 // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of 481 // newSubprocess), so we always expect to see signal-delivery-stop with 482 // SIGSTOP. 483 if sig := t.wait(stopped); sig != unix.SIGSTOP { 484 return fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 485 } 486 487 // Initialize options. 488 t.init() 489 return nil 490 } 491 492 func (t *thread) grabInitRegs() { 493 // Grab registers. 494 // 495 // Note that we adjust the current register RIP value to be just before 496 // the current system call executed. This depends on the definition of 497 // the stub itself. 498 if err := t.getRegs(&t.initRegs); err != nil { 499 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 500 } 501 t.adjustInitRegsRip() 502 t.initRegs.SetStackPointer(0) 503 } 504 505 // detach detaches from the thread. 506 // 507 // Because the SIGSTOP is not suppressed, the thread will enter group-stop. 508 func (t *thread) detach() { 509 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 { 510 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 511 } 512 } 513 514 // waitOutcome is used for wait below. 515 type waitOutcome int 516 517 const ( 518 // stopped indicates that the process was stopped. 519 stopped waitOutcome = iota 520 521 // killed indicates that the process was killed. 522 killed 523 ) 524 525 func (t *thread) Debugf(format string, v ...any) { 526 prefix := fmt.Sprintf("%8d:", t.tid) 527 log.DebugfAtDepth(1, prefix+format, v...) 528 } 529 530 func (t *thread) dumpAndPanic(message string) { 531 var regs arch.Registers 532 message += "\n" 533 if err := t.getRegs(®s); err == nil { 534 message += dumpRegs(®s) 535 } else { 536 log.Warningf("unable to get registers: %v", err) 537 } 538 message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) 539 panic(message) 540 } 541 542 func (t *thread) dumpRegs(message string) { 543 var regs arch.Registers 544 message += "\n" 545 if err := t.getRegs(®s); err == nil { 546 message += dumpRegs(®s) 547 } else { 548 log.Warningf("unable to get registers: %v", err) 549 } 550 log.Infof("%s", message) 551 } 552 553 func (t *thread) unexpectedStubExit() { 554 msg, err := t.getEventMessage() 555 status := unix.WaitStatus(msg) 556 if status.Signaled() && status.Signal() == unix.SIGKILL { 557 // SIGKILL can be only sent by a user or OOM-killer. In both 558 // these cases, we don't need to panic. There is no reasons to 559 // think that something wrong in gVisor. 560 log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) 561 pid := os.Getpid() 562 unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL)) 563 } 564 t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) 565 } 566 567 // wait waits for a stop event. 568 // 569 // Precondition: outcome is a valid waitOutcome. 570 func (t *thread) wait(outcome waitOutcome) unix.Signal { 571 var status unix.WaitStatus 572 573 for { 574 r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil) 575 if err == unix.EINTR || err == unix.EAGAIN { 576 // Wait was interrupted; wait again. 577 continue 578 } else if err != nil { 579 panic(fmt.Sprintf("ptrace wait failed: %v", err)) 580 } 581 if int(r) != int(t.tid) { 582 panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) 583 } 584 switch outcome { 585 case stopped: 586 if !status.Stopped() { 587 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) 588 } 589 stopSig := status.StopSignal() 590 if stopSig == 0 { 591 continue // Spurious stop. 592 } 593 if stopSig == unix.SIGTRAP { 594 if status.TrapCause() == unix.PTRACE_EVENT_EXIT { 595 t.unexpectedStubExit() 596 } 597 // Re-encode the trap cause the way it's expected. 598 return stopSig | unix.Signal(status.TrapCause()<<8) 599 } 600 // Not a trap signal. 601 return stopSig 602 case killed: 603 if !status.Exited() && !status.Signaled() { 604 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) 605 } 606 return unix.Signal(status.ExitStatus()) 607 default: 608 // Should not happen. 609 t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) 610 } 611 } 612 } 613 614 // destroy kills the thread. 615 // 616 // Note that this should not be used in the general case; the death of threads 617 // will typically cause the death of the parent. This is a utility method for 618 // manually created threads. 619 func (t *thread) destroy() { 620 t.detach() 621 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) 622 t.wait(killed) 623 } 624 625 // init initializes trace options. 626 func (t *thread) init() { 627 // Set the TRACESYSGOOD option to differentiate real SIGTRAP. 628 // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the 629 // sentry will immediately kill the associated stubs. 630 _, _, errno := unix.RawSyscall6( 631 unix.SYS_PTRACE, 632 unix.PTRACE_SETOPTIONS, 633 uintptr(t.tid), 634 0, 635 unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL, 636 0, 0) 637 if errno != 0 { 638 panic(fmt.Sprintf("ptrace set options failed: %v", errno)) 639 } 640 } 641 642 // syscall executes a system call cycle in the traced context. 643 // 644 // This is _not_ for use by application system calls, rather it is for use when 645 // a system call must be injected into the remote context (e.g. mmap, munmap). 646 // Note that clones are handled separately. 647 func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { 648 // Set registers. 649 if err := t.setRegs(regs); err != nil { 650 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 651 } 652 653 for { 654 // Execute the syscall instruction. The task has to stop on the 655 // trap instruction which is right after the syscall 656 // instruction. 657 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 658 panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) 659 } 660 661 sig := t.wait(stopped) 662 if sig == unix.SIGTRAP { 663 // Reached syscall-enter-stop. 664 break 665 } else { 666 // Some other signal caused a thread stop; ignore. 667 if sig != unix.SIGSTOP && sig != unix.SIGCHLD { 668 log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) 669 } 670 continue 671 } 672 } 673 674 // Grab registers. 675 if err := t.getRegs(regs); err != nil { 676 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 677 } 678 return syscallReturnValue(regs) 679 } 680 681 // syscallIgnoreInterrupt ignores interrupts on the system call thread and 682 // restarts the syscall if the kernel indicates that should happen. 683 func (t *thread) syscallIgnoreInterrupt( 684 initRegs *arch.Registers, 685 sysno uintptr, 686 args ...arch.SyscallArgument) (uintptr, error) { 687 for { 688 regs := createSyscallRegs(initRegs, sysno, args...) 689 rval, err := t.syscall(®s) 690 switch err { 691 case ERESTARTSYS: 692 continue 693 case ERESTARTNOINTR: 694 continue 695 case ERESTARTNOHAND: 696 continue 697 default: 698 return rval, err 699 } 700 } 701 } 702 703 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 704 func (t *thread) NotifyInterrupt() { 705 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt)) 706 } 707 708 func (s *subprocess) incAwakeContexts() { 709 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1) 710 if nr > uint32(maxSysmsgThreads) { 711 return 712 } 713 fastpath.nrMaxAwakeStubThreads.Add(1) 714 } 715 716 func (s *subprocess) decAwakeContexts() { 717 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0)) 718 if nr >= uint32(maxSysmsgThreads) { 719 return 720 } 721 fastpath.nrMaxAwakeStubThreads.Add(^uint32(0)) 722 } 723 724 // switchToApp is called from the main SwitchToApp entrypoint. 725 // 726 // This function returns true on a system call, false on a signal. 727 // The second return value is true if a syscall instruction can be replaced on 728 // a function call. 729 func (s *subprocess) switchToApp(c *platformContext, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err *platform.ContextError) { 730 // Reset necessary registers. 731 regs := &ac.StateData().Regs 732 s.resetSysemuRegs(regs) 733 ctx := c.sharedContext 734 ctx.shared.Regs = regs.PtraceRegs 735 restoreArchSpecificState(ctx.shared, ac) 736 737 // Check for interrupts, and ensure that future interrupts signal the context. 738 if !c.interrupt.Enable(c.sharedContext) { 739 // Pending interrupt; simulate. 740 ctx.clearInterrupt() 741 c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} 742 return false, false, nil 743 } 744 defer func() { 745 ctx.clearInterrupt() 746 c.interrupt.Disable() 747 }() 748 749 restoreFPState(ctx, c, ac) 750 751 // Place the context onto the context queue. 752 if ctx.sleeping { 753 ctx.sleeping = false 754 s.incAwakeContexts() 755 } 756 ctx.setState(sysmsg.ContextStateNone) 757 if err := s.contextQueue.add(ctx); err != nil { 758 return false, false, err 759 } 760 761 if err := s.waitOnState(ctx); err != nil { 762 return false, false, corruptedSharedMemoryErr(err.Error()) 763 } 764 765 // Check if there's been an error. 766 threadID := ctx.threadID() 767 if threadID != invalidThreadID { 768 if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 { 769 return false, false, sysThread.msg.ConvertSysmsgErr() 770 } 771 return false, false, corruptedSharedMemoryErr(fmt.Sprintf("found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID)) 772 } 773 774 // Copy register state locally. 775 regs.PtraceRegs = ctx.shared.Regs 776 retrieveArchSpecificState(ctx.shared, ac) 777 c.needToPullFullState = true 778 // We have a signal. We verify however, that the signal was 779 // either delivered from the kernel or from this process. We 780 // don't respect other signals. 781 c.signalInfo = ctx.shared.SignalInfo 782 ctxState := ctx.state() 783 if ctxState == sysmsg.ContextStateSyscallCanBePatched { 784 ctxState = sysmsg.ContextStateSyscall 785 shouldPatchSyscall = true 786 } 787 788 if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap { 789 if maybePatchSignalInfo(regs, &c.signalInfo) { 790 return false, false, nil 791 } 792 updateSyscallRegs(regs) 793 return true, shouldPatchSyscall, nil 794 } else if ctxState != sysmsg.ContextStateFault { 795 return false, false, corruptedSharedMemoryErr(fmt.Sprintf("unknown context state: %v", ctxState)) 796 } 797 798 return false, false, nil 799 } 800 801 func (s *subprocess) waitOnState(ctx *sharedContext) error { 802 ctx.kicked = false 803 slowPath := false 804 if !s.contextQueue.fastPathEnabled() || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 { 805 ctx.kicked = s.kickSysmsgThread() 806 } 807 for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() { 808 if !slowPath { 809 events := dispatcher.waitFor(ctx) 810 if events&sharedContextKicked != 0 { 811 if ctx.kicked { 812 continue 813 } 814 if ctx.isAcked() { 815 ctx.kicked = true 816 continue 817 } 818 s.kickSysmsgThread() 819 ctx.kicked = true 820 continue 821 } 822 if events&sharedContextSlowPath != 0 { 823 ctx.disableSentryFastPath() 824 slowPath = true 825 continue 826 } 827 } else { 828 // If the context already received a handshake then it knows it's being 829 // worked on. 830 if !ctx.kicked && !ctx.isAcked() { 831 ctx.kicked = s.kickSysmsgThread() 832 } 833 834 if err := ctx.sleepOnState(curState); err != nil { 835 return err 836 } 837 } 838 } 839 840 ctx.recordLatency() 841 ctx.resetLatencyMeasures() 842 ctx.enableSentryFastPath() 843 844 return nil 845 } 846 847 // canKickSysmsgThread returns true if a new thread can be kicked. 848 // The second return value is the expected number of threads after kicking a 849 // new one. 850 func (s *subprocess) canKickSysmsgThread() (bool, uint32) { 851 // numActiveContexts and numActiveThreads can be changed from stub 852 // threads that handles the contextQueue without any locks. The idea 853 // here is that any stub thread that gets CPU time can make some 854 // progress. In stub threads, we can use only spinlock-like 855 // synchronizations, but they don't work well because a thread that 856 // holds a lock can be preempted by another thread that is waiting for 857 // the same lock. 858 nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads) 859 nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup) 860 nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts) 861 862 nrActiveThreads += nrThreadsToWakeup + 1 863 if nrActiveThreads > nrActiveContexts { 864 // This can happen when one or more stub threads are 865 // waiting for cpu time. The host probably has more 866 // running tasks than a number of cpu-s. 867 return false, nrActiveThreads 868 } 869 return true, nrActiveThreads 870 } 871 872 // kickSysmsgThread returns true if it was able to wake up or create a new sysmsg 873 // stub thread. 874 func (s *subprocess) kickSysmsgThread() bool { 875 kick, _ := s.canKickSysmsgThread() 876 if !kick { 877 return false 878 } 879 880 s.sysmsgThreadsMu.Lock() 881 kick, nrThreads := s.canKickSysmsgThread() 882 if !kick { 883 s.sysmsgThreadsMu.Unlock() 884 return false 885 } 886 numTimesStubKicked.Increment() 887 atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1) 888 if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) { 889 s.numSysmsgThreads++ 890 s.sysmsgThreadsMu.Unlock() 891 if err := s.createSysmsgThread(); err != nil { 892 log.Warningf("Unable to create a new stub thread: %s", err) 893 s.sysmsgThreadsMu.Lock() 894 s.numSysmsgThreads-- 895 s.sysmsgThreadsMu.Unlock() 896 } 897 } else { 898 s.sysmsgThreadsMu.Unlock() 899 } 900 s.contextQueue.wakeupSysmsgThread() 901 902 return true 903 } 904 905 // syscall executes the given system call without handling interruptions. 906 func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { 907 s.syscallThreadMu.Lock() 908 defer s.syscallThreadMu.Unlock() 909 910 return s.syscallThread.syscall(sysno, args...) 911 } 912 913 // MapFile implements platform.AddressSpace.MapFile. 914 func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { 915 var flags int 916 if precommit { 917 flags |= unix.MAP_POPULATE 918 } 919 _, err := s.syscall( 920 unix.SYS_MMAP, 921 arch.SyscallArgument{Value: uintptr(addr)}, 922 arch.SyscallArgument{Value: uintptr(fr.Length())}, 923 arch.SyscallArgument{Value: uintptr(at.Prot())}, 924 arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)}, 925 arch.SyscallArgument{Value: uintptr(f.FD())}, 926 arch.SyscallArgument{Value: uintptr(fr.Start)}) 927 return err 928 } 929 930 // Unmap implements platform.AddressSpace.Unmap. 931 func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) { 932 ar, ok := addr.ToRange(length) 933 if !ok { 934 panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) 935 } 936 s.mu.Lock() 937 for c := range s.faultedContexts { 938 c.mu.Lock() 939 if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { 940 // Forget the last fault so that if c faults again, the fault isn't 941 // incorrectly reported as a write fault. If this is being called 942 // due to munmap() of the corresponding vma, handling of the second 943 // fault will fail anyway. 944 c.lastFaultSP = nil 945 delete(s.faultedContexts, c) 946 } 947 c.mu.Unlock() 948 } 949 s.mu.Unlock() 950 _, err := s.syscall( 951 unix.SYS_MUNMAP, 952 arch.SyscallArgument{Value: uintptr(addr)}, 953 arch.SyscallArgument{Value: uintptr(length)}) 954 if err != nil && err != errDeadSubprocess { 955 // We never expect this to happen. 956 panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) 957 } 958 } 959 960 func (s *subprocess) PullFullState(c *platformContext, ac *arch.Context64) error { 961 if !c.sharedContext.isActiveInSubprocess(s) { 962 panic("Attempted to PullFullState for context that is not used in subprocess") 963 } 964 saveFPState(c.sharedContext, ac) 965 return nil 966 } 967 968 var ( 969 sysmsgThreadPriorityOnce sync.Once 970 sysmsgThreadPriority int 971 ) 972 973 // initSysmsgThreadPriority looks at the current priority of the process 974 // and updates `sysmsgThreadPriority` accordingly. 975 func initSysmsgThreadPriority() { 976 sysmsgThreadPriorityOnce.Do(func() { 977 prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0) 978 if err != nil { 979 panic("unable to get current scheduling priority") 980 } 981 // Sysmsg threads are executed with a priority one lower than the Sentry. 982 sysmsgThreadPriority = 20 - prio + 1 983 }) 984 } 985 986 // createSysmsgThread creates a new sysmsg thread. 987 // The thread starts processing any available context in the context queue. 988 func (s *subprocess) createSysmsgThread() error { 989 // Create a new seccomp process. 990 var r requestThread 991 r.thread = make(chan *thread) 992 s.requests <- r 993 p := <-r.thread 994 if p == nil { 995 return fmt.Errorf("createSysmsgThread: failed to get clone") 996 } 997 998 runtime.LockOSThread() 999 defer runtime.UnlockOSThread() 1000 if err := p.attach(); err != nil { 1001 return err 1002 } 1003 1004 // Skip SIGSTOP. 1005 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 1006 panic(fmt.Sprintf("ptrace cont failed: %v", errno)) 1007 } 1008 sig := p.wait(stopped) 1009 if sig != unix.SIGSTOP { 1010 panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) 1011 } 1012 1013 // Allocate a new stack for the BPF process. 1014 opts := pgalloc.AllocOpts{ 1015 Kind: usage.System, 1016 Dir: pgalloc.TopDown, 1017 } 1018 fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts) 1019 if err != nil { 1020 // TODO(b/144063246): Need to fail the clone system call. 1021 panic(fmt.Sprintf("failed to allocate a new stack: %v", err)) 1022 } 1023 sysThread := &sysmsgThread{ 1024 thread: p, 1025 subproc: s, 1026 stackRange: fr, 1027 } 1028 // Use the sysmsgStackID as a handle on this thread instead of host tid in 1029 // order to be able to reliably specify invalidThreadID. 1030 threadID := uint32(p.sysmsgStackID) 1031 1032 // Map the stack into the sentry. 1033 sentryStackAddr, _, errno := unix.RawSyscall6( 1034 unix.SYS_MMAP, 1035 0, 1036 sysmsg.PerThreadSharedStackSize, 1037 unix.PROT_WRITE|unix.PROT_READ, 1038 unix.MAP_SHARED|unix.MAP_FILE, 1039 uintptr(s.memoryFile.FD()), uintptr(fr.Start)) 1040 if errno != 0 { 1041 panic(fmt.Sprintf("mmap failed: %v", errno)) 1042 } 1043 1044 // Before installing the stub syscall filters, we need to call a few 1045 // system calls (e.g. sigaltstack, sigaction) which have in-memory 1046 // arguments. We need to prevent changing these parameters by other 1047 // stub threads, so lets map the future BPF stack as read-only and 1048 // fill syscall arguments from the Sentry. 1049 sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset 1050 err = sysThread.mapStack(sysmsgStackAddr, true) 1051 if err != nil { 1052 panic(fmt.Sprintf("mmap failed: %v", err)) 1053 } 1054 1055 sysThread.init(sentryStackAddr, sysmsgStackAddr) 1056 1057 // Map the stack into the BPF process. 1058 err = sysThread.mapStack(sysmsgStackAddr, false) 1059 if err != nil { 1060 s.memoryFile.DecRef(fr) 1061 panic(fmt.Sprintf("mmap failed: %v", err)) 1062 } 1063 1064 // Map the stack into the BPF process. 1065 privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset 1066 err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize) 1067 if err != nil { 1068 s.memoryFile.DecRef(fr) 1069 panic(fmt.Sprintf("mmap failed: %v", err)) 1070 } 1071 1072 sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr)) 1073 sysThread.msg.Init(threadID) 1074 sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack) 1075 sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr())) 1076 sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler)) 1077 1078 sysThread.msg.State.Set(sysmsg.ThreadStateInitializing) 1079 1080 if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil { 1081 log.Warningf("Unable to change priority of a stub thread: %s", err) 1082 } 1083 1084 // Install a pre-compiled seccomp rules for the BPF process. 1085 _, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL, 1086 arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)}, 1087 arch.SyscallArgument{Value: uintptr(1)}, 1088 arch.SyscallArgument{Value: uintptr(0)}, 1089 arch.SyscallArgument{Value: uintptr(0)}, 1090 arch.SyscallArgument{Value: uintptr(0)}, 1091 arch.SyscallArgument{Value: uintptr(0)}) 1092 if err != nil { 1093 panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err)) 1094 } 1095 1096 _, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP, 1097 arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, 1098 arch.SyscallArgument{Value: uintptr(0)}, 1099 arch.SyscallArgument{Value: stubSysmsgRules}) 1100 if err != nil { 1101 panic(fmt.Sprintf("seccomp failed: %v", err)) 1102 } 1103 1104 // Prepare to start the BPF process. 1105 tregs := &arch.Registers{} 1106 s.resetSysemuRegs(tregs) 1107 setArchSpecificRegs(sysThread, tregs) 1108 if err := p.setRegs(tregs); err != nil { 1109 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 1110 } 1111 archSpecificSysmsgThreadInit(sysThread) 1112 // Skip SIGSTOP. 1113 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { 1114 panic(fmt.Sprintf("tkill failed: %v", e)) 1115 } 1116 // Resume the BPF process. 1117 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 1118 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 1119 } 1120 1121 s.sysmsgThreadsMu.Lock() 1122 s.sysmsgThreads[threadID] = sysThread 1123 s.sysmsgThreadsMu.Unlock() 1124 1125 return nil 1126 } 1127 1128 // PreFork implements platform.AddressSpace.PreFork. 1129 // We need to take the usertrap lock to be sure that fork() will not be in the 1130 // middle of applying a binary patch. 1131 func (s *subprocess) PreFork() { 1132 s.usertrap.PreFork() 1133 } 1134 1135 // PostFork implements platform.AddressSpace.PostFork. 1136 func (s *subprocess) PostFork() { 1137 s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above. 1138 } 1139 1140 // activateContext activates the context in this subprocess. 1141 // No-op if the context is already active within the subprocess; if not, 1142 // deactivates it from its last subprocess. 1143 func (s *subprocess) activateContext(c *platformContext) error { 1144 if !c.sharedContext.isActiveInSubprocess(s) { 1145 c.sharedContext.release() 1146 c.sharedContext = nil 1147 1148 shared, err := s.getSharedContext() 1149 if err != nil { 1150 return err 1151 } 1152 c.sharedContext = shared 1153 } 1154 return nil 1155 }