github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/systrap/subprocess.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package systrap 16 17 import ( 18 "fmt" 19 "os" 20 "runtime" 21 "sync" 22 "sync/atomic" 23 24 "golang.org/x/sys/unix" 25 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 26 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 27 "github.com/nicocha30/gvisor-ligolo/pkg/log" 28 "github.com/nicocha30/gvisor-ligolo/pkg/pool" 29 "github.com/nicocha30/gvisor-ligolo/pkg/seccomp" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 31 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 32 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 33 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 34 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform/systrap/sysmsg" 35 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform/systrap/usertrap" 36 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 37 ) 38 39 var ( 40 // globalPool tracks all subprocesses in various state: active or available for 41 // reuse. 42 globalPool = subprocessPool{} 43 44 // maximumUserAddress is the largest possible user address. 45 maximumUserAddress = linux.TaskSize 46 47 // stubInitAddress is the initial attempt link address for the stub. 48 stubInitAddress = linux.TaskSize 49 50 // maxRandomOffsetOfStubAddress is the maximum offset for randomizing a 51 // stub address. It is set to the default value of mm.mmap_rnd_bits. 52 // 53 // Note: Tools like ThreadSanitizer don't like when the memory layout 54 // is changed significantly. 55 maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1) 56 57 // maxStubUserAddress is the largest possible user address for 58 // processes running inside gVisor. It is fixed because 59 // * we don't want to reveal a stub address. 60 // * it has to be the same across checkpoint/restore. 61 maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress 62 ) 63 64 // Linux kernel errnos which "should never be seen by user programs", but will 65 // be revealed to ptrace syscall exit tracing. 66 // 67 // These constants are only used in subprocess.go. 68 const ( 69 ERESTARTSYS = unix.Errno(512) 70 ERESTARTNOINTR = unix.Errno(513) 71 ERESTARTNOHAND = unix.Errno(514) 72 ) 73 74 // thread is a traced thread; it is a thread identifier. 75 // 76 // This is a convenience type for defining ptrace operations. 77 type thread struct { 78 tgid int32 79 tid int32 80 81 // sysmsgStackID is a stack ID in subprocess.sysmsgStackPool. 82 sysmsgStackID uint64 83 84 // initRegs are the initial registers for the first thread. 85 // 86 // These are used for the register set for system calls. 87 initRegs arch.Registers 88 } 89 90 // requestThread is used to request a new sysmsg thread. A thread identifier will 91 // be sent into the thread channel. 92 type requestThread struct { 93 thread chan *thread 94 } 95 96 // requestStub is used to request a new stub process. 97 type requestStub struct { 98 done chan *thread 99 } 100 101 // maxSysmsgThreads specifies the maximum number of system threads that a 102 // subprocess can create in context decoupled mode. 103 // TODO(b/268366549): Replace maxSystemThreads below. 104 var maxSysmsgThreads = runtime.GOMAXPROCS(0) 105 106 const ( 107 // maxSystemThreads specifies the maximum number of system threads that a 108 // subprocess may create in order to process the contexts. 109 maxSystemThreads = 4096 110 // maxGuestContexts specifies the maximum number of task contexts that a 111 // subprocess can handle. 112 maxGuestContexts = 4095 113 // invalidContextID specifies an invalid ID. 114 invalidContextID uint32 = 0xfefefefe 115 // invalidThreadID is used to indicate that a context is not being worked on by 116 // any sysmsg thread. 117 invalidThreadID uint32 = 0xfefefefe 118 ) 119 120 // subprocess is a collection of threads being traced. 121 type subprocess struct { 122 platform.NoAddressSpaceIO 123 subprocessRefs 124 125 // requests is used to signal creation of new threads. 126 requests chan any 127 128 // sysmsgInitRegs is used to reset sysemu regs. 129 sysmsgInitRegs arch.Registers 130 131 // mu protects the following fields. 132 mu sync.Mutex 133 134 // faultedContexts is the set of contexts for which it's possible that 135 // context.lastFaultSP == this subprocess. 136 faultedContexts map[*context]struct{} 137 138 // sysmsgStackPool is a pool of available sysmsg stacks. 139 sysmsgStackPool pool.Pool 140 141 // threadContextPool is a pool of available sysmsg.ThreadContext IDs. 142 threadContextPool pool.Pool 143 144 // threadContextRegion defines the ThreadContext memory region start 145 // within the sentry address space. 146 threadContextRegion uintptr 147 148 // memoryFile is used to allocate a sysmsg stack which is shared 149 // between a stub process and the Sentry. 150 memoryFile *pgalloc.MemoryFile 151 152 // usertrap is the state of the usertrap table which contains syscall 153 // trampolines. 154 usertrap *usertrap.State 155 156 syscallThreadMu sync.Mutex 157 syscallThread *syscallThread 158 159 // sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads 160 sysmsgThreadsMu sync.Mutex 161 // sysmsgThreads is a collection of all active sysmsg threads in the 162 // subprocess. 163 sysmsgThreads map[uint32]*sysmsgThread 164 // numSysmsgThreads counts the number of active sysmsg threads; we use a 165 // counter instead of using len(sysmsgThreads) because we need to synchronize 166 // how many threads get created _before_ the creation happens. 167 numSysmsgThreads int 168 169 // contextQueue is a queue of all contexts that are ready to switch back to 170 // user mode. 171 contextQueue *contextQueue 172 } 173 174 func (s *subprocess) initSyscallThread(ptraceThread *thread) error { 175 s.syscallThreadMu.Lock() 176 defer s.syscallThreadMu.Unlock() 177 178 id, ok := s.sysmsgStackPool.Get() 179 if !ok { 180 panic("unable to allocate a sysmsg stub thread") 181 } 182 183 ptraceThread.sysmsgStackID = id 184 t := syscallThread{ 185 subproc: s, 186 thread: ptraceThread, 187 } 188 189 if err := t.init(); err != nil { 190 panic(fmt.Sprintf("failed to create a syscall thread")) 191 } 192 s.syscallThread = &t 193 194 s.syscallThread.detach() 195 196 return nil 197 } 198 199 // handlePtraceSyscallRequest executes system calls that can't be run via 200 // syscallThread without using ptrace. Look at the description of syscallThread 201 // to get more details about its limitations. 202 func (s *subprocess) handlePtraceSyscallRequest(req any) { 203 s.syscallThreadMu.Lock() 204 defer s.syscallThreadMu.Unlock() 205 runtime.LockOSThread() 206 defer runtime.UnlockOSThread() 207 s.syscallThread.attach() 208 defer s.syscallThread.detach() 209 210 ptraceThread := s.syscallThread.thread 211 212 switch req.(type) { 213 case requestThread: 214 r := req.(requestThread) 215 t, err := ptraceThread.clone() 216 if err != nil { 217 // Should not happen: not recoverable. 218 panic(fmt.Sprintf("error initializing first thread: %v", err)) 219 } 220 221 // Since the new thread was created with 222 // clone(CLONE_PTRACE), it will begin execution with 223 // SIGSTOP pending and with this thread as its tracer. 224 // (Hopefully nobody tgkilled it with a signal < 225 // SIGSTOP before the SIGSTOP was delivered, in which 226 // case that signal would be delivered before SIGSTOP.) 227 if sig := t.wait(stopped); sig != unix.SIGSTOP { 228 panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) 229 } 230 231 id, ok := s.sysmsgStackPool.Get() 232 if !ok { 233 panic("unable to allocate a sysmsg stub thread") 234 } 235 t.sysmsgStackID = id 236 237 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 { 238 panic(fmt.Sprintf("tkill failed: %v", e)) 239 } 240 241 // Detach the thread. 242 t.detach() 243 t.initRegs = ptraceThread.initRegs 244 245 // Return the thread. 246 r.thread <- t 247 case requestStub: 248 r := req.(requestStub) 249 t, err := ptraceThread.createStub() 250 if err != nil { 251 panic(fmt.Sprintf("unable to create a stub process: %s", err)) 252 } 253 r.done <- t 254 255 } 256 } 257 258 // newSubprocess returns a usable subprocess. 259 // 260 // This will either be a newly created subprocess, or one from the global pool. 261 // The create function will be called in the latter case, which is guaranteed 262 // to happen with the runtime thread locked. 263 func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile) (*subprocess, error) { 264 if sp := globalPool.fetchAvailable(); sp != nil { 265 sp.subprocessRefs.InitRefs() 266 sp.usertrap = usertrap.New() 267 return sp, nil 268 } 269 270 // The following goroutine is responsible for creating the first traced 271 // thread, and responding to requests to make additional threads in the 272 // traced process. The process will be killed and reaped when the 273 // request channel is closed, which happens in Release below. 274 requests := make(chan any) 275 276 // Ready. 277 sp := &subprocess{ 278 requests: requests, 279 faultedContexts: make(map[*context]struct{}), 280 sysmsgStackPool: pool.Pool{Start: 0, Limit: maxSystemThreads}, 281 threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts}, 282 memoryFile: memoryFile, 283 sysmsgThreads: make(map[uint32]*sysmsgThread), 284 } 285 sp.subprocessRefs.InitRefs() 286 runtime.LockOSThread() 287 defer runtime.UnlockOSThread() 288 289 // Initialize the syscall thread. 290 ptraceThread, err := create() 291 if err != nil { 292 return nil, err 293 } 294 sp.sysmsgInitRegs = ptraceThread.initRegs 295 296 if err := sp.initSyscallThread(ptraceThread); err != nil { 297 return nil, err 298 } 299 300 go func() { // S/R-SAFE: Platform-related. 301 302 // Wait for requests to create threads. 303 for req := range requests { 304 sp.handlePtraceSyscallRequest(req) 305 } 306 307 // Requests should never be closed. 308 panic("unreachable") 309 }() 310 311 sp.unmap() 312 sp.usertrap = usertrap.New() 313 sp.mapSharedRegions() 314 sp.mapPrivateRegions() 315 316 // Create the initial sysmsg thread. 317 atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1) 318 if err := sp.createSysmsgThread(); err != nil { 319 return nil, err 320 } 321 sp.numSysmsgThreads++ 322 323 return sp, nil 324 } 325 326 // mapSharedRegions maps the shared regions that are used between the subprocess 327 // and ALL of the subsequently created sysmsg threads into both the sentry and 328 // the syscall thread. 329 // 330 // Should be called before any sysmsg threads are created. 331 // Initializes s.contextQueue and s.threadContextRegion. 332 func (s *subprocess) mapSharedRegions() { 333 if s.contextQueue != nil || s.threadContextRegion != 0 { 334 panic("contextQueue or threadContextRegion was already initialized") 335 } 336 337 opts := pgalloc.AllocOpts{ 338 Kind: usage.System, 339 Dir: pgalloc.TopDown, 340 } 341 342 // Map shared regions into the sentry. 343 contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts) 344 contextQueue.init() 345 346 // Map thread context region into the syscall thread. 347 _, err := s.syscallThread.syscall( 348 unix.SYS_MMAP, 349 arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)}, 350 arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())}, 351 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 352 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 353 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 354 arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)}) 355 if err != nil { 356 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 357 } 358 359 s.contextQueue = contextQueue 360 361 // Map thread context region into the sentry. 362 threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts) 363 if err != nil { 364 panic(fmt.Sprintf("failed to allocate a new subprocess context memory region")) 365 } 366 sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6( 367 unix.SYS_MMAP, 368 0, 369 uintptr(threadContextFR.Length()), 370 unix.PROT_WRITE|unix.PROT_READ, 371 unix.MAP_SHARED|unix.MAP_FILE, 372 uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start)) 373 if errno != 0 { 374 panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno)) 375 } 376 377 // Map thread context region into the syscall thread. 378 if _, err := s.syscallThread.syscall( 379 unix.SYS_MMAP, 380 arch.SyscallArgument{Value: uintptr(stubContextRegion)}, 381 arch.SyscallArgument{Value: uintptr(threadContextFR.Length())}, 382 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 383 arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, 384 arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, 385 arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil { 386 panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) 387 } 388 389 s.threadContextRegion = sentryThreadContextRegionAddr 390 } 391 392 func (s *subprocess) mapPrivateRegions() { 393 _, err := s.syscallThread.syscall( 394 unix.SYS_MMAP, 395 arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)}, 396 arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)}, 397 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 398 arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)}, 399 arch.SyscallArgument{Value: 0}, 400 arch.SyscallArgument{Value: 0}) 401 if err != nil { 402 panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err)) 403 } 404 } 405 406 // unmap unmaps non-stub regions of the process. 407 // 408 // This will panic on failure (which should never happen). 409 func (s *subprocess) unmap() { 410 s.Unmap(0, uint64(stubStart)) 411 if maximumUserAddress != stubEnd { 412 s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) 413 } 414 } 415 416 // Release kills the subprocess. 417 // 418 // Just kidding! We can't safely co-ordinate the detaching of all the 419 // tracees (since the tracers are random runtime threads, and the process 420 // won't exit until tracers have been notifier). 421 // 422 // Therefore we simply unmap everything in the subprocess and return it to the 423 // globalPool. This has the added benefit of reducing creation time for new 424 // subprocesses. 425 func (s *subprocess) Release() { 426 s.unmap() 427 s.DecRef(s.release) 428 } 429 430 // release returns the subprocess to the global pool. 431 func (s *subprocess) release() { 432 globalPool.markAvailable(s) 433 } 434 435 // newThread creates a new traced thread. 436 // 437 // Precondition: the OS thread must be locked. 438 func (s *subprocess) newThread() *thread { 439 // Ask the first thread to create a new one. 440 var r requestThread 441 r.thread = make(chan *thread) 442 s.requests <- r 443 t := <-r.thread 444 445 // Attach the subprocess to this one. 446 t.attach() 447 448 // Return the new thread, which is now bound. 449 return t 450 } 451 452 // attach attaches to the thread. 453 func (t *thread) attach() { 454 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 455 panic(fmt.Sprintf("unable to attach: %v", errno)) 456 } 457 458 // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already 459 // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of 460 // newSubprocess), so we always expect to see signal-delivery-stop with 461 // SIGSTOP. 462 if sig := t.wait(stopped); sig != unix.SIGSTOP { 463 panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig)) 464 } 465 466 // Initialize options. 467 t.init() 468 } 469 470 func (t *thread) grabInitRegs() { 471 // Grab registers. 472 // 473 // Note that we adjust the current register RIP value to be just before 474 // the current system call executed. This depends on the definition of 475 // the stub itself. 476 if err := t.getRegs(&t.initRegs); err != nil { 477 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 478 } 479 t.adjustInitRegsRip() 480 t.initRegs.SetStackPointer(0) 481 } 482 483 // detach detaches from the thread. 484 // 485 // Because the SIGSTOP is not suppressed, the thread will enter group-stop. 486 func (t *thread) detach() { 487 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 { 488 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 489 } 490 } 491 492 // waitOutcome is used for wait below. 493 type waitOutcome int 494 495 const ( 496 // stopped indicates that the process was stopped. 497 stopped waitOutcome = iota 498 499 // killed indicates that the process was killed. 500 killed 501 ) 502 503 func (t *thread) Debugf(format string, v ...any) { 504 prefix := fmt.Sprintf("%8d:", t.tid) 505 log.DebugfAtDepth(1, prefix+format, v...) 506 } 507 508 func (t *thread) dumpAndPanic(message string) { 509 var regs arch.Registers 510 message += "\n" 511 if err := t.getRegs(®s); err == nil { 512 message += dumpRegs(®s) 513 } else { 514 log.Warningf("unable to get registers: %v", err) 515 } 516 message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) 517 panic(message) 518 } 519 520 func (t *thread) dumpRegs(message string) { 521 var regs arch.Registers 522 message += "\n" 523 if err := t.getRegs(®s); err == nil { 524 message += dumpRegs(®s) 525 } else { 526 log.Warningf("unable to get registers: %v", err) 527 } 528 log.Infof("%s", message) 529 } 530 531 func (t *thread) unexpectedStubExit() { 532 msg, err := t.getEventMessage() 533 status := unix.WaitStatus(msg) 534 if status.Signaled() && status.Signal() == unix.SIGKILL { 535 // SIGKILL can be only sent by a user or OOM-killer. In both 536 // these cases, we don't need to panic. There is no reasons to 537 // think that something wrong in gVisor. 538 log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) 539 pid := os.Getpid() 540 unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL)) 541 } 542 t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) 543 } 544 545 // wait waits for a stop event. 546 // 547 // Precondition: outcome is a valid waitOutcome. 548 func (t *thread) wait(outcome waitOutcome) unix.Signal { 549 var status unix.WaitStatus 550 551 for { 552 r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil) 553 if err == unix.EINTR || err == unix.EAGAIN { 554 // Wait was interrupted; wait again. 555 continue 556 } else if err != nil { 557 panic(fmt.Sprintf("ptrace wait failed: %v", err)) 558 } 559 if int(r) != int(t.tid) { 560 panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) 561 } 562 switch outcome { 563 case stopped: 564 if !status.Stopped() { 565 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) 566 } 567 stopSig := status.StopSignal() 568 if stopSig == 0 { 569 continue // Spurious stop. 570 } 571 if stopSig == unix.SIGTRAP { 572 if status.TrapCause() == unix.PTRACE_EVENT_EXIT { 573 t.unexpectedStubExit() 574 } 575 // Re-encode the trap cause the way it's expected. 576 return stopSig | unix.Signal(status.TrapCause()<<8) 577 } 578 // Not a trap signal. 579 return stopSig 580 case killed: 581 if !status.Exited() && !status.Signaled() { 582 t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) 583 } 584 return unix.Signal(status.ExitStatus()) 585 default: 586 // Should not happen. 587 t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) 588 } 589 } 590 } 591 592 // destroy kills the thread. 593 // 594 // Note that this should not be used in the general case; the death of threads 595 // will typically cause the death of the parent. This is a utility method for 596 // manually created threads. 597 func (t *thread) destroy() { 598 t.detach() 599 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) 600 t.wait(killed) 601 } 602 603 // init initializes trace options. 604 func (t *thread) init() { 605 // Set the TRACESYSGOOD option to differentiate real SIGTRAP. 606 // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the 607 // sentry will immediately kill the associated stubs. 608 _, _, errno := unix.RawSyscall6( 609 unix.SYS_PTRACE, 610 unix.PTRACE_SETOPTIONS, 611 uintptr(t.tid), 612 0, 613 unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL, 614 0, 0) 615 if errno != 0 { 616 panic(fmt.Sprintf("ptrace set options failed: %v", errno)) 617 } 618 } 619 620 // syscall executes a system call cycle in the traced context. 621 // 622 // This is _not_ for use by application system calls, rather it is for use when 623 // a system call must be injected into the remote context (e.g. mmap, munmap). 624 // Note that clones are handled separately. 625 func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { 626 // Set registers. 627 if err := t.setRegs(regs); err != nil { 628 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 629 } 630 631 for { 632 // Execute the syscall instruction. The task has to stop on the 633 // trap instruction which is right after the syscall 634 // instruction. 635 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { 636 panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) 637 } 638 639 sig := t.wait(stopped) 640 if sig == unix.SIGTRAP { 641 // Reached syscall-enter-stop. 642 break 643 } else { 644 // Some other signal caused a thread stop; ignore. 645 if sig != unix.SIGSTOP && sig != unix.SIGCHLD { 646 log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) 647 } 648 continue 649 } 650 } 651 652 // Grab registers. 653 if err := t.getRegs(regs); err != nil { 654 panic(fmt.Sprintf("ptrace get regs failed: %v", err)) 655 } 656 return syscallReturnValue(regs) 657 } 658 659 // syscallIgnoreInterrupt ignores interrupts on the system call thread and 660 // restarts the syscall if the kernel indicates that should happen. 661 func (t *thread) syscallIgnoreInterrupt( 662 initRegs *arch.Registers, 663 sysno uintptr, 664 args ...arch.SyscallArgument) (uintptr, error) { 665 for { 666 regs := createSyscallRegs(initRegs, sysno, args...) 667 rval, err := t.syscall(®s) 668 switch err { 669 case ERESTARTSYS: 670 continue 671 case ERESTARTNOINTR: 672 continue 673 case ERESTARTNOHAND: 674 continue 675 default: 676 return rval, err 677 } 678 } 679 } 680 681 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 682 func (t *thread) NotifyInterrupt() { 683 unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt)) 684 } 685 686 func (s *subprocess) incAwakeContexts() { 687 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1) 688 if nr > uint32(maxSysmsgThreads) { 689 return 690 } 691 nr = nrMaxAwakeStubThreads.Add(1) 692 if nr > fastPathContextLimit { 693 dispatcher.disableStubFastPath() 694 } 695 } 696 697 func (s *subprocess) decAwakeContexts() { 698 nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0)) 699 if nr >= uint32(maxSysmsgThreads) { 700 return 701 } 702 nrMaxAwakeStubThreads.Add(^uint32(0)) 703 } 704 705 // switchToApp is called from the main SwitchToApp entrypoint. 706 // 707 // This function returns true on a system call, false on a signal. 708 // The second return value is true if a syscall instruction can be replaced on 709 // a function call. 710 func (s *subprocess) switchToApp(c *context, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err error) { 711 // Reset necessary registers. 712 regs := &ac.StateData().Regs 713 s.resetSysemuRegs(regs) 714 ctx := c.sharedContext 715 ctx.shared.Regs = regs.PtraceRegs 716 restoreArchSpecificState(ctx.shared, ac) 717 718 // Check for interrupts, and ensure that future interrupts signal the context. 719 if !c.interrupt.Enable(c.sharedContext) { 720 // Pending interrupt; simulate. 721 ctx.clearInterrupt() 722 c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} 723 return false, false, nil 724 } 725 defer func() { 726 ctx.clearInterrupt() 727 c.interrupt.Disable() 728 }() 729 730 restoreFPState(ctx, c, ac) 731 732 // Place the context onto the context queue. 733 if ctx.sleeping { 734 ctx.sleeping = false 735 s.incAwakeContexts() 736 } 737 stubFastPathEnabled := dispatcher.stubFastPathEnabled() 738 ctx.setState(sysmsg.ContextStateNone) 739 s.contextQueue.add(ctx, stubFastPathEnabled) 740 s.waitOnState(ctx, stubFastPathEnabled) 741 742 // Check if there's been an error. 743 threadID := ctx.threadID() 744 if threadID != invalidThreadID { 745 if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 { 746 msg := sysThread.msg 747 panic(fmt.Sprintf("stub thread %d failed: err 0x%x line %d: %s", sysThread.thread.tid, msg.Err, msg.Line, msg)) 748 } 749 log.Warningf("systrap: found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID) 750 } 751 752 // Copy register state locally. 753 regs.PtraceRegs = ctx.shared.Regs 754 retrieveArchSpecificState(ctx.shared, ac) 755 c.needToPullFullState = true 756 // We have a signal. We verify however, that the signal was 757 // either delivered from the kernel or from this process. We 758 // don't respect other signals. 759 c.signalInfo = ctx.shared.SignalInfo 760 ctxState := ctx.state() 761 if ctxState == sysmsg.ContextStateSyscallCanBePatched { 762 ctxState = sysmsg.ContextStateSyscall 763 shouldPatchSyscall = true 764 } 765 766 if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap { 767 if maybePatchSignalInfo(regs, &c.signalInfo) { 768 return false, false, nil 769 } 770 updateSyscallRegs(regs) 771 return true, shouldPatchSyscall, nil 772 } else if ctxState != sysmsg.ContextStateFault { 773 panic(fmt.Sprintf("unknown context state: %v", ctxState)) 774 } 775 776 return false, false, nil 777 } 778 779 func (s *subprocess) waitOnState(ctx *sharedContext, stubFastPathEnabled bool) { 780 ctx.kicked = false 781 slowPath := false 782 start := cputicks() 783 ctx.startWaitingTS = start 784 if !stubFastPathEnabled || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 { 785 ctx.kicked = s.kickSysmsgThread() 786 } 787 for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() { 788 if !slowPath { 789 events := dispatcher.waitFor(ctx) 790 if events&sharedContextKicked != 0 { 791 if ctx.kicked { 792 continue 793 } 794 if ctx.isAcked() { 795 ctx.kicked = true 796 continue 797 } 798 s.kickSysmsgThread() 799 ctx.kicked = true 800 continue 801 } 802 if events&sharedContextSlowPath != 0 { 803 ctx.disableSentryFastPath() 804 slowPath = true 805 continue 806 } 807 } else { 808 // If the context already received a handshake then it knows it's being 809 // worked on. 810 if !ctx.kicked && !ctx.isAcked() { 811 ctx.kicked = s.kickSysmsgThread() 812 } 813 814 ctx.sleepOnState(curState) 815 } 816 } 817 818 ctx.resetAcked() 819 ctx.enableSentryFastPath() 820 } 821 822 // canKickSysmsgThread returns true if a new thread can be kicked. 823 // The second return value is the expected number of threads after kicking a 824 // new one. 825 func (s *subprocess) canKickSysmsgThread() (bool, uint32) { 826 // numActiveContexts and numActiveThreads can be changed from stub 827 // threads that handles the contextQueue without any locks. The idea 828 // here is that any stub thread that gets CPU time can make some 829 // progress. In stub threads, we can use only spinlock-like 830 // synchronizations, but they don't work well because a thread that 831 // holds a lock can be preempted by another thread that is waiting for 832 // the same lock. 833 nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads) 834 nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup) 835 nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts) 836 837 nrActiveThreads += nrThreadsToWakeup + 1 838 if nrActiveThreads > nrActiveContexts { 839 // This can happen when one or more stub threads are 840 // waiting for cpu time. The host probably has more 841 // running tasks than a number of cpu-s. 842 return false, nrActiveThreads 843 } 844 return true, nrActiveThreads 845 } 846 847 func (s *subprocess) kickSysmsgThread() bool { 848 kick, _ := s.canKickSysmsgThread() 849 if !kick { 850 return false 851 } 852 853 s.sysmsgThreadsMu.Lock() 854 kick, nrThreads := s.canKickSysmsgThread() 855 if !kick { 856 s.sysmsgThreadsMu.Unlock() 857 return false 858 } 859 atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1) 860 if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) { 861 s.numSysmsgThreads++ 862 s.sysmsgThreadsMu.Unlock() 863 if err := s.createSysmsgThread(); err != nil { 864 log.Warningf("Unable to create a new stub thread: %s", err) 865 s.sysmsgThreadsMu.Lock() 866 s.numSysmsgThreads-- 867 s.sysmsgThreadsMu.Unlock() 868 } 869 } else { 870 s.sysmsgThreadsMu.Unlock() 871 } 872 s.contextQueue.wakeupSysmsgThread() 873 874 return false 875 } 876 877 // syscall executes the given system call without handling interruptions. 878 func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { 879 s.syscallThreadMu.Lock() 880 defer s.syscallThreadMu.Unlock() 881 882 return s.syscallThread.syscall(sysno, args...) 883 } 884 885 // MapFile implements platform.AddressSpace.MapFile. 886 func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { 887 var flags int 888 if precommit { 889 flags |= unix.MAP_POPULATE 890 } 891 _, err := s.syscall( 892 unix.SYS_MMAP, 893 arch.SyscallArgument{Value: uintptr(addr)}, 894 arch.SyscallArgument{Value: uintptr(fr.Length())}, 895 arch.SyscallArgument{Value: uintptr(at.Prot())}, 896 arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)}, 897 arch.SyscallArgument{Value: uintptr(f.FD())}, 898 arch.SyscallArgument{Value: uintptr(fr.Start)}) 899 return err 900 } 901 902 // Unmap implements platform.AddressSpace.Unmap. 903 func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) { 904 ar, ok := addr.ToRange(length) 905 if !ok { 906 panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) 907 } 908 s.mu.Lock() 909 for c := range s.faultedContexts { 910 c.mu.Lock() 911 if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { 912 // Forget the last fault so that if c faults again, the fault isn't 913 // incorrectly reported as a write fault. If this is being called 914 // due to munmap() of the corresponding vma, handling of the second 915 // fault will fail anyway. 916 c.lastFaultSP = nil 917 delete(s.faultedContexts, c) 918 } 919 c.mu.Unlock() 920 } 921 s.mu.Unlock() 922 _, err := s.syscall( 923 unix.SYS_MUNMAP, 924 arch.SyscallArgument{Value: uintptr(addr)}, 925 arch.SyscallArgument{Value: uintptr(length)}) 926 if err != nil { 927 // We never expect this to happen. 928 panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) 929 } 930 } 931 932 func (s *subprocess) PullFullState(c *context, ac *arch.Context64) error { 933 if !c.sharedContext.isActiveInSubprocess(s) { 934 panic("Attempted to PullFullState for context that is not used in subprocess") 935 } 936 saveFPState(c.sharedContext, ac) 937 return nil 938 } 939 940 var sysmsgThreadPriority int 941 942 func initSysmsgThreadPriority() { 943 prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0) 944 if err != nil { 945 panic("unable to get current scheduling priority") 946 } 947 // Sysmsg threads are executed with a priority one lower than the Sentry. 948 sysmsgThreadPriority = 20 - prio + 1 949 } 950 951 // createSysmsgThread creates a new sysmsg thread. 952 // The thread starts processing any available context in the context queue. 953 func (s *subprocess) createSysmsgThread() error { 954 // Create a new seccomp process. 955 var r requestThread 956 r.thread = make(chan *thread) 957 s.requests <- r 958 p := <-r.thread 959 960 runtime.LockOSThread() 961 defer runtime.UnlockOSThread() 962 p.attach() 963 964 // Skip SIGSTOP. 965 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 966 panic(fmt.Sprintf("ptrace cont failed: %v", errno)) 967 } 968 sig := p.wait(stopped) 969 if sig != unix.SIGSTOP { 970 panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) 971 } 972 973 // Allocate a new stack for the BPF process. 974 opts := pgalloc.AllocOpts{ 975 Kind: usage.System, 976 Dir: pgalloc.TopDown, 977 } 978 fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts) 979 if err != nil { 980 // TODO(b/144063246): Need to fail the clone system call. 981 panic(fmt.Sprintf("failed to allocate a new stack: %v", err)) 982 } 983 sysThread := &sysmsgThread{ 984 thread: p, 985 subproc: s, 986 stackRange: fr, 987 } 988 // Use the sysmsgStackID as a handle on this thread instead of host tid in 989 // order to be able to reliably specify invalidThreadID. 990 threadID := uint32(p.sysmsgStackID) 991 992 // Map the stack into the sentry. 993 sentryStackAddr, _, errno := unix.RawSyscall6( 994 unix.SYS_MMAP, 995 0, 996 sysmsg.PerThreadSharedStackSize, 997 unix.PROT_WRITE|unix.PROT_READ, 998 unix.MAP_SHARED|unix.MAP_FILE, 999 uintptr(s.memoryFile.FD()), uintptr(fr.Start)) 1000 if errno != 0 { 1001 panic(fmt.Sprintf("mmap failed: %v", errno)) 1002 } 1003 1004 // Before installing the stub syscall filters, we need to call a few 1005 // system calls (e.g. sigaltstack, sigaction) which have in-memory 1006 // arguments. We need to prevent changing these parameters by other 1007 // stub threads, so lets map the future BPF stack as read-only and 1008 // fill syscall arguments from the Sentry. 1009 sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset 1010 err = sysThread.mapStack(sysmsgStackAddr, true) 1011 if err != nil { 1012 panic(fmt.Sprintf("mmap failed: %v", err)) 1013 } 1014 1015 sysThread.init(sentryStackAddr, sysmsgStackAddr) 1016 1017 // Map the stack into the BPF process. 1018 err = sysThread.mapStack(sysmsgStackAddr, false) 1019 if err != nil { 1020 s.memoryFile.DecRef(fr) 1021 panic(fmt.Sprintf("mmap failed: %v", err)) 1022 } 1023 1024 // Map the stack into the BPF process. 1025 privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset 1026 err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize) 1027 if err != nil { 1028 s.memoryFile.DecRef(fr) 1029 panic(fmt.Sprintf("mmap failed: %v", err)) 1030 } 1031 1032 sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr)) 1033 sysThread.msg.Init(threadID) 1034 sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack) 1035 sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr())) 1036 sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler)) 1037 1038 sysThread.msg.State.Set(sysmsg.ThreadStateInitializing) 1039 1040 if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil { 1041 log.Warningf("Unable to change priority of a stub thread: %s", err) 1042 } 1043 1044 // Install a pre-compiled seccomp rules for the BPF process. 1045 _, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL, 1046 arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)}, 1047 arch.SyscallArgument{Value: uintptr(1)}, 1048 arch.SyscallArgument{Value: uintptr(0)}, 1049 arch.SyscallArgument{Value: uintptr(0)}, 1050 arch.SyscallArgument{Value: uintptr(0)}, 1051 arch.SyscallArgument{Value: uintptr(0)}) 1052 if err != nil { 1053 panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err)) 1054 } 1055 1056 _, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP, 1057 arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, 1058 arch.SyscallArgument{Value: uintptr(0)}, 1059 arch.SyscallArgument{Value: stubSysmsgRules}) 1060 if err != nil { 1061 panic(fmt.Sprintf("seccomp failed: %v", err)) 1062 } 1063 1064 // Prepare to start the BPF process. 1065 tregs := &arch.Registers{} 1066 s.resetSysemuRegs(tregs) 1067 setArchSpecificRegs(sysThread, tregs) 1068 if err := p.setRegs(tregs); err != nil { 1069 panic(fmt.Sprintf("ptrace set regs failed: %v", err)) 1070 } 1071 archSpecificSysmsgThreadInit(sysThread) 1072 // Skip SIGSTOP. 1073 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { 1074 panic(fmt.Sprintf("tkill failed: %v", e)) 1075 } 1076 // Resume the BPF process. 1077 if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { 1078 panic(fmt.Sprintf("can't detach new clone: %v", errno)) 1079 } 1080 1081 s.sysmsgThreadsMu.Lock() 1082 s.sysmsgThreads[threadID] = sysThread 1083 s.sysmsgThreadsMu.Unlock() 1084 1085 return nil 1086 } 1087 1088 // PreFork implements platform.AddressSpace.PreFork. 1089 // We need to take the usertrap lock to be sure that fork() will not be in the 1090 // middle of applying a binary patch. 1091 func (s *subprocess) PreFork() { 1092 s.usertrap.PreFork() 1093 } 1094 1095 // PostFork implements platform.AddressSpace.PostFork. 1096 func (s *subprocess) PostFork() { 1097 s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above. 1098 } 1099 1100 // activateContext activates the context in this subprocess. 1101 // No-op if the context is already active within the subprocess; if not, 1102 // deactivates it from its last subprocess. 1103 func (s *subprocess) activateContext(c *context) error { 1104 if !c.sharedContext.isActiveInSubprocess(s) { 1105 c.sharedContext.release() 1106 c.sharedContext = nil 1107 1108 shared, err := s.getSharedContext() 1109 if err != nil { 1110 return err 1111 } 1112 c.sharedContext = shared 1113 } 1114 return nil 1115 }