gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/systrap.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package systrap provides a seccomp-based implementation of the platform 16 // interface. 17 // 18 // In a nutshell, it works as follows: 19 // 20 // The creation of a new address space creates a new child processes. 21 // 22 // The creation of a new stub thread creates a new system thread with a 23 // specified address space. To initialize this thread, the following action 24 // will be done: 25 // - install a signal stack which is shared with the Sentry. 26 // - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals. 27 // This signal handler is a key part of the systrap platform. Any stub event 28 // which has to be handled in a privilege mode (by the Sentry) triggers one of 29 // previous signals. The signal handler is running on the separate stack which 30 // is shared with the Sentry. There is the sysmsg structure to synchronize the 31 // Sentry and a stub thread. 32 // - install seccomp filters to trap user system calls. 33 // - send a fake SIGSEGV to stop the thread in the signal handler. 34 // 35 // A platformContext is just a collection of temporary variables. Calling Switch on a 36 // platformContext does the following: 37 // 38 // Set up proper registers and an FPU state on a stub signal frame. 39 // Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE. 40 // Wait for new stub event by polling sysmsg->stage. 41 // 42 // Lock order: 43 // 44 // subprocessPool.mu 45 // subprocess.mu 46 // platformContext.mu 47 // 48 // +checkalignedignore 49 package systrap 50 51 import ( 52 "fmt" 53 "os" 54 "runtime" 55 "sync" 56 57 "golang.org/x/sys/unix" 58 "gvisor.dev/gvisor/pkg/abi/linux" 59 pkgcontext "gvisor.dev/gvisor/pkg/context" 60 "gvisor.dev/gvisor/pkg/fd" 61 "gvisor.dev/gvisor/pkg/hostarch" 62 "gvisor.dev/gvisor/pkg/memutil" 63 "gvisor.dev/gvisor/pkg/sentry/arch" 64 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 65 "gvisor.dev/gvisor/pkg/sentry/platform" 66 "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" 67 "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" 68 "gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap" 69 ) 70 71 var ( 72 // stubStart is the link address for our stub, and determines the 73 // maximum user address. This is valid only after a call to stubInit. 74 // 75 // We attempt to link the stub here, and adjust downward as needed. 76 stubStart uintptr = stubInitAddress 77 78 stubInitProcess uintptr 79 80 // Memory region to store thread specific stacks. 81 stubSysmsgStack uintptr 82 stubSysmsgStart uintptr 83 stubSysmsgEnd uintptr 84 // Memory region to store the contextQueue. 85 stubContextQueueRegion uintptr 86 stubContextQueueRegionLen uintptr 87 // Memory region to store instances of sysmsg.ThreadContext. 88 stubContextRegion uintptr 89 stubContextRegionLen uintptr 90 // The memory blob with precompiled seccomp rules. 91 stubSysmsgRules uintptr 92 stubSysmsgRulesLen uintptr 93 stubSyscallRules uintptr 94 stubSyscallRulesLen uintptr 95 96 stubSpinningThreadQueueAddr uintptr 97 stubSpinningThreadQueueSize uintptr 98 99 // stubROMapEnd is the end address of the read-only stub region that 100 // contains the code and precompiled seccomp rules. 101 stubROMapEnd uintptr 102 103 // stubEnd is the first byte past the end of the stub, as with 104 // stubStart this is valid only after a call to stubInit. 105 stubEnd uintptr 106 107 // stubInitialized controls one-time stub initialization. 108 stubInitialized sync.Once 109 110 // latencyMonitoring controls one-time initialization of the fastpath 111 // control goroutine. 112 latencyMonitoring sync.Once 113 114 // archState stores architecture-specific details used in the platform. 115 archState sysmsg.ArchState 116 ) 117 118 // platformContext is an implementation of the platform context. 119 type platformContext struct { 120 // signalInfo is the signal info, if and when a signal is received. 121 signalInfo linux.SignalInfo 122 123 // interrupt is the interrupt platformContext. 124 interrupt interrupt.Forwarder 125 126 // sharedContext is everything related to this platformContext that is resident in 127 // shared memory with the stub thread. 128 // sharedContext is only accessed on the Task goroutine, therefore it is not 129 // mutex protected. 130 sharedContext *sharedContext 131 132 // mu protects the following fields. 133 mu sync.Mutex 134 135 // If lastFaultSP is non-nil, the last platformContext switch was due to a fault 136 // received while executing lastFaultSP. Only platformContext.Switch may set 137 // lastFaultSP to a non-nil value. 138 lastFaultSP *subprocess 139 140 // lastFaultAddr is the last faulting address; this is only meaningful if 141 // lastFaultSP is non-nil. 142 lastFaultAddr hostarch.Addr 143 144 // lastFaultIP is the address of the last faulting instruction; 145 // this is also only meaningful if lastFaultSP is non-nil. 146 lastFaultIP hostarch.Addr 147 148 // needRestoreFPState indicates that the FPU state has been changed by 149 // the Sentry and has to be updated on the stub thread. 150 needRestoreFPState bool 151 152 // needToPullFullState indicates that the Sentry doesn't have a full 153 // state of the thread. 154 needToPullFullState bool 155 } 156 157 // PullFullState implements platform.Context.PullFullState. 158 func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { 159 if !c.needToPullFullState { 160 return nil 161 } 162 s := as.(*subprocess) 163 if err := s.PullFullState(c, ac); err != nil { 164 return err 165 } 166 c.needToPullFullState = false 167 return nil 168 } 169 170 // FullStateChanged implements platform.Context.FullStateChanged. 171 func (c *platformContext) FullStateChanged() { 172 c.needRestoreFPState = true 173 c.needToPullFullState = false 174 } 175 176 // Switch runs the provided platformContext in the given address space. 177 func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) { 178 as := mm.AddressSpace() 179 s := as.(*subprocess) 180 if err := s.activateContext(c); err != nil { 181 return nil, hostarch.NoAccess, err 182 } 183 184 restart: 185 isSyscall, needPatch, err := s.switchToApp(c, ac) 186 if err != nil { 187 return nil, hostarch.NoAccess, err 188 } 189 if needPatch { 190 s.usertrap.PatchSyscall(ctx, ac, mm) 191 } 192 if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL { 193 err := s.usertrap.HandleFault(ctx, ac, mm) 194 if err == usertrap.ErrFaultSyscall { 195 isSyscall = true 196 } else if err == usertrap.ErrFaultRestart { 197 goto restart 198 } else if err != nil { 199 ctx.Warningf("usertrap.HandleFault failed: %v", err) 200 } 201 } 202 var ( 203 faultSP *subprocess 204 faultAddr hostarch.Addr 205 faultIP hostarch.Addr 206 ) 207 if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV { 208 faultSP = s 209 faultAddr = hostarch.Addr(c.signalInfo.Addr()) 210 faultIP = hostarch.Addr(ac.IP()) 211 } 212 213 // Update the platformContext to reflect the outcome of this context switch. 214 c.mu.Lock() 215 lastFaultSP := c.lastFaultSP 216 lastFaultAddr := c.lastFaultAddr 217 lastFaultIP := c.lastFaultIP 218 // At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't 219 // be updated by s.Unmap(). This is fine; we only need to synchronize with 220 // calls to s.Unmap() that occur after the handling of this fault. 221 c.lastFaultSP = faultSP 222 c.lastFaultAddr = faultAddr 223 c.lastFaultIP = faultIP 224 c.mu.Unlock() 225 226 // Update subprocesses to reflect the outcome of this context switch. 227 if lastFaultSP != faultSP { 228 if lastFaultSP != nil { 229 lastFaultSP.mu.Lock() 230 delete(lastFaultSP.faultedContexts, c) 231 lastFaultSP.mu.Unlock() 232 } 233 if faultSP != nil { 234 faultSP.mu.Lock() 235 faultSP.faultedContexts[c] = struct{}{} 236 faultSP.mu.Unlock() 237 } 238 } 239 240 if isSyscall { 241 return nil, hostarch.NoAccess, nil 242 } 243 244 si := c.signalInfo 245 if faultSP == nil { 246 // Non-fault signal. 247 return &si, hostarch.NoAccess, platform.ErrContextSignal 248 } 249 250 // See if this can be handled as a CPUID exception. 251 if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) { 252 goto restart 253 } 254 255 // Got a page fault. Ideally, we'd get real fault type here, but ptrace 256 // doesn't expose this information. Instead, we use a simple heuristic: 257 // 258 // It was an instruction fault iff the faulting addr == instruction 259 // pointer. 260 // 261 // It was a write fault if the fault is immediately repeated. 262 at := hostarch.Read 263 if faultAddr == faultIP { 264 at.Execute = true 265 } 266 if lastFaultSP == faultSP && 267 lastFaultAddr == faultAddr && 268 lastFaultIP == faultIP { 269 at.Write = true 270 } 271 272 // Handle as a signal. 273 return &si, at, platform.ErrContextSignal 274 } 275 276 // Interrupt interrupts the running guest application associated with this platformContext. 277 func (c *platformContext) Interrupt() { 278 c.interrupt.NotifyInterrupt() 279 } 280 281 // Release releases all platform resources used by the platformContext. 282 func (c *platformContext) Release() { 283 if c.sharedContext != nil { 284 c.sharedContext.release() 285 c.sharedContext = nil 286 } 287 } 288 289 // PrepareSleep implements platform.Context.platform.PrepareSleep. 290 func (c *platformContext) PrepareSleep() { 291 ctx := c.sharedContext 292 if ctx == nil { 293 return 294 } 295 if !ctx.sleeping { 296 ctx.sleeping = true 297 ctx.subprocess.decAwakeContexts() 298 } 299 } 300 301 // Systrap represents a collection of seccomp subprocesses. 302 type Systrap struct { 303 platform.NoCPUPreemptionDetection 304 platform.UseHostGlobalMemoryBarrier 305 platform.DoesNotOwnPageTables 306 307 // memoryFile is used to create a stub sysmsg stack 308 // which is shared with the Sentry. 309 memoryFile *pgalloc.MemoryFile 310 } 311 312 // MinUserAddress implements platform.MinUserAddress. 313 func (*Systrap) MinUserAddress() hostarch.Addr { 314 return platform.SystemMMapMinAddr() 315 } 316 317 // New returns a new seccomp-based implementation of the platform interface. 318 func New() (*Systrap, error) { 319 if maxSysmsgThreads == 0 { 320 // CPUID information has been initialized at this point. 321 archState.Init() 322 // GOMAXPROCS has been set at this point. 323 maxSysmsgThreads = runtime.GOMAXPROCS(0) 324 // Account for syscall thread. 325 maxChildThreads = maxSysmsgThreads + 1 326 } 327 328 mf, err := createMemoryFile() 329 if err != nil { 330 return nil, err 331 } 332 333 stubInitialized.Do(func() { 334 // Don't use sentry and stub fast paths if here is just one cpu. 335 neverEnableFastPath = min(runtime.NumCPU(), runtime.GOMAXPROCS(0)) == 1 336 337 // Initialize the stub. 338 stubInit() 339 340 // Create the source process for the global pool. This must be 341 // done before initializing any other processes. 342 source, err := newSubprocess(createStub, mf, false) 343 if err != nil { 344 // Should never happen. 345 panic("unable to initialize systrap source: " + err.Error()) 346 } 347 // The source subprocess is never released explicitly by a MM. 348 source.DecRef(nil) 349 350 globalPool.source = source 351 352 initSysmsgThreadPriority() 353 }) 354 355 latencyMonitoring.Do(func() { 356 go controlFastPath() 357 }) 358 359 return &Systrap{memoryFile: mf}, nil 360 } 361 362 // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. 363 func (*Systrap) SupportsAddressSpaceIO() bool { 364 return false 365 } 366 367 // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. 368 func (*Systrap) CooperativelySchedulesAddressSpace() bool { 369 return false 370 } 371 372 // MapUnit implements platform.Platform.MapUnit. 373 func (*Systrap) MapUnit() uint64 { 374 // The host kernel manages page tables and arbitrary-sized mappings 375 // have effectively the same cost. 376 return 0 377 } 378 379 // MaxUserAddress returns the first address that may not be used by user 380 // applications. 381 func (*Systrap) MaxUserAddress() hostarch.Addr { 382 return hostarch.Addr(maxStubUserAddress) 383 } 384 385 // NewAddressSpace returns a new subprocess. 386 func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) { 387 as, err := newSubprocess(globalPool.source.createStub, p.memoryFile, true) 388 return as, nil, err 389 } 390 391 // NewContext returns an interruptible platformContext. 392 func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context { 393 return &platformContext{ 394 needRestoreFPState: true, 395 needToPullFullState: false, 396 } 397 } 398 399 type constructor struct{} 400 401 func (*constructor) New(_ *fd.FD) (platform.Platform, error) { 402 return New() 403 } 404 405 func (*constructor) OpenDevice(_ string) (*fd.FD, error) { 406 return nil, nil 407 } 408 409 // Requirements implements platform.Constructor.Requirements(). 410 func (*constructor) Requirements() platform.Requirements { 411 // TODO(b/75837838): Also set a new PID namespace so that we limit 412 // access to other host processes. 413 return platform.Requirements{ 414 RequiresCapSysPtrace: true, 415 RequiresCurrentPIDNS: true, 416 } 417 } 418 419 func init() { 420 platform.Register("systrap", &constructor{}) 421 } 422 423 func createMemoryFile() (*pgalloc.MemoryFile, error) { 424 const memfileName = "systrap-memory" 425 fd, err := memutil.CreateMemFD(memfileName, 0) 426 if err != nil { 427 return nil, fmt.Errorf("error creating memfd: %v", err) 428 } 429 memfile := os.NewFile(uintptr(fd), memfileName) 430 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{ 431 EnforceMaximumAllocatable: true, 432 }) 433 if err != nil { 434 memfile.Close() 435 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) 436 } 437 return mf, nil 438 } 439 440 func corruptedSharedMemoryErr(additional string) *platform.ContextError { 441 return &platform.ContextError{ 442 Err: fmt.Errorf("systrap corrupted memory: %s", additional), 443 Errno: unix.EPERM, 444 } 445 }