github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/systrap.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package systrap provides a seccomp-based implementation of the platform 16 // interface. 17 // 18 // In a nutshell, it works as follows: 19 // 20 // The creation of a new address space creates a new child processes. 21 // 22 // The creation of a new stub thread creates a new system thread with a 23 // specified address space. To initialize this thread, the following action 24 // will be done: 25 // - install a signal stack which is shared with the Sentry. 26 // - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals. 27 // This signal handler is a key part of the systrap platform. Any stub event 28 // which has to be handled in a privilege mode (by the Sentry) triggers one of 29 // previous signals. The signal handler is running on the separate stack which 30 // is shared with the Sentry. There is the sysmsg structure to synchronize the 31 // Sentry and a stub thread. 32 // - install seccomp filters to trap user system calls. 33 // - send a fake SIGSEGV to stop the thread in the signal handler. 34 // 35 // A platformContext is just a collection of temporary variables. Calling Switch on a 36 // platformContext does the following: 37 // 38 // Set up proper registers and an FPU state on a stub signal frame. 39 // Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE. 40 // Wait for new stub event by polling sysmsg->stage. 41 // 42 // Lock order: 43 // 44 // subprocessPool.mu 45 // subprocess.mu 46 // platformContext.mu 47 // 48 // +checkalignedignore 49 package systrap 50 51 import ( 52 "fmt" 53 "os" 54 "sync" 55 56 "golang.org/x/sys/unix" 57 "github.com/metacubex/gvisor/pkg/abi/linux" 58 pkgcontext "github.com/metacubex/gvisor/pkg/context" 59 "github.com/metacubex/gvisor/pkg/hostarch" 60 "github.com/metacubex/gvisor/pkg/memutil" 61 "github.com/metacubex/gvisor/pkg/sentry/arch" 62 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 63 "github.com/metacubex/gvisor/pkg/sentry/platform" 64 "github.com/metacubex/gvisor/pkg/sentry/platform/interrupt" 65 "github.com/metacubex/gvisor/pkg/sentry/platform/systrap/sysmsg" 66 "github.com/metacubex/gvisor/pkg/sentry/platform/systrap/usertrap" 67 ) 68 69 var ( 70 // stubStart is the link address for our stub, and determines the 71 // maximum user address. This is valid only after a call to stubInit. 72 // 73 // We attempt to link the stub here, and adjust downward as needed. 74 stubStart uintptr = stubInitAddress 75 76 stubInitProcess uintptr 77 78 // Memory region to store thread specific stacks. 79 stubSysmsgStack uintptr 80 stubSysmsgStart uintptr 81 stubSysmsgEnd uintptr 82 // Memory region to store the contextQueue. 83 stubContextQueueRegion uintptr 84 stubContextQueueRegionLen uintptr 85 // Memory region to store instances of sysmsg.ThreadContext. 86 stubContextRegion uintptr 87 stubContextRegionLen uintptr 88 // The memory blob with precompiled seccomp rules. 89 stubSysmsgRules uintptr 90 stubSysmsgRulesLen uintptr 91 92 stubSpinningThreadQueueAddr uintptr 93 stubSpinningThreadQueueSize uintptr 94 95 // stubROMapEnd is the end address of the read-only stub region that 96 // contains the code and precompiled seccomp rules. 97 stubROMapEnd uintptr 98 99 // stubEnd is the first byte past the end of the stub, as with 100 // stubStart this is valid only after a call to stubInit. 101 stubEnd uintptr 102 103 // stubInitialized controls one-time stub initialization. 104 stubInitialized sync.Once 105 106 // latencyMonitoring controls one-time initialization of the fastpath 107 // control goroutine. 108 latencyMonitoring sync.Once 109 110 // archState stores architecture-specific details used in the platform. 111 archState sysmsg.ArchState 112 ) 113 114 // platformContext is an implementation of the platform context. 115 type platformContext struct { 116 // signalInfo is the signal info, if and when a signal is received. 117 signalInfo linux.SignalInfo 118 119 // interrupt is the interrupt platformContext. 120 interrupt interrupt.Forwarder 121 122 // sharedContext is everything related to this platformContext that is resident in 123 // shared memory with the stub thread. 124 // sharedContext is only accessed on the Task goroutine, therefore it is not 125 // mutex protected. 126 sharedContext *sharedContext 127 128 // mu protects the following fields. 129 mu sync.Mutex 130 131 // If lastFaultSP is non-nil, the last platformContext switch was due to a fault 132 // received while executing lastFaultSP. Only platformContext.Switch may set 133 // lastFaultSP to a non-nil value. 134 lastFaultSP *subprocess 135 136 // lastFaultAddr is the last faulting address; this is only meaningful if 137 // lastFaultSP is non-nil. 138 lastFaultAddr hostarch.Addr 139 140 // lastFaultIP is the address of the last faulting instruction; 141 // this is also only meaningful if lastFaultSP is non-nil. 142 lastFaultIP hostarch.Addr 143 144 // needRestoreFPState indicates that the FPU state has been changed by 145 // the Sentry and has to be updated on the stub thread. 146 needRestoreFPState bool 147 148 // needToPullFullState indicates that the Sentry doesn't have a full 149 // state of the thread. 150 needToPullFullState bool 151 } 152 153 // PullFullState implements platform.Context.PullFullState. 154 func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { 155 if !c.needToPullFullState { 156 return nil 157 } 158 s := as.(*subprocess) 159 if err := s.PullFullState(c, ac); err != nil { 160 return err 161 } 162 c.needToPullFullState = false 163 return nil 164 } 165 166 // FullStateChanged implements platform.Context.FullStateChanged. 167 func (c *platformContext) FullStateChanged() { 168 c.needRestoreFPState = true 169 c.needToPullFullState = false 170 } 171 172 // Switch runs the provided platformContext in the given address space. 173 func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) { 174 as := mm.AddressSpace() 175 s := as.(*subprocess) 176 if err := s.activateContext(c); err != nil { 177 return nil, hostarch.NoAccess, err 178 } 179 180 restart: 181 isSyscall, needPatch, err := s.switchToApp(c, ac) 182 if err != nil { 183 return nil, hostarch.NoAccess, err 184 } 185 if needPatch { 186 restart, _ := s.usertrap.PatchSyscall(ctx, ac, mm) 187 if restart { 188 goto restart 189 } 190 } 191 if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL { 192 err := s.usertrap.HandleFault(ctx, ac, mm) 193 if err == usertrap.ErrFaultSyscall { 194 isSyscall = true 195 } else if err == usertrap.ErrFaultRestart { 196 goto restart 197 } else if err != nil { 198 ctx.Warningf("usertrap.HandleFault failed: %v", err) 199 } 200 } 201 var ( 202 faultSP *subprocess 203 faultAddr hostarch.Addr 204 faultIP hostarch.Addr 205 ) 206 if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV { 207 faultSP = s 208 faultAddr = hostarch.Addr(c.signalInfo.Addr()) 209 faultIP = hostarch.Addr(ac.IP()) 210 } 211 212 // Update the platformContext to reflect the outcome of this context switch. 213 c.mu.Lock() 214 lastFaultSP := c.lastFaultSP 215 lastFaultAddr := c.lastFaultAddr 216 lastFaultIP := c.lastFaultIP 217 // At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't 218 // be updated by s.Unmap(). This is fine; we only need to synchronize with 219 // calls to s.Unmap() that occur after the handling of this fault. 220 c.lastFaultSP = faultSP 221 c.lastFaultAddr = faultAddr 222 c.lastFaultIP = faultIP 223 c.mu.Unlock() 224 225 // Update subprocesses to reflect the outcome of this context switch. 226 if lastFaultSP != faultSP { 227 if lastFaultSP != nil { 228 lastFaultSP.mu.Lock() 229 delete(lastFaultSP.faultedContexts, c) 230 lastFaultSP.mu.Unlock() 231 } 232 if faultSP != nil { 233 faultSP.mu.Lock() 234 faultSP.faultedContexts[c] = struct{}{} 235 faultSP.mu.Unlock() 236 } 237 } 238 239 if isSyscall { 240 return nil, hostarch.NoAccess, nil 241 } 242 243 si := c.signalInfo 244 if faultSP == nil { 245 // Non-fault signal. 246 return &si, hostarch.NoAccess, platform.ErrContextSignal 247 } 248 249 // See if this can be handled as a CPUID exception. 250 if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) { 251 goto restart 252 } 253 254 // Got a page fault. Ideally, we'd get real fault type here, but ptrace 255 // doesn't expose this information. Instead, we use a simple heuristic: 256 // 257 // It was an instruction fault iff the faulting addr == instruction 258 // pointer. 259 // 260 // It was a write fault if the fault is immediately repeated. 261 at := hostarch.Read 262 if faultAddr == faultIP { 263 at.Execute = true 264 } 265 if lastFaultSP == faultSP && 266 lastFaultAddr == faultAddr && 267 lastFaultIP == faultIP { 268 at.Write = true 269 } 270 271 // Handle as a signal. 272 return &si, at, platform.ErrContextSignal 273 } 274 275 // Interrupt interrupts the running guest application associated with this platformContext. 276 func (c *platformContext) Interrupt() { 277 c.interrupt.NotifyInterrupt() 278 } 279 280 // Release releases all platform resources used by the platformContext. 281 func (c *platformContext) Release() { 282 if c.sharedContext != nil { 283 c.sharedContext.release() 284 c.sharedContext = nil 285 } 286 } 287 288 // PrepareSleep implements platform.Context.platform.PrepareSleep. 289 func (c *platformContext) PrepareSleep() { 290 ctx := c.sharedContext 291 if ctx == nil { 292 return 293 } 294 if !ctx.sleeping { 295 ctx.sleeping = true 296 ctx.subprocess.decAwakeContexts() 297 } 298 } 299 300 // Systrap represents a collection of seccomp subprocesses. 301 type Systrap struct { 302 platform.NoCPUPreemptionDetection 303 platform.UseHostGlobalMemoryBarrier 304 platform.DoesNotOwnPageTables 305 306 // memoryFile is used to create a stub sysmsg stack 307 // which is shared with the Sentry. 308 memoryFile *pgalloc.MemoryFile 309 } 310 311 // MinUserAddress implements platform.MinUserAddress. 312 func (*Systrap) MinUserAddress() hostarch.Addr { 313 return platform.SystemMMapMinAddr() 314 } 315 316 // New returns a new seccomp-based implementation of the platform interface. 317 func New() (*Systrap, error) { 318 // CPUID information has been initialized at this point. 319 archState.Init() 320 321 mf, err := createMemoryFile() 322 if err != nil { 323 return nil, err 324 } 325 326 stubInitialized.Do(func() { 327 // Initialize the stub. 328 stubInit() 329 330 // Create the source process for the global pool. This must be 331 // done before initializing any other processes. 332 source, err := newSubprocess(createStub, mf) 333 if err != nil { 334 // Should never happen. 335 panic("unable to initialize systrap source: " + err.Error()) 336 } 337 // The source subprocess is never released explicitly by a MM. 338 source.DecRef(nil) 339 340 globalPool.source = source 341 342 initSysmsgThreadPriority() 343 }) 344 345 latencyMonitoring.Do(func() { 346 go controlFastPath() 347 }) 348 349 return &Systrap{memoryFile: mf}, nil 350 } 351 352 // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. 353 func (*Systrap) SupportsAddressSpaceIO() bool { 354 return false 355 } 356 357 // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. 358 func (*Systrap) CooperativelySchedulesAddressSpace() bool { 359 return false 360 } 361 362 // MapUnit implements platform.Platform.MapUnit. 363 func (*Systrap) MapUnit() uint64 { 364 // The host kernel manages page tables and arbitrary-sized mappings 365 // have effectively the same cost. 366 return 0 367 } 368 369 // MaxUserAddress returns the first address that may not be used by user 370 // applications. 371 func (*Systrap) MaxUserAddress() hostarch.Addr { 372 return hostarch.Addr(maxStubUserAddress) 373 } 374 375 // NewAddressSpace returns a new subprocess. 376 func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) { 377 as, err := newSubprocess(globalPool.source.createStub, p.memoryFile) 378 return as, nil, err 379 } 380 381 // NewContext returns an interruptible platformContext. 382 func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context { 383 return &platformContext{ 384 needRestoreFPState: true, 385 needToPullFullState: false, 386 } 387 } 388 389 type constructor struct{} 390 391 func (*constructor) New(_ *os.File) (platform.Platform, error) { 392 return New() 393 } 394 395 func (*constructor) OpenDevice(_ string) (*os.File, error) { 396 return nil, nil 397 } 398 399 // Requirements implements platform.Constructor.Requirements(). 400 func (*constructor) Requirements() platform.Requirements { 401 // TODO(b/75837838): Also set a new PID namespace so that we limit 402 // access to other host processes. 403 return platform.Requirements{ 404 RequiresCapSysPtrace: true, 405 RequiresCurrentPIDNS: true, 406 } 407 } 408 409 func init() { 410 platform.Register("systrap", &constructor{}) 411 } 412 413 func createMemoryFile() (*pgalloc.MemoryFile, error) { 414 const memfileName = "systrap-memory" 415 fd, err := memutil.CreateMemFD(memfileName, 0) 416 if err != nil { 417 return nil, fmt.Errorf("error creating memfd: %v", err) 418 } 419 memfile := os.NewFile(uintptr(fd), memfileName) 420 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) 421 if err != nil { 422 memfile.Close() 423 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) 424 } 425 return mf, nil 426 } 427 428 func corruptedSharedMemoryErr(additional string) *platform.ContextError { 429 return &platform.ContextError{ 430 Err: fmt.Errorf("systrap corrupted memory: %s", additional), 431 Errno: unix.EPERM, 432 } 433 }