github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/systrap.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package systrap provides a seccomp-based implementation of the platform 16 // interface. 17 // 18 // In a nutshell, it works as follows: 19 // 20 // The creation of a new address space creates a new child processes. 21 // 22 // The creation of a new stub thread creates a new system thread with a 23 // specified address space. To initialize this thread, the following action 24 // will be done: 25 // - install a signal stack which is shared with the Sentry. 26 // - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals. 27 // This signal handler is a key part of the systrap platform. Any stub event 28 // which has to be handled in a privilege mode (by the Sentry) triggers one of 29 // previous signals. The signal handler is running on the separate stack which 30 // is shared with the Sentry. There is the sysmsg structure to synchronize the 31 // Sentry and a stub thread. 32 // - install seccomp filters to trap user system calls. 33 // - send a fake SIGSEGV to stop the thread in the signal handler. 34 // 35 // A context is just a collection of temporary variables. Calling Switch on a 36 // context does the following: 37 // 38 // Set up proper registers and an FPU state on a stub signal frame. 39 // Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE. 40 // Wait for new stub event by polling sysmsg->stage. 41 // 42 // Lock order: 43 // 44 // subprocessPool.mu 45 // subprocess.mu 46 // context.mu 47 // 48 // +checkalignedignore 49 package systrap 50 51 import ( 52 "fmt" 53 "os" 54 "sync" 55 56 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 57 pkgcontext "github.com/MerlinKodo/gvisor/pkg/context" 58 "github.com/MerlinKodo/gvisor/pkg/hostarch" 59 "github.com/MerlinKodo/gvisor/pkg/memutil" 60 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 61 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 62 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 63 "github.com/MerlinKodo/gvisor/pkg/sentry/platform/interrupt" 64 "github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/sysmsg" 65 "github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/usertrap" 66 ) 67 68 var ( 69 // stubStart is the link address for our stub, and determines the 70 // maximum user address. This is valid only after a call to stubInit. 71 // 72 // We attempt to link the stub here, and adjust downward as needed. 73 stubStart uintptr = stubInitAddress 74 75 stubInitProcess uintptr 76 77 // Memory region to store thread specific stacks. 78 stubSysmsgStack uintptr 79 stubSysmsgStart uintptr 80 stubSysmsgEnd uintptr 81 // Memory region to store the contextQueue. 82 stubContextQueueRegion uintptr 83 stubContextQueueRegionLen uintptr 84 // Memory region to store instances of sysmsg.ThreadContext. 85 stubContextRegion uintptr 86 stubContextRegionLen uintptr 87 // The memory blob with precompiled seccomp rules. 88 stubSysmsgRules uintptr 89 stubSysmsgRulesLen uintptr 90 91 stubSpinningThreadQueueAddr uintptr 92 stubSpinningThreadQueueSize uintptr 93 94 // stubROMapEnd is the end address of the read-only stub region that 95 // contains the code and precompiled seccomp rules. 96 stubROMapEnd uintptr 97 98 // stubEnd is the first byte past the end of the stub, as with 99 // stubStart this is valid only after a call to stubInit. 100 stubEnd uintptr 101 102 // stubInitialized controls one-time stub initialization. 103 stubInitialized sync.Once 104 105 // archState stores architecture-specific details used in the platform. 106 archState sysmsg.ArchState 107 ) 108 109 // context is an implementation of the platform context. 110 type context struct { 111 // signalInfo is the signal info, if and when a signal is received. 112 signalInfo linux.SignalInfo 113 114 // interrupt is the interrupt context. 115 interrupt interrupt.Forwarder 116 117 // sharedContext is everything related to this context that is resident in 118 // shared memory with the stub thread. 119 // sharedContext is only accessed on the Task goroutine, therefore it is not 120 // mutex protected. 121 sharedContext *sharedContext 122 123 // mu protects the following fields. 124 mu sync.Mutex 125 126 // If lastFaultSP is non-nil, the last context switch was due to a fault 127 // received while executing lastFaultSP. Only context.Switch may set 128 // lastFaultSP to a non-nil value. 129 lastFaultSP *subprocess 130 131 // lastFaultAddr is the last faulting address; this is only meaningful if 132 // lastFaultSP is non-nil. 133 lastFaultAddr hostarch.Addr 134 135 // lastFaultIP is the address of the last faulting instruction; 136 // this is also only meaningful if lastFaultSP is non-nil. 137 lastFaultIP hostarch.Addr 138 139 // needRestoreFPState indicates that the FPU state has been changed by 140 // the Sentry and has to be updated on the stub thread. 141 needRestoreFPState bool 142 143 // needToPullFullState indicates that the Sentry doesn't have a full 144 // state of the thread. 145 needToPullFullState bool 146 } 147 148 // PullFullState implements platform.Context.PullFullState. 149 func (c *context) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { 150 if !c.needToPullFullState { 151 return nil 152 } 153 s := as.(*subprocess) 154 if err := s.PullFullState(c, ac); err != nil { 155 return err 156 } 157 c.needToPullFullState = false 158 return nil 159 } 160 161 // FullStateChanged implements platform.Context.FullStateChanged. 162 func (c *context) FullStateChanged() { 163 c.needRestoreFPState = true 164 c.needToPullFullState = false 165 } 166 167 // Switch runs the provided context in the given address space. 168 func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) { 169 as := mm.AddressSpace() 170 s := as.(*subprocess) 171 if err := s.activateContext(c); err != nil { 172 return nil, hostarch.NoAccess, err 173 } 174 175 restart: 176 isSyscall, needPatch, err := s.switchToApp(c, ac) 177 if err != nil { 178 return nil, hostarch.NoAccess, err 179 } 180 if needPatch { 181 restart, _ := s.usertrap.PatchSyscall(ctx, ac, mm) 182 if restart { 183 goto restart 184 } 185 } 186 if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL { 187 err := s.usertrap.HandleFault(ctx, ac, mm) 188 if err == usertrap.ErrFaultSyscall { 189 isSyscall = true 190 } else if err == usertrap.ErrFaultRestart { 191 goto restart 192 } else if err != nil { 193 ctx.Warningf("usertrap.HandleFault failed: %v", err) 194 } 195 } 196 var ( 197 faultSP *subprocess 198 faultAddr hostarch.Addr 199 faultIP hostarch.Addr 200 ) 201 if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV { 202 faultSP = s 203 faultAddr = hostarch.Addr(c.signalInfo.Addr()) 204 faultIP = hostarch.Addr(ac.IP()) 205 } 206 207 // Update the context to reflect the outcome of this context switch. 208 c.mu.Lock() 209 lastFaultSP := c.lastFaultSP 210 lastFaultAddr := c.lastFaultAddr 211 lastFaultIP := c.lastFaultIP 212 // At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't 213 // be updated by s.Unmap(). This is fine; we only need to synchronize with 214 // calls to s.Unmap() that occur after the handling of this fault. 215 c.lastFaultSP = faultSP 216 c.lastFaultAddr = faultAddr 217 c.lastFaultIP = faultIP 218 c.mu.Unlock() 219 220 // Update subprocesses to reflect the outcome of this context switch. 221 if lastFaultSP != faultSP { 222 if lastFaultSP != nil { 223 lastFaultSP.mu.Lock() 224 delete(lastFaultSP.faultedContexts, c) 225 lastFaultSP.mu.Unlock() 226 } 227 if faultSP != nil { 228 faultSP.mu.Lock() 229 faultSP.faultedContexts[c] = struct{}{} 230 faultSP.mu.Unlock() 231 } 232 } 233 234 if isSyscall { 235 return nil, hostarch.NoAccess, nil 236 } 237 238 si := c.signalInfo 239 if faultSP == nil { 240 // Non-fault signal. 241 return &si, hostarch.NoAccess, platform.ErrContextSignal 242 } 243 244 // See if this can be handled as a CPUID exception. 245 if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) { 246 goto restart 247 } 248 249 // Got a page fault. Ideally, we'd get real fault type here, but ptrace 250 // doesn't expose this information. Instead, we use a simple heuristic: 251 // 252 // It was an instruction fault iff the faulting addr == instruction 253 // pointer. 254 // 255 // It was a write fault if the fault is immediately repeated. 256 at := hostarch.Read 257 if faultAddr == faultIP { 258 at.Execute = true 259 } 260 if lastFaultSP == faultSP && 261 lastFaultAddr == faultAddr && 262 lastFaultIP == faultIP { 263 at.Write = true 264 } 265 266 // Handle as a signal. 267 return &si, at, platform.ErrContextSignal 268 } 269 270 // Interrupt interrupts the running guest application associated with this context. 271 func (c *context) Interrupt() { 272 c.interrupt.NotifyInterrupt() 273 } 274 275 // Release releases all platform resources used by the context. 276 func (c *context) Release() { 277 if c.sharedContext != nil { 278 c.sharedContext.release() 279 c.sharedContext = nil 280 } 281 } 282 283 // PrepareSleep implements platform.Context.platform.PrepareSleep. 284 func (c *context) PrepareSleep() { 285 ctx := c.sharedContext 286 if ctx == nil { 287 return 288 } 289 if !ctx.sleeping { 290 ctx.sleeping = true 291 ctx.subprocess.decAwakeContexts() 292 } 293 } 294 295 // Systrap represents a collection of seccomp subprocesses. 296 type Systrap struct { 297 platform.NoCPUPreemptionDetection 298 platform.UseHostGlobalMemoryBarrier 299 platform.DoesNotOwnPageTables 300 301 // memoryFile is used to create a stub sysmsg stack 302 // which is shared with the Sentry. 303 memoryFile *pgalloc.MemoryFile 304 } 305 306 // MinUserAddress implements platform.MinUserAddress. 307 func (*Systrap) MinUserAddress() hostarch.Addr { 308 return platform.SystemMMapMinAddr() 309 } 310 311 // New returns a new seccomp-based implementation of the platform interface. 312 func New() (*Systrap, error) { 313 // CPUID information has been initialized at this point. 314 archState.Init() 315 316 mf, err := createMemoryFile() 317 if err != nil { 318 return nil, err 319 } 320 321 stubInitialized.Do(func() { 322 // Initialize the stub. 323 stubInit() 324 325 // Create the source process for the global pool. This must be 326 // done before initializing any other processes. 327 source, err := newSubprocess(createStub, mf) 328 if err != nil { 329 // Should never happen. 330 panic("unable to initialize systrap source: " + err.Error()) 331 } 332 // The source subprocess is never released explicitly by a MM. 333 source.DecRef(nil) 334 335 globalPool.source = source 336 337 initSysmsgThreadPriority() 338 }) 339 340 return &Systrap{memoryFile: mf}, nil 341 } 342 343 // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. 344 func (*Systrap) SupportsAddressSpaceIO() bool { 345 return false 346 } 347 348 // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. 349 func (*Systrap) CooperativelySchedulesAddressSpace() bool { 350 return false 351 } 352 353 // MapUnit implements platform.Platform.MapUnit. 354 func (*Systrap) MapUnit() uint64 { 355 // The host kernel manages page tables and arbitrary-sized mappings 356 // have effectively the same cost. 357 return 0 358 } 359 360 // MaxUserAddress returns the first address that may not be used by user 361 // applications. 362 func (*Systrap) MaxUserAddress() hostarch.Addr { 363 return hostarch.Addr(maxStubUserAddress) 364 } 365 366 // NewAddressSpace returns a new subprocess. 367 func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) { 368 as, err := newSubprocess(globalPool.source.createStub, p.memoryFile) 369 return as, nil, err 370 } 371 372 // NewContext returns an interruptible context. 373 func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context { 374 return &context{ 375 needRestoreFPState: true, 376 needToPullFullState: false, 377 } 378 } 379 380 type constructor struct{} 381 382 func (*constructor) New(_ *os.File) (platform.Platform, error) { 383 return New() 384 } 385 386 func (*constructor) OpenDevice(_ string) (*os.File, error) { 387 return nil, nil 388 } 389 390 // Requirements implements platform.Constructor.Requirements(). 391 func (*constructor) Requirements() platform.Requirements { 392 // TODO(b/75837838): Also set a new PID namespace so that we limit 393 // access to other host processes. 394 return platform.Requirements{ 395 RequiresCapSysPtrace: true, 396 RequiresCurrentPIDNS: true, 397 } 398 } 399 400 func init() { 401 platform.Register("systrap", &constructor{}) 402 } 403 404 func createMemoryFile() (*pgalloc.MemoryFile, error) { 405 const memfileName = "systrap-memory" 406 fd, err := memutil.CreateMemFD(memfileName, 0) 407 if err != nil { 408 return nil, fmt.Errorf("error creating memfd: %v", err) 409 } 410 memfile := os.NewFile(uintptr(fd), memfileName) 411 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) 412 if err != nil { 413 memfile.Close() 414 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) 415 } 416 return mf, nil 417 }