gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/systrap.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package systrap provides a seccomp-based implementation of the platform
    16  // interface.
    17  //
    18  // In a nutshell, it works as follows:
    19  //
    20  // The creation of a new address space creates a new child processes.
    21  //
    22  // The creation of a new stub thread creates a new system thread with a
    23  // specified address space. To initialize this thread, the following action
    24  // will be done:
    25  //   - install a signal stack which is shared with the Sentry.
    26  //   - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals.
    27  //     This signal handler is a key part of the systrap platform. Any stub event
    28  //     which has to be handled in a privilege mode (by the Sentry) triggers one of
    29  //     previous signals. The signal handler is running on the separate stack which
    30  //     is shared with the Sentry. There is the sysmsg structure to synchronize the
    31  //     Sentry and a stub thread.
    32  //   - install seccomp filters to trap user system calls.
    33  //   - send a fake SIGSEGV to stop the thread in the signal handler.
    34  //
    35  // A platformContext is just a collection of temporary variables. Calling Switch on a
    36  // platformContext does the following:
    37  //
    38  //	Set up proper registers and an FPU state on a stub signal frame.
    39  //	Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE.
    40  //	Wait for new stub event by polling sysmsg->stage.
    41  //
    42  // Lock order:
    43  //
    44  //	subprocessPool.mu
    45  //		subprocess.mu
    46  //			platformContext.mu
    47  //
    48  // +checkalignedignore
    49  package systrap
    50  
    51  import (
    52  	"fmt"
    53  	"os"
    54  	"runtime"
    55  	"sync"
    56  
    57  	"golang.org/x/sys/unix"
    58  	"gvisor.dev/gvisor/pkg/abi/linux"
    59  	pkgcontext "gvisor.dev/gvisor/pkg/context"
    60  	"gvisor.dev/gvisor/pkg/fd"
    61  	"gvisor.dev/gvisor/pkg/hostarch"
    62  	"gvisor.dev/gvisor/pkg/memutil"
    63  	"gvisor.dev/gvisor/pkg/sentry/arch"
    64  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    65  	"gvisor.dev/gvisor/pkg/sentry/platform"
    66  	"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
    67  	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg"
    68  	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap"
    69  )
    70  
    71  var (
    72  	// stubStart is the link address for our stub, and determines the
    73  	// maximum user address. This is valid only after a call to stubInit.
    74  	//
    75  	// We attempt to link the stub here, and adjust downward as needed.
    76  	stubStart uintptr = stubInitAddress
    77  
    78  	stubInitProcess uintptr
    79  
    80  	// Memory region to store thread specific stacks.
    81  	stubSysmsgStack uintptr
    82  	stubSysmsgStart uintptr
    83  	stubSysmsgEnd   uintptr
    84  	// Memory region to store the contextQueue.
    85  	stubContextQueueRegion    uintptr
    86  	stubContextQueueRegionLen uintptr
    87  	// Memory region to store instances of sysmsg.ThreadContext.
    88  	stubContextRegion    uintptr
    89  	stubContextRegionLen uintptr
    90  	// The memory blob with precompiled seccomp rules.
    91  	stubSysmsgRules     uintptr
    92  	stubSysmsgRulesLen  uintptr
    93  	stubSyscallRules    uintptr
    94  	stubSyscallRulesLen uintptr
    95  
    96  	stubSpinningThreadQueueAddr uintptr
    97  	stubSpinningThreadQueueSize uintptr
    98  
    99  	// stubROMapEnd is the end address of the read-only stub region that
   100  	// contains the code and precompiled seccomp rules.
   101  	stubROMapEnd uintptr
   102  
   103  	// stubEnd is the first byte past the end of the stub, as with
   104  	// stubStart this is valid only after a call to stubInit.
   105  	stubEnd uintptr
   106  
   107  	// stubInitialized controls one-time stub initialization.
   108  	stubInitialized sync.Once
   109  
   110  	// latencyMonitoring controls one-time initialization of the fastpath
   111  	// control goroutine.
   112  	latencyMonitoring sync.Once
   113  
   114  	// archState stores architecture-specific details used in the platform.
   115  	archState sysmsg.ArchState
   116  )
   117  
   118  // platformContext is an implementation of the platform context.
   119  type platformContext struct {
   120  	// signalInfo is the signal info, if and when a signal is received.
   121  	signalInfo linux.SignalInfo
   122  
   123  	// interrupt is the interrupt platformContext.
   124  	interrupt interrupt.Forwarder
   125  
   126  	// sharedContext is everything related to this platformContext that is resident in
   127  	// shared memory with the stub thread.
   128  	// sharedContext is only accessed on the Task goroutine, therefore it is not
   129  	// mutex protected.
   130  	sharedContext *sharedContext
   131  
   132  	// mu protects the following fields.
   133  	mu sync.Mutex
   134  
   135  	// If lastFaultSP is non-nil, the last platformContext switch was due to a fault
   136  	// received while executing lastFaultSP. Only platformContext.Switch may set
   137  	// lastFaultSP to a non-nil value.
   138  	lastFaultSP *subprocess
   139  
   140  	// lastFaultAddr is the last faulting address; this is only meaningful if
   141  	// lastFaultSP is non-nil.
   142  	lastFaultAddr hostarch.Addr
   143  
   144  	// lastFaultIP is the address of the last faulting instruction;
   145  	// this is also only meaningful if lastFaultSP is non-nil.
   146  	lastFaultIP hostarch.Addr
   147  
   148  	// needRestoreFPState indicates that the FPU state has been changed by
   149  	// the Sentry and has to be updated on the stub thread.
   150  	needRestoreFPState bool
   151  
   152  	// needToPullFullState indicates that the Sentry doesn't have a full
   153  	// state of the thread.
   154  	needToPullFullState bool
   155  }
   156  
   157  // PullFullState implements platform.Context.PullFullState.
   158  func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error {
   159  	if !c.needToPullFullState {
   160  		return nil
   161  	}
   162  	s := as.(*subprocess)
   163  	if err := s.PullFullState(c, ac); err != nil {
   164  		return err
   165  	}
   166  	c.needToPullFullState = false
   167  	return nil
   168  }
   169  
   170  // FullStateChanged implements platform.Context.FullStateChanged.
   171  func (c *platformContext) FullStateChanged() {
   172  	c.needRestoreFPState = true
   173  	c.needToPullFullState = false
   174  }
   175  
   176  // Switch runs the provided platformContext in the given address space.
   177  func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) {
   178  	as := mm.AddressSpace()
   179  	s := as.(*subprocess)
   180  	if err := s.activateContext(c); err != nil {
   181  		return nil, hostarch.NoAccess, err
   182  	}
   183  
   184  restart:
   185  	isSyscall, needPatch, err := s.switchToApp(c, ac)
   186  	if err != nil {
   187  		return nil, hostarch.NoAccess, err
   188  	}
   189  	if needPatch {
   190  		s.usertrap.PatchSyscall(ctx, ac, mm)
   191  	}
   192  	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL {
   193  		err := s.usertrap.HandleFault(ctx, ac, mm)
   194  		if err == usertrap.ErrFaultSyscall {
   195  			isSyscall = true
   196  		} else if err == usertrap.ErrFaultRestart {
   197  			goto restart
   198  		} else if err != nil {
   199  			ctx.Warningf("usertrap.HandleFault failed: %v", err)
   200  		}
   201  	}
   202  	var (
   203  		faultSP   *subprocess
   204  		faultAddr hostarch.Addr
   205  		faultIP   hostarch.Addr
   206  	)
   207  	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
   208  		faultSP = s
   209  		faultAddr = hostarch.Addr(c.signalInfo.Addr())
   210  		faultIP = hostarch.Addr(ac.IP())
   211  	}
   212  
   213  	// Update the platformContext to reflect the outcome of this context switch.
   214  	c.mu.Lock()
   215  	lastFaultSP := c.lastFaultSP
   216  	lastFaultAddr := c.lastFaultAddr
   217  	lastFaultIP := c.lastFaultIP
   218  	// At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't
   219  	// be updated by s.Unmap(). This is fine; we only need to synchronize with
   220  	// calls to s.Unmap() that occur after the handling of this fault.
   221  	c.lastFaultSP = faultSP
   222  	c.lastFaultAddr = faultAddr
   223  	c.lastFaultIP = faultIP
   224  	c.mu.Unlock()
   225  
   226  	// Update subprocesses to reflect the outcome of this context switch.
   227  	if lastFaultSP != faultSP {
   228  		if lastFaultSP != nil {
   229  			lastFaultSP.mu.Lock()
   230  			delete(lastFaultSP.faultedContexts, c)
   231  			lastFaultSP.mu.Unlock()
   232  		}
   233  		if faultSP != nil {
   234  			faultSP.mu.Lock()
   235  			faultSP.faultedContexts[c] = struct{}{}
   236  			faultSP.mu.Unlock()
   237  		}
   238  	}
   239  
   240  	if isSyscall {
   241  		return nil, hostarch.NoAccess, nil
   242  	}
   243  
   244  	si := c.signalInfo
   245  	if faultSP == nil {
   246  		// Non-fault signal.
   247  		return &si, hostarch.NoAccess, platform.ErrContextSignal
   248  	}
   249  
   250  	// See if this can be handled as a CPUID exception.
   251  	if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) {
   252  		goto restart
   253  	}
   254  
   255  	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
   256  	// doesn't expose this information. Instead, we use a simple heuristic:
   257  	//
   258  	// It was an instruction fault iff the faulting addr == instruction
   259  	// pointer.
   260  	//
   261  	// It was a write fault if the fault is immediately repeated.
   262  	at := hostarch.Read
   263  	if faultAddr == faultIP {
   264  		at.Execute = true
   265  	}
   266  	if lastFaultSP == faultSP &&
   267  		lastFaultAddr == faultAddr &&
   268  		lastFaultIP == faultIP {
   269  		at.Write = true
   270  	}
   271  
   272  	// Handle as a signal.
   273  	return &si, at, platform.ErrContextSignal
   274  }
   275  
   276  // Interrupt interrupts the running guest application associated with this platformContext.
   277  func (c *platformContext) Interrupt() {
   278  	c.interrupt.NotifyInterrupt()
   279  }
   280  
   281  // Release releases all platform resources used by the platformContext.
   282  func (c *platformContext) Release() {
   283  	if c.sharedContext != nil {
   284  		c.sharedContext.release()
   285  		c.sharedContext = nil
   286  	}
   287  }
   288  
   289  // PrepareSleep implements platform.Context.platform.PrepareSleep.
   290  func (c *platformContext) PrepareSleep() {
   291  	ctx := c.sharedContext
   292  	if ctx == nil {
   293  		return
   294  	}
   295  	if !ctx.sleeping {
   296  		ctx.sleeping = true
   297  		ctx.subprocess.decAwakeContexts()
   298  	}
   299  }
   300  
   301  // Systrap represents a collection of seccomp subprocesses.
   302  type Systrap struct {
   303  	platform.NoCPUPreemptionDetection
   304  	platform.UseHostGlobalMemoryBarrier
   305  	platform.DoesNotOwnPageTables
   306  
   307  	// memoryFile is used to create a stub sysmsg stack
   308  	// which is shared with the Sentry.
   309  	memoryFile *pgalloc.MemoryFile
   310  }
   311  
   312  // MinUserAddress implements platform.MinUserAddress.
   313  func (*Systrap) MinUserAddress() hostarch.Addr {
   314  	return platform.SystemMMapMinAddr()
   315  }
   316  
   317  // New returns a new seccomp-based implementation of the platform interface.
   318  func New() (*Systrap, error) {
   319  	if maxSysmsgThreads == 0 {
   320  		// CPUID information has been initialized at this point.
   321  		archState.Init()
   322  		// GOMAXPROCS has been set at this point.
   323  		maxSysmsgThreads = runtime.GOMAXPROCS(0)
   324  		// Account for syscall thread.
   325  		maxChildThreads = maxSysmsgThreads + 1
   326  	}
   327  
   328  	mf, err := createMemoryFile()
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  
   333  	stubInitialized.Do(func() {
   334  		// Don't use sentry and stub fast paths if here is just one cpu.
   335  		neverEnableFastPath = min(runtime.NumCPU(), runtime.GOMAXPROCS(0)) == 1
   336  
   337  		// Initialize the stub.
   338  		stubInit()
   339  
   340  		// Create the source process for the global pool. This must be
   341  		// done before initializing any other processes.
   342  		source, err := newSubprocess(createStub, mf, false)
   343  		if err != nil {
   344  			// Should never happen.
   345  			panic("unable to initialize systrap source: " + err.Error())
   346  		}
   347  		// The source subprocess is never released explicitly by a MM.
   348  		source.DecRef(nil)
   349  
   350  		globalPool.source = source
   351  
   352  		initSysmsgThreadPriority()
   353  	})
   354  
   355  	latencyMonitoring.Do(func() {
   356  		go controlFastPath()
   357  	})
   358  
   359  	return &Systrap{memoryFile: mf}, nil
   360  }
   361  
   362  // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
   363  func (*Systrap) SupportsAddressSpaceIO() bool {
   364  	return false
   365  }
   366  
   367  // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
   368  func (*Systrap) CooperativelySchedulesAddressSpace() bool {
   369  	return false
   370  }
   371  
   372  // MapUnit implements platform.Platform.MapUnit.
   373  func (*Systrap) MapUnit() uint64 {
   374  	// The host kernel manages page tables and arbitrary-sized mappings
   375  	// have effectively the same cost.
   376  	return 0
   377  }
   378  
   379  // MaxUserAddress returns the first address that may not be used by user
   380  // applications.
   381  func (*Systrap) MaxUserAddress() hostarch.Addr {
   382  	return hostarch.Addr(maxStubUserAddress)
   383  }
   384  
   385  // NewAddressSpace returns a new subprocess.
   386  func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) {
   387  	as, err := newSubprocess(globalPool.source.createStub, p.memoryFile, true)
   388  	return as, nil, err
   389  }
   390  
   391  // NewContext returns an interruptible platformContext.
   392  func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context {
   393  	return &platformContext{
   394  		needRestoreFPState:  true,
   395  		needToPullFullState: false,
   396  	}
   397  }
   398  
   399  type constructor struct{}
   400  
   401  func (*constructor) New(_ *fd.FD) (platform.Platform, error) {
   402  	return New()
   403  }
   404  
   405  func (*constructor) OpenDevice(_ string) (*fd.FD, error) {
   406  	return nil, nil
   407  }
   408  
   409  // Requirements implements platform.Constructor.Requirements().
   410  func (*constructor) Requirements() platform.Requirements {
   411  	// TODO(b/75837838): Also set a new PID namespace so that we limit
   412  	// access to other host processes.
   413  	return platform.Requirements{
   414  		RequiresCapSysPtrace: true,
   415  		RequiresCurrentPIDNS: true,
   416  	}
   417  }
   418  
   419  func init() {
   420  	platform.Register("systrap", &constructor{})
   421  }
   422  
   423  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   424  	const memfileName = "systrap-memory"
   425  	fd, err := memutil.CreateMemFD(memfileName, 0)
   426  	if err != nil {
   427  		return nil, fmt.Errorf("error creating memfd: %v", err)
   428  	}
   429  	memfile := os.NewFile(uintptr(fd), memfileName)
   430  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{
   431  		EnforceMaximumAllocatable: true,
   432  	})
   433  	if err != nil {
   434  		memfile.Close()
   435  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
   436  	}
   437  	return mf, nil
   438  }
   439  
   440  func corruptedSharedMemoryErr(additional string) *platform.ContextError {
   441  	return &platform.ContextError{
   442  		Err:   fmt.Errorf("systrap corrupted memory: %s", additional),
   443  		Errno: unix.EPERM,
   444  	}
   445  }