github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/systrap.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package systrap provides a seccomp-based implementation of the platform
    16  // interface.
    17  //
    18  // In a nutshell, it works as follows:
    19  //
    20  // The creation of a new address space creates a new child processes.
    21  //
    22  // The creation of a new stub thread creates a new system thread with a
    23  // specified address space. To initialize this thread, the following action
    24  // will be done:
    25  //   - install a signal stack which is shared with the Sentry.
    26  //   - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals.
    27  //     This signal handler is a key part of the systrap platform. Any stub event
    28  //     which has to be handled in a privilege mode (by the Sentry) triggers one of
    29  //     previous signals. The signal handler is running on the separate stack which
    30  //     is shared with the Sentry. There is the sysmsg structure to synchronize the
    31  //     Sentry and a stub thread.
    32  //   - install seccomp filters to trap user system calls.
    33  //   - send a fake SIGSEGV to stop the thread in the signal handler.
    34  //
    35  // A platformContext is just a collection of temporary variables. Calling Switch on a
    36  // platformContext does the following:
    37  //
    38  //	Set up proper registers and an FPU state on a stub signal frame.
    39  //	Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE.
    40  //	Wait for new stub event by polling sysmsg->stage.
    41  //
    42  // Lock order:
    43  //
    44  //	subprocessPool.mu
    45  //		subprocess.mu
    46  //			platformContext.mu
    47  //
    48  // +checkalignedignore
    49  package systrap
    50  
    51  import (
    52  	"fmt"
    53  	"os"
    54  	"sync"
    55  
    56  	"golang.org/x/sys/unix"
    57  	"github.com/metacubex/gvisor/pkg/abi/linux"
    58  	pkgcontext "github.com/metacubex/gvisor/pkg/context"
    59  	"github.com/metacubex/gvisor/pkg/hostarch"
    60  	"github.com/metacubex/gvisor/pkg/memutil"
    61  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    62  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    63  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    64  	"github.com/metacubex/gvisor/pkg/sentry/platform/interrupt"
    65  	"github.com/metacubex/gvisor/pkg/sentry/platform/systrap/sysmsg"
    66  	"github.com/metacubex/gvisor/pkg/sentry/platform/systrap/usertrap"
    67  )
    68  
    69  var (
    70  	// stubStart is the link address for our stub, and determines the
    71  	// maximum user address. This is valid only after a call to stubInit.
    72  	//
    73  	// We attempt to link the stub here, and adjust downward as needed.
    74  	stubStart uintptr = stubInitAddress
    75  
    76  	stubInitProcess uintptr
    77  
    78  	// Memory region to store thread specific stacks.
    79  	stubSysmsgStack uintptr
    80  	stubSysmsgStart uintptr
    81  	stubSysmsgEnd   uintptr
    82  	// Memory region to store the contextQueue.
    83  	stubContextQueueRegion    uintptr
    84  	stubContextQueueRegionLen uintptr
    85  	// Memory region to store instances of sysmsg.ThreadContext.
    86  	stubContextRegion    uintptr
    87  	stubContextRegionLen uintptr
    88  	// The memory blob with precompiled seccomp rules.
    89  	stubSysmsgRules    uintptr
    90  	stubSysmsgRulesLen uintptr
    91  
    92  	stubSpinningThreadQueueAddr uintptr
    93  	stubSpinningThreadQueueSize uintptr
    94  
    95  	// stubROMapEnd is the end address of the read-only stub region that
    96  	// contains the code and precompiled seccomp rules.
    97  	stubROMapEnd uintptr
    98  
    99  	// stubEnd is the first byte past the end of the stub, as with
   100  	// stubStart this is valid only after a call to stubInit.
   101  	stubEnd uintptr
   102  
   103  	// stubInitialized controls one-time stub initialization.
   104  	stubInitialized sync.Once
   105  
   106  	// latencyMonitoring controls one-time initialization of the fastpath
   107  	// control goroutine.
   108  	latencyMonitoring sync.Once
   109  
   110  	// archState stores architecture-specific details used in the platform.
   111  	archState sysmsg.ArchState
   112  )
   113  
   114  // platformContext is an implementation of the platform context.
   115  type platformContext struct {
   116  	// signalInfo is the signal info, if and when a signal is received.
   117  	signalInfo linux.SignalInfo
   118  
   119  	// interrupt is the interrupt platformContext.
   120  	interrupt interrupt.Forwarder
   121  
   122  	// sharedContext is everything related to this platformContext that is resident in
   123  	// shared memory with the stub thread.
   124  	// sharedContext is only accessed on the Task goroutine, therefore it is not
   125  	// mutex protected.
   126  	sharedContext *sharedContext
   127  
   128  	// mu protects the following fields.
   129  	mu sync.Mutex
   130  
   131  	// If lastFaultSP is non-nil, the last platformContext switch was due to a fault
   132  	// received while executing lastFaultSP. Only platformContext.Switch may set
   133  	// lastFaultSP to a non-nil value.
   134  	lastFaultSP *subprocess
   135  
   136  	// lastFaultAddr is the last faulting address; this is only meaningful if
   137  	// lastFaultSP is non-nil.
   138  	lastFaultAddr hostarch.Addr
   139  
   140  	// lastFaultIP is the address of the last faulting instruction;
   141  	// this is also only meaningful if lastFaultSP is non-nil.
   142  	lastFaultIP hostarch.Addr
   143  
   144  	// needRestoreFPState indicates that the FPU state has been changed by
   145  	// the Sentry and has to be updated on the stub thread.
   146  	needRestoreFPState bool
   147  
   148  	// needToPullFullState indicates that the Sentry doesn't have a full
   149  	// state of the thread.
   150  	needToPullFullState bool
   151  }
   152  
   153  // PullFullState implements platform.Context.PullFullState.
   154  func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error {
   155  	if !c.needToPullFullState {
   156  		return nil
   157  	}
   158  	s := as.(*subprocess)
   159  	if err := s.PullFullState(c, ac); err != nil {
   160  		return err
   161  	}
   162  	c.needToPullFullState = false
   163  	return nil
   164  }
   165  
   166  // FullStateChanged implements platform.Context.FullStateChanged.
   167  func (c *platformContext) FullStateChanged() {
   168  	c.needRestoreFPState = true
   169  	c.needToPullFullState = false
   170  }
   171  
   172  // Switch runs the provided platformContext in the given address space.
   173  func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) {
   174  	as := mm.AddressSpace()
   175  	s := as.(*subprocess)
   176  	if err := s.activateContext(c); err != nil {
   177  		return nil, hostarch.NoAccess, err
   178  	}
   179  
   180  restart:
   181  	isSyscall, needPatch, err := s.switchToApp(c, ac)
   182  	if err != nil {
   183  		return nil, hostarch.NoAccess, err
   184  	}
   185  	if needPatch {
   186  		restart, _ := s.usertrap.PatchSyscall(ctx, ac, mm)
   187  		if restart {
   188  			goto restart
   189  		}
   190  	}
   191  	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL {
   192  		err := s.usertrap.HandleFault(ctx, ac, mm)
   193  		if err == usertrap.ErrFaultSyscall {
   194  			isSyscall = true
   195  		} else if err == usertrap.ErrFaultRestart {
   196  			goto restart
   197  		} else if err != nil {
   198  			ctx.Warningf("usertrap.HandleFault failed: %v", err)
   199  		}
   200  	}
   201  	var (
   202  		faultSP   *subprocess
   203  		faultAddr hostarch.Addr
   204  		faultIP   hostarch.Addr
   205  	)
   206  	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
   207  		faultSP = s
   208  		faultAddr = hostarch.Addr(c.signalInfo.Addr())
   209  		faultIP = hostarch.Addr(ac.IP())
   210  	}
   211  
   212  	// Update the platformContext to reflect the outcome of this context switch.
   213  	c.mu.Lock()
   214  	lastFaultSP := c.lastFaultSP
   215  	lastFaultAddr := c.lastFaultAddr
   216  	lastFaultIP := c.lastFaultIP
   217  	// At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't
   218  	// be updated by s.Unmap(). This is fine; we only need to synchronize with
   219  	// calls to s.Unmap() that occur after the handling of this fault.
   220  	c.lastFaultSP = faultSP
   221  	c.lastFaultAddr = faultAddr
   222  	c.lastFaultIP = faultIP
   223  	c.mu.Unlock()
   224  
   225  	// Update subprocesses to reflect the outcome of this context switch.
   226  	if lastFaultSP != faultSP {
   227  		if lastFaultSP != nil {
   228  			lastFaultSP.mu.Lock()
   229  			delete(lastFaultSP.faultedContexts, c)
   230  			lastFaultSP.mu.Unlock()
   231  		}
   232  		if faultSP != nil {
   233  			faultSP.mu.Lock()
   234  			faultSP.faultedContexts[c] = struct{}{}
   235  			faultSP.mu.Unlock()
   236  		}
   237  	}
   238  
   239  	if isSyscall {
   240  		return nil, hostarch.NoAccess, nil
   241  	}
   242  
   243  	si := c.signalInfo
   244  	if faultSP == nil {
   245  		// Non-fault signal.
   246  		return &si, hostarch.NoAccess, platform.ErrContextSignal
   247  	}
   248  
   249  	// See if this can be handled as a CPUID exception.
   250  	if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) {
   251  		goto restart
   252  	}
   253  
   254  	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
   255  	// doesn't expose this information. Instead, we use a simple heuristic:
   256  	//
   257  	// It was an instruction fault iff the faulting addr == instruction
   258  	// pointer.
   259  	//
   260  	// It was a write fault if the fault is immediately repeated.
   261  	at := hostarch.Read
   262  	if faultAddr == faultIP {
   263  		at.Execute = true
   264  	}
   265  	if lastFaultSP == faultSP &&
   266  		lastFaultAddr == faultAddr &&
   267  		lastFaultIP == faultIP {
   268  		at.Write = true
   269  	}
   270  
   271  	// Handle as a signal.
   272  	return &si, at, platform.ErrContextSignal
   273  }
   274  
   275  // Interrupt interrupts the running guest application associated with this platformContext.
   276  func (c *platformContext) Interrupt() {
   277  	c.interrupt.NotifyInterrupt()
   278  }
   279  
   280  // Release releases all platform resources used by the platformContext.
   281  func (c *platformContext) Release() {
   282  	if c.sharedContext != nil {
   283  		c.sharedContext.release()
   284  		c.sharedContext = nil
   285  	}
   286  }
   287  
   288  // PrepareSleep implements platform.Context.platform.PrepareSleep.
   289  func (c *platformContext) PrepareSleep() {
   290  	ctx := c.sharedContext
   291  	if ctx == nil {
   292  		return
   293  	}
   294  	if !ctx.sleeping {
   295  		ctx.sleeping = true
   296  		ctx.subprocess.decAwakeContexts()
   297  	}
   298  }
   299  
   300  // Systrap represents a collection of seccomp subprocesses.
   301  type Systrap struct {
   302  	platform.NoCPUPreemptionDetection
   303  	platform.UseHostGlobalMemoryBarrier
   304  	platform.DoesNotOwnPageTables
   305  
   306  	// memoryFile is used to create a stub sysmsg stack
   307  	// which is shared with the Sentry.
   308  	memoryFile *pgalloc.MemoryFile
   309  }
   310  
   311  // MinUserAddress implements platform.MinUserAddress.
   312  func (*Systrap) MinUserAddress() hostarch.Addr {
   313  	return platform.SystemMMapMinAddr()
   314  }
   315  
   316  // New returns a new seccomp-based implementation of the platform interface.
   317  func New() (*Systrap, error) {
   318  	// CPUID information has been initialized at this point.
   319  	archState.Init()
   320  
   321  	mf, err := createMemoryFile()
   322  	if err != nil {
   323  		return nil, err
   324  	}
   325  
   326  	stubInitialized.Do(func() {
   327  		// Initialize the stub.
   328  		stubInit()
   329  
   330  		// Create the source process for the global pool. This must be
   331  		// done before initializing any other processes.
   332  		source, err := newSubprocess(createStub, mf)
   333  		if err != nil {
   334  			// Should never happen.
   335  			panic("unable to initialize systrap source: " + err.Error())
   336  		}
   337  		// The source subprocess is never released explicitly by a MM.
   338  		source.DecRef(nil)
   339  
   340  		globalPool.source = source
   341  
   342  		initSysmsgThreadPriority()
   343  	})
   344  
   345  	latencyMonitoring.Do(func() {
   346  		go controlFastPath()
   347  	})
   348  
   349  	return &Systrap{memoryFile: mf}, nil
   350  }
   351  
   352  // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
   353  func (*Systrap) SupportsAddressSpaceIO() bool {
   354  	return false
   355  }
   356  
   357  // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
   358  func (*Systrap) CooperativelySchedulesAddressSpace() bool {
   359  	return false
   360  }
   361  
   362  // MapUnit implements platform.Platform.MapUnit.
   363  func (*Systrap) MapUnit() uint64 {
   364  	// The host kernel manages page tables and arbitrary-sized mappings
   365  	// have effectively the same cost.
   366  	return 0
   367  }
   368  
   369  // MaxUserAddress returns the first address that may not be used by user
   370  // applications.
   371  func (*Systrap) MaxUserAddress() hostarch.Addr {
   372  	return hostarch.Addr(maxStubUserAddress)
   373  }
   374  
   375  // NewAddressSpace returns a new subprocess.
   376  func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) {
   377  	as, err := newSubprocess(globalPool.source.createStub, p.memoryFile)
   378  	return as, nil, err
   379  }
   380  
   381  // NewContext returns an interruptible platformContext.
   382  func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context {
   383  	return &platformContext{
   384  		needRestoreFPState:  true,
   385  		needToPullFullState: false,
   386  	}
   387  }
   388  
   389  type constructor struct{}
   390  
   391  func (*constructor) New(_ *os.File) (platform.Platform, error) {
   392  	return New()
   393  }
   394  
   395  func (*constructor) OpenDevice(_ string) (*os.File, error) {
   396  	return nil, nil
   397  }
   398  
   399  // Requirements implements platform.Constructor.Requirements().
   400  func (*constructor) Requirements() platform.Requirements {
   401  	// TODO(b/75837838): Also set a new PID namespace so that we limit
   402  	// access to other host processes.
   403  	return platform.Requirements{
   404  		RequiresCapSysPtrace: true,
   405  		RequiresCurrentPIDNS: true,
   406  	}
   407  }
   408  
   409  func init() {
   410  	platform.Register("systrap", &constructor{})
   411  }
   412  
   413  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   414  	const memfileName = "systrap-memory"
   415  	fd, err := memutil.CreateMemFD(memfileName, 0)
   416  	if err != nil {
   417  		return nil, fmt.Errorf("error creating memfd: %v", err)
   418  	}
   419  	memfile := os.NewFile(uintptr(fd), memfileName)
   420  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
   421  	if err != nil {
   422  		memfile.Close()
   423  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
   424  	}
   425  	return mf, nil
   426  }
   427  
   428  func corruptedSharedMemoryErr(additional string) *platform.ContextError {
   429  	return &platform.ContextError{
   430  		Err:   fmt.Errorf("systrap corrupted memory: %s", additional),
   431  		Errno: unix.EPERM,
   432  	}
   433  }