github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/systrap.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package systrap provides a seccomp-based implementation of the platform
    16  // interface.
    17  //
    18  // In a nutshell, it works as follows:
    19  //
    20  // The creation of a new address space creates a new child processes.
    21  //
    22  // The creation of a new stub thread creates a new system thread with a
    23  // specified address space. To initialize this thread, the following action
    24  // will be done:
    25  //   - install a signal stack which is shared with the Sentry.
    26  //   - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals.
    27  //     This signal handler is a key part of the systrap platform. Any stub event
    28  //     which has to be handled in a privilege mode (by the Sentry) triggers one of
    29  //     previous signals. The signal handler is running on the separate stack which
    30  //     is shared with the Sentry. There is the sysmsg structure to synchronize the
    31  //     Sentry and a stub thread.
    32  //   - install seccomp filters to trap user system calls.
    33  //   - send a fake SIGSEGV to stop the thread in the signal handler.
    34  //
    35  // A context is just a collection of temporary variables. Calling Switch on a
    36  // context does the following:
    37  //
    38  //	Set up proper registers and an FPU state on a stub signal frame.
    39  //	Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE.
    40  //	Wait for new stub event by polling sysmsg->stage.
    41  //
    42  // Lock order:
    43  //
    44  //	subprocessPool.mu
    45  //		subprocess.mu
    46  //			context.mu
    47  //
    48  // +checkalignedignore
    49  package systrap
    50  
    51  import (
    52  	"fmt"
    53  	"os"
    54  	"sync"
    55  
    56  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    57  	pkgcontext "github.com/MerlinKodo/gvisor/pkg/context"
    58  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    59  	"github.com/MerlinKodo/gvisor/pkg/memutil"
    60  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    61  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    62  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    63  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform/interrupt"
    64  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/sysmsg"
    65  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/usertrap"
    66  )
    67  
    68  var (
    69  	// stubStart is the link address for our stub, and determines the
    70  	// maximum user address. This is valid only after a call to stubInit.
    71  	//
    72  	// We attempt to link the stub here, and adjust downward as needed.
    73  	stubStart uintptr = stubInitAddress
    74  
    75  	stubInitProcess uintptr
    76  
    77  	// Memory region to store thread specific stacks.
    78  	stubSysmsgStack uintptr
    79  	stubSysmsgStart uintptr
    80  	stubSysmsgEnd   uintptr
    81  	// Memory region to store the contextQueue.
    82  	stubContextQueueRegion    uintptr
    83  	stubContextQueueRegionLen uintptr
    84  	// Memory region to store instances of sysmsg.ThreadContext.
    85  	stubContextRegion    uintptr
    86  	stubContextRegionLen uintptr
    87  	// The memory blob with precompiled seccomp rules.
    88  	stubSysmsgRules    uintptr
    89  	stubSysmsgRulesLen uintptr
    90  
    91  	stubSpinningThreadQueueAddr uintptr
    92  	stubSpinningThreadQueueSize uintptr
    93  
    94  	// stubROMapEnd is the end address of the read-only stub region that
    95  	// contains the code and precompiled seccomp rules.
    96  	stubROMapEnd uintptr
    97  
    98  	// stubEnd is the first byte past the end of the stub, as with
    99  	// stubStart this is valid only after a call to stubInit.
   100  	stubEnd uintptr
   101  
   102  	// stubInitialized controls one-time stub initialization.
   103  	stubInitialized sync.Once
   104  
   105  	// archState stores architecture-specific details used in the platform.
   106  	archState sysmsg.ArchState
   107  )
   108  
   109  // context is an implementation of the platform context.
   110  type context struct {
   111  	// signalInfo is the signal info, if and when a signal is received.
   112  	signalInfo linux.SignalInfo
   113  
   114  	// interrupt is the interrupt context.
   115  	interrupt interrupt.Forwarder
   116  
   117  	// sharedContext is everything related to this context that is resident in
   118  	// shared memory with the stub thread.
   119  	// sharedContext is only accessed on the Task goroutine, therefore it is not
   120  	// mutex protected.
   121  	sharedContext *sharedContext
   122  
   123  	// mu protects the following fields.
   124  	mu sync.Mutex
   125  
   126  	// If lastFaultSP is non-nil, the last context switch was due to a fault
   127  	// received while executing lastFaultSP. Only context.Switch may set
   128  	// lastFaultSP to a non-nil value.
   129  	lastFaultSP *subprocess
   130  
   131  	// lastFaultAddr is the last faulting address; this is only meaningful if
   132  	// lastFaultSP is non-nil.
   133  	lastFaultAddr hostarch.Addr
   134  
   135  	// lastFaultIP is the address of the last faulting instruction;
   136  	// this is also only meaningful if lastFaultSP is non-nil.
   137  	lastFaultIP hostarch.Addr
   138  
   139  	// needRestoreFPState indicates that the FPU state has been changed by
   140  	// the Sentry and has to be updated on the stub thread.
   141  	needRestoreFPState bool
   142  
   143  	// needToPullFullState indicates that the Sentry doesn't have a full
   144  	// state of the thread.
   145  	needToPullFullState bool
   146  }
   147  
   148  // PullFullState implements platform.Context.PullFullState.
   149  func (c *context) PullFullState(as platform.AddressSpace, ac *arch.Context64) error {
   150  	if !c.needToPullFullState {
   151  		return nil
   152  	}
   153  	s := as.(*subprocess)
   154  	if err := s.PullFullState(c, ac); err != nil {
   155  		return err
   156  	}
   157  	c.needToPullFullState = false
   158  	return nil
   159  }
   160  
   161  // FullStateChanged implements platform.Context.FullStateChanged.
   162  func (c *context) FullStateChanged() {
   163  	c.needRestoreFPState = true
   164  	c.needToPullFullState = false
   165  }
   166  
   167  // Switch runs the provided context in the given address space.
   168  func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) {
   169  	as := mm.AddressSpace()
   170  	s := as.(*subprocess)
   171  	if err := s.activateContext(c); err != nil {
   172  		return nil, hostarch.NoAccess, err
   173  	}
   174  
   175  restart:
   176  	isSyscall, needPatch, err := s.switchToApp(c, ac)
   177  	if err != nil {
   178  		return nil, hostarch.NoAccess, err
   179  	}
   180  	if needPatch {
   181  		restart, _ := s.usertrap.PatchSyscall(ctx, ac, mm)
   182  		if restart {
   183  			goto restart
   184  		}
   185  	}
   186  	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL {
   187  		err := s.usertrap.HandleFault(ctx, ac, mm)
   188  		if err == usertrap.ErrFaultSyscall {
   189  			isSyscall = true
   190  		} else if err == usertrap.ErrFaultRestart {
   191  			goto restart
   192  		} else if err != nil {
   193  			ctx.Warningf("usertrap.HandleFault failed: %v", err)
   194  		}
   195  	}
   196  	var (
   197  		faultSP   *subprocess
   198  		faultAddr hostarch.Addr
   199  		faultIP   hostarch.Addr
   200  	)
   201  	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
   202  		faultSP = s
   203  		faultAddr = hostarch.Addr(c.signalInfo.Addr())
   204  		faultIP = hostarch.Addr(ac.IP())
   205  	}
   206  
   207  	// Update the context to reflect the outcome of this context switch.
   208  	c.mu.Lock()
   209  	lastFaultSP := c.lastFaultSP
   210  	lastFaultAddr := c.lastFaultAddr
   211  	lastFaultIP := c.lastFaultIP
   212  	// At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't
   213  	// be updated by s.Unmap(). This is fine; we only need to synchronize with
   214  	// calls to s.Unmap() that occur after the handling of this fault.
   215  	c.lastFaultSP = faultSP
   216  	c.lastFaultAddr = faultAddr
   217  	c.lastFaultIP = faultIP
   218  	c.mu.Unlock()
   219  
   220  	// Update subprocesses to reflect the outcome of this context switch.
   221  	if lastFaultSP != faultSP {
   222  		if lastFaultSP != nil {
   223  			lastFaultSP.mu.Lock()
   224  			delete(lastFaultSP.faultedContexts, c)
   225  			lastFaultSP.mu.Unlock()
   226  		}
   227  		if faultSP != nil {
   228  			faultSP.mu.Lock()
   229  			faultSP.faultedContexts[c] = struct{}{}
   230  			faultSP.mu.Unlock()
   231  		}
   232  	}
   233  
   234  	if isSyscall {
   235  		return nil, hostarch.NoAccess, nil
   236  	}
   237  
   238  	si := c.signalInfo
   239  	if faultSP == nil {
   240  		// Non-fault signal.
   241  		return &si, hostarch.NoAccess, platform.ErrContextSignal
   242  	}
   243  
   244  	// See if this can be handled as a CPUID exception.
   245  	if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) {
   246  		goto restart
   247  	}
   248  
   249  	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
   250  	// doesn't expose this information. Instead, we use a simple heuristic:
   251  	//
   252  	// It was an instruction fault iff the faulting addr == instruction
   253  	// pointer.
   254  	//
   255  	// It was a write fault if the fault is immediately repeated.
   256  	at := hostarch.Read
   257  	if faultAddr == faultIP {
   258  		at.Execute = true
   259  	}
   260  	if lastFaultSP == faultSP &&
   261  		lastFaultAddr == faultAddr &&
   262  		lastFaultIP == faultIP {
   263  		at.Write = true
   264  	}
   265  
   266  	// Handle as a signal.
   267  	return &si, at, platform.ErrContextSignal
   268  }
   269  
   270  // Interrupt interrupts the running guest application associated with this context.
   271  func (c *context) Interrupt() {
   272  	c.interrupt.NotifyInterrupt()
   273  }
   274  
   275  // Release releases all platform resources used by the context.
   276  func (c *context) Release() {
   277  	if c.sharedContext != nil {
   278  		c.sharedContext.release()
   279  		c.sharedContext = nil
   280  	}
   281  }
   282  
   283  // PrepareSleep implements platform.Context.platform.PrepareSleep.
   284  func (c *context) PrepareSleep() {
   285  	ctx := c.sharedContext
   286  	if ctx == nil {
   287  		return
   288  	}
   289  	if !ctx.sleeping {
   290  		ctx.sleeping = true
   291  		ctx.subprocess.decAwakeContexts()
   292  	}
   293  }
   294  
   295  // Systrap represents a collection of seccomp subprocesses.
   296  type Systrap struct {
   297  	platform.NoCPUPreemptionDetection
   298  	platform.UseHostGlobalMemoryBarrier
   299  	platform.DoesNotOwnPageTables
   300  
   301  	// memoryFile is used to create a stub sysmsg stack
   302  	// which is shared with the Sentry.
   303  	memoryFile *pgalloc.MemoryFile
   304  }
   305  
   306  // MinUserAddress implements platform.MinUserAddress.
   307  func (*Systrap) MinUserAddress() hostarch.Addr {
   308  	return platform.SystemMMapMinAddr()
   309  }
   310  
   311  // New returns a new seccomp-based implementation of the platform interface.
   312  func New() (*Systrap, error) {
   313  	// CPUID information has been initialized at this point.
   314  	archState.Init()
   315  
   316  	mf, err := createMemoryFile()
   317  	if err != nil {
   318  		return nil, err
   319  	}
   320  
   321  	stubInitialized.Do(func() {
   322  		// Initialize the stub.
   323  		stubInit()
   324  
   325  		// Create the source process for the global pool. This must be
   326  		// done before initializing any other processes.
   327  		source, err := newSubprocess(createStub, mf)
   328  		if err != nil {
   329  			// Should never happen.
   330  			panic("unable to initialize systrap source: " + err.Error())
   331  		}
   332  		// The source subprocess is never released explicitly by a MM.
   333  		source.DecRef(nil)
   334  
   335  		globalPool.source = source
   336  
   337  		initSysmsgThreadPriority()
   338  	})
   339  
   340  	return &Systrap{memoryFile: mf}, nil
   341  }
   342  
   343  // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
   344  func (*Systrap) SupportsAddressSpaceIO() bool {
   345  	return false
   346  }
   347  
   348  // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
   349  func (*Systrap) CooperativelySchedulesAddressSpace() bool {
   350  	return false
   351  }
   352  
   353  // MapUnit implements platform.Platform.MapUnit.
   354  func (*Systrap) MapUnit() uint64 {
   355  	// The host kernel manages page tables and arbitrary-sized mappings
   356  	// have effectively the same cost.
   357  	return 0
   358  }
   359  
   360  // MaxUserAddress returns the first address that may not be used by user
   361  // applications.
   362  func (*Systrap) MaxUserAddress() hostarch.Addr {
   363  	return hostarch.Addr(maxStubUserAddress)
   364  }
   365  
   366  // NewAddressSpace returns a new subprocess.
   367  func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) {
   368  	as, err := newSubprocess(globalPool.source.createStub, p.memoryFile)
   369  	return as, nil, err
   370  }
   371  
   372  // NewContext returns an interruptible context.
   373  func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context {
   374  	return &context{
   375  		needRestoreFPState:  true,
   376  		needToPullFullState: false,
   377  	}
   378  }
   379  
   380  type constructor struct{}
   381  
   382  func (*constructor) New(_ *os.File) (platform.Platform, error) {
   383  	return New()
   384  }
   385  
   386  func (*constructor) OpenDevice(_ string) (*os.File, error) {
   387  	return nil, nil
   388  }
   389  
   390  // Requirements implements platform.Constructor.Requirements().
   391  func (*constructor) Requirements() platform.Requirements {
   392  	// TODO(b/75837838): Also set a new PID namespace so that we limit
   393  	// access to other host processes.
   394  	return platform.Requirements{
   395  		RequiresCapSysPtrace: true,
   396  		RequiresCurrentPIDNS: true,
   397  	}
   398  }
   399  
   400  func init() {
   401  	platform.Register("systrap", &constructor{})
   402  }
   403  
   404  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   405  	const memfileName = "systrap-memory"
   406  	fd, err := memutil.CreateMemFD(memfileName, 0)
   407  	if err != nil {
   408  		return nil, fmt.Errorf("error creating memfd: %v", err)
   409  	}
   410  	memfile := os.NewFile(uintptr(fd), memfileName)
   411  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
   412  	if err != nil {
   413  		memfile.Close()
   414  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
   415  	}
   416  	return mf, nil
   417  }