github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/platform.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package platform provides a Platform abstraction.
    16  //
    17  // See Platform for more information.
    18  package platform
    19  
    20  import (
    21  	"fmt"
    22  	"os"
    23  
    24  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    25  	"github.com/MerlinKodo/gvisor/pkg/context"
    26  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    27  	"github.com/MerlinKodo/gvisor/pkg/seccomp"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/hostmm"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    31  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    32  )
    33  
    34  // Platform provides abstractions for execution contexts (Context,
    35  // AddressSpace).
    36  type Platform interface {
    37  	// SupportsAddressSpaceIO returns true if AddressSpaces returned by this
    38  	// Platform support AddressSpaceIO methods.
    39  	//
    40  	// The value returned by SupportsAddressSpaceIO is guaranteed to remain
    41  	// unchanged over the lifetime of the Platform.
    42  	SupportsAddressSpaceIO() bool
    43  
    44  	// CooperativelySchedulesAddressSpace returns true if the Platform has a
    45  	// limited number of AddressSpaces, such that mm.MemoryManager.Deactivate
    46  	// should call AddressSpace.Release when there are no goroutines that
    47  	// require the mm.MemoryManager to have an active AddressSpace.
    48  	//
    49  	// The value returned by CooperativelySchedulesAddressSpace is guaranteed
    50  	// to remain unchanged over the lifetime of the Platform.
    51  	CooperativelySchedulesAddressSpace() bool
    52  
    53  	// DetectsCPUPreemption returns true if Contexts returned by the Platform
    54  	// can reliably return ErrContextCPUPreempted.
    55  	DetectsCPUPreemption() bool
    56  
    57  	// HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method
    58  	// is supported.
    59  	HaveGlobalMemoryBarrier() bool
    60  
    61  	// OwnsPageTables returns true if the Platform implementation manages any
    62  	// page tables directly (rather than via host mmap(2) etc.) As of this
    63  	// writing, this property is relevant because the AddressSpace interface
    64  	// does not support specification of memory type (cacheability), such that
    65  	// host FDs specifying memory types (e.g. device drivers) can only set them
    66  	// correctly in host-managed page tables.
    67  	OwnsPageTables() bool
    68  
    69  	// MapUnit returns the alignment used for optional mappings into this
    70  	// platform's AddressSpaces. Higher values indicate lower per-page costs
    71  	// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
    72  	// that the cost of AddressSpace.MapFile is effectively independent of the
    73  	// number of pages mapped. If MapUnit is non-zero, it must be a power-of-2
    74  	// multiple of hostarch.PageSize.
    75  	MapUnit() uint64
    76  
    77  	// MinUserAddress returns the minimum mappable address on this
    78  	// platform.
    79  	MinUserAddress() hostarch.Addr
    80  
    81  	// MaxUserAddress returns the maximum mappable address on this
    82  	// platform.
    83  	MaxUserAddress() hostarch.Addr
    84  
    85  	// NewAddressSpace returns a new memory context for this platform.
    86  	//
    87  	// If mappingsID is not nil, the platform may assume that (1) all calls
    88  	// to NewAddressSpace with the same mappingsID represent the same
    89  	// (mutable) set of mappings, and (2) the set of mappings has not
    90  	// changed since the last time AddressSpace.Release was called on an
    91  	// AddressSpace returned by a call to NewAddressSpace with the same
    92  	// mappingsID.
    93  	//
    94  	// If a new AddressSpace cannot be created immediately, a nil
    95  	// AddressSpace is returned, along with channel that is closed when
    96  	// the caller should retry a call to NewAddressSpace.
    97  	//
    98  	// In general, this blocking behavior only occurs when
    99  	// CooperativelySchedulesAddressSpace (above) returns false.
   100  	NewAddressSpace(mappingsID any) (AddressSpace, <-chan struct{}, error)
   101  
   102  	// NewContext returns a new execution context.
   103  	NewContext(context.Context) Context
   104  
   105  	// PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well
   106  	// as the first following call to Context.Switch() for each Context, to
   107  	// return ErrContextCPUPreempted.
   108  	//
   109  	// PreemptAllCPUs is only supported if DetectsCPUPremption() == true.
   110  	// Platforms for which this does not hold may panic if PreemptAllCPUs is
   111  	// called.
   112  	PreemptAllCPUs() error
   113  
   114  	// GlobalMemoryBarrier blocks until all threads running application code
   115  	// (via Context.Switch) and all task goroutines "have passed through a
   116  	// state where all memory accesses to user-space addresses match program
   117  	// order between entry to and return from [GlobalMemoryBarrier]", as for
   118  	// membarrier(2).
   119  	//
   120  	// Preconditions: HaveGlobalMemoryBarrier() == true.
   121  	GlobalMemoryBarrier() error
   122  
   123  	// SyscallFilters returns syscalls made exclusively by this platform.
   124  	SyscallFilters() seccomp.SyscallRules
   125  }
   126  
   127  // NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and
   128  // dependent methods for Platforms that do not support this feature.
   129  type NoCPUPreemptionDetection struct{}
   130  
   131  // DetectsCPUPreemption implements Platform.DetectsCPUPreemption.
   132  func (NoCPUPreemptionDetection) DetectsCPUPreemption() bool {
   133  	return false
   134  }
   135  
   136  // PreemptAllCPUs implements Platform.PreemptAllCPUs.
   137  func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
   138  	panic("This platform does not support CPU preemption detection")
   139  }
   140  
   141  // UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
   142  // Platform.GlobalMemoryBarrier by invoking equivalent functionality on the
   143  // host.
   144  type UseHostGlobalMemoryBarrier struct{}
   145  
   146  // HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
   147  func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool {
   148  	return hostmm.HaveGlobalMemoryBarrier()
   149  }
   150  
   151  // GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
   152  func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error {
   153  	return hostmm.GlobalMemoryBarrier()
   154  }
   155  
   156  // UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
   157  // Platform.GlobalMemoryBarrier by invoking a process-local memory barrier.
   158  // This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for
   159  // platforms for which application code executes while using the sentry's
   160  // mm_struct.
   161  type UseHostProcessMemoryBarrier struct{}
   162  
   163  // HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
   164  func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool {
   165  	// Fall back to a global memory barrier if a process-local one isn't
   166  	// available.
   167  	return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier()
   168  }
   169  
   170  // GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
   171  func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error {
   172  	if hostmm.HaveProcessMemoryBarrier() {
   173  		return hostmm.ProcessMemoryBarrier()
   174  	}
   175  	return hostmm.GlobalMemoryBarrier()
   176  }
   177  
   178  // DoesOwnPageTables implements Platform.OwnsPageTables in the positive.
   179  type DoesOwnPageTables struct{}
   180  
   181  // OwnsPageTables implements Platform.OwnsPageTables.
   182  func (DoesOwnPageTables) OwnsPageTables() bool {
   183  	return true
   184  }
   185  
   186  // DoesNotOwnPageTables implements Platform.OwnsPageTables in the negative.
   187  type DoesNotOwnPageTables struct{}
   188  
   189  // OwnsPageTables implements Platform.OwnsPageTables.
   190  func (DoesNotOwnPageTables) OwnsPageTables() bool {
   191  	return false
   192  }
   193  
   194  // MemoryManager represents an abstraction above the platform address space
   195  // which manages memory mappings and their contents.
   196  type MemoryManager interface {
   197  	//usermem.IO provides access to the contents of a virtual memory space.
   198  	usermem.IO
   199  	// MMap establishes a memory mapping.
   200  	MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error)
   201  	// AddressSpace returns the AddressSpace bound to mm.
   202  	AddressSpace() AddressSpace
   203  	// FindVMAByName finds a vma with the specified name.
   204  	FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error)
   205  }
   206  
   207  // Context represents the execution context for a single thread.
   208  type Context interface {
   209  	// Switch resumes execution of the thread specified by the arch.Context64
   210  	// in the provided address space. This call will block while the thread
   211  	// is executing.
   212  	//
   213  	// If cpu is non-negative, and it is not the number of the CPU that the
   214  	// thread executes on, Context should return ErrContextCPUPreempted. cpu
   215  	// can only be non-negative if Platform.DetectsCPUPreemption() is true;
   216  	// Contexts from Platforms for which this does not hold may ignore cpu, or
   217  	// panic if cpu is non-negative.
   218  	//
   219  	// Switch may return one of the following special errors:
   220  	//
   221  	//	- nil: The Context invoked a system call.
   222  	//
   223  	//	- ErrContextSignal: The Context was interrupted by a signal. The
   224  	//		returned *linux.SignalInfo contains information about the signal. If
   225  	//		linux.SignalInfo.Signo == SIGSEGV, the returned hostarch.AccessType
   226  	//		contains the access type of the triggering fault. The caller owns
   227  	//		the returned SignalInfo.
   228  	//
   229  	//	- ErrContextInterrupt: The Context was interrupted by a call to
   230  	//		Interrupt(). Switch() may return ErrContextInterrupt spuriously. In
   231  	//		particular, most implementations of Interrupt() will cause the first
   232  	//		following call to Switch() to return ErrContextInterrupt if there is no
   233  	//		concurrent call to Switch().
   234  	//
   235  	//	- ErrContextCPUPreempted: See the definition of that error for details.
   236  	Switch(ctx context.Context, mm MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error)
   237  
   238  	// PullFullState() pulls a full state of the application thread.
   239  	//
   240  	// A platform can support lazy loading/restoring of a thread state
   241  	// which includes registers and a floating point state.
   242  	//
   243  	// For example, when the Sentry handles a system call, it may have only
   244  	// syscall arguments without other registers and a floating point
   245  	// state. And in this case, if the Sentry will need to construct a
   246  	// signal frame to call a signal handler, it will need to call
   247  	// PullFullState() to load all registers and FPU state.
   248  	//
   249  	// Preconditions: The caller must be running on the task goroutine.
   250  	PullFullState(as AddressSpace, ac *arch.Context64) error
   251  
   252  	// FullStateChanged() indicates that a thread state has been changed by
   253  	// the Sentry. This happens in case of the rt_sigreturn, execve, etc.
   254  	//
   255  	// First, it indicates that the Sentry has the full state of the thread
   256  	// and PullFullState() has to do nothing if it is called after
   257  	// FullStateChanged().
   258  	//
   259  	// Second, it forces restoring the full state of the application
   260  	// thread. A platform can support lazy loading/restoring of a thread
   261  	// state. This means that if the Sentry has not changed a thread state,
   262  	// the platform may not restore it.
   263  	//
   264  	// Preconditions: The caller must be running on the task goroutine.
   265  	FullStateChanged()
   266  
   267  	// Interrupt interrupts a concurrent call to Switch(), causing it to return
   268  	// ErrContextInterrupt.
   269  	Interrupt()
   270  
   271  	// Release() releases any resources associated with this context.
   272  	Release()
   273  
   274  	// PrepareSleep() is called when the tread switches to the
   275  	// interruptible sleep state.
   276  	PrepareSleep()
   277  }
   278  
   279  var (
   280  	// ErrContextSignal is returned by Context.Switch() to indicate that the
   281  	// Context was interrupted by a signal.
   282  	ErrContextSignal = fmt.Errorf("interrupted by signal")
   283  
   284  	// ErrContextInterrupt is returned by Context.Switch() to indicate that the
   285  	// Context was interrupted by a call to Context.Interrupt().
   286  	ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()")
   287  
   288  	// ErrContextCPUPreempted is returned by Context.Switch() to indicate that
   289  	// one of the following occurred:
   290  	//
   291  	//	- The CPU executing the Context is not the CPU passed to
   292  	//		Context.Switch().
   293  	//
   294  	//	- The CPU executing the Context may have executed another Context since
   295  	//		the last time it executed this one; or the CPU has previously executed
   296  	//		another Context, and has never executed this one.
   297  	//
   298  	//	- Platform.PreemptAllCPUs() was called since the last return from
   299  	//		Context.Switch().
   300  	ErrContextCPUPreempted = fmt.Errorf("interrupted by CPU preemption")
   301  )
   302  
   303  // SignalInterrupt is a signal reserved for use by implementations of
   304  // Context.Interrupt(). The sentry guarantees that it will ignore delivery of
   305  // this signal both to Contexts and to the sentry itself, under the assumption
   306  // that they originate from races with Context.Interrupt().
   307  //
   308  // NOTE(b/23420492): The Go runtime only guarantees that a small subset
   309  // of signals will be always be unblocked on all threads, one of which
   310  // is SIGCHLD.
   311  const SignalInterrupt = linux.SIGCHLD
   312  
   313  // AddressSpace represents a virtual address space in which a Context can
   314  // execute.
   315  type AddressSpace interface {
   316  	// MapFile creates a shared mapping of offsets fr from f at address addr.
   317  	// Any existing overlapping mappings are silently replaced.
   318  	//
   319  	// If precommit is true, the platform should eagerly commit resources (e.g.
   320  	// physical memory) to the mapping. The precommit flag is advisory and
   321  	// implementations may choose to ignore it.
   322  	//
   323  	// Preconditions:
   324  	//	* addr and fr must be page-aligned.
   325  	//	* fr.Length() > 0.
   326  	//	* at.Any() == true.
   327  	//	* At least one reference must be held on all pages in fr, and must
   328  	//		continue to be held as long as pages are mapped.
   329  	MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error
   330  
   331  	// Unmap unmaps the given range.
   332  	//
   333  	// Preconditions:
   334  	//	* addr is page-aligned.
   335  	//	* length > 0.
   336  	Unmap(addr hostarch.Addr, length uint64)
   337  
   338  	// Release releases this address space. After releasing, a new AddressSpace
   339  	// must be acquired via platform.NewAddressSpace().
   340  	Release()
   341  
   342  	// PreFork() is called before creating a copy of AddressSpace. This
   343  	// guarantees that this address space will be in a consistent state.
   344  	PreFork()
   345  
   346  	// PostFork() is called after creating a copy of AddressSpace.
   347  	PostFork()
   348  
   349  	// AddressSpaceIO methods are supported iff the associated platform's
   350  	// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
   351  	// does not hold may panic if AddressSpaceIO methods are invoked.
   352  	AddressSpaceIO
   353  }
   354  
   355  // AddressSpaceIO supports IO through the memory mappings installed in an
   356  // AddressSpace.
   357  //
   358  // AddressSpaceIO implementors are responsible for ensuring that address ranges
   359  // are application-mappable.
   360  type AddressSpaceIO interface {
   361  	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
   362  	// returns the number of bytes copied. If the number of bytes copied is <
   363  	// len(src), it returns a non-nil error explaining why.
   364  	CopyOut(addr hostarch.Addr, src []byte) (int, error)
   365  
   366  	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
   367  	// It returns the number of bytes copied. If the number of bytes copied is
   368  	// < len(dst), it returns a non-nil error explaining why.
   369  	CopyIn(addr hostarch.Addr, dst []byte) (int, error)
   370  
   371  	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
   372  	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
   373  	// non-nil error explaining why.
   374  	ZeroOut(addr hostarch.Addr, toZero uintptr) (uintptr, error)
   375  
   376  	// SwapUint32 atomically sets the uint32 value at addr to new and returns
   377  	// the previous value.
   378  	//
   379  	// Preconditions: addr must be aligned to a 4-byte boundary.
   380  	SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)
   381  
   382  	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
   383  	// old; if they are equal, the value in memory is replaced by new. In
   384  	// either case, the previous value stored in memory is returned.
   385  	//
   386  	// Preconditions: addr must be aligned to a 4-byte boundary.
   387  	CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)
   388  
   389  	// LoadUint32 atomically loads the uint32 value at addr and returns it.
   390  	//
   391  	// Preconditions: addr must be aligned to a 4-byte boundary.
   392  	LoadUint32(addr hostarch.Addr) (uint32, error)
   393  }
   394  
   395  // NoAddressSpaceIO implements AddressSpaceIO methods by panicking.
   396  type NoAddressSpaceIO struct{}
   397  
   398  // CopyOut implements AddressSpaceIO.CopyOut.
   399  func (NoAddressSpaceIO) CopyOut(addr hostarch.Addr, src []byte) (int, error) {
   400  	panic("This platform does not support AddressSpaceIO")
   401  }
   402  
   403  // CopyIn implements AddressSpaceIO.CopyIn.
   404  func (NoAddressSpaceIO) CopyIn(addr hostarch.Addr, dst []byte) (int, error) {
   405  	panic("This platform does not support AddressSpaceIO")
   406  }
   407  
   408  // ZeroOut implements AddressSpaceIO.ZeroOut.
   409  func (NoAddressSpaceIO) ZeroOut(addr hostarch.Addr, toZero uintptr) (uintptr, error) {
   410  	panic("This platform does not support AddressSpaceIO")
   411  }
   412  
   413  // SwapUint32 implements AddressSpaceIO.SwapUint32.
   414  func (NoAddressSpaceIO) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
   415  	panic("This platform does not support AddressSpaceIO")
   416  }
   417  
   418  // CompareAndSwapUint32 implements AddressSpaceIO.CompareAndSwapUint32.
   419  func (NoAddressSpaceIO) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
   420  	panic("This platform does not support AddressSpaceIO")
   421  }
   422  
   423  // LoadUint32 implements AddressSpaceIO.LoadUint32.
   424  func (NoAddressSpaceIO) LoadUint32(addr hostarch.Addr) (uint32, error) {
   425  	panic("This platform does not support AddressSpaceIO")
   426  }
   427  
   428  // SegmentationFault is an error returned by AddressSpaceIO methods when IO
   429  // fails due to access of an unmapped page, or a mapped page with insufficient
   430  // permissions.
   431  type SegmentationFault struct {
   432  	// Addr is the address at which the fault occurred.
   433  	Addr hostarch.Addr
   434  }
   435  
   436  // Error implements error.Error.
   437  func (f SegmentationFault) Error() string {
   438  	return fmt.Sprintf("segmentation fault at %#x", f.Addr)
   439  }
   440  
   441  // Requirements is used to specify platform specific requirements.
   442  type Requirements struct {
   443  	// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
   444  	// current pid namespace.
   445  	RequiresCurrentPIDNS bool
   446  	// RequiresCapSysPtrace indicates that the sandbox has to be started with
   447  	// the CAP_SYS_PTRACE capability.
   448  	RequiresCapSysPtrace bool
   449  }
   450  
   451  // Constructor represents a platform type.
   452  type Constructor interface {
   453  	// New returns a new platform instance.
   454  	//
   455  	// Arguments:
   456  	//
   457  	//	* deviceFile - the device file (e.g. /dev/kvm for the KVM platform).
   458  	New(deviceFile *os.File) (Platform, error)
   459  
   460  	// OpenDevice opens the path to the device used by the platform.
   461  	// Passing in an empty string will use the default path for the device,
   462  	// e.g. "/dev/kvm" for the KVM platform.
   463  	OpenDevice(devicePath string) (*os.File, error)
   464  
   465  	// Requirements returns platform specific requirements.
   466  	Requirements() Requirements
   467  }
   468  
   469  // platforms contains all available platform types.
   470  var platforms = map[string]Constructor{}
   471  
   472  // Register registers a new platform type.
   473  func Register(name string, platform Constructor) {
   474  	platforms[name] = platform
   475  }
   476  
   477  // List lists available platforms.
   478  func List() (available []string) {
   479  	for name := range platforms {
   480  		available = append(available, name)
   481  	}
   482  	return
   483  }
   484  
   485  // Lookup looks up the platform constructor by name.
   486  func Lookup(name string) (Constructor, error) {
   487  	p, ok := platforms[name]
   488  	if !ok {
   489  		return nil, fmt.Errorf("unknown platform: %v", name)
   490  	}
   491  	return p, nil
   492  }