github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/platform.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package platform provides a Platform abstraction.
    16  //
    17  // See Platform for more information.
    18  package platform
    19  
    20  import (
    21  	"fmt"
    22  	"os"
    23  
    24  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    25  	"github.com/SagerNet/gvisor/pkg/context"
    26  	"github.com/SagerNet/gvisor/pkg/hostarch"
    27  	"github.com/SagerNet/gvisor/pkg/seccomp"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/hostmm"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    31  	"github.com/SagerNet/gvisor/pkg/usermem"
    32  )
    33  
    34  // Platform provides abstractions for execution contexts (Context,
    35  // AddressSpace).
    36  type Platform interface {
    37  	// SupportsAddressSpaceIO returns true if AddressSpaces returned by this
    38  	// Platform support AddressSpaceIO methods.
    39  	//
    40  	// The value returned by SupportsAddressSpaceIO is guaranteed to remain
    41  	// unchanged over the lifetime of the Platform.
    42  	SupportsAddressSpaceIO() bool
    43  
    44  	// CooperativelySchedulesAddressSpace returns true if the Platform has a
    45  	// limited number of AddressSpaces, such that mm.MemoryManager.Deactivate
    46  	// should call AddressSpace.Release when there are no goroutines that
    47  	// require the mm.MemoryManager to have an active AddressSpace.
    48  	//
    49  	// The value returned by CooperativelySchedulesAddressSpace is guaranteed
    50  	// to remain unchanged over the lifetime of the Platform.
    51  	CooperativelySchedulesAddressSpace() bool
    52  
    53  	// DetectsCPUPreemption returns true if Contexts returned by the Platform
    54  	// can reliably return ErrContextCPUPreempted.
    55  	DetectsCPUPreemption() bool
    56  
    57  	// HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method
    58  	// is supported.
    59  	HaveGlobalMemoryBarrier() bool
    60  
    61  	// MapUnit returns the alignment used for optional mappings into this
    62  	// platform's AddressSpaces. Higher values indicate lower per-page costs
    63  	// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
    64  	// that the cost of AddressSpace.MapFile is effectively independent of the
    65  	// number of pages mapped. If MapUnit is non-zero, it must be a power-of-2
    66  	// multiple of hostarch.PageSize.
    67  	MapUnit() uint64
    68  
    69  	// MinUserAddress returns the minimum mappable address on this
    70  	// platform.
    71  	MinUserAddress() hostarch.Addr
    72  
    73  	// MaxUserAddress returns the maximum mappable address on this
    74  	// platform.
    75  	MaxUserAddress() hostarch.Addr
    76  
    77  	// NewAddressSpace returns a new memory context for this platform.
    78  	//
    79  	// If mappingsID is not nil, the platform may assume that (1) all calls
    80  	// to NewAddressSpace with the same mappingsID represent the same
    81  	// (mutable) set of mappings, and (2) the set of mappings has not
    82  	// changed since the last time AddressSpace.Release was called on an
    83  	// AddressSpace returned by a call to NewAddressSpace with the same
    84  	// mappingsID.
    85  	//
    86  	// If a new AddressSpace cannot be created immediately, a nil
    87  	// AddressSpace is returned, along with channel that is closed when
    88  	// the caller should retry a call to NewAddressSpace.
    89  	//
    90  	// In general, this blocking behavior only occurs when
    91  	// CooperativelySchedulesAddressSpace (above) returns false.
    92  	NewAddressSpace(mappingsID interface{}) (AddressSpace, <-chan struct{}, error)
    93  
    94  	// NewContext returns a new execution context.
    95  	NewContext() Context
    96  
    97  	// PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well
    98  	// as the first following call to Context.Switch() for each Context, to
    99  	// return ErrContextCPUPreempted.
   100  	//
   101  	// PreemptAllCPUs is only supported if DetectsCPUPremption() == true.
   102  	// Platforms for which this does not hold may panic if PreemptAllCPUs is
   103  	// called.
   104  	PreemptAllCPUs() error
   105  
   106  	// GlobalMemoryBarrier blocks until all threads running application code
   107  	// (via Context.Switch) and all task goroutines "have passed through a
   108  	// state where all memory accesses to user-space addresses match program
   109  	// order between entry to and return from [GlobalMemoryBarrier]", as for
   110  	// membarrier(2).
   111  	//
   112  	// Preconditions: HaveGlobalMemoryBarrier() == true.
   113  	GlobalMemoryBarrier() error
   114  
   115  	// SyscallFilters returns syscalls made exclusively by this platform.
   116  	SyscallFilters() seccomp.SyscallRules
   117  }
   118  
   119  // NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and
   120  // dependent methods for Platforms that do not support this feature.
   121  type NoCPUPreemptionDetection struct{}
   122  
   123  // DetectsCPUPreemption implements Platform.DetectsCPUPreemption.
   124  func (NoCPUPreemptionDetection) DetectsCPUPreemption() bool {
   125  	return false
   126  }
   127  
   128  // PreemptAllCPUs implements Platform.PreemptAllCPUs.
   129  func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
   130  	panic("This platform does not support CPU preemption detection")
   131  }
   132  
   133  // UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
   134  // Platform.GlobalMemoryBarrier by invoking equivalent functionality on the
   135  // host.
   136  type UseHostGlobalMemoryBarrier struct{}
   137  
   138  // HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
   139  func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool {
   140  	return hostmm.HaveGlobalMemoryBarrier()
   141  }
   142  
   143  // GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
   144  func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error {
   145  	return hostmm.GlobalMemoryBarrier()
   146  }
   147  
   148  // UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
   149  // Platform.GlobalMemoryBarrier by invoking a process-local memory barrier.
   150  // This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for
   151  // platforms for which application code executes while using the sentry's
   152  // mm_struct.
   153  type UseHostProcessMemoryBarrier struct{}
   154  
   155  // HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
   156  func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool {
   157  	// Fall back to a global memory barrier if a process-local one isn't
   158  	// available.
   159  	return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier()
   160  }
   161  
   162  // GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
   163  func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error {
   164  	if hostmm.HaveProcessMemoryBarrier() {
   165  		return hostmm.ProcessMemoryBarrier()
   166  	}
   167  	return hostmm.GlobalMemoryBarrier()
   168  }
   169  
   170  // MemoryManager represents an abstraction above the platform address space
   171  // which manages memory mappings and their contents.
   172  type MemoryManager interface {
   173  	//usermem.IO provides access to the contents of a virtual memory space.
   174  	usermem.IO
   175  	// MMap establishes a memory mapping.
   176  	MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error)
   177  	// AddressSpace returns the AddressSpace bound to mm.
   178  	AddressSpace() AddressSpace
   179  }
   180  
   181  // Context represents the execution context for a single thread.
   182  type Context interface {
   183  	// Switch resumes execution of the thread specified by the arch.Context
   184  	// in the provided address space. This call will block while the thread
   185  	// is executing.
   186  	//
   187  	// If cpu is non-negative, and it is not the number of the CPU that the
   188  	// thread executes on, Context should return ErrContextCPUPreempted. cpu
   189  	// can only be non-negative if Platform.DetectsCPUPreemption() is true;
   190  	// Contexts from Platforms for which this does not hold may ignore cpu, or
   191  	// panic if cpu is non-negative.
   192  	//
   193  	// Switch may return one of the following special errors:
   194  	//
   195  	// - nil: The Context invoked a system call.
   196  	//
   197  	// - ErrContextSignal: The Context was interrupted by a signal. The
   198  	// returned *linux.SignalInfo contains information about the signal. If
   199  	// linux.SignalInfo.Signo == SIGSEGV, the returned hostarch.AccessType
   200  	// contains the access type of the triggering fault. The caller owns
   201  	// the returned SignalInfo.
   202  	//
   203  	// - ErrContextInterrupt: The Context was interrupted by a call to
   204  	// Interrupt(). Switch() may return ErrContextInterrupt spuriously. In
   205  	// particular, most implementations of Interrupt() will cause the first
   206  	// following call to Switch() to return ErrContextInterrupt if there is no
   207  	// concurrent call to Switch().
   208  	//
   209  	// - ErrContextCPUPreempted: See the definition of that error for details.
   210  	Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error)
   211  
   212  	// PullFullState() pulls a full state of the application thread.
   213  	//
   214  	// A platform can support lazy loading/restoring of a thread state
   215  	// which includes registers and a floating point state.
   216  	//
   217  	// For example, when the Sentry handles a system call, it may have only
   218  	// syscall arguments without other registers and a floating point
   219  	// state. And in this case, if the Sentry will need to construct a
   220  	// signal frame to call a signal handler, it will need to call
   221  	// PullFullState() to load all registers and FPU state.
   222  	//
   223  	// Preconditions: The caller must be running on the task goroutine.
   224  	PullFullState(as AddressSpace, ac arch.Context)
   225  
   226  	// FullStateChanged() indicates that a thread state has been changed by
   227  	// the Sentry. This happens in case of the rt_sigreturn, execve, etc.
   228  	//
   229  	// First, it indicates that the Sentry has the full state of the thread
   230  	// and PullFullState() has to do nothing if it is called after
   231  	// FullStateChanged().
   232  	//
   233  	// Second, it forces restoring the full state of the application
   234  	// thread. A platform can support lazy loading/restoring of a thread
   235  	// state. This means that if the Sentry has not changed a thread state,
   236  	// the platform may not restore it.
   237  	//
   238  	// Preconditions: The caller must be running on the task goroutine.
   239  	FullStateChanged()
   240  
   241  	// Interrupt interrupts a concurrent call to Switch(), causing it to return
   242  	// ErrContextInterrupt.
   243  	Interrupt()
   244  
   245  	// Release() releases any resources associated with this context.
   246  	Release()
   247  }
   248  
   249  var (
   250  	// ErrContextSignal is returned by Context.Switch() to indicate that the
   251  	// Context was interrupted by a signal.
   252  	ErrContextSignal = fmt.Errorf("interrupted by signal")
   253  
   254  	// ErrContextSignalCPUID is equivalent to ErrContextSignal, except that
   255  	// a check should be done for execution of the CPUID instruction. If
   256  	// the current instruction pointer is a CPUID instruction, then this
   257  	// should be emulated appropriately. If not, then the given signal
   258  	// should be handled per above.
   259  	ErrContextSignalCPUID = fmt.Errorf("interrupted by signal, possible CPUID")
   260  
   261  	// ErrContextInterrupt is returned by Context.Switch() to indicate that the
   262  	// Context was interrupted by a call to Context.Interrupt().
   263  	ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()")
   264  
   265  	// ErrContextCPUPreempted is returned by Context.Switch() to indicate that
   266  	// one of the following occurred:
   267  	//
   268  	// - The CPU executing the Context is not the CPU passed to
   269  	// Context.Switch().
   270  	//
   271  	// - The CPU executing the Context may have executed another Context since
   272  	// the last time it executed this one; or the CPU has previously executed
   273  	// another Context, and has never executed this one.
   274  	//
   275  	// - Platform.PreemptAllCPUs() was called since the last return from
   276  	// Context.Switch().
   277  	ErrContextCPUPreempted = fmt.Errorf("interrupted by CPU preemption")
   278  )
   279  
   280  // SignalInterrupt is a signal reserved for use by implementations of
   281  // Context.Interrupt(). The sentry guarantees that it will ignore delivery of
   282  // this signal both to Contexts and to the sentry itself, under the assumption
   283  // that they originate from races with Context.Interrupt().
   284  //
   285  // NOTE(b/23420492): The Go runtime only guarantees that a small subset
   286  // of signals will be always be unblocked on all threads, one of which
   287  // is SIGCHLD.
   288  const SignalInterrupt = linux.SIGCHLD
   289  
   290  // AddressSpace represents a virtual address space in which a Context can
   291  // execute.
   292  type AddressSpace interface {
   293  	// MapFile creates a shared mapping of offsets fr from f at address addr.
   294  	// Any existing overlapping mappings are silently replaced.
   295  	//
   296  	// If precommit is true, the platform should eagerly commit resources (e.g.
   297  	// physical memory) to the mapping. The precommit flag is advisory and
   298  	// implementations may choose to ignore it.
   299  	//
   300  	// Preconditions:
   301  	// * addr and fr must be page-aligned.
   302  	// * fr.Length() > 0.
   303  	// * at.Any() == true.
   304  	// * At least one reference must be held on all pages in fr, and must
   305  	//   continue to be held as long as pages are mapped.
   306  	MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error
   307  
   308  	// Unmap unmaps the given range.
   309  	//
   310  	// Preconditions:
   311  	// * addr is page-aligned.
   312  	// * length > 0.
   313  	Unmap(addr hostarch.Addr, length uint64)
   314  
   315  	// Release releases this address space. After releasing, a new AddressSpace
   316  	// must be acquired via platform.NewAddressSpace().
   317  	Release()
   318  
   319  	// PreFork() is called before creating a copy of AddressSpace. This
   320  	// guarantees that this address space will be in a consistent state.
   321  	PreFork()
   322  
   323  	// PostFork() is called after creating a copy of AddressSpace.
   324  	PostFork()
   325  
   326  	// AddressSpaceIO methods are supported iff the associated platform's
   327  	// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
   328  	// does not hold may panic if AddressSpaceIO methods are invoked.
   329  	AddressSpaceIO
   330  }
   331  
   332  // AddressSpaceIO supports IO through the memory mappings installed in an
   333  // AddressSpace.
   334  //
   335  // AddressSpaceIO implementors are responsible for ensuring that address ranges
   336  // are application-mappable.
   337  type AddressSpaceIO interface {
   338  	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
   339  	// returns the number of bytes copied. If the number of bytes copied is <
   340  	// len(src), it returns a non-nil error explaining why.
   341  	CopyOut(addr hostarch.Addr, src []byte) (int, error)
   342  
   343  	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
   344  	// It returns the number of bytes copied. If the number of bytes copied is
   345  	// < len(dst), it returns a non-nil error explaining why.
   346  	CopyIn(addr hostarch.Addr, dst []byte) (int, error)
   347  
   348  	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
   349  	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
   350  	// non-nil error explaining why.
   351  	ZeroOut(addr hostarch.Addr, toZero uintptr) (uintptr, error)
   352  
   353  	// SwapUint32 atomically sets the uint32 value at addr to new and returns
   354  	// the previous value.
   355  	//
   356  	// Preconditions: addr must be aligned to a 4-byte boundary.
   357  	SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)
   358  
   359  	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
   360  	// old; if they are equal, the value in memory is replaced by new. In
   361  	// either case, the previous value stored in memory is returned.
   362  	//
   363  	// Preconditions: addr must be aligned to a 4-byte boundary.
   364  	CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)
   365  
   366  	// LoadUint32 atomically loads the uint32 value at addr and returns it.
   367  	//
   368  	// Preconditions: addr must be aligned to a 4-byte boundary.
   369  	LoadUint32(addr hostarch.Addr) (uint32, error)
   370  }
   371  
   372  // NoAddressSpaceIO implements AddressSpaceIO methods by panicking.
   373  type NoAddressSpaceIO struct{}
   374  
   375  // CopyOut implements AddressSpaceIO.CopyOut.
   376  func (NoAddressSpaceIO) CopyOut(addr hostarch.Addr, src []byte) (int, error) {
   377  	panic("This platform does not support AddressSpaceIO")
   378  }
   379  
   380  // CopyIn implements AddressSpaceIO.CopyIn.
   381  func (NoAddressSpaceIO) CopyIn(addr hostarch.Addr, dst []byte) (int, error) {
   382  	panic("This platform does not support AddressSpaceIO")
   383  }
   384  
   385  // ZeroOut implements AddressSpaceIO.ZeroOut.
   386  func (NoAddressSpaceIO) ZeroOut(addr hostarch.Addr, toZero uintptr) (uintptr, error) {
   387  	panic("This platform does not support AddressSpaceIO")
   388  }
   389  
   390  // SwapUint32 implements AddressSpaceIO.SwapUint32.
   391  func (NoAddressSpaceIO) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
   392  	panic("This platform does not support AddressSpaceIO")
   393  }
   394  
   395  // CompareAndSwapUint32 implements AddressSpaceIO.CompareAndSwapUint32.
   396  func (NoAddressSpaceIO) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
   397  	panic("This platform does not support AddressSpaceIO")
   398  }
   399  
   400  // LoadUint32 implements AddressSpaceIO.LoadUint32.
   401  func (NoAddressSpaceIO) LoadUint32(addr hostarch.Addr) (uint32, error) {
   402  	panic("This platform does not support AddressSpaceIO")
   403  }
   404  
   405  // SegmentationFault is an error returned by AddressSpaceIO methods when IO
   406  // fails due to access of an unmapped page, or a mapped page with insufficient
   407  // permissions.
   408  type SegmentationFault struct {
   409  	// Addr is the address at which the fault occurred.
   410  	Addr hostarch.Addr
   411  }
   412  
   413  // Error implements error.Error.
   414  func (f SegmentationFault) Error() string {
   415  	return fmt.Sprintf("segmentation fault at %#x", f.Addr)
   416  }
   417  
   418  // Requirements is used to specify platform specific requirements.
   419  type Requirements struct {
   420  	// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
   421  	// current pid namespace.
   422  	RequiresCurrentPIDNS bool
   423  	// RequiresCapSysPtrace indicates that the sandbox has to be started with
   424  	// the CAP_SYS_PTRACE capability.
   425  	RequiresCapSysPtrace bool
   426  }
   427  
   428  // Constructor represents a platform type.
   429  type Constructor interface {
   430  	// New returns a new platform instance.
   431  	//
   432  	// Arguments:
   433  	//
   434  	// * deviceFile - the device file (e.g. /dev/kvm for the KVM platform).
   435  	New(deviceFile *os.File) (Platform, error)
   436  	OpenDevice() (*os.File, error)
   437  
   438  	// Requirements returns platform specific requirements.
   439  	Requirements() Requirements
   440  }
   441  
   442  // platforms contains all available platform types.
   443  var platforms = map[string]Constructor{}
   444  
   445  // Register registers a new platform type.
   446  func Register(name string, platform Constructor) {
   447  	platforms[name] = platform
   448  }
   449  
   450  // Lookup looks up the platform constructor by name.
   451  func Lookup(name string) (Constructor, error) {
   452  	p, ok := platforms[name]
   453  	if !ok {
   454  		return nil, fmt.Errorf("unknown platform: %v", name)
   455  	}
   456  	return p, nil
   457  }