github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/sysmsg/sysmsg.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sysmsg provides a stub signal handler and a communication protocol
    16  // between stub threads and the Sentry.
    17  //
    18  // Note that this package is allowlisted for use of sync/atomic.
    19  //
    20  // +checkalignedignore
    21  package sysmsg
    22  
    23  import (
    24  	"fmt"
    25  	"strings"
    26  	"sync/atomic"
    27  
    28  	"golang.org/x/sys/unix"
    29  	"github.com/metacubex/gvisor/pkg/abi/linux"
    30  	"github.com/metacubex/gvisor/pkg/hostarch"
    31  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    32  )
    33  
    34  // LINT.IfChange
    35  // Per-thread stack layout:
    36  //
    37  // *------------*
    38  // | guard page |
    39  // |------------|
    40  // |            |
    41  // |  sysstack  |
    42  // |            |
    43  // *------------*
    44  // | guard page |
    45  // |------------|
    46  // |            |
    47  // |     ^      |
    48  // |    / \     |
    49  // |     |      |
    50  // |  altstack  |
    51  // |------------|
    52  // |   sysmsg   |
    53  // *------------*
    54  const (
    55  	// PerThreadMemSize is the size of a per-thread memory region.
    56  	PerThreadMemSize = 8 * hostarch.PageSize
    57  	// GuardSize is the size of an unmapped region which is placed right
    58  	// before the signal stack.
    59  	GuardSize                   = hostarch.PageSize
    60  	PerThreadPrivateStackOffset = GuardSize
    61  	PerThreadPrivateStackSize   = 2 * hostarch.PageSize
    62  	// PerThreadStackSharedSize is the size of a per-thread stack region.
    63  	PerThreadSharedStackSize   = 4 * hostarch.PageSize
    64  	PerThreadSharedStackOffset = 4 * hostarch.PageSize
    65  	// MsgOffsetFromStack is the offset of the Msg structure on
    66  	// the thread stack.
    67  	MsgOffsetFromSharedStack = PerThreadMemSize - hostarch.PageSize - PerThreadSharedStackOffset
    68  
    69  	// SpinningQueueMemSize is the size of a spinning queue memory region.
    70  	SpinningQueueMemSize = hostarch.PageSize
    71  )
    72  
    73  // StackAddrToMsg returns an address of a sysmsg structure.
    74  func StackAddrToMsg(sp uintptr) uintptr {
    75  	return sp + MsgOffsetFromSharedStack
    76  }
    77  
    78  // StackAddrToSyshandlerStack returns an address of a syshandler stack.
    79  func StackAddrToSyshandlerStack(sp uintptr) uintptr {
    80  	return sp + PerThreadPrivateStackOffset + PerThreadPrivateStackSize
    81  }
    82  
    83  // MsgToStackAddr returns a start address of a stack.
    84  func MsgToStackAddr(msg uintptr) uintptr {
    85  	return msg - MsgOffsetFromSharedStack
    86  }
    87  
    88  // ThreadState is used to store a state of the sysmsg thread.
    89  type ThreadState uint32
    90  
    91  // Set atomicaly sets the state value.
    92  func (s *ThreadState) Set(state ThreadState) {
    93  	atomic.StoreUint32((*uint32)(s), uint32(state))
    94  }
    95  
    96  // CompareAndSwap atomicaly compares and swaps the state value.
    97  func (s *ThreadState) CompareAndSwap(old, state ThreadState) bool {
    98  	return atomic.CompareAndSwapUint32((*uint32)(s), uint32(old), uint32(state))
    99  }
   100  
   101  // Get returns the current state value.
   102  //
   103  //go:nosplit
   104  func (s *ThreadState) Get() ThreadState {
   105  	return ThreadState(atomic.LoadUint32((*uint32)(s)))
   106  }
   107  
   108  const (
   109  	// ThreadStateNone means that the thread is executing the user workload.
   110  	ThreadStateNone ThreadState = iota
   111  	// ThreadStateDone means that last event has been handled and the stub thread
   112  	// can be resumed.
   113  	ThreadStateDone
   114  	// ThreadStatePrep means that syshandler started filling the sysmsg struct.
   115  	ThreadStatePrep
   116  	// ThreadStateAsleep means that this thread fell asleep because there was not
   117  	// enough contexts to process in the context queue.
   118  	ThreadStateAsleep
   119  	// ThreadStateInitializing is only set once at sysmsg thread creation time. It
   120  	// is used to tell the signal handler that the thread does not yet have a
   121  	// context.
   122  	ThreadStateInitializing
   123  )
   124  
   125  // Msg contains the current state of the sysmsg thread.
   126  type Msg struct {
   127  	// The next batch of fields is used to call the syshandler stub
   128  	// function. A system call can be replaced with a function call. When
   129  	// a function call is executed, it can't change the current process
   130  	// stack, so it needs to save stack and instruction registers, switch
   131  	// on its syshandler stack and call the jmp instruction to the syshandler
   132  	// address.
   133  	//
   134  	// Self is a pointer to itself in a process address space.
   135  	Self uint64
   136  	// RetAddr is a return address from the syshandler function.
   137  	RetAddr uint64
   138  	// Syshandler is an address of the syshandler function.
   139  	Syshandler uint64
   140  	// SyshandlerStack is an address of  the thread syshandler stack.
   141  	SyshandlerStack uint64
   142  	// AppStack is a value of the stack register before calling the syshandler
   143  	// function.
   144  	AppStack uint64
   145  	// interrupt is non-zero if there is a postponed interrupt.
   146  	interrupt uint32
   147  	// State indicates to the sentry what the sysmsg thread is doing at a given
   148  	// moment.
   149  	State ThreadState
   150  	// Context is a pointer to the ThreadContext struct that the current sysmsg
   151  	// thread is processing.
   152  	Context uint64
   153  
   154  	// FaultJump is the size of a faulted instruction.
   155  	FaultJump int32
   156  	// Err is the error value with which the {sig|sys}handler crashes the stub
   157  	// thread (see sysmsg.h:__panic).
   158  	Err int32
   159  	// ErrAdditional is an error value that gives additional information
   160  	// about the panic.
   161  	ErrAdditional int32
   162  	// Line is the code line on which the {sig|sys}handler crashed the stub thread
   163  	// (see sysmsg.h:panic).
   164  	Line int32
   165  	// Debug is a variable to use to get visibility into the stub from the sentry.
   166  	Debug uint64
   167  	// ThreadID is the ID of the sysmsg thread.
   168  	ThreadID uint32
   169  }
   170  
   171  // ContextState defines the reason the context has exited back to the sentry,
   172  // or ContextStateNone if running/ready-to-run.
   173  type ContextState uint32
   174  
   175  // Set atomicaly sets the state value.
   176  func (s *ContextState) Set(state ContextState) {
   177  	atomic.StoreUint32((*uint32)(s), uint32(state))
   178  }
   179  
   180  // Get returns the current state value.
   181  //
   182  //go:nosplit
   183  func (s *ContextState) Get() ContextState {
   184  	return ContextState(atomic.LoadUint32((*uint32)(s)))
   185  }
   186  
   187  // Context State types.
   188  const (
   189  	// ContextStateNone means that is either running in the user task or is ready
   190  	// to run in the user task.
   191  	ContextStateNone ContextState = iota
   192  	// ContextStateSyscall means that a syscall event is triggered from the
   193  	// sighandler.
   194  	ContextStateSyscall
   195  	// ContextStateFault means that there is a fault event that needs to be
   196  	// handled.
   197  	ContextStateFault
   198  	// ContextStateSyscallTrap means that a syscall event is triggered from
   199  	// a function call (syshandler).
   200  	ContextStateSyscallTrap
   201  	// ContextStateSyscallCanBePatched means that the syscall can be replaced
   202  	// with a function call.
   203  	ContextStateSyscallCanBePatched
   204  	// ContextStateInvalid is an invalid state that the sentry should never see.
   205  	ContextStateInvalid
   206  )
   207  
   208  const (
   209  	// MaxFPStateLen is the largest possible FPState that we will save.
   210  	// Note: This value was chosen to be able to fit ThreadContext into one page.
   211  	MaxFPStateLen uint32 = 3584
   212  
   213  	// AllocatedSizeofThreadContextStruct defines how much memory to allocate for
   214  	// one instance of ThreadContext.
   215  	// We over allocate the memory for it because:
   216  	//   - The next instances needs to align to 64 bytes for purposes of xsave.
   217  	//   - It's nice to align it to the page boundary.
   218  	AllocatedSizeofThreadContextStruct uintptr = 4096
   219  )
   220  
   221  // ThreadContext contains the current context of the sysmsg thread. The struct
   222  // facilitates switching contexts by allowing the sentry to switch pointers to
   223  // this struct as it needs to.
   224  type ThreadContext struct {
   225  	// FPState is a region of memory where:
   226  	//   - syshandler saves FPU state to using xsave/fxsave
   227  	//   - sighandler copies FPU state to from ucontext->uc_mcontext.fpregs
   228  	// Note that xsave requires this region of memory to be 64 byte aligned;
   229  	// therefore allocations of ThreadContext must be too.
   230  	FPState [MaxFPStateLen]byte
   231  	// FPStateChanged is set to true when the stub thread needs to restore FPState
   232  	// because the sentry changed it.
   233  	FPStateChanged uint64
   234  	// Regs is the context's GP register set. The {sig|sys}handler will save and
   235  	// restore the user app's registers here.
   236  	Regs linux.PtraceRegs
   237  
   238  	// SignalInfo is the siginfo struct.
   239  	SignalInfo linux.SignalInfo
   240  	// Signo is the signal that the stub is requesting the sentry to handle.
   241  	Signo int64
   242  	// State indicates the reason why the context has exited back to the sentry.
   243  	State ContextState
   244  	// Interrupt is set to indicate that this context has been interrupted.
   245  	Interrupt uint32
   246  	// ThreadID is the ID of the sysmsg thread that's currently working on the
   247  	// context.
   248  	ThreadID uint32
   249  	// LastThreadID is the ID of the previous sysmsg thread that ran the context
   250  	// (not the one currently working on it). This field is used by sysmsg threads
   251  	// to detect whether fpstate may have changed since the last time they ran a
   252  	// context.
   253  	LastThreadID uint32
   254  	// SentryFastPath is used to indicate to the stub thread that the sentry
   255  	// goroutine used for this thread context is busy-polling for a response
   256  	// instead of using FUTEX_WAIT.
   257  	SentryFastPath uint32
   258  	// AckedTime is used by sysmsg threads to signal to the sentry that this context
   259  	// has been picked up from the context queue and is actively being worked on.
   260  	// The stub thread puts down the timestamp at which it has started processing
   261  	// this context.
   262  	AckedTime uint64
   263  	// StateChangedTime is the time when the ThreadContext.State changed, as
   264  	// recorded by the stub thread when it gave it back to the sentry
   265  	// (the sentry does not populate this field except to reset it).
   266  	StateChangedTime uint64
   267  	// TLS is a pointer to a thread local storage.
   268  	// It is is only populated on ARM64.
   269  	TLS uint64
   270  	// Debug is a variable to use to get visibility into the stub from the sentry.
   271  	Debug uint64
   272  }
   273  
   274  // StubError are values that represent known stub-thread failure modes.
   275  // Since these errors originate from the stub threads, look at
   276  // sysmsg.h:stub_error.
   277  type StubError int32
   278  
   279  const (
   280  	// StubErrorBadSysmsg indicates sysmsg->self did not match sysmsg.
   281  	StubErrorBadSysmsg StubError = 0x0bad0000 + iota
   282  	// StubErrorBadThreadState indicates sysmsg->state was invalid.
   283  	StubErrorBadThreadState
   284  	// StubErrorBadSpinningQueueDecref indicates stubs removed more threads
   285  	// from spinning queue than were put in.
   286  	StubErrorBadSpinningQueueDecref
   287  	// StubErrorArchPrctl indicates an error when calling arch_prctl.
   288  	StubErrorArchPrctl
   289  	// StubErrorFutex indicates an error when calling futex.
   290  	StubErrorFutex
   291  	// StubErrorBadContextID indicates a context received from the context
   292  	// queue was of unexpected value.
   293  	StubErrorBadContextID
   294  	// StubErrorFpStateBadHeader indicates that the floating point state
   295  	// header did not match the expected value.
   296  	StubErrorFpStateBadHeader
   297  )
   298  
   299  // LINT.ThenChange(sysmsg.h)
   300  
   301  // Init initializes the message.
   302  func (m *Msg) Init(threadID uint32) {
   303  	m.Err = 0
   304  	m.ErrAdditional = 0
   305  	m.Line = -1
   306  	m.ThreadID = threadID
   307  	m.Context = 0
   308  }
   309  
   310  // Init initializes the ThreadContext instance.
   311  func (c *ThreadContext) Init(initialThreadID uint32) {
   312  	c.FPStateChanged = 1
   313  	c.Regs = linux.PtraceRegs{}
   314  	c.Signo = 0
   315  	c.SignalInfo = linux.SignalInfo{}
   316  	c.State = ContextStateNone
   317  	c.ThreadID = initialThreadID
   318  }
   319  
   320  // ConvertSysmsgErr converts m.Err to platform.ContextError.
   321  func (m *Msg) ConvertSysmsgErr() *platform.ContextError {
   322  	err := &platform.ContextError{
   323  		Errno: unix.EPERM,
   324  	}
   325  
   326  	const prefix = "systrap stub thread failure:"
   327  	suffix := fmt.Sprintf("(failed on line %d; %s)", atomic.LoadInt32(&m.Line), m.String())
   328  	switch StubError(atomic.LoadInt32(&m.Err)) {
   329  	case StubErrorBadSysmsg:
   330  		err.Err = fmt.Errorf("%s sysmsg->self did not match sysmsg during sig/sys-handler %s", prefix, suffix)
   331  	case StubErrorBadThreadState:
   332  		err.Err = fmt.Errorf("%s sysmsg->state was invalid during sys-handler %s", prefix, suffix)
   333  	case StubErrorBadSpinningQueueDecref:
   334  		err.Err = fmt.Errorf("%s imbalanced use of spinning queue %s", prefix, suffix)
   335  	case StubErrorArchPrctl:
   336  		err.Err = fmt.Errorf("%s arch_prctl error=0x%x %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
   337  	case StubErrorFutex:
   338  		err.Err = fmt.Errorf("%s futex error=0x%x %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
   339  	case StubErrorBadContextID:
   340  		err.Err = fmt.Errorf("%s unexpected context ID (%d) from context queue %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
   341  	case StubErrorFpStateBadHeader:
   342  		err.Err = fmt.Errorf("%s FP state context magic header (%d) does not match expected FPSIMD_MAGIC %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix)
   343  	default:
   344  		err.Err = fmt.Errorf("%s unknown reason (0x%x) (possible shared memory corruption) %s", prefix, atomic.LoadInt32(&m.Err), suffix)
   345  	}
   346  
   347  	return err
   348  }
   349  
   350  func (m *Msg) String() string {
   351  	var b strings.Builder
   352  	fmt.Fprintf(&b, "sysmsg.Msg{msg: %x state %d", m.Self, m.State)
   353  	fmt.Fprintf(&b, " err %x line %d debug %x", m.Err, m.Line, m.Debug)
   354  	fmt.Fprintf(&b, " app stack %x", m.AppStack)
   355  	fmt.Fprintf(&b, " context %x", m.Context)
   356  	fmt.Fprintf(&b, " ThreadID %d", m.ThreadID)
   357  	b.WriteString("}")
   358  
   359  	return b.String()
   360  }
   361  
   362  func (c *ThreadContext) String() string {
   363  	var b strings.Builder
   364  	fmt.Fprintf(&b, "sysmsg.ThreadContext{state %d", c.State.Get())
   365  	fmt.Fprintf(&b, " fault addr %x syscall %d", c.SignalInfo.Addr(), c.SignalInfo.Syscall())
   366  	fmt.Fprintf(&b, " ip %x sp %x", c.Regs.InstructionPointer(), c.Regs.StackPointer())
   367  	fmt.Fprintf(&b, " FPStateChanged %d Regs %+v", c.FPStateChanged, c.Regs)
   368  	fmt.Fprintf(&b, " Interrupt %d", c.Interrupt)
   369  	fmt.Fprintf(&b, " ThreadID %d LastThreadID %d", c.ThreadID, c.LastThreadID)
   370  	fmt.Fprintf(&b, " SentryFastPath %d Acked %d", c.SentryFastPath, c.AckedTime)
   371  	fmt.Fprintf(&b, " signo: %d, siginfo: %+v", c.Signo, c.SignalInfo)
   372  	fmt.Fprintf(&b, " debug %d", atomic.LoadUint64(&c.Debug))
   373  	b.WriteString("}")
   374  
   375  	return b.String()
   376  }