github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/arch/fpu/fpu_amd64.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64 || i386
    16  // +build amd64 i386
    17  
    18  package fpu
    19  
    20  import (
    21  	"fmt"
    22  	"io"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/metacubex/gvisor/pkg/cpuid"
    26  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    27  	"github.com/metacubex/gvisor/pkg/hostarch"
    28  	"github.com/metacubex/gvisor/pkg/safecopy"
    29  	"github.com/metacubex/gvisor/pkg/sync"
    30  )
    31  
    32  // FPSoftwareFrame is equivalent to struct _fpx_sw_bytes, the data stored by
    33  // Linux in bytes 464:511 of the fxsave/xsave frame.
    34  //
    35  // +marshal
    36  type FPSoftwareFrame struct {
    37  	Magic1       uint32
    38  	ExtendedSize uint32
    39  	Xfeatures    uint64
    40  	XstateSize   uint32
    41  	Padding      [7]uint32
    42  }
    43  
    44  // From Linux's arch/x86/include/uapi/asm/sigcontext.h.
    45  const (
    46  	// FP_XSTATE_MAGIC1 is the value of FPSoftwareFrame.Magic1.
    47  	FP_XSTATE_MAGIC1 = 0x46505853
    48  	// FP_SW_FRAME_OFFSET is the offset of FPSoftwareFrame in the
    49  	// fxsave/xsave area.
    50  	FP_SW_FRAME_OFFSET = 464
    51  
    52  	// FP_XSTATE_MAGIC2 is the value written to the 4 bytes inserted by
    53  	// Linux after the fxsave/xsave area in the signal frame.
    54  	FP_XSTATE_MAGIC2 = 0x46505845
    55  	// FP_XSTATE_MAGIC2_SIZE is the size of FP_XSTATE_MAGIC2.
    56  	FP_XSTATE_MAGIC2_SIZE = 4
    57  )
    58  
    59  // From Linux's arch/x86/include/asm/fpu/types.h.
    60  const (
    61  	// XFEATURE_MASK_FPSSE is xsave features that are always enabled in
    62  	// signal frame fpstate.
    63  	XFEATURE_MASK_FPSSE = 0x3
    64  
    65  	// FXSAVE_AREA_SIZE is the size of the FXSAVE area.
    66  	FXSAVE_AREA_SIZE = 512
    67  )
    68  
    69  // initX86FPState (defined in asm files) sets up initial state.
    70  func initX86FPState(data *byte, useXsave bool)
    71  
    72  func newX86FPStateSlice() State {
    73  	maxsize, align := cpuid.HostFeatureSet().ExtendedStateSize()
    74  	// We need capacity to be large enough to hold AMX bytes because of
    75  	// ptrace. PTRACE_SETREGSET/GETREGSET assume that AMX portions should
    76  	// always be used.
    77  	// TODO(gvisor.dev/issues/9896): Implement AMX Support.
    78  	capacity := maxsize + FP_XSTATE_MAGIC2_SIZE
    79  	size := maxsize - cpuid.HostFeatureSet().AMXExtendedStateSize()
    80  	// Always use at least 4096 bytes.
    81  	//
    82  	// For the KVM platform, this state is a fixed 4096 bytes, so make sure
    83  	// that the underlying array is at _least_ that size otherwise we will
    84  	// corrupt random memory. This is not a pleasant thing to debug.
    85  	if capacity < 4096 {
    86  		capacity = 4096
    87  	}
    88  	return alignedBytes(capacity, align)[:size+FP_XSTATE_MAGIC2_SIZE]
    89  }
    90  
    91  // Slice returns the byte array that contains only the fpu state. `s` has the
    92  // fpu state and FP_XSTATE_MAGIC2.
    93  func (s State) Slice() []byte {
    94  	return s[:len(s)-FP_XSTATE_MAGIC2_SIZE]
    95  }
    96  
    97  // NewState returns an initialized floating point state.
    98  //
    99  // The returned state is large enough to store all floating point state
   100  // supported by host, even if the app won't use much of it due to a restricted
   101  // FeatureSet. Since they may still be able to see state not advertised by
   102  // CPUID we must ensure it does not contain any sentry state.
   103  func NewState() State {
   104  	f := newX86FPStateSlice()
   105  	initX86FPState(&f[0], cpuid.HostFeatureSet().UseXsave())
   106  	return f
   107  }
   108  
   109  // Fork creates and returns an identical copy of the x86 floating point state.
   110  func (s *State) Fork() State {
   111  	n := newX86FPStateSlice()
   112  	copy(n, *s)
   113  	return n
   114  }
   115  
   116  // Reset resets s to its initial state.
   117  func (s *State) Reset() {
   118  	f := *s
   119  	for i := range f {
   120  		f[i] = 0
   121  	}
   122  	initX86FPState(&f[0], cpuid.HostFeatureSet().UseXsave())
   123  }
   124  
   125  var (
   126  	hostXCR0Mask      uint64
   127  	hostFPSize        uint
   128  	hostUseXsave      bool
   129  	initHostStateOnce sync.Once
   130  )
   131  
   132  // InitHostState initializes host parameters.
   133  func InitHostState() {
   134  	initHostStateOnce.Do(func() {
   135  		featureSet := cpuid.HostFeatureSet()
   136  		hostXCR0Mask = featureSet.ValidXCR0Mask()
   137  		hostUseXsave = featureSet.UseXsave()
   138  		hostFPSize, _ = featureSet.ExtendedStateSize()
   139  		// TODO(gvisor.dev/issues/9896): Implement AMX Support.
   140  		hostFPSize = hostFPSize - featureSet.AMXExtendedStateSize()
   141  	})
   142  }
   143  
   144  // ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type
   145  // manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently,
   146  // ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area.
   147  const ptraceFPRegsSize = 512
   148  
   149  // PtraceGetFPRegs implements Context.PtraceGetFPRegs.
   150  func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) {
   151  	if maxlen < ptraceFPRegsSize {
   152  		return 0, linuxerr.EFAULT
   153  	}
   154  
   155  	return dst.Write((*s)[:ptraceFPRegsSize])
   156  }
   157  
   158  // PtraceSetFPRegs implements Context.PtraceSetFPRegs.
   159  func (s *State) PtraceSetFPRegs(src io.Reader, maxlen int) (int, error) {
   160  	if maxlen < ptraceFPRegsSize {
   161  		return 0, linuxerr.EFAULT
   162  	}
   163  
   164  	var f [ptraceFPRegsSize]byte
   165  	n, err := io.ReadFull(src, f[:])
   166  	if err != nil {
   167  		return 0, err
   168  	}
   169  	// Force reserved bits in MXCSR to 0. This is consistent with Linux.
   170  	sanitizeMXCSR(State(f[:]))
   171  	// N.B. this only copies the beginning of the FP state, which
   172  	// corresponds to the FXSAVE area.
   173  	copy(*s, f[:])
   174  	return n, nil
   175  }
   176  
   177  const (
   178  	// mxcsrOffset is the offset in bytes of the MXCSR field from the start of
   179  	// the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE
   180  	// Area")
   181  	mxcsrOffset = 24
   182  
   183  	// mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the
   184  	// start of the FXSAVE area.
   185  	mxcsrMaskOffset = 28
   186  )
   187  
   188  const (
   189  	// minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal
   190  	// to the size of the XSAVE legacy area (512 bytes) plus the size of the
   191  	// XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's
   192  	// X86_XSTATE_SSE_SIZE.
   193  	minXstateBytes = 512 + 64
   194  
   195  	// userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD
   196  	// field in Linux's struct user_xstateregs, which is the type manipulated
   197  	// by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently,
   198  	// userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET.
   199  	userXstateXCR0Offset = 464
   200  
   201  	// xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86
   202  	// XSAVE area.
   203  	xstateBVOffset = 512
   204  	xcompBVOffset  = 520
   205  
   206  	// xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the
   207  	// XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is
   208  	// a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE
   209  	// header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header".
   210  	// Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP
   211  	// exceptions resulting from invalid values; we aren't. Linux also never
   212  	// uses the compacted format when doing XSAVE and doesn't even define the
   213  	// compaction extensions to XSAVE as a CPU feature, so for simplicity we
   214  	// assume no one is using them.
   215  	xsaveHeaderZeroedOffset = 512 + 8
   216  	xsaveHeaderZeroedBytes  = 64 - 8
   217  )
   218  
   219  // PtraceGetXstateRegs implements ptrace(PTRACE_GETREGS, NT_X86_XSTATE) by
   220  // writing the floating point registers from this state to dst and returning the
   221  // number of bytes written, which must be less than or equal to maxlen.
   222  func (s *State) PtraceGetXstateRegs(dst io.Writer, maxlen int, featureSet cpuid.FeatureSet) (int, error) {
   223  	// N.B. s.x86FPState may contain more state than the application
   224  	// expects. We only copy the subset that would be in their XSAVE area.
   225  	ess, _ := featureSet.ExtendedStateSize()
   226  	f := make([]byte, ess)
   227  	copy(f, *s)
   228  	// "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are
   229  	// reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE
   230  	// Area". Linux uses the first 8 bytes of this area to store the OS XSTATE
   231  	// mask. GDB relies on this: see
   232  	// gdb/x86-linux-nat.c:x86_linux_read_description().
   233  	hostarch.ByteOrder.PutUint64(f[userXstateXCR0Offset:], featureSet.ValidXCR0Mask())
   234  	if len(f) > maxlen {
   235  		f = f[:maxlen]
   236  	}
   237  	return dst.Write(f)
   238  }
   239  
   240  // PtraceSetXstateRegs implements ptrace(PTRACE_SETREGS, NT_X86_XSTATE) by
   241  // reading floating point registers from src and returning the number of bytes
   242  // read, which must be less than or equal to maxlen.
   243  func (s *State) PtraceSetXstateRegs(src io.Reader, maxlen int, featureSet cpuid.FeatureSet) (int, error) {
   244  	// Allow users to pass an xstate register set smaller than ours (they can
   245  	// mask bits out of XSTATE_BV), as long as it's at least minXstateBytes.
   246  	// Also allow users to pass a register set larger than ours; anything after
   247  	// their ExtendedStateSize will be ignored. (I think Linux technically
   248  	// permits setting a register set smaller than minXstateBytes, but it has
   249  	// the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().)
   250  	if maxlen < minXstateBytes {
   251  		return 0, unix.EFAULT
   252  	}
   253  	ess, _ := featureSet.ExtendedStateSize()
   254  	if maxlen > int(ess) {
   255  		maxlen = int(ess)
   256  	}
   257  	f := make([]byte, maxlen)
   258  	if _, err := io.ReadFull(src, f); err != nil {
   259  		return 0, err
   260  	}
   261  	n := copy(*s, f)
   262  	s.SanitizeUser(featureSet)
   263  	return n, nil
   264  }
   265  
   266  // SanitizeUser mutates s to ensure that restoring it is safe.
   267  func (s *State) SanitizeUser(featureSet cpuid.FeatureSet) {
   268  	f := *s
   269  
   270  	// Force reserved bits in MXCSR to 0. This is consistent with Linux.
   271  	sanitizeMXCSR(f)
   272  
   273  	if len(f) >= minXstateBytes {
   274  		// Users can't enable *more* XCR0 bits than what we, and the CPU, support.
   275  		xstateBV := hostarch.ByteOrder.Uint64(f[xstateBVOffset:])
   276  		xstateBV &= featureSet.ValidXCR0Mask()
   277  		hostarch.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV)
   278  		// Force XCOMP_BV and reserved bytes in the XSAVE header to 0.
   279  		reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes]
   280  		for i := range reserved {
   281  			reserved[i] = 0
   282  		}
   283  	}
   284  }
   285  
   286  var (
   287  	mxcsrMask     uint32
   288  	initMXCSRMask sync.Once
   289  )
   290  
   291  // sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR
   292  // generates a general-protection fault (#GP) in response to an attempt to set
   293  // any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section
   294  // 10.5.1.2 "SSE State")
   295  func sanitizeMXCSR(f State) {
   296  	mxcsr := hostarch.ByteOrder.Uint32(f[mxcsrOffset:])
   297  	initMXCSRMask.Do(func() {
   298  		temp := State(alignedBytes(uint(ptraceFPRegsSize), 16))
   299  		initX86FPState(&temp[0], false /* useXsave */)
   300  		mxcsrMask = hostarch.ByteOrder.Uint32(temp[mxcsrMaskOffset:])
   301  		if mxcsrMask == 0 {
   302  			// "If the value of the MXCSR_MASK field is 00000000H, then the
   303  			// MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM
   304  			// Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR
   305  			// Register"
   306  			mxcsrMask = 0xffbf
   307  		}
   308  	})
   309  	mxcsr &= mxcsrMask
   310  	hostarch.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr)
   311  }
   312  
   313  // SetMXCSR sets the MXCSR control/status register in the state.
   314  func (s *State) SetMXCSR(mxcsr uint32) {
   315  	hostarch.ByteOrder.PutUint32((*s)[mxcsrOffset:], mxcsr)
   316  }
   317  
   318  // GetMXCSR gets the MXCSR control/status register in the state.
   319  func (s *State) GetMXCSR() uint32 {
   320  	return hostarch.ByteOrder.Uint32((*s)[mxcsrOffset:])
   321  }
   322  
   323  // BytePointer returns a pointer to the first byte of the state.
   324  //
   325  //go:nosplit
   326  func (s *State) BytePointer() *byte {
   327  	return &(*s)[0]
   328  }
   329  
   330  // XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87
   331  // and SSE state, so this is the equivalent XSTATE_BV value.
   332  const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE
   333  
   334  // AfterLoad converts the loaded state to the format that compatible with the
   335  // current processor.
   336  func (s *State) AfterLoad() {
   337  	old := s.Slice()
   338  
   339  	// Recreate the slice. This is done to ensure that it is aligned
   340  	// appropriately in memory, and large enough to accommodate any new
   341  	// state that may be saved by the new CPU. Even if extraneous new state
   342  	// is saved, the state we care about is guaranteed to be a subset of
   343  	// new state. Later optimizations can use less space when using a
   344  	// smaller state component bitmap. Intel SDM Volume 1 Chapter 13 has
   345  	// more info.
   346  	*s = NewState()
   347  
   348  	// x86FPState always contains all the FP state supported by the host.
   349  	// We may have come from a newer machine that supports additional state
   350  	// which we cannot restore.
   351  	//
   352  	// The x86 FP state areas are backwards compatible, so we can simply
   353  	// truncate the additional floating point state.
   354  	//
   355  	// Applications should not depend on the truncated state because it
   356  	// should relate only to features that were not exposed in the app
   357  	// FeatureSet. However, because we do not *prevent* them from using
   358  	// this state, we must verify here that there is no in-use state
   359  	// (according to XSTATE_BV) which we do not support.
   360  	// What do we support?
   361  	supportedBV := fxsaveBV
   362  	hostFeatureSet := cpuid.HostFeatureSet()
   363  	if hostFeatureSet.UseXsave() {
   364  		supportedBV = hostFeatureSet.ValidXCR0Mask()
   365  	}
   366  
   367  	// What was in use?
   368  	savedBV := fxsaveBV
   369  	if len(old) >= xstateBVOffset+8 {
   370  		savedBV = hostarch.ByteOrder.Uint64(old[xstateBVOffset:])
   371  	}
   372  
   373  	// Supported features must be a superset of saved features.
   374  	if savedBV&^supportedBV != 0 {
   375  		panic(ErrLoadingState{supportedFeatures: supportedBV, savedFeatures: savedBV})
   376  	}
   377  
   378  	// Copy to the new, aligned location.
   379  	copy(*s, old)
   380  
   381  	mxcsrBefore := s.GetMXCSR()
   382  	sanitizeMXCSR(*s)
   383  	mxcsrAfter := s.GetMXCSR()
   384  	if mxcsrBefore != mxcsrAfter {
   385  		panic(fmt.Sprintf("incompatible mxcsr value: %x (%x)", mxcsrBefore, mxcsrAfter))
   386  	}
   387  	if hostFeatureSet.UseXsave() {
   388  		if err := safecopy.CheckXstate(s.BytePointer()); err != nil {
   389  			xcompBV := uint64(0)
   390  			if len(old) >= xcompBVOffset+8 {
   391  				xcompBV = hostarch.ByteOrder.Uint64(old[xcompBVOffset:])
   392  			}
   393  			panic(fmt.Sprintf("incompatible state: %s\nlen(old)=%d len(new)=%d supportedBV=%#x XSTATE_BV=%#x XCOMP_BV=%#x", err, len(old), len(*s), supportedBV, savedBV, xcompBV))
   394  		}
   395  	}
   396  }