github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/kvm/machine_amd64.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64
    16  // +build amd64
    17  
    18  package kvm
    19  
    20  import (
    21  	"fmt"
    22  	"math/big"
    23  	"reflect"
    24  	"runtime"
    25  	"runtime/debug"
    26  
    27  	"golang.org/x/sys/unix"
    28  	"github.com/metacubex/gvisor/pkg/abi/linux"
    29  	"github.com/metacubex/gvisor/pkg/cpuid"
    30  	"github.com/metacubex/gvisor/pkg/hostarch"
    31  	"github.com/metacubex/gvisor/pkg/ring0"
    32  	"github.com/metacubex/gvisor/pkg/ring0/pagetables"
    33  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    34  	ktime "github.com/metacubex/gvisor/pkg/sentry/time"
    35  )
    36  
    37  // initArchState initializes architecture-specific state.
    38  func (m *machine) initArchState() error {
    39  	// Set the legacy TSS address. This address is covered by the reserved
    40  	// range (up to 4GB). In fact, this is a main reason it exists.
    41  	if _, _, errno := unix.RawSyscall(
    42  		unix.SYS_IOCTL,
    43  		uintptr(m.fd),
    44  		KVM_SET_TSS_ADDR,
    45  		uintptr(reservedMemory-(3*hostarch.PageSize))); errno != 0 {
    46  		return errno
    47  	}
    48  
    49  	// Initialize all vCPUs to minimize kvm ioctl-s allowed by seccomp filters.
    50  	m.mu.Lock()
    51  	for i := 0; i < m.maxVCPUs; i++ {
    52  		m.createVCPU(i)
    53  	}
    54  	m.mu.Unlock()
    55  
    56  	c := m.Get()
    57  	defer m.Put(c)
    58  	// Enable CPUID faulting, if possible. Note that this also serves as a
    59  	// basic platform sanity tests, since we will enter guest mode for the
    60  	// first time here. The recovery is necessary, since if we fail to read
    61  	// the platform info register, we will retry to host mode and
    62  	// ultimately need to handle a segmentation fault.
    63  	old := debug.SetPanicOnFault(true)
    64  	defer func() {
    65  		recover()
    66  		debug.SetPanicOnFault(old)
    67  	}()
    68  
    69  	bluepill(c)
    70  	ring0.SetCPUIDFaulting(true)
    71  
    72  	return nil
    73  }
    74  
    75  type vCPUArchState struct {
    76  	// PCIDs is the set of PCIDs for this vCPU.
    77  	//
    78  	// This starts above fixedKernelPCID.
    79  	PCIDs *pagetables.PCIDs
    80  }
    81  
    82  const (
    83  	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
    84  	// tables. We must start allocating user PCIDs above this in order to
    85  	// avoid any conflict (see below).
    86  	fixedKernelPCID = 1
    87  
    88  	// poolPCIDs is the number of PCIDs to record in the database. As this
    89  	// grows, assignment can take longer, since it is a simple linear scan.
    90  	// Beyond a relatively small number, there are likely few perform
    91  	// benefits, since the TLB has likely long since lost any translations
    92  	// from more than a few PCIDs past.
    93  	poolPCIDs = 8
    94  )
    95  
    96  // initArchState initializes architecture-specific state.
    97  func (c *vCPU) initArchState() error {
    98  	var (
    99  		kernelSystemRegs systemRegs
   100  		kernelUserRegs   userRegs
   101  	)
   102  
   103  	// Set base control registers.
   104  	kernelSystemRegs.CR0 = c.CR0()
   105  	kernelSystemRegs.CR4 = c.CR4()
   106  	kernelSystemRegs.EFER = c.EFER()
   107  
   108  	// Set the IDT & GDT in the registers.
   109  	kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT()
   110  	kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT()
   111  	kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode)
   112  	kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata)
   113  	kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata)
   114  	kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata)
   115  	kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata)
   116  	kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata)
   117  	tssBase, tssLimit, tss := c.TSS()
   118  	kernelSystemRegs.TR.Load(tss, ring0.Tss)
   119  	kernelSystemRegs.TR.base = tssBase
   120  	kernelSystemRegs.TR.limit = uint32(tssLimit)
   121  
   122  	// Point to kernel page tables, with no initial PCID.
   123  	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0)
   124  
   125  	// Initialize the PCID database.
   126  	if hasGuestPCID {
   127  		// Note that NewPCIDs may return a nil table here, in which
   128  		// case we simply don't use PCID support (see below). In
   129  		// practice, this should not happen, however.
   130  		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
   131  	}
   132  
   133  	// Set the CPUID; this is required before setting system registers,
   134  	// since KVM will reject several CR4 bits if the CPUID does not
   135  	// indicate the support is available.
   136  	if err := c.setCPUID(); err != nil {
   137  		return err
   138  	}
   139  
   140  	// Set the entrypoint for the kernel.
   141  	kernelUserRegs.RIP = uint64(ring0.AddrOfStart())
   142  	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
   143  	kernelUserRegs.RSP = c.StackTop()
   144  	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
   145  
   146  	// Set the system registers.
   147  	if err := c.setSystemRegisters(&kernelSystemRegs); err != nil {
   148  		return err
   149  	}
   150  
   151  	// Set the user registers.
   152  	if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 {
   153  		return fmt.Errorf("error setting user registers: %v", errno)
   154  	}
   155  
   156  	// Set the time offset to the host native time.
   157  	return c.setSystemTime()
   158  }
   159  
   160  // bitsForScaling returns the bits available for storing the fraction component
   161  // of the TSC scaling ratio.
   162  // It is set using getBitsForScaling when the KVM platform is initialized.
   163  var bitsForScaling int64
   164  
   165  // getBitsForScaling returns the bits available for storing the fraction component
   166  // of the TSC scaling ratio. This allows us to replicate the (bad) math done by
   167  // the kernel below in scaledTSC, and ensure we can compute an exact zero
   168  // offset in setSystemTime.
   169  //
   170  // These constants correspond to kvm_tsc_scaling_ratio_frac_bits.
   171  func getBitsForScaling() int64 {
   172  	fs := cpuid.HostFeatureSet()
   173  	if fs.Intel() {
   174  		return 48 // See vmx.c (kvm sources).
   175  	} else if fs.AMD() {
   176  		return 32 // See svm.c (svm sources).
   177  	} else {
   178  		return 63 // Unknown: theoretical maximum.
   179  	}
   180  }
   181  
   182  // scaledTSC returns the host TSC scaled by the given frequency.
   183  //
   184  // This assumes a current frequency of 1. We require only the unitless ratio of
   185  // rawFreq to some current frequency. See setSystemTime for context.
   186  //
   187  // The kernel math guarantees that all bits of the multiplication and division
   188  // will be correctly preserved and applied. However, it is not possible to
   189  // actually store the ratio correctly.  So we need to use the same schema in
   190  // order to calculate the scaled frequency and get the same result.
   191  //
   192  // We can assume that the current frequency is (1), so we are calculating a
   193  // strict inverse of this value. This simplifies this function considerably.
   194  //
   195  // Roughly, the returned value "scaledTSC" will have:
   196  // scaledTSC/hostTSC == 1/rawFreq
   197  //
   198  //go:nosplit
   199  func scaledTSC(rawFreq uintptr) int64 {
   200  	scale := int64(1 << bitsForScaling)
   201  	ratio := big.NewInt(scale / int64(rawFreq))
   202  	ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc())))
   203  	ratio.Div(ratio, big.NewInt(scale))
   204  	return ratio.Int64()
   205  }
   206  
   207  // setSystemTime sets the vCPU to the system time.
   208  func (c *vCPU) setSystemTime() error {
   209  	// Attempt to set the offset directly. This is supported as of Linux 5.16,
   210  	// or commit 828ca89628bfcb1b8f27535025f69dd00eb55207.
   211  	if err := c.setTSCOffset(); err == nil {
   212  		return err
   213  	}
   214  
   215  	// If tsc scaling is not supported, fallback to legacy mode.
   216  	if !c.machine.tscControl {
   217  		return c.setSystemTimeLegacy()
   218  	}
   219  
   220  	// First, scale down the clock frequency to the lowest value allowed by
   221  	// the API itself.  How low we can go depends on the underlying
   222  	// hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD.
   223  	// Even the lower bound here will take a 4GHz frequency down to 1Hz,
   224  	// meaning that everything should be able to handle a Khz setting of 1
   225  	// with bits to spare.
   226  	//
   227  	// Note that reducing the clock does not typically require special
   228  	// capabilities as it is emulated in KVM. We don't actually use this
   229  	// capability, but it means that this method should be robust to
   230  	// different hardware configurations.
   231  	rawFreq, err := c.getTSCFreq()
   232  	if err != nil {
   233  		return c.setSystemTimeLegacy()
   234  	}
   235  	if err := c.setTSCFreq(1); err != nil {
   236  		return c.setSystemTimeLegacy()
   237  	}
   238  
   239  	// Always restore the original frequency.
   240  	defer func() {
   241  		if err := c.setTSCFreq(rawFreq); err != nil {
   242  			panic(err.Error())
   243  		}
   244  	}()
   245  
   246  	// Attempt to set the system time in this compressed world. The
   247  	// calculation for offset normally looks like:
   248  	//
   249  	//	offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc());
   250  	//
   251  	// So as long as the kvm_scale_tsc component is constant before and
   252  	// after the call to set the TSC value (and it is passes as the
   253  	// target_tsc), we will compute an offset value of zero.
   254  	//
   255  	// This is effectively cheating to make our "setSystemTime" call so
   256  	// unbelievably, incredibly fast that we do it "instantly" and all the
   257  	// calculations result in an offset of zero.
   258  	lastTSC := scaledTSC(rawFreq)
   259  	for {
   260  		if err := c.setTSC(uint64(lastTSC)); err != nil {
   261  			return err
   262  		}
   263  		nextTSC := scaledTSC(rawFreq)
   264  		if lastTSC == nextTSC {
   265  			return nil
   266  		}
   267  		lastTSC = nextTSC // Try again.
   268  	}
   269  }
   270  
   271  // nonCanonical generates a canonical address return.
   272  //
   273  //go:nosplit
   274  func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   275  	*info = linux.SignalInfo{
   276  		Signo: signal,
   277  		Code:  linux.SI_KERNEL,
   278  	}
   279  	info.SetAddr(addr) // Include address.
   280  	return hostarch.NoAccess, platform.ErrContextSignal
   281  }
   282  
   283  // fault generates an appropriate fault return.
   284  //
   285  //go:nosplit
   286  func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   287  	bluepill(c) // Probably no-op, but may not be.
   288  	faultAddr := ring0.ReadCR2()
   289  	code, user := c.ErrorCode()
   290  	if !user {
   291  		// The last fault serviced by this CPU was not a user
   292  		// fault, so we can't reliably trust the faultAddr or
   293  		// the code provided here. We need to re-execute.
   294  		return hostarch.NoAccess, platform.ErrContextInterrupt
   295  	}
   296  	// Reset the pointed SignalInfo.
   297  	*info = linux.SignalInfo{Signo: signal}
   298  	info.SetAddr(uint64(faultAddr))
   299  	accessType := hostarch.AccessType{}
   300  	if signal == int32(unix.SIGSEGV) {
   301  		accessType = hostarch.AccessType{
   302  			Read:    code&(1<<1) == 0,
   303  			Write:   code&(1<<1) != 0,
   304  			Execute: code&(1<<4) != 0,
   305  		}
   306  	}
   307  	if !accessType.Write && !accessType.Execute {
   308  		info.Code = 1 // SEGV_MAPERR.
   309  	} else {
   310  		info.Code = 2 // SEGV_ACCERR.
   311  	}
   312  	return accessType, platform.ErrContextSignal
   313  }
   314  
   315  //go:nosplit
   316  //go:noinline
   317  func loadByte(ptr *byte) byte {
   318  	return *ptr
   319  }
   320  
   321  // SwitchToUser unpacks architectural-details.
   322  func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) {
   323  	// Check for canonical addresses.
   324  	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) {
   325  		return nonCanonical(regs.Rip, int32(unix.SIGSEGV), info)
   326  	} else if !ring0.IsCanonical(regs.Rsp) {
   327  		return nonCanonical(regs.Rsp, int32(unix.SIGBUS), info)
   328  	} else if !ring0.IsCanonical(regs.Fs_base) {
   329  		return nonCanonical(regs.Fs_base, int32(unix.SIGBUS), info)
   330  	} else if !ring0.IsCanonical(regs.Gs_base) {
   331  		return nonCanonical(regs.Gs_base, int32(unix.SIGBUS), info)
   332  	}
   333  
   334  	// Assign PCIDs.
   335  	if c.PCIDs != nil {
   336  		var requireFlushPCID bool // Force a flush?
   337  		switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
   338  		switchOpts.KernelPCID = fixedKernelPCID
   339  		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
   340  	}
   341  
   342  	// See below.
   343  	var vector ring0.Vector
   344  
   345  	// Past this point, stack growth can cause system calls (and a break
   346  	// from guest mode). So we need to ensure that between the bluepill
   347  	// call here and the switch call immediately below, no additional
   348  	// allocations occur.
   349  	entersyscall()
   350  	bluepill(c)
   351  	vector = c.CPU.SwitchToUser(switchOpts)
   352  	exitsyscall()
   353  
   354  	switch vector {
   355  	case ring0.Syscall, ring0.SyscallInt80:
   356  		// Fast path: system call executed.
   357  		return hostarch.NoAccess, nil
   358  
   359  	case ring0.PageFault:
   360  		return c.fault(int32(unix.SIGSEGV), info)
   361  
   362  	case ring0.Debug, ring0.Breakpoint:
   363  		*info = linux.SignalInfo{
   364  			Signo: int32(unix.SIGTRAP),
   365  			Code:  1, // TRAP_BRKPT (breakpoint).
   366  		}
   367  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   368  		return hostarch.AccessType{}, platform.ErrContextSignal
   369  
   370  	case ring0.GeneralProtectionFault,
   371  		ring0.SegmentNotPresent,
   372  		ring0.BoundRangeExceeded,
   373  		ring0.InvalidTSS,
   374  		ring0.StackSegmentFault:
   375  		*info = linux.SignalInfo{
   376  			Signo: int32(unix.SIGSEGV),
   377  			Code:  linux.SI_KERNEL,
   378  		}
   379  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   380  		if vector == ring0.GeneralProtectionFault {
   381  			// When CPUID faulting is enabled, we will generate a #GP(0) when
   382  			// userspace executes a CPUID instruction. This is handled above,
   383  			// because we need to be able to map and read user memory.
   384  			return hostarch.AccessType{}, tryCPUIDError{}
   385  		}
   386  		return hostarch.AccessType{}, platform.ErrContextSignal
   387  
   388  	case ring0.InvalidOpcode:
   389  		*info = linux.SignalInfo{
   390  			Signo: int32(unix.SIGILL),
   391  			Code:  1, // ILL_ILLOPC (illegal opcode).
   392  		}
   393  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   394  		return hostarch.AccessType{}, platform.ErrContextSignal
   395  
   396  	case ring0.DivideByZero:
   397  		*info = linux.SignalInfo{
   398  			Signo: int32(unix.SIGFPE),
   399  			Code:  1, // FPE_INTDIV (divide by zero).
   400  		}
   401  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   402  		return hostarch.AccessType{}, platform.ErrContextSignal
   403  
   404  	case ring0.Overflow:
   405  		*info = linux.SignalInfo{
   406  			Signo: int32(unix.SIGFPE),
   407  			Code:  2, // FPE_INTOVF (integer overflow).
   408  		}
   409  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   410  		return hostarch.AccessType{}, platform.ErrContextSignal
   411  
   412  	case ring0.X87FloatingPointException,
   413  		ring0.SIMDFloatingPointException:
   414  		*info = linux.SignalInfo{
   415  			Signo: int32(unix.SIGFPE),
   416  			Code:  7, // FPE_FLTINV (invalid operation).
   417  		}
   418  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   419  		return hostarch.AccessType{}, platform.ErrContextSignal
   420  
   421  	case ring0.Vector(bounce): // ring0.VirtualizationException
   422  		return hostarch.NoAccess, platform.ErrContextInterrupt
   423  
   424  	case ring0.AlignmentCheck:
   425  		*info = linux.SignalInfo{
   426  			Signo: int32(unix.SIGBUS),
   427  			Code:  2, // BUS_ADRERR (physical address does not exist).
   428  		}
   429  		return hostarch.NoAccess, platform.ErrContextSignal
   430  
   431  	case ring0.NMI:
   432  		// An NMI is generated only when a fault is not servicable by
   433  		// KVM itself, so we think some mapping is writeable but it's
   434  		// really not. This could happen, e.g. if some file is
   435  		// truncated (and would generate a SIGBUS) and we map it
   436  		// directly into the instance.
   437  		return c.fault(int32(unix.SIGBUS), info)
   438  
   439  	case ring0.DeviceNotAvailable,
   440  		ring0.DoubleFault,
   441  		ring0.CoprocessorSegmentOverrun,
   442  		ring0.MachineCheck,
   443  		ring0.SecurityException:
   444  		fallthrough
   445  	default:
   446  		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
   447  	}
   448  }
   449  
   450  func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
   451  	// Map all the executable regions so that all the entry functions
   452  	// are mapped in the upper half.
   453  	if err := applyVirtualRegions(func(vr virtualRegion) {
   454  		if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
   455  			return
   456  		}
   457  
   458  		if vr.accessType.Execute {
   459  			r := vr.region
   460  			physical, length, ok := translateToPhysical(r.virtual)
   461  			if !ok || length < r.length {
   462  				panic("impossible translation")
   463  			}
   464  			pageTable.Map(
   465  				hostarch.Addr(ring0.KernelStartAddress|r.virtual),
   466  				r.length,
   467  				pagetables.MapOpts{AccessType: hostarch.Execute, Global: true},
   468  				physical)
   469  		}
   470  	}); err != nil {
   471  		panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err))
   472  	}
   473  	for start, end := range m.kernel.EntryRegions() {
   474  		regionLen := end - start
   475  		physical, length, ok := translateToPhysical(start)
   476  		if !ok || length < regionLen {
   477  			panic("impossible translation")
   478  		}
   479  		pageTable.Map(
   480  			hostarch.Addr(ring0.KernelStartAddress|start),
   481  			regionLen,
   482  			pagetables.MapOpts{AccessType: hostarch.ReadWrite, Global: true},
   483  			physical)
   484  	}
   485  }
   486  
   487  // getMaxVCPU get max vCPU number
   488  func (m *machine) getMaxVCPU() {
   489  	maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
   490  	if errno != 0 {
   491  		m.maxVCPUs = _KVM_NR_VCPUS
   492  	} else {
   493  		m.maxVCPUs = int(maxVCPUs)
   494  	}
   495  
   496  	// The goal here is to avoid vCPU contentions for reasonable workloads.
   497  	// But "reasonable" isn't defined well in this case. Let's say that CPU
   498  	// overcommit with factor 2 is still acceptable. We allocate a set of
   499  	// vCPU for each goruntime processor (P) and two sets of vCPUs to run
   500  	// user code.
   501  	rCPUs := runtime.GOMAXPROCS(0)
   502  	if 3*rCPUs < m.maxVCPUs {
   503  		m.maxVCPUs = 3 * rCPUs
   504  	}
   505  }
   506  
   507  func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion {
   508  	return physicalRegions
   509  }