github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine_amd64.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build amd64
    16  
    17  package kvm
    18  
    19  import (
    20  	"fmt"
    21  	"math/big"
    22  	"reflect"
    23  	"runtime/debug"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    27  	"github.com/SagerNet/gvisor/pkg/cpuid"
    28  	"github.com/SagerNet/gvisor/pkg/hostarch"
    29  	"github.com/SagerNet/gvisor/pkg/ring0"
    30  	"github.com/SagerNet/gvisor/pkg/ring0/pagetables"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/arch/fpu"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    33  	ktime "github.com/SagerNet/gvisor/pkg/sentry/time"
    34  )
    35  
    36  // initArchState initializes architecture-specific state.
    37  func (m *machine) initArchState() error {
    38  	// Set the legacy TSS address. This address is covered by the reserved
    39  	// range (up to 4GB). In fact, this is a main reason it exists.
    40  	if _, _, errno := unix.RawSyscall(
    41  		unix.SYS_IOCTL,
    42  		uintptr(m.fd),
    43  		_KVM_SET_TSS_ADDR,
    44  		uintptr(reservedMemory-(3*hostarch.PageSize))); errno != 0 {
    45  		return errno
    46  	}
    47  
    48  	// Enable CPUID faulting, if possible. Note that this also serves as a
    49  	// basic platform sanity tests, since we will enter guest mode for the
    50  	// first time here. The recovery is necessary, since if we fail to read
    51  	// the platform info register, we will retry to host mode and
    52  	// ultimately need to handle a segmentation fault.
    53  	old := debug.SetPanicOnFault(true)
    54  	defer func() {
    55  		recover()
    56  		debug.SetPanicOnFault(old)
    57  	}()
    58  	c := m.Get()
    59  	defer m.Put(c)
    60  	bluepill(c)
    61  	ring0.SetCPUIDFaulting(true)
    62  
    63  	return nil
    64  }
    65  
    66  type machineArchState struct {
    67  }
    68  
    69  type vCPUArchState struct {
    70  	// PCIDs is the set of PCIDs for this vCPU.
    71  	//
    72  	// This starts above fixedKernelPCID.
    73  	PCIDs *pagetables.PCIDs
    74  
    75  	// floatingPointState is the floating point state buffer used in guest
    76  	// to host transitions. See usage in bluepill_amd64.go.
    77  	floatingPointState fpu.State
    78  }
    79  
    80  const (
    81  	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
    82  	// tables. We must start allocating user PCIDs above this in order to
    83  	// avoid any conflict (see below).
    84  	fixedKernelPCID = 1
    85  
    86  	// poolPCIDs is the number of PCIDs to record in the database. As this
    87  	// grows, assignment can take longer, since it is a simple linear scan.
    88  	// Beyond a relatively small number, there are likely few perform
    89  	// benefits, since the TLB has likely long since lost any translations
    90  	// from more than a few PCIDs past.
    91  	poolPCIDs = 8
    92  )
    93  
    94  // initArchState initializes architecture-specific state.
    95  func (c *vCPU) initArchState() error {
    96  	var (
    97  		kernelSystemRegs systemRegs
    98  		kernelUserRegs   userRegs
    99  	)
   100  
   101  	// Set base control registers.
   102  	kernelSystemRegs.CR0 = c.CR0()
   103  	kernelSystemRegs.CR4 = c.CR4()
   104  	kernelSystemRegs.EFER = c.EFER()
   105  
   106  	// Set the IDT & GDT in the registers.
   107  	kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT()
   108  	kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT()
   109  	kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode)
   110  	kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata)
   111  	kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata)
   112  	kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata)
   113  	kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata)
   114  	kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata)
   115  	tssBase, tssLimit, tss := c.TSS()
   116  	kernelSystemRegs.TR.Load(tss, ring0.Tss)
   117  	kernelSystemRegs.TR.base = tssBase
   118  	kernelSystemRegs.TR.limit = uint32(tssLimit)
   119  
   120  	// Point to kernel page tables, with no initial PCID.
   121  	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0)
   122  
   123  	// Initialize the PCID database.
   124  	if hasGuestPCID {
   125  		// Note that NewPCIDs may return a nil table here, in which
   126  		// case we simply don't use PCID support (see below). In
   127  		// practice, this should not happen, however.
   128  		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
   129  	}
   130  
   131  	// Set the CPUID; this is required before setting system registers,
   132  	// since KVM will reject several CR4 bits if the CPUID does not
   133  	// indicate the support is available.
   134  	if err := c.setCPUID(); err != nil {
   135  		return err
   136  	}
   137  
   138  	// Set the entrypoint for the kernel.
   139  	kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
   140  	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
   141  	kernelUserRegs.RSP = c.StackTop()
   142  	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
   143  
   144  	// Set the system registers.
   145  	if err := c.setSystemRegisters(&kernelSystemRegs); err != nil {
   146  		return err
   147  	}
   148  
   149  	// Set the user registers.
   150  	if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 {
   151  		return fmt.Errorf("error setting user registers: %v", errno)
   152  	}
   153  
   154  	// Allocate some floating point state save area for the local vCPU.
   155  	// This will be saved prior to leaving the guest, and we restore from
   156  	// this always. We cannot use the pointer in the context alone because
   157  	// we don't know how large the area there is in reality.
   158  	c.floatingPointState = fpu.NewState()
   159  
   160  	// Set the time offset to the host native time.
   161  	return c.setSystemTime()
   162  }
   163  
   164  // bitsForScaling returns the bits available for storing the fraction component
   165  // of the TSC scaling ratio. This allows us to replicate the (bad) math done by
   166  // the kernel below in scaledTSC, and ensure we can compute an exact zero
   167  // offset in setSystemTime.
   168  //
   169  // These constants correspond to kvm_tsc_scaling_ratio_frac_bits.
   170  var bitsForScaling = func() int64 {
   171  	fs := cpuid.HostFeatureSet()
   172  	if fs.Intel() {
   173  		return 48 // See vmx.c (kvm sources).
   174  	} else if fs.AMD() {
   175  		return 32 // See svm.c (svm sources).
   176  	} else {
   177  		return 63 // Unknown: theoretical maximum.
   178  	}
   179  }()
   180  
   181  // scaledTSC returns the host TSC scaled by the given frequency.
   182  //
   183  // This assumes a current frequency of 1. We require only the unitless ratio of
   184  // rawFreq to some current frequency. See setSystemTime for context.
   185  //
   186  // The kernel math guarantees that all bits of the multiplication and division
   187  // will be correctly preserved and applied. However, it is not possible to
   188  // actually store the ratio correctly.  So we need to use the same schema in
   189  // order to calculate the scaled frequency and get the same result.
   190  //
   191  // We can assume that the current frequency is (1), so we are calculating a
   192  // strict inverse of this value. This simplifies this function considerably.
   193  //
   194  // Roughly, the returned value "scaledTSC" will have:
   195  // 	scaledTSC/hostTSC == 1/rawFreq
   196  //
   197  //go:nosplit
   198  func scaledTSC(rawFreq uintptr) int64 {
   199  	scale := int64(1 << bitsForScaling)
   200  	ratio := big.NewInt(scale / int64(rawFreq))
   201  	ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc())))
   202  	ratio.Div(ratio, big.NewInt(scale))
   203  	return ratio.Int64()
   204  }
   205  
   206  // setSystemTime sets the vCPU to the system time.
   207  func (c *vCPU) setSystemTime() error {
   208  	// First, scale down the clock frequency to the lowest value allowed by
   209  	// the API itself.  How low we can go depends on the underlying
   210  	// hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD.
   211  	// Even the lower bound here will take a 4GHz frequency down to 1Hz,
   212  	// meaning that everything should be able to handle a Khz setting of 1
   213  	// with bits to spare.
   214  	//
   215  	// Note that reducing the clock does not typically require special
   216  	// capabilities as it is emulated in KVM. We don't actually use this
   217  	// capability, but it means that this method should be robust to
   218  	// different hardware configurations.
   219  
   220  	// if tsc scaling is not supported, fallback to legacy mode
   221  	if !c.machine.tscControl {
   222  		return c.setSystemTimeLegacy()
   223  	}
   224  	rawFreq, err := c.getTSCFreq()
   225  	if err != nil {
   226  		return c.setSystemTimeLegacy()
   227  	}
   228  	if err := c.setTSCFreq(1); err != nil {
   229  		return c.setSystemTimeLegacy()
   230  	}
   231  
   232  	// Always restore the original frequency.
   233  	defer func() {
   234  		if err := c.setTSCFreq(rawFreq); err != nil {
   235  			panic(err.Error())
   236  		}
   237  	}()
   238  
   239  	// Attempt to set the system time in this compressed world. The
   240  	// calculation for offset normally looks like:
   241  	//
   242  	//	offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc());
   243  	//
   244  	// So as long as the kvm_scale_tsc component is constant before and
   245  	// after the call to set the TSC value (and it is passes as the
   246  	// target_tsc), we will compute an offset value of zero.
   247  	//
   248  	// This is effectively cheating to make our "setSystemTime" call so
   249  	// unbelievably, incredibly fast that we do it "instantly" and all the
   250  	// calculations result in an offset of zero.
   251  	lastTSC := scaledTSC(rawFreq)
   252  	for {
   253  		if err := c.setTSC(uint64(lastTSC)); err != nil {
   254  			return err
   255  		}
   256  		nextTSC := scaledTSC(rawFreq)
   257  		if lastTSC == nextTSC {
   258  			return nil
   259  		}
   260  		lastTSC = nextTSC // Try again.
   261  	}
   262  }
   263  
   264  // nonCanonical generates a canonical address return.
   265  //
   266  //go:nosplit
   267  func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   268  	*info = linux.SignalInfo{
   269  		Signo: signal,
   270  		Code:  linux.SI_KERNEL,
   271  	}
   272  	info.SetAddr(addr) // Include address.
   273  	return hostarch.NoAccess, platform.ErrContextSignal
   274  }
   275  
   276  // fault generates an appropriate fault return.
   277  //
   278  //go:nosplit
   279  func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   280  	bluepill(c) // Probably no-op, but may not be.
   281  	faultAddr := ring0.ReadCR2()
   282  	code, user := c.ErrorCode()
   283  	if !user {
   284  		// The last fault serviced by this CPU was not a user
   285  		// fault, so we can't reliably trust the faultAddr or
   286  		// the code provided here. We need to re-execute.
   287  		return hostarch.NoAccess, platform.ErrContextInterrupt
   288  	}
   289  	// Reset the pointed SignalInfo.
   290  	*info = linux.SignalInfo{Signo: signal}
   291  	info.SetAddr(uint64(faultAddr))
   292  	accessType := hostarch.AccessType{
   293  		Read:    code&(1<<1) == 0,
   294  		Write:   code&(1<<1) != 0,
   295  		Execute: code&(1<<4) != 0,
   296  	}
   297  	if !accessType.Write && !accessType.Execute {
   298  		info.Code = 1 // SEGV_MAPERR.
   299  	} else {
   300  		info.Code = 2 // SEGV_ACCERR.
   301  	}
   302  	return accessType, platform.ErrContextSignal
   303  }
   304  
   305  //go:nosplit
   306  //go:noinline
   307  func loadByte(ptr *byte) byte {
   308  	return *ptr
   309  }
   310  
   311  // prefaultFloatingPointState touches each page of the floating point state to
   312  // be sure that its physical pages are mapped.
   313  //
   314  // Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that
   315  // triggered a fault will be emulated by the kvm kernel code, but it can't
   316  // emulate instructions like xsave and xrstor.
   317  //
   318  //go:nosplit
   319  func prefaultFloatingPointState(data *fpu.State) {
   320  	size := len(*data)
   321  	for i := 0; i < size; i += hostarch.PageSize {
   322  		loadByte(&(*data)[i])
   323  	}
   324  	loadByte(&(*data)[size-1])
   325  }
   326  
   327  // SwitchToUser unpacks architectural-details.
   328  func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) {
   329  	// Check for canonical addresses.
   330  	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) {
   331  		return nonCanonical(regs.Rip, int32(unix.SIGSEGV), info)
   332  	} else if !ring0.IsCanonical(regs.Rsp) {
   333  		return nonCanonical(regs.Rsp, int32(unix.SIGBUS), info)
   334  	} else if !ring0.IsCanonical(regs.Fs_base) {
   335  		return nonCanonical(regs.Fs_base, int32(unix.SIGBUS), info)
   336  	} else if !ring0.IsCanonical(regs.Gs_base) {
   337  		return nonCanonical(regs.Gs_base, int32(unix.SIGBUS), info)
   338  	}
   339  
   340  	// Assign PCIDs.
   341  	if c.PCIDs != nil {
   342  		var requireFlushPCID bool // Force a flush?
   343  		switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
   344  		switchOpts.KernelPCID = fixedKernelPCID
   345  		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
   346  	}
   347  
   348  	// See below.
   349  	var vector ring0.Vector
   350  
   351  	// Past this point, stack growth can cause system calls (and a break
   352  	// from guest mode). So we need to ensure that between the bluepill
   353  	// call here and the switch call immediately below, no additional
   354  	// allocations occur.
   355  	entersyscall()
   356  	bluepill(c)
   357  	// The root table physical page has to be mapped to not fault in iret
   358  	// or sysret after switching into a user address space.  sysret and
   359  	// iret are in the upper half that is global and already mapped.
   360  	switchOpts.PageTables.PrefaultRootTable()
   361  	prefaultFloatingPointState(switchOpts.FloatingPointState)
   362  	vector = c.CPU.SwitchToUser(switchOpts)
   363  	exitsyscall()
   364  
   365  	switch vector {
   366  	case ring0.Syscall, ring0.SyscallInt80:
   367  		// Fast path: system call executed.
   368  		return hostarch.NoAccess, nil
   369  
   370  	case ring0.PageFault:
   371  		return c.fault(int32(unix.SIGSEGV), info)
   372  
   373  	case ring0.Debug, ring0.Breakpoint:
   374  		*info = linux.SignalInfo{
   375  			Signo: int32(unix.SIGTRAP),
   376  			Code:  1, // TRAP_BRKPT (breakpoint).
   377  		}
   378  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   379  		return hostarch.AccessType{}, platform.ErrContextSignal
   380  
   381  	case ring0.GeneralProtectionFault,
   382  		ring0.SegmentNotPresent,
   383  		ring0.BoundRangeExceeded,
   384  		ring0.InvalidTSS,
   385  		ring0.StackSegmentFault:
   386  		*info = linux.SignalInfo{
   387  			Signo: int32(unix.SIGSEGV),
   388  			Code:  linux.SI_KERNEL,
   389  		}
   390  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   391  		if vector == ring0.GeneralProtectionFault {
   392  			// When CPUID faulting is enabled, we will generate a #GP(0) when
   393  			// userspace executes a CPUID instruction. This is handled above,
   394  			// because we need to be able to map and read user memory.
   395  			return hostarch.AccessType{}, platform.ErrContextSignalCPUID
   396  		}
   397  		return hostarch.AccessType{}, platform.ErrContextSignal
   398  
   399  	case ring0.InvalidOpcode:
   400  		*info = linux.SignalInfo{
   401  			Signo: int32(unix.SIGILL),
   402  			Code:  1, // ILL_ILLOPC (illegal opcode).
   403  		}
   404  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   405  		return hostarch.AccessType{}, platform.ErrContextSignal
   406  
   407  	case ring0.DivideByZero:
   408  		*info = linux.SignalInfo{
   409  			Signo: int32(unix.SIGFPE),
   410  			Code:  1, // FPE_INTDIV (divide by zero).
   411  		}
   412  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   413  		return hostarch.AccessType{}, platform.ErrContextSignal
   414  
   415  	case ring0.Overflow:
   416  		*info = linux.SignalInfo{
   417  			Signo: int32(unix.SIGFPE),
   418  			Code:  2, // FPE_INTOVF (integer overflow).
   419  		}
   420  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   421  		return hostarch.AccessType{}, platform.ErrContextSignal
   422  
   423  	case ring0.X87FloatingPointException,
   424  		ring0.SIMDFloatingPointException:
   425  		*info = linux.SignalInfo{
   426  			Signo: int32(unix.SIGFPE),
   427  			Code:  7, // FPE_FLTINV (invalid operation).
   428  		}
   429  		info.SetAddr(switchOpts.Registers.Rip) // Include address.
   430  		return hostarch.AccessType{}, platform.ErrContextSignal
   431  
   432  	case ring0.Vector(bounce): // ring0.VirtualizationException
   433  		return hostarch.NoAccess, platform.ErrContextInterrupt
   434  
   435  	case ring0.AlignmentCheck:
   436  		*info = linux.SignalInfo{
   437  			Signo: int32(unix.SIGBUS),
   438  			Code:  2, // BUS_ADRERR (physical address does not exist).
   439  		}
   440  		return hostarch.NoAccess, platform.ErrContextSignal
   441  
   442  	case ring0.NMI:
   443  		// An NMI is generated only when a fault is not servicable by
   444  		// KVM itself, so we think some mapping is writeable but it's
   445  		// really not. This could happen, e.g. if some file is
   446  		// truncated (and would generate a SIGBUS) and we map it
   447  		// directly into the instance.
   448  		return c.fault(int32(unix.SIGBUS), info)
   449  
   450  	case ring0.DeviceNotAvailable,
   451  		ring0.DoubleFault,
   452  		ring0.CoprocessorSegmentOverrun,
   453  		ring0.MachineCheck,
   454  		ring0.SecurityException:
   455  		fallthrough
   456  	default:
   457  		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
   458  	}
   459  }
   460  
   461  // On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
   462  // There is no need to return read-only physicalRegions.
   463  func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
   464  	return nil
   465  }
   466  
   467  func availableRegionsForSetMem() (phyRegions []physicalRegion) {
   468  	return physicalRegions
   469  }
   470  
   471  func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
   472  	// Map all the executable regions so that all the entry functions
   473  	// are mapped in the upper half.
   474  	applyVirtualRegions(func(vr virtualRegion) {
   475  		if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
   476  			return
   477  		}
   478  
   479  		if vr.accessType.Execute {
   480  			r := vr.region
   481  			physical, length, ok := translateToPhysical(r.virtual)
   482  			if !ok || length < r.length {
   483  				panic("impossible translation")
   484  			}
   485  			pageTable.Map(
   486  				hostarch.Addr(ring0.KernelStartAddress|r.virtual),
   487  				r.length,
   488  				pagetables.MapOpts{AccessType: hostarch.Execute, Global: true},
   489  				physical)
   490  		}
   491  	})
   492  	for start, end := range m.kernel.EntryRegions() {
   493  		regionLen := end - start
   494  		physical, length, ok := translateToPhysical(start)
   495  		if !ok || length < regionLen {
   496  			panic("impossible translation")
   497  		}
   498  		pageTable.Map(
   499  			hostarch.Addr(ring0.KernelStartAddress|start),
   500  			regionLen,
   501  			pagetables.MapOpts{AccessType: hostarch.ReadWrite, Global: true},
   502  			physical)
   503  	}
   504  }
   505  
   506  // getMaxVCPU get max vCPU number
   507  func (m *machine) getMaxVCPU() {
   508  	maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
   509  	if errno != 0 {
   510  		m.maxVCPUs = _KVM_NR_VCPUS
   511  	} else {
   512  		m.maxVCPUs = int(maxVCPUs)
   513  	}
   514  }
   515  
   516  // getNewVCPU create a new vCPU (maybe)
   517  func (m *machine) getNewVCPU() *vCPU {
   518  	if int(m.nextID) < m.maxVCPUs {
   519  		c := m.newVCPU()
   520  		return c
   521  	}
   522  	return nil
   523  }