github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kvm
    16  
    17  import (
    18  	"fmt"
    19  	"runtime"
    20  	"sync/atomic"
    21  
    22  	"golang.org/x/sys/unix"
    23  	"github.com/SagerNet/gvisor/pkg/atomicbitops"
    24  	"github.com/SagerNet/gvisor/pkg/hostarch"
    25  	"github.com/SagerNet/gvisor/pkg/log"
    26  	"github.com/SagerNet/gvisor/pkg/procid"
    27  	"github.com/SagerNet/gvisor/pkg/ring0"
    28  	"github.com/SagerNet/gvisor/pkg/ring0/pagetables"
    29  	ktime "github.com/SagerNet/gvisor/pkg/sentry/time"
    30  	"github.com/SagerNet/gvisor/pkg/sync"
    31  )
    32  
    33  // machine contains state associated with the VM as a whole.
    34  type machine struct {
    35  	// fd is the vm fd.
    36  	fd int
    37  
    38  	// nextSlot is the next slot for setMemoryRegion.
    39  	//
    40  	// This must be accessed atomically. If nextSlot is ^uint32(0), then
    41  	// slots are currently being updated, and the caller should retry.
    42  	nextSlot uint32
    43  
    44  	// upperSharedPageTables tracks the read-only shared upper of all the pagetables.
    45  	upperSharedPageTables *pagetables.PageTables
    46  
    47  	// kernel is the set of global structures.
    48  	kernel ring0.Kernel
    49  
    50  	// mu protects vCPUs.
    51  	mu sync.RWMutex
    52  
    53  	// available is notified when vCPUs are available.
    54  	available sync.Cond
    55  
    56  	// vCPUsByTID are the machine vCPUs.
    57  	//
    58  	// These are populated dynamically.
    59  	vCPUsByTID map[uint64]*vCPU
    60  
    61  	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
    62  	vCPUsByID []*vCPU
    63  
    64  	// maxVCPUs is the maximum number of vCPUs supported by the machine.
    65  	maxVCPUs int
    66  
    67  	// maxSlots is the maximum number of memory slots supported by the machine.
    68  	maxSlots int
    69  
    70  	// tscControl checks whether cpu supports TSC scaling
    71  	tscControl bool
    72  
    73  	// usedSlots is the set of used physical addresses (sorted).
    74  	usedSlots []uintptr
    75  
    76  	// nextID is the next vCPU ID.
    77  	nextID uint32
    78  
    79  	// machineArchState is the architecture-specific state.
    80  	machineArchState
    81  }
    82  
    83  const (
    84  	// vCPUReady is an alias for all the below clear.
    85  	vCPUReady uint32 = 0
    86  
    87  	// vCPUser indicates that the vCPU is in or about to enter user mode.
    88  	vCPUUser uint32 = 1 << 0
    89  
    90  	// vCPUGuest indicates the vCPU is in guest mode.
    91  	vCPUGuest uint32 = 1 << 1
    92  
    93  	// vCPUWaiter indicates that there is a waiter.
    94  	//
    95  	// If this is set, then notify must be called on any state transitions.
    96  	vCPUWaiter uint32 = 1 << 2
    97  )
    98  
    99  // vCPU is a single KVM vCPU.
   100  type vCPU struct {
   101  	// CPU is the kernel CPU data.
   102  	//
   103  	// This must be the first element of this structure, it is referenced
   104  	// by the bluepill code (see bluepill_amd64.s).
   105  	ring0.CPU
   106  
   107  	// id is the vCPU id.
   108  	id int
   109  
   110  	// fd is the vCPU fd.
   111  	fd int
   112  
   113  	// tid is the last set tid.
   114  	tid uint64
   115  
   116  	// userExits is the count of user exits.
   117  	userExits uint64
   118  
   119  	// guestExits is the count of guest to host world switches.
   120  	guestExits uint64
   121  
   122  	// faults is a count of world faults (informational only).
   123  	faults uint32
   124  
   125  	// state is the vCPU state.
   126  	//
   127  	// This is a bitmask of the three fields (vCPU*) described above.
   128  	state uint32
   129  
   130  	// runData for this vCPU.
   131  	runData *runData
   132  
   133  	// machine associated with this vCPU.
   134  	machine *machine
   135  
   136  	// active is the current addressSpace: this is set and read atomically,
   137  	// it is used to elide unnecessary interrupts due to invalidations.
   138  	active atomicAddressSpace
   139  
   140  	// vCPUArchState is the architecture-specific state.
   141  	vCPUArchState
   142  
   143  	// dieState holds state related to vCPU death.
   144  	dieState dieState
   145  }
   146  
   147  type dieState struct {
   148  	// message is thrown from die.
   149  	message string
   150  
   151  	// guestRegs is used to store register state during vCPU.die() to prevent
   152  	// allocation inside nosplit function.
   153  	guestRegs userRegs
   154  }
   155  
   156  // newVCPU creates a returns a new vCPU.
   157  //
   158  // Precondition: mu must be held.
   159  func (m *machine) newVCPU() *vCPU {
   160  	// Create the vCPU.
   161  	id := int(atomic.AddUint32(&m.nextID, 1) - 1)
   162  	fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
   163  	if errno != 0 {
   164  		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
   165  	}
   166  
   167  	c := &vCPU{
   168  		id:      id,
   169  		fd:      int(fd),
   170  		machine: m,
   171  	}
   172  	c.CPU.Init(&m.kernel, c.id, c)
   173  	m.vCPUsByID[c.id] = c
   174  
   175  	// Ensure the signal mask is correct.
   176  	if err := c.setSignalMask(); err != nil {
   177  		panic(fmt.Sprintf("error setting signal mask: %v", err))
   178  	}
   179  
   180  	// Map the run data.
   181  	runData, err := mapRunData(int(fd))
   182  	if err != nil {
   183  		panic(fmt.Sprintf("error mapping run data: %v", err))
   184  	}
   185  	c.runData = runData
   186  
   187  	// Initialize architecture state.
   188  	if err := c.initArchState(); err != nil {
   189  		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
   190  	}
   191  
   192  	return c // Done.
   193  }
   194  
   195  // newMachine returns a new VM context.
   196  func newMachine(vm int) (*machine, error) {
   197  	// Create the machine.
   198  	m := &machine{fd: vm}
   199  	m.available.L = &m.mu
   200  
   201  	// Pull the maximum vCPUs.
   202  	m.getMaxVCPU()
   203  	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
   204  	m.vCPUsByTID = make(map[uint64]*vCPU)
   205  	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
   206  	m.kernel.Init(m.maxVCPUs)
   207  
   208  	// Pull the maximum slots.
   209  	maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
   210  	if errno != 0 {
   211  		m.maxSlots = _KVM_NR_MEMSLOTS
   212  	} else {
   213  		m.maxSlots = int(maxSlots)
   214  	}
   215  	log.Debugf("The maximum number of slots is %d.", m.maxSlots)
   216  	m.usedSlots = make([]uintptr, m.maxSlots)
   217  
   218  	// Check TSC Scaling
   219  	hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL)
   220  	m.tscControl = errno == 0 && hasTSCControl == 1
   221  	log.Debugf("TSC scaling support: %t.", m.tscControl)
   222  
   223  	// Create the upper shared pagetables and kernel(sentry) pagetables.
   224  	m.upperSharedPageTables = pagetables.New(newAllocator())
   225  	m.mapUpperHalf(m.upperSharedPageTables)
   226  	m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
   227  	m.upperSharedPageTables.MarkReadOnlyShared()
   228  	m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
   229  
   230  	// Apply the physical mappings. Note that these mappings may point to
   231  	// guest physical addresses that are not actually available. These
   232  	// physical pages are mapped on demand, see kernel_unsafe.go.
   233  	applyPhysicalRegions(func(pr physicalRegion) bool {
   234  		// Map everything in the lower half.
   235  		m.kernel.PageTables.Map(
   236  			hostarch.Addr(pr.virtual),
   237  			pr.length,
   238  			pagetables.MapOpts{AccessType: hostarch.AnyAccess},
   239  			pr.physical)
   240  
   241  		return true // Keep iterating.
   242  	})
   243  
   244  	var physicalRegionsReadOnly []physicalRegion
   245  	var physicalRegionsAvailable []physicalRegion
   246  
   247  	physicalRegionsReadOnly = rdonlyRegionsForSetMem()
   248  	physicalRegionsAvailable = availableRegionsForSetMem()
   249  
   250  	// Map all read-only regions.
   251  	for _, r := range physicalRegionsReadOnly {
   252  		m.mapPhysical(r.physical, r.length, physicalRegionsReadOnly, _KVM_MEM_READONLY)
   253  	}
   254  
   255  	// Ensure that the currently mapped virtual regions are actually
   256  	// available in the VM. Note that this doesn't guarantee no future
   257  	// faults, however it should guarantee that everything is available to
   258  	// ensure successful vCPU entry.
   259  	applyVirtualRegions(func(vr virtualRegion) {
   260  		if excludeVirtualRegion(vr) {
   261  			return // skip region.
   262  		}
   263  
   264  		for _, r := range physicalRegionsReadOnly {
   265  			if vr.virtual == r.virtual {
   266  				return
   267  			}
   268  		}
   269  
   270  		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
   271  			physical, length, ok := translateToPhysical(virtual)
   272  			if !ok {
   273  				// This must be an invalid region that was
   274  				// knocked out by creation of the physical map.
   275  				return
   276  			}
   277  			if virtual+length > vr.virtual+vr.length {
   278  				// Cap the length to the end of the area.
   279  				length = vr.virtual + vr.length - virtual
   280  			}
   281  
   282  			// Ensure the physical range is mapped.
   283  			m.mapPhysical(physical, length, physicalRegionsAvailable, _KVM_MEM_FLAGS_NONE)
   284  			virtual += length
   285  		}
   286  	})
   287  
   288  	// Initialize architecture state.
   289  	if err := m.initArchState(); err != nil {
   290  		m.Destroy()
   291  		return nil, err
   292  	}
   293  
   294  	// Ensure the machine is cleaned up properly.
   295  	runtime.SetFinalizer(m, (*machine).Destroy)
   296  	return m, nil
   297  }
   298  
   299  // hasSlot returns true iff the given address is mapped.
   300  //
   301  // This must be done via a linear scan.
   302  //
   303  //go:nosplit
   304  func (m *machine) hasSlot(physical uintptr) bool {
   305  	for i := 0; i < len(m.usedSlots); i++ {
   306  		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
   307  			return true
   308  		}
   309  	}
   310  	return false
   311  }
   312  
   313  // mapPhysical checks for the mapping of a physical range, and installs one if
   314  // not available. This attempts to be efficient for calls in the hot path.
   315  //
   316  // This panics on error.
   317  //
   318  //go:nosplit
   319  func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
   320  	for end := physical + length; physical < end; {
   321  		_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
   322  		if !ok {
   323  			// Should never happen.
   324  			panic("mapPhysical on unknown physical address")
   325  		}
   326  
   327  		// Is this already mapped? Check the usedSlots.
   328  		if !m.hasSlot(physicalStart) {
   329  			if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok {
   330  				panic("handleBluepillFault failed")
   331  			}
   332  		}
   333  
   334  		// Move to the next chunk.
   335  		physical = physicalStart + length
   336  	}
   337  }
   338  
   339  // Destroy frees associated resources.
   340  //
   341  // Destroy should only be called once all active users of the machine are gone.
   342  // The machine object should not be used after calling Destroy.
   343  //
   344  // Precondition: all vCPUs must be returned to the machine.
   345  func (m *machine) Destroy() {
   346  	runtime.SetFinalizer(m, nil)
   347  
   348  	// Destroy vCPUs.
   349  	for _, c := range m.vCPUsByID {
   350  		if c == nil {
   351  			continue
   352  		}
   353  
   354  		// Ensure the vCPU is not still running in guest mode. This is
   355  		// possible iff teardown has been done by other threads, and
   356  		// somehow a single thread has not executed any system calls.
   357  		c.BounceToHost()
   358  
   359  		// Note that the runData may not be mapped if an error occurs
   360  		// during the middle of initialization.
   361  		if c.runData != nil {
   362  			if err := unmapRunData(c.runData); err != nil {
   363  				panic(fmt.Sprintf("error unmapping rundata: %v", err))
   364  			}
   365  		}
   366  		if err := unix.Close(int(c.fd)); err != nil {
   367  			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
   368  		}
   369  	}
   370  
   371  	// vCPUs are gone: teardown machine state.
   372  	if err := unix.Close(m.fd); err != nil {
   373  		panic(fmt.Sprintf("error closing VM fd: %v", err))
   374  	}
   375  }
   376  
   377  // Get gets an available vCPU.
   378  //
   379  // This will return with the OS thread locked.
   380  //
   381  // It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
   382  // to the vCPU in which the OS thread TID is running. So if Get() returns with
   383  // the corrent context in guest, the vCPU of it must be the same as what
   384  // Get() returns.
   385  func (m *machine) Get() *vCPU {
   386  	m.mu.RLock()
   387  	runtime.LockOSThread()
   388  	tid := procid.Current()
   389  
   390  	// Check for an exact match.
   391  	if c := m.vCPUsByTID[tid]; c != nil {
   392  		c.lock()
   393  		m.mu.RUnlock()
   394  		return c
   395  	}
   396  
   397  	// The happy path failed. We now proceed to acquire an exclusive lock
   398  	// (because the vCPU map may change), and scan all available vCPUs.
   399  	// In this case, we first unlock the OS thread. Otherwise, if mu is
   400  	// not available, the current system thread will be parked and a new
   401  	// system thread spawned. We avoid this situation by simply refreshing
   402  	// tid after relocking the system thread.
   403  	m.mu.RUnlock()
   404  	runtime.UnlockOSThread()
   405  	m.mu.Lock()
   406  	runtime.LockOSThread()
   407  	tid = procid.Current()
   408  
   409  	// Recheck for an exact match.
   410  	if c := m.vCPUsByTID[tid]; c != nil {
   411  		c.lock()
   412  		m.mu.Unlock()
   413  		return c
   414  	}
   415  
   416  	for {
   417  		// Scan for an available vCPU.
   418  		for origTID, c := range m.vCPUsByTID {
   419  			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
   420  				delete(m.vCPUsByTID, origTID)
   421  				m.vCPUsByTID[tid] = c
   422  				m.mu.Unlock()
   423  				c.loadSegments(tid)
   424  				return c
   425  			}
   426  		}
   427  
   428  		// Get a new vCPU (maybe).
   429  		if c := m.getNewVCPU(); c != nil {
   430  			c.lock()
   431  			m.vCPUsByTID[tid] = c
   432  			m.mu.Unlock()
   433  			c.loadSegments(tid)
   434  			return c
   435  		}
   436  
   437  		// Scan for something not in user mode.
   438  		for origTID, c := range m.vCPUsByTID {
   439  			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
   440  				continue
   441  			}
   442  
   443  			// The vCPU is not be able to transition to
   444  			// vCPUGuest|vCPUWaiter or to vCPUUser because that
   445  			// transition requires holding the machine mutex, as we
   446  			// do now. There is no path to register a waiter on
   447  			// just the vCPUReady state.
   448  			for {
   449  				c.waitUntilNot(vCPUGuest | vCPUWaiter)
   450  				if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
   451  					break
   452  				}
   453  			}
   454  
   455  			// Steal the vCPU.
   456  			delete(m.vCPUsByTID, origTID)
   457  			m.vCPUsByTID[tid] = c
   458  			m.mu.Unlock()
   459  			c.loadSegments(tid)
   460  			return c
   461  		}
   462  
   463  		// Everything is executing in user mode. Wait until something
   464  		// is available.  Note that signaling the condition variable
   465  		// will have the extra effect of kicking the vCPUs out of guest
   466  		// mode if that's where they were.
   467  		m.available.Wait()
   468  	}
   469  }
   470  
   471  // Put puts the current vCPU.
   472  func (m *machine) Put(c *vCPU) {
   473  	c.unlock()
   474  	runtime.UnlockOSThread()
   475  
   476  	m.mu.RLock()
   477  	m.available.Signal()
   478  	m.mu.RUnlock()
   479  }
   480  
   481  // newDirtySet returns a new dirty set.
   482  func (m *machine) newDirtySet() *dirtySet {
   483  	return &dirtySet{
   484  		vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
   485  	}
   486  }
   487  
   488  // dropPageTables drops cached page table entries.
   489  func (m *machine) dropPageTables(pt *pagetables.PageTables) {
   490  	m.mu.Lock()
   491  	defer m.mu.Unlock()
   492  
   493  	// Clear from all PCIDs.
   494  	for _, c := range m.vCPUsByID {
   495  		if c != nil && c.PCIDs != nil {
   496  			c.PCIDs.Drop(pt)
   497  		}
   498  	}
   499  }
   500  
   501  // lock marks the vCPU as in user mode.
   502  //
   503  // This should only be called directly when known to be safe, i.e. when
   504  // the vCPU is owned by the current TID with no chance of theft.
   505  //
   506  //go:nosplit
   507  func (c *vCPU) lock() {
   508  	atomicbitops.OrUint32(&c.state, vCPUUser)
   509  }
   510  
   511  // unlock clears the vCPUUser bit.
   512  //
   513  //go:nosplit
   514  func (c *vCPU) unlock() {
   515  	if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) {
   516  		// Happy path: no exits are forced, and we can continue
   517  		// executing on our merry way with a single atomic access.
   518  		return
   519  	}
   520  
   521  	// Clear the lock.
   522  	origState := atomic.LoadUint32(&c.state)
   523  	atomicbitops.AndUint32(&c.state, ^vCPUUser)
   524  	switch origState {
   525  	case vCPUUser:
   526  		// Normal state.
   527  	case vCPUUser | vCPUGuest | vCPUWaiter:
   528  		// Force a transition: this must trigger a notification when we
   529  		// return from guest mode. We must clear vCPUWaiter here
   530  		// anyways, because BounceToKernel will force a transition only
   531  		// from ring3 to ring0, which will not clear this bit. Halt may
   532  		// workaround the issue, but if there is no exception or
   533  		// syscall in this period, BounceToKernel will hang.
   534  		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
   535  		c.notify()
   536  	case vCPUUser | vCPUWaiter:
   537  		// Waiting for the lock to be released; the responsibility is
   538  		// on us to notify the waiter and clear the associated bit.
   539  		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
   540  		c.notify()
   541  	default:
   542  		panic("invalid state")
   543  	}
   544  }
   545  
   546  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   547  //
   548  //go:nosplit
   549  func (c *vCPU) NotifyInterrupt() {
   550  	c.BounceToKernel()
   551  }
   552  
   553  // pid is used below in bounce.
   554  var pid = unix.Getpid()
   555  
   556  // bounce forces a return to the kernel or to host mode.
   557  //
   558  // This effectively unwinds the state machine.
   559  func (c *vCPU) bounce(forceGuestExit bool) {
   560  	origGuestExits := atomic.LoadUint64(&c.guestExits)
   561  	origUserExits := atomic.LoadUint64(&c.userExits)
   562  	for {
   563  		switch state := atomic.LoadUint32(&c.state); state {
   564  		case vCPUReady, vCPUWaiter:
   565  			// There is nothing to be done, we're already in the
   566  			// kernel pre-acquisition. The Bounce criteria have
   567  			// been satisfied.
   568  			return
   569  		case vCPUUser:
   570  			// We need to register a waiter for the actual guest
   571  			// transition. When the transition takes place, then we
   572  			// can inject an interrupt to ensure a return to host
   573  			// mode.
   574  			atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter)
   575  		case vCPUUser | vCPUWaiter:
   576  			// Wait for the transition to guest mode. This should
   577  			// come from the bluepill handler.
   578  			c.waitUntilNot(state)
   579  		case vCPUGuest, vCPUUser | vCPUGuest:
   580  			if state == vCPUGuest && !forceGuestExit {
   581  				// The vCPU is already not acquired, so there's
   582  				// no need to do a fresh injection here.
   583  				return
   584  			}
   585  			// The vCPU is in user or kernel mode. Attempt to
   586  			// register a notification on change.
   587  			if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) {
   588  				break // Retry.
   589  			}
   590  			for {
   591  				// We need to spin here until the signal is
   592  				// delivered, because Tgkill can return EAGAIN
   593  				// under memory pressure. Since we already
   594  				// marked ourselves as a waiter, we need to
   595  				// ensure that a signal is actually delivered.
   596  				if err := unix.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil {
   597  					break
   598  				} else if err.(unix.Errno) == unix.EAGAIN {
   599  					continue
   600  				} else {
   601  					// Nothing else should be returned by tgkill.
   602  					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
   603  				}
   604  			}
   605  		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
   606  			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
   607  				// See above.
   608  				return
   609  			}
   610  			// Wait for the transition. This again should happen
   611  			// from the bluepill handler, but on the way out.
   612  			c.waitUntilNot(state)
   613  		default:
   614  			// Should not happen: the above is exhaustive.
   615  			panic("invalid state")
   616  		}
   617  
   618  		// Check if we've missed the state transition, but
   619  		// we can safely return at this point in time.
   620  		newGuestExits := atomic.LoadUint64(&c.guestExits)
   621  		newUserExits := atomic.LoadUint64(&c.userExits)
   622  		if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
   623  			return
   624  		}
   625  	}
   626  }
   627  
   628  // BounceToKernel ensures that the vCPU bounces back to the kernel.
   629  //
   630  //go:nosplit
   631  func (c *vCPU) BounceToKernel() {
   632  	c.bounce(false)
   633  }
   634  
   635  // BounceToHost ensures that the vCPU is in host mode.
   636  //
   637  //go:nosplit
   638  func (c *vCPU) BounceToHost() {
   639  	c.bounce(true)
   640  }
   641  
   642  // setSystemTimeLegacy calibrates and sets an approximate system time.
   643  func (c *vCPU) setSystemTimeLegacy() error {
   644  	const minIterations = 10
   645  	minimum := uint64(0)
   646  	for iter := 0; ; iter++ {
   647  		// Try to set the TSC to an estimate of where it will be
   648  		// on the host during a "fast" system call iteration.
   649  		start := uint64(ktime.Rdtsc())
   650  		if err := c.setTSC(start + (minimum / 2)); err != nil {
   651  			return err
   652  		}
   653  		// See if this is our new minimum call time. Note that this
   654  		// serves two functions: one, we make sure that we are
   655  		// accurately predicting the offset we need to set. Second, we
   656  		// don't want to do the final set on a slow call, which could
   657  		// produce a really bad result.
   658  		end := uint64(ktime.Rdtsc())
   659  		if end < start {
   660  			continue // Totally bogus: unstable TSC?
   661  		}
   662  		current := end - start
   663  		if current < minimum || iter == 0 {
   664  			minimum = current // Set our new minimum.
   665  		}
   666  		// Is this past minIterations and within ~10% of minimum?
   667  		upperThreshold := (((minimum << 3) + minimum) >> 3)
   668  		if iter >= minIterations && current <= upperThreshold {
   669  			return nil
   670  		}
   671  	}
   672  }