github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kvm
    16  
    17  import (
    18  	"fmt"
    19  	"runtime"
    20  	gosync "sync"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/hosttid"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/metric"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0/pagetables"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/seccomp"
    34  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time"
    35  	"github.com/nicocha30/gvisor-ligolo/pkg/sighandling"
    36  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    37  )
    38  
    39  // machine contains state associated with the VM as a whole.
    40  type machine struct {
    41  	// fd is the vm fd.
    42  	fd int
    43  
    44  	// machinePoolIndex is the index in the machinePool array.
    45  	machinePoolIndex uint32
    46  
    47  	// nextSlot is the next slot for setMemoryRegion.
    48  	//
    49  	// If nextSlot is ^uint32(0), then slots are currently being updated, and the
    50  	// caller should retry.
    51  	nextSlot atomicbitops.Uint32
    52  
    53  	// upperSharedPageTables tracks the read-only shared upper of all the pagetables.
    54  	upperSharedPageTables *pagetables.PageTables
    55  
    56  	// kernel is the set of global structures.
    57  	kernel ring0.Kernel
    58  
    59  	// mu protects vCPUs.
    60  	mu sync.RWMutex
    61  
    62  	// available is notified when vCPUs are available.
    63  	available sync.Cond
    64  
    65  	// vCPUsByTID are the machine vCPUs.
    66  	//
    67  	// These are populated dynamically.
    68  	vCPUsByTID map[uint64]*vCPU
    69  
    70  	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
    71  	vCPUsByID []*vCPU
    72  
    73  	// usedVCPUs is the number of vCPUs that have been used from the
    74  	// vCPUsByID pool.
    75  	usedVCPUs int
    76  
    77  	// maxVCPUs is the maximum number of vCPUs supported by the machine.
    78  	maxVCPUs int
    79  
    80  	// maxSlots is the maximum number of memory slots supported by the machine.
    81  	maxSlots int
    82  
    83  	// tscControl checks whether cpu supports TSC scaling
    84  	tscControl bool
    85  
    86  	// usedSlots is the set of used physical addresses (not sorted).
    87  	usedSlots []uintptr
    88  }
    89  
    90  const (
    91  	// vCPUReady is an alias for all the below clear.
    92  	vCPUReady uint32 = 0
    93  
    94  	// vCPUser indicates that the vCPU is in or about to enter user mode.
    95  	vCPUUser uint32 = 1 << 0
    96  
    97  	// vCPUGuest indicates the vCPU is in guest mode.
    98  	vCPUGuest uint32 = 1 << 1
    99  
   100  	// vCPUWaiter indicates that there is a waiter.
   101  	//
   102  	// If this is set, then notify must be called on any state transitions.
   103  	vCPUWaiter uint32 = 1 << 2
   104  )
   105  
   106  // Field values for the get_vcpu metric acquisition path used.
   107  var (
   108  	getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"}
   109  	getVCPUAcquisitionReused     = metric.FieldValue{"reused"}
   110  	getVCPUAcquisitionUnused     = metric.FieldValue{"unused"}
   111  	getVCPUAcquisitionStolen     = metric.FieldValue{"stolen"}
   112  )
   113  
   114  var (
   115  	// hostExitCounter is a metric that tracks how many times the sentry
   116  	// performed a host to guest world switch.
   117  	hostExitCounter = metric.MustCreateNewProfilingUint64Metric(
   118  		"/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.")
   119  
   120  	// userExitCounter is a metric that tracks how many times the sentry has
   121  	// had an exit from userspace. Analogous to vCPU.userExits.
   122  	userExitCounter = metric.MustCreateNewProfilingUint64Metric(
   123  		"/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.")
   124  
   125  	// interruptCounter is a metric that tracks how many times execution returned
   126  	// to the KVM host to handle a pending signal.
   127  	interruptCounter = metric.MustCreateNewProfilingUint64Metric(
   128  		"/kvm/interrupts", false, "The number of times the signal handler was invoked.")
   129  
   130  	// mmapCallCounter is a metric that tracks how many times the function
   131  	// seccompMmapSyscall has been called.
   132  	mmapCallCounter = metric.MustCreateNewProfilingUint64Metric(
   133  		"/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.")
   134  
   135  	// getVCPUCounter is a metric that tracks how many times different paths of
   136  	// machine.Get() are triggered.
   137  	getVCPUCounter = metric.MustCreateNewProfilingUint64Metric(
   138  		"/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.",
   139  		metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen))
   140  
   141  	// asInvalidateDuration are durations of calling addressSpace.invalidate().
   142  	asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate",
   143  		metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2),
   144  		"Duration of calling addressSpace.invalidate().")
   145  )
   146  
   147  // vCPU is a single KVM vCPU.
   148  type vCPU struct {
   149  	// CPU is the kernel CPU data.
   150  	//
   151  	// This must be the first element of this structure, it is referenced
   152  	// by the bluepill code (see bluepill_amd64.s).
   153  	ring0.CPU
   154  
   155  	// id is the vCPU id.
   156  	id int
   157  
   158  	// fd is the vCPU fd.
   159  	fd int
   160  
   161  	// tid is the last set tid.
   162  	tid atomicbitops.Uint64
   163  
   164  	// userExits is the count of user exits.
   165  	userExits atomicbitops.Uint64
   166  
   167  	// guestExits is the count of guest to host world switches.
   168  	guestExits atomicbitops.Uint64
   169  
   170  	// faults is a count of world faults (informational only).
   171  	faults uint32
   172  
   173  	// state is the vCPU state.
   174  	//
   175  	// This is a bitmask of the three fields (vCPU*) described above.
   176  	state atomicbitops.Uint32
   177  
   178  	// runData for this vCPU.
   179  	runData *runData
   180  
   181  	// machine associated with this vCPU.
   182  	machine *machine
   183  
   184  	// active is the current addressSpace: this is set and read atomically,
   185  	// it is used to elide unnecessary interrupts due to invalidations.
   186  	active atomicAddressSpace
   187  
   188  	// vCPUArchState is the architecture-specific state.
   189  	vCPUArchState
   190  
   191  	// dieState holds state related to vCPU death.
   192  	dieState dieState
   193  }
   194  
   195  type dieState struct {
   196  	// message is thrown from die.
   197  	message string
   198  
   199  	// guestRegs is used to store register state during vCPU.die() to prevent
   200  	// allocation inside nosplit function.
   201  	guestRegs userRegs
   202  }
   203  
   204  // createVCPU creates and returns a new vCPU.
   205  //
   206  // Precondition: mu must be held.
   207  func (m *machine) createVCPU(id int) *vCPU {
   208  	// Create the vCPU.
   209  	fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
   210  	if errno != 0 {
   211  		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
   212  	}
   213  
   214  	c := &vCPU{
   215  		id:      id,
   216  		fd:      int(fd),
   217  		machine: m,
   218  	}
   219  	c.CPU.Init(&m.kernel, c.id, c)
   220  	m.vCPUsByID[c.id] = c
   221  
   222  	// Ensure the signal mask is correct.
   223  	if err := c.setSignalMask(); err != nil {
   224  		panic(fmt.Sprintf("error setting signal mask: %v", err))
   225  	}
   226  
   227  	// Map the run data.
   228  	runData, err := mapRunData(int(fd))
   229  	if err != nil {
   230  		panic(fmt.Sprintf("error mapping run data: %v", err))
   231  	}
   232  	c.runData = runData
   233  
   234  	// Initialize architecture state.
   235  	if err := c.initArchState(); err != nil {
   236  		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
   237  	}
   238  
   239  	return c // Done.
   240  }
   241  
   242  // newMachine returns a new VM context.
   243  func newMachine(vm int) (*machine, error) {
   244  	// Create the machine.
   245  	m := &machine{fd: vm}
   246  	m.available.L = &m.mu
   247  
   248  	// Pull the maximum vCPUs.
   249  	m.getMaxVCPU()
   250  	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
   251  	m.vCPUsByTID = make(map[uint64]*vCPU)
   252  	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
   253  	m.kernel.Init(m.maxVCPUs)
   254  
   255  	// Pull the maximum slots.
   256  	maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
   257  	if errno != 0 {
   258  		m.maxSlots = _KVM_NR_MEMSLOTS
   259  	} else {
   260  		m.maxSlots = int(maxSlots)
   261  	}
   262  	log.Debugf("The maximum number of slots is %d.", m.maxSlots)
   263  	m.usedSlots = make([]uintptr, m.maxSlots)
   264  
   265  	// Check TSC Scaling
   266  	hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL)
   267  	m.tscControl = errno == 0 && hasTSCControl == 1
   268  	log.Debugf("TSC scaling support: %t.", m.tscControl)
   269  
   270  	// Create the upper shared pagetables and kernel(sentry) pagetables.
   271  	m.upperSharedPageTables = pagetables.New(newAllocator())
   272  	m.mapUpperHalf(m.upperSharedPageTables)
   273  	m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
   274  	m.upperSharedPageTables.MarkReadOnlyShared()
   275  	m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
   276  
   277  	// Install seccomp rules to trap runtime mmap system calls. They will
   278  	// be handled by seccompMmapHandler.
   279  	seccompMmapRules(m)
   280  
   281  	// Apply the physical mappings. Note that these mappings may point to
   282  	// guest physical addresses that are not actually available. These
   283  	// physical pages are mapped on demand, see kernel_unsafe.go.
   284  	applyPhysicalRegions(func(pr physicalRegion) bool {
   285  		// Map everything in the lower half.
   286  		m.kernel.PageTables.Map(
   287  			hostarch.Addr(pr.virtual),
   288  			pr.length,
   289  			pagetables.MapOpts{AccessType: hostarch.ReadWrite},
   290  			pr.physical)
   291  
   292  		return true // Keep iterating.
   293  	})
   294  
   295  	// Ensure that the currently mapped virtual regions are actually
   296  	// available in the VM. Note that this doesn't guarantee no future
   297  	// faults, however it should guarantee that everything is available to
   298  	// ensure successful vCPU entry.
   299  	mapRegion := func(vr virtualRegion, flags uint32) {
   300  		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
   301  			physical, length, ok := translateToPhysical(virtual)
   302  			if !ok {
   303  				// This must be an invalid region that was
   304  				// knocked out by creation of the physical map.
   305  				return
   306  			}
   307  			if virtual+length > vr.virtual+vr.length {
   308  				// Cap the length to the end of the area.
   309  				length = vr.virtual + vr.length - virtual
   310  			}
   311  			// Update page tables for executable mappings.
   312  			if vr.accessType.Execute {
   313  				if vr.accessType.Write {
   314  					panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr))
   315  				}
   316  				m.kernel.PageTables.Map(
   317  					hostarch.Addr(virtual),
   318  					length,
   319  					pagetables.MapOpts{AccessType: vr.accessType},
   320  					physical)
   321  			}
   322  
   323  			// Ensure the physical range is mapped.
   324  			m.mapPhysical(physical, length, physicalRegions)
   325  			virtual += length
   326  		}
   327  	}
   328  
   329  	// handleBluepillFault takes the slot spinlock and it is called from
   330  	// seccompMmapHandler, so here we have to guarantee that mmap is not
   331  	// called while we hold the slot spinlock.
   332  	disableAsyncPreemption()
   333  	applyVirtualRegions(func(vr virtualRegion) {
   334  		if excludeVirtualRegion(vr) {
   335  			return // skip region.
   336  		}
   337  		// Take into account that the stack can grow down.
   338  		if vr.filename == "[stack]" {
   339  			vr.virtual -= 1 << 20
   340  			vr.length += 1 << 20
   341  		}
   342  
   343  		mapRegion(vr, 0)
   344  
   345  	})
   346  	enableAsyncPreemption()
   347  
   348  	// Initialize architecture state.
   349  	if err := m.initArchState(); err != nil {
   350  		m.Destroy()
   351  		return nil, err
   352  	}
   353  
   354  	// Ensure the machine is cleaned up properly.
   355  	runtime.SetFinalizer(m, (*machine).Destroy)
   356  	return m, nil
   357  }
   358  
   359  // hasSlot returns true if the given address is mapped.
   360  //
   361  // This must be done via a linear scan.
   362  //
   363  //go:nosplit
   364  func (m *machine) hasSlot(physical uintptr) bool {
   365  	slotLen := int(m.nextSlot.Load())
   366  	// When slots are being updated, nextSlot is ^uint32(0). As this situation
   367  	// is less likely happen, we just set the slotLen to m.maxSlots, and scan
   368  	// the whole usedSlots array.
   369  	if slotLen == int(^uint32(0)) {
   370  		slotLen = m.maxSlots
   371  	}
   372  	for i := 0; i < slotLen; i++ {
   373  		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
   374  			return true
   375  		}
   376  	}
   377  	return false
   378  }
   379  
   380  // mapPhysical checks for the mapping of a physical range, and installs one if
   381  // not available. This attempts to be efficient for calls in the hot path.
   382  //
   383  // This throws on error.
   384  //
   385  //go:nosplit
   386  func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) {
   387  	for end := physical + length; physical < end; {
   388  		_, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions)
   389  		if pr == nil {
   390  			// Should never happen.
   391  			throw("mapPhysical on unknown physical address")
   392  		}
   393  
   394  		// Is this already mapped? Check the usedSlots.
   395  		if !m.hasSlot(physicalStart) {
   396  			if _, ok := handleBluepillFault(m, physical, phyRegions); !ok {
   397  				throw("handleBluepillFault failed")
   398  			}
   399  		}
   400  
   401  		// Move to the next chunk.
   402  		physical = physicalStart + length
   403  	}
   404  }
   405  
   406  // Destroy frees associated resources.
   407  //
   408  // Destroy should only be called once all active users of the machine are gone.
   409  // The machine object should not be used after calling Destroy.
   410  //
   411  // Precondition: all vCPUs must be returned to the machine.
   412  func (m *machine) Destroy() {
   413  	runtime.SetFinalizer(m, nil)
   414  
   415  	// Destroy vCPUs.
   416  	for _, c := range m.vCPUsByID {
   417  		if c == nil {
   418  			continue
   419  		}
   420  
   421  		// Ensure the vCPU is not still running in guest mode. This is
   422  		// possible iff teardown has been done by other threads, and
   423  		// somehow a single thread has not executed any system calls.
   424  		c.BounceToHost()
   425  
   426  		// Note that the runData may not be mapped if an error occurs
   427  		// during the middle of initialization.
   428  		if c.runData != nil {
   429  			if err := unmapRunData(c.runData); err != nil {
   430  				panic(fmt.Sprintf("error unmapping rundata: %v", err))
   431  			}
   432  		}
   433  		if err := unix.Close(int(c.fd)); err != nil {
   434  			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
   435  		}
   436  	}
   437  
   438  	machinePool[m.machinePoolIndex].Store(nil)
   439  	seccompMmapSync()
   440  
   441  	// vCPUs are gone: teardown machine state.
   442  	if err := unix.Close(m.fd); err != nil {
   443  		panic(fmt.Sprintf("error closing VM fd: %v", err))
   444  	}
   445  }
   446  
   447  // Get gets an available vCPU.
   448  //
   449  // This will return with the OS thread locked.
   450  //
   451  // It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
   452  // to the vCPU in which the OS thread TID is running. So if Get() returns with
   453  // the corrent context in guest, the vCPU of it must be the same as what
   454  // Get() returns.
   455  func (m *machine) Get() *vCPU {
   456  	m.mu.RLock()
   457  	runtime.LockOSThread()
   458  	tid := hosttid.Current()
   459  
   460  	// Check for an exact match.
   461  	if c := m.vCPUsByTID[tid]; c != nil {
   462  		c.lock()
   463  		m.mu.RUnlock()
   464  		getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
   465  		return c
   466  	}
   467  
   468  	// The happy path failed. We now proceed to acquire an exclusive lock
   469  	// (because the vCPU map may change), and scan all available vCPUs.
   470  	// In this case, we first unlock the OS thread. Otherwise, if mu is
   471  	// not available, the current system thread will be parked and a new
   472  	// system thread spawned. We avoid this situation by simply refreshing
   473  	// tid after relocking the system thread.
   474  	m.mu.RUnlock()
   475  	runtime.UnlockOSThread()
   476  	m.mu.Lock()
   477  	runtime.LockOSThread()
   478  	tid = hosttid.Current()
   479  
   480  	// Recheck for an exact match.
   481  	if c := m.vCPUsByTID[tid]; c != nil {
   482  		c.lock()
   483  		m.mu.Unlock()
   484  		getVCPUCounter.Increment(&getVCPUAcquisitionReused)
   485  		return c
   486  	}
   487  
   488  	for {
   489  		// Get vCPU from the m.vCPUsByID pool.
   490  		if m.usedVCPUs < m.maxVCPUs {
   491  			c := m.vCPUsByID[m.usedVCPUs]
   492  			m.usedVCPUs++
   493  			c.lock()
   494  			m.vCPUsByTID[tid] = c
   495  			m.mu.Unlock()
   496  			c.loadSegments(tid)
   497  			getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
   498  			return c
   499  		}
   500  
   501  		// Scan for an available vCPU.
   502  		for origTID, c := range m.vCPUsByTID {
   503  			if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
   504  				delete(m.vCPUsByTID, origTID)
   505  				m.vCPUsByTID[tid] = c
   506  				m.mu.Unlock()
   507  				c.loadSegments(tid)
   508  				getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
   509  				return c
   510  			}
   511  		}
   512  
   513  		// Scan for something not in user mode.
   514  		for origTID, c := range m.vCPUsByTID {
   515  			if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
   516  				continue
   517  			}
   518  
   519  			// The vCPU is not be able to transition to
   520  			// vCPUGuest|vCPUWaiter or to vCPUUser because that
   521  			// transition requires holding the machine mutex, as we
   522  			// do now. There is no path to register a waiter on
   523  			// just the vCPUReady state.
   524  			for {
   525  				c.waitUntilNot(vCPUGuest | vCPUWaiter)
   526  				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
   527  					break
   528  				}
   529  			}
   530  
   531  			// Steal the vCPU.
   532  			delete(m.vCPUsByTID, origTID)
   533  			m.vCPUsByTID[tid] = c
   534  			m.mu.Unlock()
   535  			c.loadSegments(tid)
   536  			getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
   537  			return c
   538  		}
   539  
   540  		// Everything is executing in user mode. Wait until something
   541  		// is available.  Note that signaling the condition variable
   542  		// will have the extra effect of kicking the vCPUs out of guest
   543  		// mode if that's where they were.
   544  		m.available.Wait()
   545  	}
   546  }
   547  
   548  // Put puts the current vCPU.
   549  func (m *machine) Put(c *vCPU) {
   550  	c.unlock()
   551  	runtime.UnlockOSThread()
   552  
   553  	m.mu.RLock()
   554  	m.available.Signal()
   555  	m.mu.RUnlock()
   556  }
   557  
   558  // newDirtySet returns a new dirty set.
   559  func (m *machine) newDirtySet() *dirtySet {
   560  	return &dirtySet{
   561  		vCPUMasks: make([]atomicbitops.Uint64,
   562  			(m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
   563  	}
   564  }
   565  
   566  // dropPageTables drops cached page table entries.
   567  func (m *machine) dropPageTables(pt *pagetables.PageTables) {
   568  	m.mu.Lock()
   569  	defer m.mu.Unlock()
   570  
   571  	// Clear from all PCIDs.
   572  	for _, c := range m.vCPUsByID {
   573  		if c != nil && c.PCIDs != nil {
   574  			c.PCIDs.Drop(pt)
   575  		}
   576  	}
   577  }
   578  
   579  // lock marks the vCPU as in user mode.
   580  //
   581  // This should only be called directly when known to be safe, i.e. when
   582  // the vCPU is owned by the current TID with no chance of theft.
   583  //
   584  //go:nosplit
   585  func (c *vCPU) lock() {
   586  	atomicbitops.OrUint32(&c.state, vCPUUser)
   587  }
   588  
   589  // unlock clears the vCPUUser bit.
   590  //
   591  //go:nosplit
   592  func (c *vCPU) unlock() {
   593  	origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
   594  	if origState == vCPUUser|vCPUGuest {
   595  		// Happy path: no exits are forced, and we can continue
   596  		// executing on our merry way with a single atomic access.
   597  		return
   598  	}
   599  
   600  	// Clear the lock.
   601  	for {
   602  		state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser)
   603  		if state == origState {
   604  			break
   605  		}
   606  		origState = state
   607  	}
   608  	switch origState {
   609  	case vCPUUser:
   610  		// Normal state.
   611  	case vCPUUser | vCPUGuest | vCPUWaiter:
   612  		// Force a transition: this must trigger a notification when we
   613  		// return from guest mode. We must clear vCPUWaiter here
   614  		// anyways, because BounceToKernel will force a transition only
   615  		// from ring3 to ring0, which will not clear this bit. Halt may
   616  		// workaround the issue, but if there is no exception or
   617  		// syscall in this period, BounceToKernel will hang.
   618  		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
   619  		c.notify()
   620  	case vCPUUser | vCPUWaiter:
   621  		// Waiting for the lock to be released; the responsibility is
   622  		// on us to notify the waiter and clear the associated bit.
   623  		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
   624  		c.notify()
   625  	default:
   626  		panic("invalid state")
   627  	}
   628  }
   629  
   630  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   631  //
   632  //go:nosplit
   633  func (c *vCPU) NotifyInterrupt() {
   634  	c.BounceToKernel()
   635  }
   636  
   637  // pid is used below in bounce.
   638  var pid = unix.Getpid()
   639  
   640  // bounce forces a return to the kernel or to host mode.
   641  //
   642  // This effectively unwinds the state machine.
   643  func (c *vCPU) bounce(forceGuestExit bool) {
   644  	origGuestExits := c.guestExits.Load()
   645  	origUserExits := c.userExits.Load()
   646  	for {
   647  		switch state := c.state.Load(); state {
   648  		case vCPUReady, vCPUWaiter:
   649  			// There is nothing to be done, we're already in the
   650  			// kernel pre-acquisition. The Bounce criteria have
   651  			// been satisfied.
   652  			return
   653  		case vCPUUser:
   654  			// We need to register a waiter for the actual guest
   655  			// transition. When the transition takes place, then we
   656  			// can inject an interrupt to ensure a return to host
   657  			// mode.
   658  			c.state.CompareAndSwap(state, state|vCPUWaiter)
   659  		case vCPUUser | vCPUWaiter:
   660  			// Wait for the transition to guest mode. This should
   661  			// come from the bluepill handler.
   662  			c.waitUntilNot(state)
   663  		case vCPUGuest, vCPUUser | vCPUGuest:
   664  			if state == vCPUGuest && !forceGuestExit {
   665  				// The vCPU is already not acquired, so there's
   666  				// no need to do a fresh injection here.
   667  				return
   668  			}
   669  			// The vCPU is in user or kernel mode. Attempt to
   670  			// register a notification on change.
   671  			if !c.state.CompareAndSwap(state, state|vCPUWaiter) {
   672  				break // Retry.
   673  			}
   674  			for {
   675  				// We need to spin here until the signal is
   676  				// delivered, because Tgkill can return EAGAIN
   677  				// under memory pressure. Since we already
   678  				// marked ourselves as a waiter, we need to
   679  				// ensure that a signal is actually delivered.
   680  				if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil {
   681  					break
   682  				} else if err.(unix.Errno) == unix.EAGAIN {
   683  					continue
   684  				} else {
   685  					// Nothing else should be returned by tgkill.
   686  					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
   687  				}
   688  			}
   689  		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
   690  			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
   691  				// See above.
   692  				return
   693  			}
   694  			// Wait for the transition. This again should happen
   695  			// from the bluepill handler, but on the way out.
   696  			c.waitUntilNot(state)
   697  		default:
   698  			// Should not happen: the above is exhaustive.
   699  			panic("invalid state")
   700  		}
   701  
   702  		// Check if we've missed the state transition, but
   703  		// we can safely return at this point in time.
   704  		newGuestExits := c.guestExits.Load()
   705  		newUserExits := c.userExits.Load()
   706  		if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
   707  			return
   708  		}
   709  	}
   710  }
   711  
   712  // BounceToKernel ensures that the vCPU bounces back to the kernel.
   713  //
   714  //go:nosplit
   715  func (c *vCPU) BounceToKernel() {
   716  	c.bounce(false)
   717  }
   718  
   719  // BounceToHost ensures that the vCPU is in host mode.
   720  //
   721  //go:nosplit
   722  func (c *vCPU) BounceToHost() {
   723  	c.bounce(true)
   724  }
   725  
   726  // setSystemTimeLegacy calibrates and sets an approximate system time.
   727  func (c *vCPU) setSystemTimeLegacy() error {
   728  	const minIterations = 10
   729  	minimum := uint64(0)
   730  	for iter := 0; ; iter++ {
   731  		// Try to set the TSC to an estimate of where it will be
   732  		// on the host during a "fast" system call iteration.
   733  		start := uint64(ktime.Rdtsc())
   734  		if err := c.setTSC(start + (minimum / 2)); err != nil {
   735  			return err
   736  		}
   737  		// See if this is our new minimum call time. Note that this
   738  		// serves two functions: one, we make sure that we are
   739  		// accurately predicting the offset we need to set. Second, we
   740  		// don't want to do the final set on a slow call, which could
   741  		// produce a really bad result.
   742  		end := uint64(ktime.Rdtsc())
   743  		if end < start {
   744  			continue // Totally bogus: unstable TSC?
   745  		}
   746  		current := end - start
   747  		if current < minimum || iter == 0 {
   748  			minimum = current // Set our new minimum.
   749  		}
   750  		// Is this past minIterations and within ~10% of minimum?
   751  		upperThreshold := (((minimum << 3) + minimum) >> 3)
   752  		if iter >= minIterations && current <= upperThreshold {
   753  			return nil
   754  		}
   755  	}
   756  }
   757  
   758  const machinePoolSize = 16
   759  
   760  // machinePool is enumerated from the seccompMmapHandler signal handler
   761  var (
   762  	machinePool          [machinePoolSize]machineAtomicPtr
   763  	machinePoolLen       atomicbitops.Uint32
   764  	machinePoolMu        sync.Mutex
   765  	seccompMmapRulesOnce gosync.Once
   766  )
   767  
   768  func sigsysHandler()
   769  func addrOfSigsysHandler() uintptr
   770  
   771  // seccompMmapRules adds seccomp rules to trap mmap system calls that will be
   772  // handled in seccompMmapHandler.
   773  func seccompMmapRules(m *machine) {
   774  	seccompMmapRulesOnce.Do(func() {
   775  		// Install the handler.
   776  		if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
   777  			panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
   778  		}
   779  		rules := []seccomp.RuleSet{}
   780  		rules = append(rules, []seccomp.RuleSet{
   781  			// Trap mmap system calls and handle them in sigsysGoHandler
   782  			{
   783  				Rules: seccomp.SyscallRules{
   784  					unix.SYS_MMAP: {
   785  						{
   786  							seccomp.MatchAny{},
   787  							seccomp.MatchAny{},
   788  							seccomp.MaskedEqual(unix.PROT_EXEC, 0),
   789  							/* MAP_DENYWRITE is ignored and used only for filtering. */
   790  							seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
   791  						},
   792  					},
   793  				},
   794  				Action: linux.SECCOMP_RET_TRAP,
   795  			},
   796  		}...)
   797  		instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
   798  		if err != nil {
   799  			panic(fmt.Sprintf("failed to build rules: %v", err))
   800  		}
   801  		// Perform the actual installation.
   802  		if err := seccomp.SetFilter(instrs); err != nil {
   803  			panic(fmt.Sprintf("failed to set filter: %v", err))
   804  		}
   805  	})
   806  
   807  	machinePoolMu.Lock()
   808  	n := machinePoolLen.Load()
   809  	i := uint32(0)
   810  	for ; i < n; i++ {
   811  		if machinePool[i].Load() == nil {
   812  			break
   813  		}
   814  	}
   815  	if i == n {
   816  		if i == machinePoolSize {
   817  			machinePoolMu.Unlock()
   818  			panic("machinePool is full")
   819  		}
   820  		machinePoolLen.Add(1)
   821  	}
   822  	machinePool[i].Store(m)
   823  	m.machinePoolIndex = i
   824  	machinePoolMu.Unlock()
   825  }