github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/kvm/machine_unsafe.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build go1.18
    16  // +build go1.18
    17  
    18  // //go:linkname directives type-checked by checklinkname. Any other
    19  // non-linkname assumptions outside the Go 1 compatibility guarantee should
    20  // have an accompanied vet check or version guard build tag.
    21  
    22  package kvm
    23  
    24  import (
    25  	"fmt"
    26  	"math"
    27  	"runtime"
    28  	"sync/atomic"
    29  	"syscall"
    30  	"unsafe"
    31  
    32  	"golang.org/x/sys/unix"
    33  	"github.com/metacubex/gvisor/pkg/abi/linux"
    34  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    35  )
    36  
    37  //go:linkname entersyscall runtime.entersyscall
    38  func entersyscall()
    39  
    40  //go:linkname exitsyscall runtime.exitsyscall
    41  func exitsyscall()
    42  
    43  // setMemoryRegion initializes a region.
    44  //
    45  // This may be called from bluepillHandler, and therefore returns an errno
    46  // directly (instead of wrapping in an error) to avoid allocations.
    47  //
    48  //go:nosplit
    49  func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr, flags uint32) unix.Errno {
    50  	userRegion := userMemoryRegion{
    51  		slot:          uint32(slot),
    52  		flags:         uint32(flags),
    53  		guestPhysAddr: uint64(physical),
    54  		memorySize:    uint64(length),
    55  		userspaceAddr: uint64(virtual),
    56  	}
    57  
    58  	// Set the region.
    59  	// Note: syscall.RawSyscall is used to fit the nosplit stack limit.
    60  	_, _, errno := syscall.RawSyscall(
    61  		unix.SYS_IOCTL,
    62  		uintptr(m.fd),
    63  		KVM_SET_USER_MEMORY_REGION,
    64  		uintptr(unsafe.Pointer(&userRegion)))
    65  	return errno
    66  }
    67  
    68  // mapRunData maps the vCPU run data.
    69  func mapRunData(fd int) (*runData, error) {
    70  	r, _, errno := unix.RawSyscall6(
    71  		unix.SYS_MMAP,
    72  		0,
    73  		uintptr(runDataSize),
    74  		unix.PROT_READ|unix.PROT_WRITE,
    75  		unix.MAP_SHARED,
    76  		uintptr(fd),
    77  		0)
    78  	if errno != 0 {
    79  		return nil, fmt.Errorf("error mapping runData: %v", errno)
    80  	}
    81  	return (*runData)(unsafe.Pointer(r)), nil
    82  }
    83  
    84  // unmapRunData unmaps the vCPU run data.
    85  func unmapRunData(r *runData) error {
    86  	if _, _, errno := unix.RawSyscall(
    87  		unix.SYS_MUNMAP,
    88  		uintptr(unsafe.Pointer(r)),
    89  		uintptr(runDataSize),
    90  		0); errno != 0 {
    91  		return fmt.Errorf("error unmapping runData: %v", errno)
    92  	}
    93  	return nil
    94  }
    95  
    96  // atomicAddressSpace is an atomic address space pointer.
    97  type atomicAddressSpace struct {
    98  	pointer unsafe.Pointer
    99  }
   100  
   101  // set sets the address space value.
   102  //
   103  //go:nosplit
   104  func (a *atomicAddressSpace) set(as *addressSpace) {
   105  	atomic.StorePointer(&a.pointer, unsafe.Pointer(as))
   106  }
   107  
   108  // get gets the address space value.
   109  //
   110  // Note that this should be considered best-effort, and may have changed by the
   111  // time this function returns.
   112  //
   113  //go:nosplit
   114  func (a *atomicAddressSpace) get() *addressSpace {
   115  	return (*addressSpace)(atomic.LoadPointer(&a.pointer))
   116  }
   117  
   118  // notify notifies that the vCPU has transitioned modes.
   119  //
   120  // This may be called by a signal handler and therefore throws on error.
   121  //
   122  //go:nosplit
   123  func (c *vCPU) notify() {
   124  	_, _, errno := unix.RawSyscall6( // escapes: no.
   125  		unix.SYS_FUTEX,
   126  		uintptr(unsafe.Pointer(&c.state)),
   127  		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
   128  		math.MaxInt32, // Number of waiters.
   129  		0, 0, 0)
   130  	if errno != 0 {
   131  		throw("futex wake error")
   132  	}
   133  }
   134  
   135  // waitUntilNot waits for the vCPU to transition modes.
   136  //
   137  // The state should have been previously set to vCPUWaiter after performing an
   138  // appropriate action to cause a transition (e.g. interrupt injection).
   139  //
   140  // This panics on error.
   141  func (c *vCPU) waitUntilNot(state uint32) {
   142  	_, _, errno := unix.Syscall6(
   143  		unix.SYS_FUTEX,
   144  		uintptr(unsafe.Pointer(&c.state)),
   145  		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
   146  		uintptr(state),
   147  		0, 0, 0)
   148  	if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN {
   149  		panic("futex wait error")
   150  	}
   151  }
   152  
   153  // setSignalMask sets the vCPU signal mask.
   154  //
   155  // This must be called prior to running the vCPU.
   156  func (c *vCPU) setSignalMask() error {
   157  	// The layout of this structure implies that it will not necessarily be
   158  	// the same layout chosen by the Go compiler. It gets fudged here.
   159  	var data struct {
   160  		length uint32
   161  		mask1  uint32
   162  		mask2  uint32
   163  		_      uint32
   164  	}
   165  	data.length = 8 // Fixed sigset size.
   166  	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
   167  	data.mask2 = ^uint32(bounceSignalMask >> 32)
   168  	if _, _, errno := unix.RawSyscall(
   169  		unix.SYS_IOCTL,
   170  		uintptr(c.fd),
   171  		KVM_SET_SIGNAL_MASK,
   172  		uintptr(unsafe.Pointer(&data))); errno != 0 {
   173  		return fmt.Errorf("error setting signal mask: %v", errno)
   174  	}
   175  
   176  	return nil
   177  }
   178  
   179  // seccompMmapHandlerCnt is a number of currently running seccompMmapHandler
   180  // instances.
   181  var seccompMmapHandlerCnt atomicbitops.Int64
   182  
   183  // seccompMmapSync waits for all currently runnuing seccompMmapHandler
   184  // instances.
   185  //
   186  // The standard locking primitives can't be used in this case since
   187  // seccompMmapHandler is executed in a signal handler context.
   188  //
   189  // It can be implemented by using FUTEX calls, but it will require to call
   190  // FUTEX_WAKE from seccompMmapHandler. Consider machine.Destroy is called only
   191  // once, and the probability is racing with seccompMmapHandler is very low the
   192  // spinlock-like way looks more reasonable.
   193  func seccompMmapSync() {
   194  	for seccompMmapHandlerCnt.Load() != 0 {
   195  		runtime.Gosched()
   196  	}
   197  }
   198  
   199  // seccompMmapHandler is a signal handler for runtime mmap system calls
   200  // that are trapped by seccomp.
   201  //
   202  // It executes the mmap syscall with specified arguments and maps a new region
   203  // to the guest.
   204  //
   205  //go:nosplit
   206  func seccompMmapHandler(context unsafe.Pointer) {
   207  	mmapCallCounter.Increment()
   208  
   209  	addr, length, errno := seccompMmapSyscall(context)
   210  	if errno != 0 {
   211  		return
   212  	}
   213  
   214  	seccompMmapHandlerCnt.Add(1)
   215  	for i := uint32(0); i < machinePoolLen.Load(); i++ {
   216  		m := machinePool[i].Load()
   217  		if m == nil {
   218  			continue
   219  		}
   220  
   221  		// Map the new region to the guest.
   222  		vr := region{
   223  			virtual: addr,
   224  			length:  length,
   225  		}
   226  		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
   227  			physical, length, ok := translateToPhysical(virtual)
   228  			if !ok {
   229  				// This must be an invalid region that was
   230  				// knocked out by creation of the physical map.
   231  				return
   232  			}
   233  			if virtual+length > vr.virtual+vr.length {
   234  				// Cap the length to the end of the area.
   235  				length = vr.virtual + vr.length - virtual
   236  			}
   237  
   238  			// Ensure the physical range is mapped.
   239  			m.mapPhysical(physical, length, physicalRegions)
   240  			virtual += length
   241  		}
   242  	}
   243  	seccompMmapHandlerCnt.Add(-1)
   244  }
   245  
   246  // disableAsyncPreemption disables asynchronous preemption of go-routines.
   247  func disableAsyncPreemption() {
   248  	set := linux.MakeSignalSet(linux.SIGURG)
   249  	_, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_BLOCK,
   250  		uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
   251  	if errno != 0 {
   252  		panic(fmt.Sprintf("sigprocmask failed: %d", errno))
   253  	}
   254  }
   255  
   256  // enableAsyncPreemption enables asynchronous preemption of go-routines.
   257  func enableAsyncPreemption() {
   258  	set := linux.MakeSignalSet(linux.SIGURG)
   259  	_, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK,
   260  		uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
   261  	if errno != 0 {
   262  		panic(fmt.Sprintf("sigprocmask failed: %d", errno))
   263  	}
   264  }