github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/bluepill_unsafe.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build go1.12
    16  // +build !go1.18
    17  
    18  // Check go:linkname function signatures when updating Go version.
    19  
    20  package kvm
    21  
    22  import (
    23  	"sync/atomic"
    24  	"unsafe"
    25  
    26  	"golang.org/x/sys/unix"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    28  )
    29  
    30  //go:linkname throw runtime.throw
    31  func throw(string)
    32  
    33  // vCPUPtr returns a CPU for the given address.
    34  //
    35  //go:nosplit
    36  func vCPUPtr(addr uintptr) *vCPU {
    37  	return (*vCPU)(unsafe.Pointer(addr))
    38  }
    39  
    40  // bytePtr returns a bytePtr for the given address.
    41  //
    42  //go:nosplit
    43  func bytePtr(addr uintptr) *byte {
    44  	return (*byte)(unsafe.Pointer(addr))
    45  }
    46  
    47  // uintptrValue returns a uintptr for the given address.
    48  //
    49  //go:nosplit
    50  func uintptrValue(addr *byte) uintptr {
    51  	return (uintptr)(unsafe.Pointer(addr))
    52  }
    53  
    54  // bluepillArchContext returns the UContext64.
    55  //
    56  //go:nosplit
    57  func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
    58  	return &((*arch.UContext64)(context).MContext)
    59  }
    60  
    61  // bluepillHandleHlt is reponsible for handling VM-Exit.
    62  //
    63  //go:nosplit
    64  func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
    65  	// Increment our counter.
    66  	atomic.AddUint64(&c.guestExits, 1)
    67  
    68  	// Copy out registers.
    69  	bluepillArchExit(c, bluepillArchContext(context))
    70  
    71  	// Return to the vCPUReady state; notify any waiters.
    72  	user := atomic.LoadUint32(&c.state) & vCPUUser
    73  	switch atomic.SwapUint32(&c.state, user) {
    74  	case user | vCPUGuest: // Expected case.
    75  	case user | vCPUGuest | vCPUWaiter:
    76  		c.notify()
    77  	default:
    78  		throw("invalid state")
    79  	}
    80  }
    81  
    82  // bluepillHandler is called from the signal stub.
    83  //
    84  // The world may be stopped while this is executing, and it executes on the
    85  // signal stack. It should only execute raw system calls and functions that are
    86  // explicitly marked go:nosplit.
    87  //
    88  // +checkescape:all
    89  //
    90  //go:nosplit
    91  func bluepillHandler(context unsafe.Pointer) {
    92  	// Sanitize the registers; interrupts must always be disabled.
    93  	c := bluepillArchEnter(bluepillArchContext(context))
    94  
    95  	// Mark this as guest mode.
    96  	switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
    97  	case vCPUUser: // Expected case.
    98  	case vCPUUser | vCPUWaiter:
    99  		c.notify()
   100  	default:
   101  		throw("invalid state")
   102  	}
   103  
   104  	for {
   105  		_, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
   106  		switch errno {
   107  		case 0: // Expected case.
   108  		case unix.EINTR:
   109  			// First, we process whatever pending signal
   110  			// interrupted KVM. Since we're in a signal handler
   111  			// currently, all signals are masked and the signal
   112  			// must have been delivered directly to this thread.
   113  			timeout := unix.Timespec{}
   114  			sig, _, errno := unix.RawSyscall6( // escapes: no.
   115  				unix.SYS_RT_SIGTIMEDWAIT,
   116  				uintptr(unsafe.Pointer(&bounceSignalMask)),
   117  				0,                                 // siginfo.
   118  				uintptr(unsafe.Pointer(&timeout)), // timeout.
   119  				8,                                 // sigset size.
   120  				0, 0)
   121  			if errno == unix.EAGAIN {
   122  				continue
   123  			}
   124  			if errno != 0 {
   125  				throw("error waiting for pending signal")
   126  			}
   127  			if sig != uintptr(bounceSignal) {
   128  				throw("unexpected signal")
   129  			}
   130  
   131  			// Check whether the current state of the vCPU is ready
   132  			// for interrupt injection. Because we don't have a
   133  			// PIC, we can't inject an interrupt while they are
   134  			// masked. We need to request a window if it's not
   135  			// ready.
   136  			if bluepillReadyStopGuest(c) {
   137  				// Force injection below; the vCPU is ready.
   138  				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
   139  			} else {
   140  				c.runData.requestInterruptWindow = 1
   141  				continue // Rerun vCPU.
   142  			}
   143  		case unix.EFAULT:
   144  			// If a fault is not serviceable due to the host
   145  			// backing pages having page permissions, instead of an
   146  			// MMIO exit we receive EFAULT from the run ioctl. We
   147  			// always inject an NMI here since we may be in kernel
   148  			// mode and have interrupts disabled.
   149  			bluepillSigBus(c)
   150  			continue // Rerun vCPU.
   151  		case unix.ENOSYS:
   152  			bluepillHandleEnosys(c)
   153  			continue
   154  		default:
   155  			throw("run failed")
   156  		}
   157  
   158  		switch c.runData.exitReason {
   159  		case _KVM_EXIT_EXCEPTION:
   160  			c.die(bluepillArchContext(context), "exception")
   161  			return
   162  		case _KVM_EXIT_IO:
   163  			c.die(bluepillArchContext(context), "I/O")
   164  			return
   165  		case _KVM_EXIT_INTERNAL_ERROR:
   166  			// An internal error is typically thrown when emulation
   167  			// fails. This can occur via the MMIO path below (and
   168  			// it might fail because we have multiple regions that
   169  			// are not mapped). We would actually prefer that no
   170  			// emulation occur, and don't mind at all if it fails.
   171  		case _KVM_EXIT_HYPERCALL:
   172  			c.die(bluepillArchContext(context), "hypercall")
   173  			return
   174  		case _KVM_EXIT_DEBUG:
   175  			c.die(bluepillArchContext(context), "debug")
   176  			return
   177  		case _KVM_EXIT_HLT:
   178  			bluepillGuestExit(c, context)
   179  			return
   180  		case _KVM_EXIT_MMIO:
   181  			physical := uintptr(c.runData.data[0])
   182  			if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
   183  				bluepillGuestExit(c, context)
   184  				return
   185  			}
   186  
   187  			// Increment the fault count.
   188  			atomic.AddUint32(&c.faults, 1)
   189  
   190  			// For MMIO, the physical address is the first data item.
   191  			physical = uintptr(c.runData.data[0])
   192  			virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
   193  			if !ok {
   194  				c.die(bluepillArchContext(context), "invalid physical address")
   195  				return
   196  			}
   197  
   198  			// We now need to fill in the data appropriately. KVM
   199  			// expects us to provide the result of the given MMIO
   200  			// operation in the runData struct. This is safe
   201  			// because, if a fault occurs here, the same fault
   202  			// would have occurred in guest mode. The kernel should
   203  			// not create invalid page table mappings.
   204  			data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
   205  			length := (uintptr)((uint32)(c.runData.data[2]))
   206  			write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0
   207  			for i := uintptr(0); i < length; i++ {
   208  				b := bytePtr(uintptr(virtual) + i)
   209  				if write {
   210  					// Write to the given address.
   211  					*b = data[i]
   212  				} else {
   213  					// Read from the given address.
   214  					data[i] = *b
   215  				}
   216  			}
   217  		case _KVM_EXIT_IRQ_WINDOW_OPEN:
   218  			bluepillStopGuest(c)
   219  		case _KVM_EXIT_SHUTDOWN:
   220  			c.die(bluepillArchContext(context), "shutdown")
   221  			return
   222  		case _KVM_EXIT_FAIL_ENTRY:
   223  			c.die(bluepillArchContext(context), "entry failed")
   224  			return
   225  		default:
   226  			bluepillArchHandleExit(c, context)
   227  			return
   228  		}
   229  	}
   230  }