gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/kvm/bluepill_unsafe.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build go1.18 16 // +build go1.18 17 18 // //go:linkname directives type-checked by checklinkname. Any other 19 // non-linkname assumptions outside the Go 1 compatibility guarantee should 20 // have an accompanied vet check or version guard build tag. 21 22 package kvm 23 24 import ( 25 "unsafe" 26 27 "golang.org/x/sys/unix" 28 "gvisor.dev/gvisor/pkg/sentry/arch" 29 ) 30 31 //go:linkname throw runtime.throw 32 func throw(s string) 33 34 // vCPUPtr returns a CPU for the given address. 35 // 36 //go:nosplit 37 func vCPUPtr(addr uintptr) *vCPU { 38 return (*vCPU)(unsafe.Pointer(addr)) 39 } 40 41 // bytePtr returns a bytePtr for the given address. 42 // 43 //go:nosplit 44 func bytePtr(addr uintptr) *byte { 45 return (*byte)(unsafe.Pointer(addr)) 46 } 47 48 // uintptrValue returns a uintptr for the given address. 49 // 50 //go:nosplit 51 func uintptrValue(addr *byte) uintptr { 52 return (uintptr)(unsafe.Pointer(addr)) 53 } 54 55 // bluepillArchContext returns the UContext64. 56 // 57 //go:nosplit 58 func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { 59 return &((*arch.UContext64)(context).MContext) 60 } 61 62 // bluepillHandleHlt is responsible for handling VM-Exit. 63 // 64 //go:nosplit 65 func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { 66 // Increment our counter. 67 c.guestExits.Add(1) 68 69 // Copy out registers. 70 bluepillArchExit(c, bluepillArchContext(context)) 71 72 // Return to the vCPUReady state; notify any waiters. 73 user := c.state.Load() & vCPUUser 74 switch c.state.Swap(user) { 75 case user | vCPUGuest: // Expected case. 76 case user | vCPUGuest | vCPUWaiter: 77 c.notify() 78 default: 79 throw("invalid state") 80 } 81 } 82 83 var hexSyms = []byte("0123456789abcdef") 84 85 //go:nosplit 86 func printHex(title []byte, val uint64) { 87 var str [18]byte 88 for i := 0; i < 16; i++ { 89 str[16-i] = hexSyms[val&0xf] 90 val = val >> 4 91 } 92 str[0] = ' ' 93 str[17] = '\n' 94 unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title))) 95 unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18) 96 } 97 98 // bluepillHandler is called from the signal stub. 99 // 100 // The world may be stopped while this is executing, and it executes on the 101 // signal stack. It should only execute raw system calls and functions that are 102 // explicitly marked go:nosplit. 103 // 104 // Ideally, this function should switch to gsignal, as runtime.sigtramp does, 105 // but that is tedious given all the runtime internals. That said, using 106 // gsignal inside a signal handler is not _required_, provided we avoid stack 107 // splits and allocations. Note that calling any splittable function here will 108 // be flaky; if the signal stack is below the G stack then we will trigger a 109 // split and crash. If above, we won't trigger a split. 110 // 111 // +checkescape:all 112 // 113 //go:nosplit 114 func bluepillHandler(context unsafe.Pointer) { 115 // Sanitize the registers; interrupts must always be disabled. 116 c := bluepillArchEnter(bluepillArchContext(context)) 117 118 // Mark this as guest mode. 119 switch c.state.Swap(vCPUGuest | vCPUUser) { 120 case vCPUUser: // Expected case. 121 case vCPUUser | vCPUWaiter: 122 c.notify() 123 default: 124 throw("invalid state") 125 } 126 127 for { 128 hostExitCounter.Increment() 129 _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), KVM_RUN, 0) // escapes: no. 130 switch errno { 131 case 0: // Expected case. 132 case unix.EINTR: 133 interruptCounter.Increment() 134 // First, we process whatever pending signal 135 // interrupted KVM. Since we're in a signal handler 136 // currently, all signals are masked and the signal 137 // must have been delivered directly to this thread. 138 timeout := unix.Timespec{} 139 sig, _, errno := unix.RawSyscall6( // escapes: no. 140 unix.SYS_RT_SIGTIMEDWAIT, 141 uintptr(unsafe.Pointer(&bounceSignalMask)), 142 0, // siginfo. 143 uintptr(unsafe.Pointer(&timeout)), // timeout. 144 8, // sigset size. 145 0, 0) 146 if errno == unix.EAGAIN { 147 continue 148 } 149 if errno != 0 { 150 throw("error waiting for pending signal") 151 } 152 if sig != uintptr(bounceSignal) { 153 throw("unexpected signal") 154 } 155 156 // Check whether the current state of the vCPU is ready 157 // for interrupt injection. Because we don't have a 158 // PIC, we can't inject an interrupt while they are 159 // masked. We need to request a window if it's not 160 // ready. 161 if bluepillReadyStopGuest(c) { 162 // Force injection below; the vCPU is ready. 163 c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN 164 } else { 165 c.runData.requestInterruptWindow = 1 166 continue // Rerun vCPU. 167 } 168 case unix.EFAULT: 169 // If a fault is not serviceable due to the host 170 // backing pages having page permissions, instead of an 171 // MMIO exit we receive EFAULT from the run ioctl. We 172 // always inject an NMI here since we may be in kernel 173 // mode and have interrupts disabled. 174 bluepillSigBus(c) 175 continue // Rerun vCPU. 176 case unix.ENOSYS: 177 bluepillHandleEnosys(c) 178 continue 179 default: 180 throw("run failed") 181 } 182 183 switch c.runData.exitReason { 184 case _KVM_EXIT_EXCEPTION: 185 c.die(bluepillArchContext(context), "exception") 186 return 187 case _KVM_EXIT_IO: 188 c.die(bluepillArchContext(context), "I/O") 189 return 190 case _KVM_EXIT_INTERNAL_ERROR: 191 // An internal error is typically thrown when emulation 192 // fails. This can occur via the MMIO path below (and 193 // it might fail because we have multiple regions that 194 // are not mapped). We would actually prefer that no 195 // emulation occur, and don't mind at all if it fails. 196 case _KVM_EXIT_HYPERCALL: 197 c.die(bluepillArchContext(context), "hypercall") 198 return 199 case _KVM_EXIT_DEBUG: 200 c.die(bluepillArchContext(context), "debug") 201 return 202 case _KVM_EXIT_HLT: 203 c.hltSanityCheck() 204 bluepillGuestExit(c, context) 205 return 206 case _KVM_EXIT_MMIO: 207 physical := uintptr(c.runData.data[0]) 208 if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT { 209 bluepillGuestExit(c, context) 210 return 211 } 212 213 c.die(bluepillArchContext(context), "exit_mmio") 214 return 215 case _KVM_EXIT_IRQ_WINDOW_OPEN: 216 bluepillStopGuest(c) 217 case _KVM_EXIT_SHUTDOWN: 218 c.die(bluepillArchContext(context), "shutdown") 219 return 220 case _KVM_EXIT_FAIL_ENTRY: 221 c.die(bluepillArchContext(context), "entry failed") 222 return 223 default: 224 bluepillArchHandleExit(c, context) 225 return 226 } 227 } 228 }