github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine_unsafe.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build go1.18 16 // +build go1.18 17 18 // //go:linkname directives type-checked by checklinkname. Any other 19 // non-linkname assumptions outside the Go 1 compatibility guarantee should 20 // have an accompanied vet check or version guard build tag. 21 22 package kvm 23 24 import ( 25 "fmt" 26 "math" 27 "runtime" 28 "sync/atomic" 29 "syscall" 30 "unsafe" 31 32 "golang.org/x/sys/unix" 33 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 34 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 35 ) 36 37 //go:linkname entersyscall runtime.entersyscall 38 func entersyscall() 39 40 //go:linkname exitsyscall runtime.exitsyscall 41 func exitsyscall() 42 43 // setMemoryRegion initializes a region. 44 // 45 // This may be called from bluepillHandler, and therefore returns an errno 46 // directly (instead of wrapping in an error) to avoid allocations. 47 // 48 //go:nosplit 49 func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr, flags uint32) unix.Errno { 50 userRegion := userMemoryRegion{ 51 slot: uint32(slot), 52 flags: uint32(flags), 53 guestPhysAddr: uint64(physical), 54 memorySize: uint64(length), 55 userspaceAddr: uint64(virtual), 56 } 57 58 // Set the region. 59 // Note: syscall.RawSyscall is used to fit the nosplit stack limit. 60 _, _, errno := syscall.RawSyscall( 61 unix.SYS_IOCTL, 62 uintptr(m.fd), 63 _KVM_SET_USER_MEMORY_REGION, 64 uintptr(unsafe.Pointer(&userRegion))) 65 return errno 66 } 67 68 // mapRunData maps the vCPU run data. 69 func mapRunData(fd int) (*runData, error) { 70 r, _, errno := unix.RawSyscall6( 71 unix.SYS_MMAP, 72 0, 73 uintptr(runDataSize), 74 unix.PROT_READ|unix.PROT_WRITE, 75 unix.MAP_SHARED, 76 uintptr(fd), 77 0) 78 if errno != 0 { 79 return nil, fmt.Errorf("error mapping runData: %v", errno) 80 } 81 return (*runData)(unsafe.Pointer(r)), nil 82 } 83 84 // unmapRunData unmaps the vCPU run data. 85 func unmapRunData(r *runData) error { 86 if _, _, errno := unix.RawSyscall( 87 unix.SYS_MUNMAP, 88 uintptr(unsafe.Pointer(r)), 89 uintptr(runDataSize), 90 0); errno != 0 { 91 return fmt.Errorf("error unmapping runData: %v", errno) 92 } 93 return nil 94 } 95 96 // atomicAddressSpace is an atomic address space pointer. 97 type atomicAddressSpace struct { 98 pointer unsafe.Pointer 99 } 100 101 // set sets the address space value. 102 // 103 //go:nosplit 104 func (a *atomicAddressSpace) set(as *addressSpace) { 105 atomic.StorePointer(&a.pointer, unsafe.Pointer(as)) 106 } 107 108 // get gets the address space value. 109 // 110 // Note that this should be considered best-effort, and may have changed by the 111 // time this function returns. 112 // 113 //go:nosplit 114 func (a *atomicAddressSpace) get() *addressSpace { 115 return (*addressSpace)(atomic.LoadPointer(&a.pointer)) 116 } 117 118 // notify notifies that the vCPU has transitioned modes. 119 // 120 // This may be called by a signal handler and therefore throws on error. 121 // 122 //go:nosplit 123 func (c *vCPU) notify() { 124 _, _, errno := unix.RawSyscall6( // escapes: no. 125 unix.SYS_FUTEX, 126 uintptr(unsafe.Pointer(&c.state)), 127 linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, 128 math.MaxInt32, // Number of waiters. 129 0, 0, 0) 130 if errno != 0 { 131 throw("futex wake error") 132 } 133 } 134 135 // waitUntilNot waits for the vCPU to transition modes. 136 // 137 // The state should have been previously set to vCPUWaiter after performing an 138 // appropriate action to cause a transition (e.g. interrupt injection). 139 // 140 // This panics on error. 141 func (c *vCPU) waitUntilNot(state uint32) { 142 _, _, errno := unix.Syscall6( 143 unix.SYS_FUTEX, 144 uintptr(unsafe.Pointer(&c.state)), 145 linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG, 146 uintptr(state), 147 0, 0, 0) 148 if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN { 149 panic("futex wait error") 150 } 151 } 152 153 // setSignalMask sets the vCPU signal mask. 154 // 155 // This must be called prior to running the vCPU. 156 func (c *vCPU) setSignalMask() error { 157 // The layout of this structure implies that it will not necessarily be 158 // the same layout chosen by the Go compiler. It gets fudged here. 159 var data struct { 160 length uint32 161 mask1 uint32 162 mask2 uint32 163 _ uint32 164 } 165 data.length = 8 // Fixed sigset size. 166 data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) 167 data.mask2 = ^uint32(bounceSignalMask >> 32) 168 if _, _, errno := unix.RawSyscall( 169 unix.SYS_IOCTL, 170 uintptr(c.fd), 171 _KVM_SET_SIGNAL_MASK, 172 uintptr(unsafe.Pointer(&data))); errno != 0 { 173 return fmt.Errorf("error setting signal mask: %v", errno) 174 } 175 176 return nil 177 } 178 179 // seccompMmapHandlerCnt is a number of currently running seccompMmapHandler 180 // instances. 181 var seccompMmapHandlerCnt atomicbitops.Int64 182 183 // seccompMmapSync waits for all currently runnuing seccompMmapHandler 184 // instances. 185 // 186 // The standard locking primitives can't be used in this case since 187 // seccompMmapHandler is executed in a signal handler context. 188 // 189 // It can be implemented by using FUTEX calls, but it will require to call 190 // FUTEX_WAKE from seccompMmapHandler. Consider machine.Destroy is called only 191 // once, and the probability is racing with seccompMmapHandler is very low the 192 // spinlock-like way looks more reasonable. 193 func seccompMmapSync() { 194 for seccompMmapHandlerCnt.Load() != 0 { 195 runtime.Gosched() 196 } 197 } 198 199 // seccompMmapHandler is a signal handler for runtime mmap system calls 200 // that are trapped by seccomp. 201 // 202 // It executes the mmap syscall with specified arguments and maps a new region 203 // to the guest. 204 // 205 //go:nosplit 206 func seccompMmapHandler(context unsafe.Pointer) { 207 mmapCallCounter.Increment() 208 209 addr, length, errno := seccompMmapSyscall(context) 210 if errno != 0 { 211 return 212 } 213 214 seccompMmapHandlerCnt.Add(1) 215 for i := uint32(0); i < machinePoolLen.Load(); i++ { 216 m := machinePool[i].Load() 217 if m == nil { 218 continue 219 } 220 221 // Map the new region to the guest. 222 vr := region{ 223 virtual: addr, 224 length: length, 225 } 226 for virtual := vr.virtual; virtual < vr.virtual+vr.length; { 227 physical, length, ok := translateToPhysical(virtual) 228 if !ok { 229 // This must be an invalid region that was 230 // knocked out by creation of the physical map. 231 return 232 } 233 if virtual+length > vr.virtual+vr.length { 234 // Cap the length to the end of the area. 235 length = vr.virtual + vr.length - virtual 236 } 237 238 // Ensure the physical range is mapped. 239 m.mapPhysical(physical, length, physicalRegions) 240 virtual += length 241 } 242 } 243 seccompMmapHandlerCnt.Add(-1) 244 } 245 246 // disableAsyncPreemption disables asynchronous preemption of go-routines. 247 func disableAsyncPreemption() { 248 set := linux.MakeSignalSet(linux.SIGURG) 249 _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_BLOCK, 250 uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) 251 if errno != 0 { 252 panic(fmt.Sprintf("sigprocmask failed: %d", errno)) 253 } 254 } 255 256 // enableAsyncPreemption enables asynchronous preemption of go-routines. 257 func enableAsyncPreemption() { 258 set := linux.MakeSignalSet(linux.SIGURG) 259 _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, 260 uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) 261 if errno != 0 { 262 panic(fmt.Sprintf("sigprocmask failed: %d", errno)) 263 } 264 }