github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kvm 16 17 import ( 18 "fmt" 19 "runtime" 20 gosync "sync" 21 "sync/atomic" 22 "time" 23 24 "golang.org/x/sys/unix" 25 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 26 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 27 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 28 "github.com/nicocha30/gvisor-ligolo/pkg/hosttid" 29 "github.com/nicocha30/gvisor-ligolo/pkg/log" 30 "github.com/nicocha30/gvisor-ligolo/pkg/metric" 31 "github.com/nicocha30/gvisor-ligolo/pkg/ring0" 32 "github.com/nicocha30/gvisor-ligolo/pkg/ring0/pagetables" 33 "github.com/nicocha30/gvisor-ligolo/pkg/seccomp" 34 ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time" 35 "github.com/nicocha30/gvisor-ligolo/pkg/sighandling" 36 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 37 ) 38 39 // machine contains state associated with the VM as a whole. 40 type machine struct { 41 // fd is the vm fd. 42 fd int 43 44 // machinePoolIndex is the index in the machinePool array. 45 machinePoolIndex uint32 46 47 // nextSlot is the next slot for setMemoryRegion. 48 // 49 // If nextSlot is ^uint32(0), then slots are currently being updated, and the 50 // caller should retry. 51 nextSlot atomicbitops.Uint32 52 53 // upperSharedPageTables tracks the read-only shared upper of all the pagetables. 54 upperSharedPageTables *pagetables.PageTables 55 56 // kernel is the set of global structures. 57 kernel ring0.Kernel 58 59 // mu protects vCPUs. 60 mu sync.RWMutex 61 62 // available is notified when vCPUs are available. 63 available sync.Cond 64 65 // vCPUsByTID are the machine vCPUs. 66 // 67 // These are populated dynamically. 68 vCPUsByTID map[uint64]*vCPU 69 70 // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. 71 vCPUsByID []*vCPU 72 73 // usedVCPUs is the number of vCPUs that have been used from the 74 // vCPUsByID pool. 75 usedVCPUs int 76 77 // maxVCPUs is the maximum number of vCPUs supported by the machine. 78 maxVCPUs int 79 80 // maxSlots is the maximum number of memory slots supported by the machine. 81 maxSlots int 82 83 // tscControl checks whether cpu supports TSC scaling 84 tscControl bool 85 86 // usedSlots is the set of used physical addresses (not sorted). 87 usedSlots []uintptr 88 } 89 90 const ( 91 // vCPUReady is an alias for all the below clear. 92 vCPUReady uint32 = 0 93 94 // vCPUser indicates that the vCPU is in or about to enter user mode. 95 vCPUUser uint32 = 1 << 0 96 97 // vCPUGuest indicates the vCPU is in guest mode. 98 vCPUGuest uint32 = 1 << 1 99 100 // vCPUWaiter indicates that there is a waiter. 101 // 102 // If this is set, then notify must be called on any state transitions. 103 vCPUWaiter uint32 = 1 << 2 104 ) 105 106 // Field values for the get_vcpu metric acquisition path used. 107 var ( 108 getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"} 109 getVCPUAcquisitionReused = metric.FieldValue{"reused"} 110 getVCPUAcquisitionUnused = metric.FieldValue{"unused"} 111 getVCPUAcquisitionStolen = metric.FieldValue{"stolen"} 112 ) 113 114 var ( 115 // hostExitCounter is a metric that tracks how many times the sentry 116 // performed a host to guest world switch. 117 hostExitCounter = metric.MustCreateNewProfilingUint64Metric( 118 "/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.") 119 120 // userExitCounter is a metric that tracks how many times the sentry has 121 // had an exit from userspace. Analogous to vCPU.userExits. 122 userExitCounter = metric.MustCreateNewProfilingUint64Metric( 123 "/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.") 124 125 // interruptCounter is a metric that tracks how many times execution returned 126 // to the KVM host to handle a pending signal. 127 interruptCounter = metric.MustCreateNewProfilingUint64Metric( 128 "/kvm/interrupts", false, "The number of times the signal handler was invoked.") 129 130 // mmapCallCounter is a metric that tracks how many times the function 131 // seccompMmapSyscall has been called. 132 mmapCallCounter = metric.MustCreateNewProfilingUint64Metric( 133 "/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.") 134 135 // getVCPUCounter is a metric that tracks how many times different paths of 136 // machine.Get() are triggered. 137 getVCPUCounter = metric.MustCreateNewProfilingUint64Metric( 138 "/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.", 139 metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen)) 140 141 // asInvalidateDuration are durations of calling addressSpace.invalidate(). 142 asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate", 143 metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2), 144 "Duration of calling addressSpace.invalidate().") 145 ) 146 147 // vCPU is a single KVM vCPU. 148 type vCPU struct { 149 // CPU is the kernel CPU data. 150 // 151 // This must be the first element of this structure, it is referenced 152 // by the bluepill code (see bluepill_amd64.s). 153 ring0.CPU 154 155 // id is the vCPU id. 156 id int 157 158 // fd is the vCPU fd. 159 fd int 160 161 // tid is the last set tid. 162 tid atomicbitops.Uint64 163 164 // userExits is the count of user exits. 165 userExits atomicbitops.Uint64 166 167 // guestExits is the count of guest to host world switches. 168 guestExits atomicbitops.Uint64 169 170 // faults is a count of world faults (informational only). 171 faults uint32 172 173 // state is the vCPU state. 174 // 175 // This is a bitmask of the three fields (vCPU*) described above. 176 state atomicbitops.Uint32 177 178 // runData for this vCPU. 179 runData *runData 180 181 // machine associated with this vCPU. 182 machine *machine 183 184 // active is the current addressSpace: this is set and read atomically, 185 // it is used to elide unnecessary interrupts due to invalidations. 186 active atomicAddressSpace 187 188 // vCPUArchState is the architecture-specific state. 189 vCPUArchState 190 191 // dieState holds state related to vCPU death. 192 dieState dieState 193 } 194 195 type dieState struct { 196 // message is thrown from die. 197 message string 198 199 // guestRegs is used to store register state during vCPU.die() to prevent 200 // allocation inside nosplit function. 201 guestRegs userRegs 202 } 203 204 // createVCPU creates and returns a new vCPU. 205 // 206 // Precondition: mu must be held. 207 func (m *machine) createVCPU(id int) *vCPU { 208 // Create the vCPU. 209 fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) 210 if errno != 0 { 211 panic(fmt.Sprintf("error creating new vCPU: %v", errno)) 212 } 213 214 c := &vCPU{ 215 id: id, 216 fd: int(fd), 217 machine: m, 218 } 219 c.CPU.Init(&m.kernel, c.id, c) 220 m.vCPUsByID[c.id] = c 221 222 // Ensure the signal mask is correct. 223 if err := c.setSignalMask(); err != nil { 224 panic(fmt.Sprintf("error setting signal mask: %v", err)) 225 } 226 227 // Map the run data. 228 runData, err := mapRunData(int(fd)) 229 if err != nil { 230 panic(fmt.Sprintf("error mapping run data: %v", err)) 231 } 232 c.runData = runData 233 234 // Initialize architecture state. 235 if err := c.initArchState(); err != nil { 236 panic(fmt.Sprintf("error initialization vCPU state: %v", err)) 237 } 238 239 return c // Done. 240 } 241 242 // newMachine returns a new VM context. 243 func newMachine(vm int) (*machine, error) { 244 // Create the machine. 245 m := &machine{fd: vm} 246 m.available.L = &m.mu 247 248 // Pull the maximum vCPUs. 249 m.getMaxVCPU() 250 log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) 251 m.vCPUsByTID = make(map[uint64]*vCPU) 252 m.vCPUsByID = make([]*vCPU, m.maxVCPUs) 253 m.kernel.Init(m.maxVCPUs) 254 255 // Pull the maximum slots. 256 maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) 257 if errno != 0 { 258 m.maxSlots = _KVM_NR_MEMSLOTS 259 } else { 260 m.maxSlots = int(maxSlots) 261 } 262 log.Debugf("The maximum number of slots is %d.", m.maxSlots) 263 m.usedSlots = make([]uintptr, m.maxSlots) 264 265 // Check TSC Scaling 266 hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL) 267 m.tscControl = errno == 0 && hasTSCControl == 1 268 log.Debugf("TSC scaling support: %t.", m.tscControl) 269 270 // Create the upper shared pagetables and kernel(sentry) pagetables. 271 m.upperSharedPageTables = pagetables.New(newAllocator()) 272 m.mapUpperHalf(m.upperSharedPageTables) 273 m.upperSharedPageTables.Allocator.(*allocator).base.Drain() 274 m.upperSharedPageTables.MarkReadOnlyShared() 275 m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) 276 277 // Install seccomp rules to trap runtime mmap system calls. They will 278 // be handled by seccompMmapHandler. 279 seccompMmapRules(m) 280 281 // Apply the physical mappings. Note that these mappings may point to 282 // guest physical addresses that are not actually available. These 283 // physical pages are mapped on demand, see kernel_unsafe.go. 284 applyPhysicalRegions(func(pr physicalRegion) bool { 285 // Map everything in the lower half. 286 m.kernel.PageTables.Map( 287 hostarch.Addr(pr.virtual), 288 pr.length, 289 pagetables.MapOpts{AccessType: hostarch.ReadWrite}, 290 pr.physical) 291 292 return true // Keep iterating. 293 }) 294 295 // Ensure that the currently mapped virtual regions are actually 296 // available in the VM. Note that this doesn't guarantee no future 297 // faults, however it should guarantee that everything is available to 298 // ensure successful vCPU entry. 299 mapRegion := func(vr virtualRegion, flags uint32) { 300 for virtual := vr.virtual; virtual < vr.virtual+vr.length; { 301 physical, length, ok := translateToPhysical(virtual) 302 if !ok { 303 // This must be an invalid region that was 304 // knocked out by creation of the physical map. 305 return 306 } 307 if virtual+length > vr.virtual+vr.length { 308 // Cap the length to the end of the area. 309 length = vr.virtual + vr.length - virtual 310 } 311 // Update page tables for executable mappings. 312 if vr.accessType.Execute { 313 if vr.accessType.Write { 314 panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr)) 315 } 316 m.kernel.PageTables.Map( 317 hostarch.Addr(virtual), 318 length, 319 pagetables.MapOpts{AccessType: vr.accessType}, 320 physical) 321 } 322 323 // Ensure the physical range is mapped. 324 m.mapPhysical(physical, length, physicalRegions) 325 virtual += length 326 } 327 } 328 329 // handleBluepillFault takes the slot spinlock and it is called from 330 // seccompMmapHandler, so here we have to guarantee that mmap is not 331 // called while we hold the slot spinlock. 332 disableAsyncPreemption() 333 applyVirtualRegions(func(vr virtualRegion) { 334 if excludeVirtualRegion(vr) { 335 return // skip region. 336 } 337 // Take into account that the stack can grow down. 338 if vr.filename == "[stack]" { 339 vr.virtual -= 1 << 20 340 vr.length += 1 << 20 341 } 342 343 mapRegion(vr, 0) 344 345 }) 346 enableAsyncPreemption() 347 348 // Initialize architecture state. 349 if err := m.initArchState(); err != nil { 350 m.Destroy() 351 return nil, err 352 } 353 354 // Ensure the machine is cleaned up properly. 355 runtime.SetFinalizer(m, (*machine).Destroy) 356 return m, nil 357 } 358 359 // hasSlot returns true if the given address is mapped. 360 // 361 // This must be done via a linear scan. 362 // 363 //go:nosplit 364 func (m *machine) hasSlot(physical uintptr) bool { 365 slotLen := int(m.nextSlot.Load()) 366 // When slots are being updated, nextSlot is ^uint32(0). As this situation 367 // is less likely happen, we just set the slotLen to m.maxSlots, and scan 368 // the whole usedSlots array. 369 if slotLen == int(^uint32(0)) { 370 slotLen = m.maxSlots 371 } 372 for i := 0; i < slotLen; i++ { 373 if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { 374 return true 375 } 376 } 377 return false 378 } 379 380 // mapPhysical checks for the mapping of a physical range, and installs one if 381 // not available. This attempts to be efficient for calls in the hot path. 382 // 383 // This throws on error. 384 // 385 //go:nosplit 386 func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) { 387 for end := physical + length; physical < end; { 388 _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions) 389 if pr == nil { 390 // Should never happen. 391 throw("mapPhysical on unknown physical address") 392 } 393 394 // Is this already mapped? Check the usedSlots. 395 if !m.hasSlot(physicalStart) { 396 if _, ok := handleBluepillFault(m, physical, phyRegions); !ok { 397 throw("handleBluepillFault failed") 398 } 399 } 400 401 // Move to the next chunk. 402 physical = physicalStart + length 403 } 404 } 405 406 // Destroy frees associated resources. 407 // 408 // Destroy should only be called once all active users of the machine are gone. 409 // The machine object should not be used after calling Destroy. 410 // 411 // Precondition: all vCPUs must be returned to the machine. 412 func (m *machine) Destroy() { 413 runtime.SetFinalizer(m, nil) 414 415 // Destroy vCPUs. 416 for _, c := range m.vCPUsByID { 417 if c == nil { 418 continue 419 } 420 421 // Ensure the vCPU is not still running in guest mode. This is 422 // possible iff teardown has been done by other threads, and 423 // somehow a single thread has not executed any system calls. 424 c.BounceToHost() 425 426 // Note that the runData may not be mapped if an error occurs 427 // during the middle of initialization. 428 if c.runData != nil { 429 if err := unmapRunData(c.runData); err != nil { 430 panic(fmt.Sprintf("error unmapping rundata: %v", err)) 431 } 432 } 433 if err := unix.Close(int(c.fd)); err != nil { 434 panic(fmt.Sprintf("error closing vCPU fd: %v", err)) 435 } 436 } 437 438 machinePool[m.machinePoolIndex].Store(nil) 439 seccompMmapSync() 440 441 // vCPUs are gone: teardown machine state. 442 if err := unix.Close(m.fd); err != nil { 443 panic(fmt.Sprintf("error closing VM fd: %v", err)) 444 } 445 } 446 447 // Get gets an available vCPU. 448 // 449 // This will return with the OS thread locked. 450 // 451 // It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points 452 // to the vCPU in which the OS thread TID is running. So if Get() returns with 453 // the corrent context in guest, the vCPU of it must be the same as what 454 // Get() returns. 455 func (m *machine) Get() *vCPU { 456 m.mu.RLock() 457 runtime.LockOSThread() 458 tid := hosttid.Current() 459 460 // Check for an exact match. 461 if c := m.vCPUsByTID[tid]; c != nil { 462 c.lock() 463 m.mu.RUnlock() 464 getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) 465 return c 466 } 467 468 // The happy path failed. We now proceed to acquire an exclusive lock 469 // (because the vCPU map may change), and scan all available vCPUs. 470 // In this case, we first unlock the OS thread. Otherwise, if mu is 471 // not available, the current system thread will be parked and a new 472 // system thread spawned. We avoid this situation by simply refreshing 473 // tid after relocking the system thread. 474 m.mu.RUnlock() 475 runtime.UnlockOSThread() 476 m.mu.Lock() 477 runtime.LockOSThread() 478 tid = hosttid.Current() 479 480 // Recheck for an exact match. 481 if c := m.vCPUsByTID[tid]; c != nil { 482 c.lock() 483 m.mu.Unlock() 484 getVCPUCounter.Increment(&getVCPUAcquisitionReused) 485 return c 486 } 487 488 for { 489 // Get vCPU from the m.vCPUsByID pool. 490 if m.usedVCPUs < m.maxVCPUs { 491 c := m.vCPUsByID[m.usedVCPUs] 492 m.usedVCPUs++ 493 c.lock() 494 m.vCPUsByTID[tid] = c 495 m.mu.Unlock() 496 c.loadSegments(tid) 497 getVCPUCounter.Increment(&getVCPUAcquisitionUnused) 498 return c 499 } 500 501 // Scan for an available vCPU. 502 for origTID, c := range m.vCPUsByTID { 503 if c.state.CompareAndSwap(vCPUReady, vCPUUser) { 504 delete(m.vCPUsByTID, origTID) 505 m.vCPUsByTID[tid] = c 506 m.mu.Unlock() 507 c.loadSegments(tid) 508 getVCPUCounter.Increment(&getVCPUAcquisitionUnused) 509 return c 510 } 511 } 512 513 // Scan for something not in user mode. 514 for origTID, c := range m.vCPUsByTID { 515 if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { 516 continue 517 } 518 519 // The vCPU is not be able to transition to 520 // vCPUGuest|vCPUWaiter or to vCPUUser because that 521 // transition requires holding the machine mutex, as we 522 // do now. There is no path to register a waiter on 523 // just the vCPUReady state. 524 for { 525 c.waitUntilNot(vCPUGuest | vCPUWaiter) 526 if c.state.CompareAndSwap(vCPUReady, vCPUUser) { 527 break 528 } 529 } 530 531 // Steal the vCPU. 532 delete(m.vCPUsByTID, origTID) 533 m.vCPUsByTID[tid] = c 534 m.mu.Unlock() 535 c.loadSegments(tid) 536 getVCPUCounter.Increment(&getVCPUAcquisitionStolen) 537 return c 538 } 539 540 // Everything is executing in user mode. Wait until something 541 // is available. Note that signaling the condition variable 542 // will have the extra effect of kicking the vCPUs out of guest 543 // mode if that's where they were. 544 m.available.Wait() 545 } 546 } 547 548 // Put puts the current vCPU. 549 func (m *machine) Put(c *vCPU) { 550 c.unlock() 551 runtime.UnlockOSThread() 552 553 m.mu.RLock() 554 m.available.Signal() 555 m.mu.RUnlock() 556 } 557 558 // newDirtySet returns a new dirty set. 559 func (m *machine) newDirtySet() *dirtySet { 560 return &dirtySet{ 561 vCPUMasks: make([]atomicbitops.Uint64, 562 (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), 563 } 564 } 565 566 // dropPageTables drops cached page table entries. 567 func (m *machine) dropPageTables(pt *pagetables.PageTables) { 568 m.mu.Lock() 569 defer m.mu.Unlock() 570 571 // Clear from all PCIDs. 572 for _, c := range m.vCPUsByID { 573 if c != nil && c.PCIDs != nil { 574 c.PCIDs.Drop(pt) 575 } 576 } 577 } 578 579 // lock marks the vCPU as in user mode. 580 // 581 // This should only be called directly when known to be safe, i.e. when 582 // the vCPU is owned by the current TID with no chance of theft. 583 // 584 //go:nosplit 585 func (c *vCPU) lock() { 586 atomicbitops.OrUint32(&c.state, vCPUUser) 587 } 588 589 // unlock clears the vCPUUser bit. 590 // 591 //go:nosplit 592 func (c *vCPU) unlock() { 593 origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) 594 if origState == vCPUUser|vCPUGuest { 595 // Happy path: no exits are forced, and we can continue 596 // executing on our merry way with a single atomic access. 597 return 598 } 599 600 // Clear the lock. 601 for { 602 state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) 603 if state == origState { 604 break 605 } 606 origState = state 607 } 608 switch origState { 609 case vCPUUser: 610 // Normal state. 611 case vCPUUser | vCPUGuest | vCPUWaiter: 612 // Force a transition: this must trigger a notification when we 613 // return from guest mode. We must clear vCPUWaiter here 614 // anyways, because BounceToKernel will force a transition only 615 // from ring3 to ring0, which will not clear this bit. Halt may 616 // workaround the issue, but if there is no exception or 617 // syscall in this period, BounceToKernel will hang. 618 atomicbitops.AndUint32(&c.state, ^vCPUWaiter) 619 c.notify() 620 case vCPUUser | vCPUWaiter: 621 // Waiting for the lock to be released; the responsibility is 622 // on us to notify the waiter and clear the associated bit. 623 atomicbitops.AndUint32(&c.state, ^vCPUWaiter) 624 c.notify() 625 default: 626 panic("invalid state") 627 } 628 } 629 630 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 631 // 632 //go:nosplit 633 func (c *vCPU) NotifyInterrupt() { 634 c.BounceToKernel() 635 } 636 637 // pid is used below in bounce. 638 var pid = unix.Getpid() 639 640 // bounce forces a return to the kernel or to host mode. 641 // 642 // This effectively unwinds the state machine. 643 func (c *vCPU) bounce(forceGuestExit bool) { 644 origGuestExits := c.guestExits.Load() 645 origUserExits := c.userExits.Load() 646 for { 647 switch state := c.state.Load(); state { 648 case vCPUReady, vCPUWaiter: 649 // There is nothing to be done, we're already in the 650 // kernel pre-acquisition. The Bounce criteria have 651 // been satisfied. 652 return 653 case vCPUUser: 654 // We need to register a waiter for the actual guest 655 // transition. When the transition takes place, then we 656 // can inject an interrupt to ensure a return to host 657 // mode. 658 c.state.CompareAndSwap(state, state|vCPUWaiter) 659 case vCPUUser | vCPUWaiter: 660 // Wait for the transition to guest mode. This should 661 // come from the bluepill handler. 662 c.waitUntilNot(state) 663 case vCPUGuest, vCPUUser | vCPUGuest: 664 if state == vCPUGuest && !forceGuestExit { 665 // The vCPU is already not acquired, so there's 666 // no need to do a fresh injection here. 667 return 668 } 669 // The vCPU is in user or kernel mode. Attempt to 670 // register a notification on change. 671 if !c.state.CompareAndSwap(state, state|vCPUWaiter) { 672 break // Retry. 673 } 674 for { 675 // We need to spin here until the signal is 676 // delivered, because Tgkill can return EAGAIN 677 // under memory pressure. Since we already 678 // marked ourselves as a waiter, we need to 679 // ensure that a signal is actually delivered. 680 if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil { 681 break 682 } else if err.(unix.Errno) == unix.EAGAIN { 683 continue 684 } else { 685 // Nothing else should be returned by tgkill. 686 panic(fmt.Sprintf("unexpected tgkill error: %v", err)) 687 } 688 } 689 case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: 690 if state == vCPUGuest|vCPUWaiter && !forceGuestExit { 691 // See above. 692 return 693 } 694 // Wait for the transition. This again should happen 695 // from the bluepill handler, but on the way out. 696 c.waitUntilNot(state) 697 default: 698 // Should not happen: the above is exhaustive. 699 panic("invalid state") 700 } 701 702 // Check if we've missed the state transition, but 703 // we can safely return at this point in time. 704 newGuestExits := c.guestExits.Load() 705 newUserExits := c.userExits.Load() 706 if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { 707 return 708 } 709 } 710 } 711 712 // BounceToKernel ensures that the vCPU bounces back to the kernel. 713 // 714 //go:nosplit 715 func (c *vCPU) BounceToKernel() { 716 c.bounce(false) 717 } 718 719 // BounceToHost ensures that the vCPU is in host mode. 720 // 721 //go:nosplit 722 func (c *vCPU) BounceToHost() { 723 c.bounce(true) 724 } 725 726 // setSystemTimeLegacy calibrates and sets an approximate system time. 727 func (c *vCPU) setSystemTimeLegacy() error { 728 const minIterations = 10 729 minimum := uint64(0) 730 for iter := 0; ; iter++ { 731 // Try to set the TSC to an estimate of where it will be 732 // on the host during a "fast" system call iteration. 733 start := uint64(ktime.Rdtsc()) 734 if err := c.setTSC(start + (minimum / 2)); err != nil { 735 return err 736 } 737 // See if this is our new minimum call time. Note that this 738 // serves two functions: one, we make sure that we are 739 // accurately predicting the offset we need to set. Second, we 740 // don't want to do the final set on a slow call, which could 741 // produce a really bad result. 742 end := uint64(ktime.Rdtsc()) 743 if end < start { 744 continue // Totally bogus: unstable TSC? 745 } 746 current := end - start 747 if current < minimum || iter == 0 { 748 minimum = current // Set our new minimum. 749 } 750 // Is this past minIterations and within ~10% of minimum? 751 upperThreshold := (((minimum << 3) + minimum) >> 3) 752 if iter >= minIterations && current <= upperThreshold { 753 return nil 754 } 755 } 756 } 757 758 const machinePoolSize = 16 759 760 // machinePool is enumerated from the seccompMmapHandler signal handler 761 var ( 762 machinePool [machinePoolSize]machineAtomicPtr 763 machinePoolLen atomicbitops.Uint32 764 machinePoolMu sync.Mutex 765 seccompMmapRulesOnce gosync.Once 766 ) 767 768 func sigsysHandler() 769 func addrOfSigsysHandler() uintptr 770 771 // seccompMmapRules adds seccomp rules to trap mmap system calls that will be 772 // handled in seccompMmapHandler. 773 func seccompMmapRules(m *machine) { 774 seccompMmapRulesOnce.Do(func() { 775 // Install the handler. 776 if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { 777 panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) 778 } 779 rules := []seccomp.RuleSet{} 780 rules = append(rules, []seccomp.RuleSet{ 781 // Trap mmap system calls and handle them in sigsysGoHandler 782 { 783 Rules: seccomp.SyscallRules{ 784 unix.SYS_MMAP: { 785 { 786 seccomp.MatchAny{}, 787 seccomp.MatchAny{}, 788 seccomp.MaskedEqual(unix.PROT_EXEC, 0), 789 /* MAP_DENYWRITE is ignored and used only for filtering. */ 790 seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0), 791 }, 792 }, 793 }, 794 Action: linux.SECCOMP_RET_TRAP, 795 }, 796 }...) 797 instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW) 798 if err != nil { 799 panic(fmt.Sprintf("failed to build rules: %v", err)) 800 } 801 // Perform the actual installation. 802 if err := seccomp.SetFilter(instrs); err != nil { 803 panic(fmt.Sprintf("failed to set filter: %v", err)) 804 } 805 }) 806 807 machinePoolMu.Lock() 808 n := machinePoolLen.Load() 809 i := uint32(0) 810 for ; i < n; i++ { 811 if machinePool[i].Load() == nil { 812 break 813 } 814 } 815 if i == n { 816 if i == machinePoolSize { 817 machinePoolMu.Unlock() 818 panic("machinePool is full") 819 } 820 machinePoolLen.Add(1) 821 } 822 machinePool[i].Store(m) 823 m.machinePoolIndex = i 824 machinePoolMu.Unlock() 825 }