github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kvm 16 17 import ( 18 "fmt" 19 "runtime" 20 "sync/atomic" 21 22 "golang.org/x/sys/unix" 23 "github.com/SagerNet/gvisor/pkg/atomicbitops" 24 "github.com/SagerNet/gvisor/pkg/hostarch" 25 "github.com/SagerNet/gvisor/pkg/log" 26 "github.com/SagerNet/gvisor/pkg/procid" 27 "github.com/SagerNet/gvisor/pkg/ring0" 28 "github.com/SagerNet/gvisor/pkg/ring0/pagetables" 29 ktime "github.com/SagerNet/gvisor/pkg/sentry/time" 30 "github.com/SagerNet/gvisor/pkg/sync" 31 ) 32 33 // machine contains state associated with the VM as a whole. 34 type machine struct { 35 // fd is the vm fd. 36 fd int 37 38 // nextSlot is the next slot for setMemoryRegion. 39 // 40 // This must be accessed atomically. If nextSlot is ^uint32(0), then 41 // slots are currently being updated, and the caller should retry. 42 nextSlot uint32 43 44 // upperSharedPageTables tracks the read-only shared upper of all the pagetables. 45 upperSharedPageTables *pagetables.PageTables 46 47 // kernel is the set of global structures. 48 kernel ring0.Kernel 49 50 // mu protects vCPUs. 51 mu sync.RWMutex 52 53 // available is notified when vCPUs are available. 54 available sync.Cond 55 56 // vCPUsByTID are the machine vCPUs. 57 // 58 // These are populated dynamically. 59 vCPUsByTID map[uint64]*vCPU 60 61 // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. 62 vCPUsByID []*vCPU 63 64 // maxVCPUs is the maximum number of vCPUs supported by the machine. 65 maxVCPUs int 66 67 // maxSlots is the maximum number of memory slots supported by the machine. 68 maxSlots int 69 70 // tscControl checks whether cpu supports TSC scaling 71 tscControl bool 72 73 // usedSlots is the set of used physical addresses (sorted). 74 usedSlots []uintptr 75 76 // nextID is the next vCPU ID. 77 nextID uint32 78 79 // machineArchState is the architecture-specific state. 80 machineArchState 81 } 82 83 const ( 84 // vCPUReady is an alias for all the below clear. 85 vCPUReady uint32 = 0 86 87 // vCPUser indicates that the vCPU is in or about to enter user mode. 88 vCPUUser uint32 = 1 << 0 89 90 // vCPUGuest indicates the vCPU is in guest mode. 91 vCPUGuest uint32 = 1 << 1 92 93 // vCPUWaiter indicates that there is a waiter. 94 // 95 // If this is set, then notify must be called on any state transitions. 96 vCPUWaiter uint32 = 1 << 2 97 ) 98 99 // vCPU is a single KVM vCPU. 100 type vCPU struct { 101 // CPU is the kernel CPU data. 102 // 103 // This must be the first element of this structure, it is referenced 104 // by the bluepill code (see bluepill_amd64.s). 105 ring0.CPU 106 107 // id is the vCPU id. 108 id int 109 110 // fd is the vCPU fd. 111 fd int 112 113 // tid is the last set tid. 114 tid uint64 115 116 // userExits is the count of user exits. 117 userExits uint64 118 119 // guestExits is the count of guest to host world switches. 120 guestExits uint64 121 122 // faults is a count of world faults (informational only). 123 faults uint32 124 125 // state is the vCPU state. 126 // 127 // This is a bitmask of the three fields (vCPU*) described above. 128 state uint32 129 130 // runData for this vCPU. 131 runData *runData 132 133 // machine associated with this vCPU. 134 machine *machine 135 136 // active is the current addressSpace: this is set and read atomically, 137 // it is used to elide unnecessary interrupts due to invalidations. 138 active atomicAddressSpace 139 140 // vCPUArchState is the architecture-specific state. 141 vCPUArchState 142 143 // dieState holds state related to vCPU death. 144 dieState dieState 145 } 146 147 type dieState struct { 148 // message is thrown from die. 149 message string 150 151 // guestRegs is used to store register state during vCPU.die() to prevent 152 // allocation inside nosplit function. 153 guestRegs userRegs 154 } 155 156 // newVCPU creates a returns a new vCPU. 157 // 158 // Precondition: mu must be held. 159 func (m *machine) newVCPU() *vCPU { 160 // Create the vCPU. 161 id := int(atomic.AddUint32(&m.nextID, 1) - 1) 162 fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) 163 if errno != 0 { 164 panic(fmt.Sprintf("error creating new vCPU: %v", errno)) 165 } 166 167 c := &vCPU{ 168 id: id, 169 fd: int(fd), 170 machine: m, 171 } 172 c.CPU.Init(&m.kernel, c.id, c) 173 m.vCPUsByID[c.id] = c 174 175 // Ensure the signal mask is correct. 176 if err := c.setSignalMask(); err != nil { 177 panic(fmt.Sprintf("error setting signal mask: %v", err)) 178 } 179 180 // Map the run data. 181 runData, err := mapRunData(int(fd)) 182 if err != nil { 183 panic(fmt.Sprintf("error mapping run data: %v", err)) 184 } 185 c.runData = runData 186 187 // Initialize architecture state. 188 if err := c.initArchState(); err != nil { 189 panic(fmt.Sprintf("error initialization vCPU state: %v", err)) 190 } 191 192 return c // Done. 193 } 194 195 // newMachine returns a new VM context. 196 func newMachine(vm int) (*machine, error) { 197 // Create the machine. 198 m := &machine{fd: vm} 199 m.available.L = &m.mu 200 201 // Pull the maximum vCPUs. 202 m.getMaxVCPU() 203 log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) 204 m.vCPUsByTID = make(map[uint64]*vCPU) 205 m.vCPUsByID = make([]*vCPU, m.maxVCPUs) 206 m.kernel.Init(m.maxVCPUs) 207 208 // Pull the maximum slots. 209 maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) 210 if errno != 0 { 211 m.maxSlots = _KVM_NR_MEMSLOTS 212 } else { 213 m.maxSlots = int(maxSlots) 214 } 215 log.Debugf("The maximum number of slots is %d.", m.maxSlots) 216 m.usedSlots = make([]uintptr, m.maxSlots) 217 218 // Check TSC Scaling 219 hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL) 220 m.tscControl = errno == 0 && hasTSCControl == 1 221 log.Debugf("TSC scaling support: %t.", m.tscControl) 222 223 // Create the upper shared pagetables and kernel(sentry) pagetables. 224 m.upperSharedPageTables = pagetables.New(newAllocator()) 225 m.mapUpperHalf(m.upperSharedPageTables) 226 m.upperSharedPageTables.Allocator.(*allocator).base.Drain() 227 m.upperSharedPageTables.MarkReadOnlyShared() 228 m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) 229 230 // Apply the physical mappings. Note that these mappings may point to 231 // guest physical addresses that are not actually available. These 232 // physical pages are mapped on demand, see kernel_unsafe.go. 233 applyPhysicalRegions(func(pr physicalRegion) bool { 234 // Map everything in the lower half. 235 m.kernel.PageTables.Map( 236 hostarch.Addr(pr.virtual), 237 pr.length, 238 pagetables.MapOpts{AccessType: hostarch.AnyAccess}, 239 pr.physical) 240 241 return true // Keep iterating. 242 }) 243 244 var physicalRegionsReadOnly []physicalRegion 245 var physicalRegionsAvailable []physicalRegion 246 247 physicalRegionsReadOnly = rdonlyRegionsForSetMem() 248 physicalRegionsAvailable = availableRegionsForSetMem() 249 250 // Map all read-only regions. 251 for _, r := range physicalRegionsReadOnly { 252 m.mapPhysical(r.physical, r.length, physicalRegionsReadOnly, _KVM_MEM_READONLY) 253 } 254 255 // Ensure that the currently mapped virtual regions are actually 256 // available in the VM. Note that this doesn't guarantee no future 257 // faults, however it should guarantee that everything is available to 258 // ensure successful vCPU entry. 259 applyVirtualRegions(func(vr virtualRegion) { 260 if excludeVirtualRegion(vr) { 261 return // skip region. 262 } 263 264 for _, r := range physicalRegionsReadOnly { 265 if vr.virtual == r.virtual { 266 return 267 } 268 } 269 270 for virtual := vr.virtual; virtual < vr.virtual+vr.length; { 271 physical, length, ok := translateToPhysical(virtual) 272 if !ok { 273 // This must be an invalid region that was 274 // knocked out by creation of the physical map. 275 return 276 } 277 if virtual+length > vr.virtual+vr.length { 278 // Cap the length to the end of the area. 279 length = vr.virtual + vr.length - virtual 280 } 281 282 // Ensure the physical range is mapped. 283 m.mapPhysical(physical, length, physicalRegionsAvailable, _KVM_MEM_FLAGS_NONE) 284 virtual += length 285 } 286 }) 287 288 // Initialize architecture state. 289 if err := m.initArchState(); err != nil { 290 m.Destroy() 291 return nil, err 292 } 293 294 // Ensure the machine is cleaned up properly. 295 runtime.SetFinalizer(m, (*machine).Destroy) 296 return m, nil 297 } 298 299 // hasSlot returns true iff the given address is mapped. 300 // 301 // This must be done via a linear scan. 302 // 303 //go:nosplit 304 func (m *machine) hasSlot(physical uintptr) bool { 305 for i := 0; i < len(m.usedSlots); i++ { 306 if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { 307 return true 308 } 309 } 310 return false 311 } 312 313 // mapPhysical checks for the mapping of a physical range, and installs one if 314 // not available. This attempts to be efficient for calls in the hot path. 315 // 316 // This panics on error. 317 // 318 //go:nosplit 319 func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) { 320 for end := physical + length; physical < end; { 321 _, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions) 322 if !ok { 323 // Should never happen. 324 panic("mapPhysical on unknown physical address") 325 } 326 327 // Is this already mapped? Check the usedSlots. 328 if !m.hasSlot(physicalStart) { 329 if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok { 330 panic("handleBluepillFault failed") 331 } 332 } 333 334 // Move to the next chunk. 335 physical = physicalStart + length 336 } 337 } 338 339 // Destroy frees associated resources. 340 // 341 // Destroy should only be called once all active users of the machine are gone. 342 // The machine object should not be used after calling Destroy. 343 // 344 // Precondition: all vCPUs must be returned to the machine. 345 func (m *machine) Destroy() { 346 runtime.SetFinalizer(m, nil) 347 348 // Destroy vCPUs. 349 for _, c := range m.vCPUsByID { 350 if c == nil { 351 continue 352 } 353 354 // Ensure the vCPU is not still running in guest mode. This is 355 // possible iff teardown has been done by other threads, and 356 // somehow a single thread has not executed any system calls. 357 c.BounceToHost() 358 359 // Note that the runData may not be mapped if an error occurs 360 // during the middle of initialization. 361 if c.runData != nil { 362 if err := unmapRunData(c.runData); err != nil { 363 panic(fmt.Sprintf("error unmapping rundata: %v", err)) 364 } 365 } 366 if err := unix.Close(int(c.fd)); err != nil { 367 panic(fmt.Sprintf("error closing vCPU fd: %v", err)) 368 } 369 } 370 371 // vCPUs are gone: teardown machine state. 372 if err := unix.Close(m.fd); err != nil { 373 panic(fmt.Sprintf("error closing VM fd: %v", err)) 374 } 375 } 376 377 // Get gets an available vCPU. 378 // 379 // This will return with the OS thread locked. 380 // 381 // It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points 382 // to the vCPU in which the OS thread TID is running. So if Get() returns with 383 // the corrent context in guest, the vCPU of it must be the same as what 384 // Get() returns. 385 func (m *machine) Get() *vCPU { 386 m.mu.RLock() 387 runtime.LockOSThread() 388 tid := procid.Current() 389 390 // Check for an exact match. 391 if c := m.vCPUsByTID[tid]; c != nil { 392 c.lock() 393 m.mu.RUnlock() 394 return c 395 } 396 397 // The happy path failed. We now proceed to acquire an exclusive lock 398 // (because the vCPU map may change), and scan all available vCPUs. 399 // In this case, we first unlock the OS thread. Otherwise, if mu is 400 // not available, the current system thread will be parked and a new 401 // system thread spawned. We avoid this situation by simply refreshing 402 // tid after relocking the system thread. 403 m.mu.RUnlock() 404 runtime.UnlockOSThread() 405 m.mu.Lock() 406 runtime.LockOSThread() 407 tid = procid.Current() 408 409 // Recheck for an exact match. 410 if c := m.vCPUsByTID[tid]; c != nil { 411 c.lock() 412 m.mu.Unlock() 413 return c 414 } 415 416 for { 417 // Scan for an available vCPU. 418 for origTID, c := range m.vCPUsByTID { 419 if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { 420 delete(m.vCPUsByTID, origTID) 421 m.vCPUsByTID[tid] = c 422 m.mu.Unlock() 423 c.loadSegments(tid) 424 return c 425 } 426 } 427 428 // Get a new vCPU (maybe). 429 if c := m.getNewVCPU(); c != nil { 430 c.lock() 431 m.vCPUsByTID[tid] = c 432 m.mu.Unlock() 433 c.loadSegments(tid) 434 return c 435 } 436 437 // Scan for something not in user mode. 438 for origTID, c := range m.vCPUsByTID { 439 if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) { 440 continue 441 } 442 443 // The vCPU is not be able to transition to 444 // vCPUGuest|vCPUWaiter or to vCPUUser because that 445 // transition requires holding the machine mutex, as we 446 // do now. There is no path to register a waiter on 447 // just the vCPUReady state. 448 for { 449 c.waitUntilNot(vCPUGuest | vCPUWaiter) 450 if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { 451 break 452 } 453 } 454 455 // Steal the vCPU. 456 delete(m.vCPUsByTID, origTID) 457 m.vCPUsByTID[tid] = c 458 m.mu.Unlock() 459 c.loadSegments(tid) 460 return c 461 } 462 463 // Everything is executing in user mode. Wait until something 464 // is available. Note that signaling the condition variable 465 // will have the extra effect of kicking the vCPUs out of guest 466 // mode if that's where they were. 467 m.available.Wait() 468 } 469 } 470 471 // Put puts the current vCPU. 472 func (m *machine) Put(c *vCPU) { 473 c.unlock() 474 runtime.UnlockOSThread() 475 476 m.mu.RLock() 477 m.available.Signal() 478 m.mu.RUnlock() 479 } 480 481 // newDirtySet returns a new dirty set. 482 func (m *machine) newDirtySet() *dirtySet { 483 return &dirtySet{ 484 vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), 485 } 486 } 487 488 // dropPageTables drops cached page table entries. 489 func (m *machine) dropPageTables(pt *pagetables.PageTables) { 490 m.mu.Lock() 491 defer m.mu.Unlock() 492 493 // Clear from all PCIDs. 494 for _, c := range m.vCPUsByID { 495 if c != nil && c.PCIDs != nil { 496 c.PCIDs.Drop(pt) 497 } 498 } 499 } 500 501 // lock marks the vCPU as in user mode. 502 // 503 // This should only be called directly when known to be safe, i.e. when 504 // the vCPU is owned by the current TID with no chance of theft. 505 // 506 //go:nosplit 507 func (c *vCPU) lock() { 508 atomicbitops.OrUint32(&c.state, vCPUUser) 509 } 510 511 // unlock clears the vCPUUser bit. 512 // 513 //go:nosplit 514 func (c *vCPU) unlock() { 515 if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) { 516 // Happy path: no exits are forced, and we can continue 517 // executing on our merry way with a single atomic access. 518 return 519 } 520 521 // Clear the lock. 522 origState := atomic.LoadUint32(&c.state) 523 atomicbitops.AndUint32(&c.state, ^vCPUUser) 524 switch origState { 525 case vCPUUser: 526 // Normal state. 527 case vCPUUser | vCPUGuest | vCPUWaiter: 528 // Force a transition: this must trigger a notification when we 529 // return from guest mode. We must clear vCPUWaiter here 530 // anyways, because BounceToKernel will force a transition only 531 // from ring3 to ring0, which will not clear this bit. Halt may 532 // workaround the issue, but if there is no exception or 533 // syscall in this period, BounceToKernel will hang. 534 atomicbitops.AndUint32(&c.state, ^vCPUWaiter) 535 c.notify() 536 case vCPUUser | vCPUWaiter: 537 // Waiting for the lock to be released; the responsibility is 538 // on us to notify the waiter and clear the associated bit. 539 atomicbitops.AndUint32(&c.state, ^vCPUWaiter) 540 c.notify() 541 default: 542 panic("invalid state") 543 } 544 } 545 546 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 547 // 548 //go:nosplit 549 func (c *vCPU) NotifyInterrupt() { 550 c.BounceToKernel() 551 } 552 553 // pid is used below in bounce. 554 var pid = unix.Getpid() 555 556 // bounce forces a return to the kernel or to host mode. 557 // 558 // This effectively unwinds the state machine. 559 func (c *vCPU) bounce(forceGuestExit bool) { 560 origGuestExits := atomic.LoadUint64(&c.guestExits) 561 origUserExits := atomic.LoadUint64(&c.userExits) 562 for { 563 switch state := atomic.LoadUint32(&c.state); state { 564 case vCPUReady, vCPUWaiter: 565 // There is nothing to be done, we're already in the 566 // kernel pre-acquisition. The Bounce criteria have 567 // been satisfied. 568 return 569 case vCPUUser: 570 // We need to register a waiter for the actual guest 571 // transition. When the transition takes place, then we 572 // can inject an interrupt to ensure a return to host 573 // mode. 574 atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) 575 case vCPUUser | vCPUWaiter: 576 // Wait for the transition to guest mode. This should 577 // come from the bluepill handler. 578 c.waitUntilNot(state) 579 case vCPUGuest, vCPUUser | vCPUGuest: 580 if state == vCPUGuest && !forceGuestExit { 581 // The vCPU is already not acquired, so there's 582 // no need to do a fresh injection here. 583 return 584 } 585 // The vCPU is in user or kernel mode. Attempt to 586 // register a notification on change. 587 if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) { 588 break // Retry. 589 } 590 for { 591 // We need to spin here until the signal is 592 // delivered, because Tgkill can return EAGAIN 593 // under memory pressure. Since we already 594 // marked ourselves as a waiter, we need to 595 // ensure that a signal is actually delivered. 596 if err := unix.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil { 597 break 598 } else if err.(unix.Errno) == unix.EAGAIN { 599 continue 600 } else { 601 // Nothing else should be returned by tgkill. 602 panic(fmt.Sprintf("unexpected tgkill error: %v", err)) 603 } 604 } 605 case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: 606 if state == vCPUGuest|vCPUWaiter && !forceGuestExit { 607 // See above. 608 return 609 } 610 // Wait for the transition. This again should happen 611 // from the bluepill handler, but on the way out. 612 c.waitUntilNot(state) 613 default: 614 // Should not happen: the above is exhaustive. 615 panic("invalid state") 616 } 617 618 // Check if we've missed the state transition, but 619 // we can safely return at this point in time. 620 newGuestExits := atomic.LoadUint64(&c.guestExits) 621 newUserExits := atomic.LoadUint64(&c.userExits) 622 if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { 623 return 624 } 625 } 626 } 627 628 // BounceToKernel ensures that the vCPU bounces back to the kernel. 629 // 630 //go:nosplit 631 func (c *vCPU) BounceToKernel() { 632 c.bounce(false) 633 } 634 635 // BounceToHost ensures that the vCPU is in host mode. 636 // 637 //go:nosplit 638 func (c *vCPU) BounceToHost() { 639 c.bounce(true) 640 } 641 642 // setSystemTimeLegacy calibrates and sets an approximate system time. 643 func (c *vCPU) setSystemTimeLegacy() error { 644 const minIterations = 10 645 minimum := uint64(0) 646 for iter := 0; ; iter++ { 647 // Try to set the TSC to an estimate of where it will be 648 // on the host during a "fast" system call iteration. 649 start := uint64(ktime.Rdtsc()) 650 if err := c.setTSC(start + (minimum / 2)); err != nil { 651 return err 652 } 653 // See if this is our new minimum call time. Note that this 654 // serves two functions: one, we make sure that we are 655 // accurately predicting the offset we need to set. Second, we 656 // don't want to do the final set on a slow call, which could 657 // produce a really bad result. 658 end := uint64(ktime.Rdtsc()) 659 if end < start { 660 continue // Totally bogus: unstable TSC? 661 } 662 current := end - start 663 if current < minimum || iter == 0 { 664 minimum = current // Set our new minimum. 665 } 666 // Is this past minIterations and within ~10% of minimum? 667 upperThreshold := (((minimum << 3) + minimum) >> 3) 668 if iter >= minIterations && current <= upperThreshold { 669 return nil 670 } 671 } 672 }