github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine_amd64.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build amd64 16 17 package kvm 18 19 import ( 20 "fmt" 21 "math/big" 22 "reflect" 23 "runtime/debug" 24 25 "golang.org/x/sys/unix" 26 "github.com/SagerNet/gvisor/pkg/abi/linux" 27 "github.com/SagerNet/gvisor/pkg/cpuid" 28 "github.com/SagerNet/gvisor/pkg/hostarch" 29 "github.com/SagerNet/gvisor/pkg/ring0" 30 "github.com/SagerNet/gvisor/pkg/ring0/pagetables" 31 "github.com/SagerNet/gvisor/pkg/sentry/arch/fpu" 32 "github.com/SagerNet/gvisor/pkg/sentry/platform" 33 ktime "github.com/SagerNet/gvisor/pkg/sentry/time" 34 ) 35 36 // initArchState initializes architecture-specific state. 37 func (m *machine) initArchState() error { 38 // Set the legacy TSS address. This address is covered by the reserved 39 // range (up to 4GB). In fact, this is a main reason it exists. 40 if _, _, errno := unix.RawSyscall( 41 unix.SYS_IOCTL, 42 uintptr(m.fd), 43 _KVM_SET_TSS_ADDR, 44 uintptr(reservedMemory-(3*hostarch.PageSize))); errno != 0 { 45 return errno 46 } 47 48 // Enable CPUID faulting, if possible. Note that this also serves as a 49 // basic platform sanity tests, since we will enter guest mode for the 50 // first time here. The recovery is necessary, since if we fail to read 51 // the platform info register, we will retry to host mode and 52 // ultimately need to handle a segmentation fault. 53 old := debug.SetPanicOnFault(true) 54 defer func() { 55 recover() 56 debug.SetPanicOnFault(old) 57 }() 58 c := m.Get() 59 defer m.Put(c) 60 bluepill(c) 61 ring0.SetCPUIDFaulting(true) 62 63 return nil 64 } 65 66 type machineArchState struct { 67 } 68 69 type vCPUArchState struct { 70 // PCIDs is the set of PCIDs for this vCPU. 71 // 72 // This starts above fixedKernelPCID. 73 PCIDs *pagetables.PCIDs 74 75 // floatingPointState is the floating point state buffer used in guest 76 // to host transitions. See usage in bluepill_amd64.go. 77 floatingPointState fpu.State 78 } 79 80 const ( 81 // fixedKernelPCID is a fixed kernel PCID used for the kernel page 82 // tables. We must start allocating user PCIDs above this in order to 83 // avoid any conflict (see below). 84 fixedKernelPCID = 1 85 86 // poolPCIDs is the number of PCIDs to record in the database. As this 87 // grows, assignment can take longer, since it is a simple linear scan. 88 // Beyond a relatively small number, there are likely few perform 89 // benefits, since the TLB has likely long since lost any translations 90 // from more than a few PCIDs past. 91 poolPCIDs = 8 92 ) 93 94 // initArchState initializes architecture-specific state. 95 func (c *vCPU) initArchState() error { 96 var ( 97 kernelSystemRegs systemRegs 98 kernelUserRegs userRegs 99 ) 100 101 // Set base control registers. 102 kernelSystemRegs.CR0 = c.CR0() 103 kernelSystemRegs.CR4 = c.CR4() 104 kernelSystemRegs.EFER = c.EFER() 105 106 // Set the IDT & GDT in the registers. 107 kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT() 108 kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT() 109 kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode) 110 kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata) 111 kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata) 112 kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata) 113 kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata) 114 kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata) 115 tssBase, tssLimit, tss := c.TSS() 116 kernelSystemRegs.TR.Load(tss, ring0.Tss) 117 kernelSystemRegs.TR.base = tssBase 118 kernelSystemRegs.TR.limit = uint32(tssLimit) 119 120 // Point to kernel page tables, with no initial PCID. 121 kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0) 122 123 // Initialize the PCID database. 124 if hasGuestPCID { 125 // Note that NewPCIDs may return a nil table here, in which 126 // case we simply don't use PCID support (see below). In 127 // practice, this should not happen, however. 128 c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) 129 } 130 131 // Set the CPUID; this is required before setting system registers, 132 // since KVM will reject several CR4 bits if the CPUID does not 133 // indicate the support is available. 134 if err := c.setCPUID(); err != nil { 135 return err 136 } 137 138 // Set the entrypoint for the kernel. 139 kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) 140 kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) 141 kernelUserRegs.RSP = c.StackTop() 142 kernelUserRegs.RFLAGS = ring0.KernelFlagsSet 143 144 // Set the system registers. 145 if err := c.setSystemRegisters(&kernelSystemRegs); err != nil { 146 return err 147 } 148 149 // Set the user registers. 150 if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 { 151 return fmt.Errorf("error setting user registers: %v", errno) 152 } 153 154 // Allocate some floating point state save area for the local vCPU. 155 // This will be saved prior to leaving the guest, and we restore from 156 // this always. We cannot use the pointer in the context alone because 157 // we don't know how large the area there is in reality. 158 c.floatingPointState = fpu.NewState() 159 160 // Set the time offset to the host native time. 161 return c.setSystemTime() 162 } 163 164 // bitsForScaling returns the bits available for storing the fraction component 165 // of the TSC scaling ratio. This allows us to replicate the (bad) math done by 166 // the kernel below in scaledTSC, and ensure we can compute an exact zero 167 // offset in setSystemTime. 168 // 169 // These constants correspond to kvm_tsc_scaling_ratio_frac_bits. 170 var bitsForScaling = func() int64 { 171 fs := cpuid.HostFeatureSet() 172 if fs.Intel() { 173 return 48 // See vmx.c (kvm sources). 174 } else if fs.AMD() { 175 return 32 // See svm.c (svm sources). 176 } else { 177 return 63 // Unknown: theoretical maximum. 178 } 179 }() 180 181 // scaledTSC returns the host TSC scaled by the given frequency. 182 // 183 // This assumes a current frequency of 1. We require only the unitless ratio of 184 // rawFreq to some current frequency. See setSystemTime for context. 185 // 186 // The kernel math guarantees that all bits of the multiplication and division 187 // will be correctly preserved and applied. However, it is not possible to 188 // actually store the ratio correctly. So we need to use the same schema in 189 // order to calculate the scaled frequency and get the same result. 190 // 191 // We can assume that the current frequency is (1), so we are calculating a 192 // strict inverse of this value. This simplifies this function considerably. 193 // 194 // Roughly, the returned value "scaledTSC" will have: 195 // scaledTSC/hostTSC == 1/rawFreq 196 // 197 //go:nosplit 198 func scaledTSC(rawFreq uintptr) int64 { 199 scale := int64(1 << bitsForScaling) 200 ratio := big.NewInt(scale / int64(rawFreq)) 201 ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc()))) 202 ratio.Div(ratio, big.NewInt(scale)) 203 return ratio.Int64() 204 } 205 206 // setSystemTime sets the vCPU to the system time. 207 func (c *vCPU) setSystemTime() error { 208 // First, scale down the clock frequency to the lowest value allowed by 209 // the API itself. How low we can go depends on the underlying 210 // hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD. 211 // Even the lower bound here will take a 4GHz frequency down to 1Hz, 212 // meaning that everything should be able to handle a Khz setting of 1 213 // with bits to spare. 214 // 215 // Note that reducing the clock does not typically require special 216 // capabilities as it is emulated in KVM. We don't actually use this 217 // capability, but it means that this method should be robust to 218 // different hardware configurations. 219 220 // if tsc scaling is not supported, fallback to legacy mode 221 if !c.machine.tscControl { 222 return c.setSystemTimeLegacy() 223 } 224 rawFreq, err := c.getTSCFreq() 225 if err != nil { 226 return c.setSystemTimeLegacy() 227 } 228 if err := c.setTSCFreq(1); err != nil { 229 return c.setSystemTimeLegacy() 230 } 231 232 // Always restore the original frequency. 233 defer func() { 234 if err := c.setTSCFreq(rawFreq); err != nil { 235 panic(err.Error()) 236 } 237 }() 238 239 // Attempt to set the system time in this compressed world. The 240 // calculation for offset normally looks like: 241 // 242 // offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc()); 243 // 244 // So as long as the kvm_scale_tsc component is constant before and 245 // after the call to set the TSC value (and it is passes as the 246 // target_tsc), we will compute an offset value of zero. 247 // 248 // This is effectively cheating to make our "setSystemTime" call so 249 // unbelievably, incredibly fast that we do it "instantly" and all the 250 // calculations result in an offset of zero. 251 lastTSC := scaledTSC(rawFreq) 252 for { 253 if err := c.setTSC(uint64(lastTSC)); err != nil { 254 return err 255 } 256 nextTSC := scaledTSC(rawFreq) 257 if lastTSC == nextTSC { 258 return nil 259 } 260 lastTSC = nextTSC // Try again. 261 } 262 } 263 264 // nonCanonical generates a canonical address return. 265 // 266 //go:nosplit 267 func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { 268 *info = linux.SignalInfo{ 269 Signo: signal, 270 Code: linux.SI_KERNEL, 271 } 272 info.SetAddr(addr) // Include address. 273 return hostarch.NoAccess, platform.ErrContextSignal 274 } 275 276 // fault generates an appropriate fault return. 277 // 278 //go:nosplit 279 func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { 280 bluepill(c) // Probably no-op, but may not be. 281 faultAddr := ring0.ReadCR2() 282 code, user := c.ErrorCode() 283 if !user { 284 // The last fault serviced by this CPU was not a user 285 // fault, so we can't reliably trust the faultAddr or 286 // the code provided here. We need to re-execute. 287 return hostarch.NoAccess, platform.ErrContextInterrupt 288 } 289 // Reset the pointed SignalInfo. 290 *info = linux.SignalInfo{Signo: signal} 291 info.SetAddr(uint64(faultAddr)) 292 accessType := hostarch.AccessType{ 293 Read: code&(1<<1) == 0, 294 Write: code&(1<<1) != 0, 295 Execute: code&(1<<4) != 0, 296 } 297 if !accessType.Write && !accessType.Execute { 298 info.Code = 1 // SEGV_MAPERR. 299 } else { 300 info.Code = 2 // SEGV_ACCERR. 301 } 302 return accessType, platform.ErrContextSignal 303 } 304 305 //go:nosplit 306 //go:noinline 307 func loadByte(ptr *byte) byte { 308 return *ptr 309 } 310 311 // prefaultFloatingPointState touches each page of the floating point state to 312 // be sure that its physical pages are mapped. 313 // 314 // Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that 315 // triggered a fault will be emulated by the kvm kernel code, but it can't 316 // emulate instructions like xsave and xrstor. 317 // 318 //go:nosplit 319 func prefaultFloatingPointState(data *fpu.State) { 320 size := len(*data) 321 for i := 0; i < size; i += hostarch.PageSize { 322 loadByte(&(*data)[i]) 323 } 324 loadByte(&(*data)[size-1]) 325 } 326 327 // SwitchToUser unpacks architectural-details. 328 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { 329 // Check for canonical addresses. 330 if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) { 331 return nonCanonical(regs.Rip, int32(unix.SIGSEGV), info) 332 } else if !ring0.IsCanonical(regs.Rsp) { 333 return nonCanonical(regs.Rsp, int32(unix.SIGBUS), info) 334 } else if !ring0.IsCanonical(regs.Fs_base) { 335 return nonCanonical(regs.Fs_base, int32(unix.SIGBUS), info) 336 } else if !ring0.IsCanonical(regs.Gs_base) { 337 return nonCanonical(regs.Gs_base, int32(unix.SIGBUS), info) 338 } 339 340 // Assign PCIDs. 341 if c.PCIDs != nil { 342 var requireFlushPCID bool // Force a flush? 343 switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) 344 switchOpts.KernelPCID = fixedKernelPCID 345 switchOpts.Flush = switchOpts.Flush || requireFlushPCID 346 } 347 348 // See below. 349 var vector ring0.Vector 350 351 // Past this point, stack growth can cause system calls (and a break 352 // from guest mode). So we need to ensure that between the bluepill 353 // call here and the switch call immediately below, no additional 354 // allocations occur. 355 entersyscall() 356 bluepill(c) 357 // The root table physical page has to be mapped to not fault in iret 358 // or sysret after switching into a user address space. sysret and 359 // iret are in the upper half that is global and already mapped. 360 switchOpts.PageTables.PrefaultRootTable() 361 prefaultFloatingPointState(switchOpts.FloatingPointState) 362 vector = c.CPU.SwitchToUser(switchOpts) 363 exitsyscall() 364 365 switch vector { 366 case ring0.Syscall, ring0.SyscallInt80: 367 // Fast path: system call executed. 368 return hostarch.NoAccess, nil 369 370 case ring0.PageFault: 371 return c.fault(int32(unix.SIGSEGV), info) 372 373 case ring0.Debug, ring0.Breakpoint: 374 *info = linux.SignalInfo{ 375 Signo: int32(unix.SIGTRAP), 376 Code: 1, // TRAP_BRKPT (breakpoint). 377 } 378 info.SetAddr(switchOpts.Registers.Rip) // Include address. 379 return hostarch.AccessType{}, platform.ErrContextSignal 380 381 case ring0.GeneralProtectionFault, 382 ring0.SegmentNotPresent, 383 ring0.BoundRangeExceeded, 384 ring0.InvalidTSS, 385 ring0.StackSegmentFault: 386 *info = linux.SignalInfo{ 387 Signo: int32(unix.SIGSEGV), 388 Code: linux.SI_KERNEL, 389 } 390 info.SetAddr(switchOpts.Registers.Rip) // Include address. 391 if vector == ring0.GeneralProtectionFault { 392 // When CPUID faulting is enabled, we will generate a #GP(0) when 393 // userspace executes a CPUID instruction. This is handled above, 394 // because we need to be able to map and read user memory. 395 return hostarch.AccessType{}, platform.ErrContextSignalCPUID 396 } 397 return hostarch.AccessType{}, platform.ErrContextSignal 398 399 case ring0.InvalidOpcode: 400 *info = linux.SignalInfo{ 401 Signo: int32(unix.SIGILL), 402 Code: 1, // ILL_ILLOPC (illegal opcode). 403 } 404 info.SetAddr(switchOpts.Registers.Rip) // Include address. 405 return hostarch.AccessType{}, platform.ErrContextSignal 406 407 case ring0.DivideByZero: 408 *info = linux.SignalInfo{ 409 Signo: int32(unix.SIGFPE), 410 Code: 1, // FPE_INTDIV (divide by zero). 411 } 412 info.SetAddr(switchOpts.Registers.Rip) // Include address. 413 return hostarch.AccessType{}, platform.ErrContextSignal 414 415 case ring0.Overflow: 416 *info = linux.SignalInfo{ 417 Signo: int32(unix.SIGFPE), 418 Code: 2, // FPE_INTOVF (integer overflow). 419 } 420 info.SetAddr(switchOpts.Registers.Rip) // Include address. 421 return hostarch.AccessType{}, platform.ErrContextSignal 422 423 case ring0.X87FloatingPointException, 424 ring0.SIMDFloatingPointException: 425 *info = linux.SignalInfo{ 426 Signo: int32(unix.SIGFPE), 427 Code: 7, // FPE_FLTINV (invalid operation). 428 } 429 info.SetAddr(switchOpts.Registers.Rip) // Include address. 430 return hostarch.AccessType{}, platform.ErrContextSignal 431 432 case ring0.Vector(bounce): // ring0.VirtualizationException 433 return hostarch.NoAccess, platform.ErrContextInterrupt 434 435 case ring0.AlignmentCheck: 436 *info = linux.SignalInfo{ 437 Signo: int32(unix.SIGBUS), 438 Code: 2, // BUS_ADRERR (physical address does not exist). 439 } 440 return hostarch.NoAccess, platform.ErrContextSignal 441 442 case ring0.NMI: 443 // An NMI is generated only when a fault is not servicable by 444 // KVM itself, so we think some mapping is writeable but it's 445 // really not. This could happen, e.g. if some file is 446 // truncated (and would generate a SIGBUS) and we map it 447 // directly into the instance. 448 return c.fault(int32(unix.SIGBUS), info) 449 450 case ring0.DeviceNotAvailable, 451 ring0.DoubleFault, 452 ring0.CoprocessorSegmentOverrun, 453 ring0.MachineCheck, 454 ring0.SecurityException: 455 fallthrough 456 default: 457 panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) 458 } 459 } 460 461 // On x86 platform, the flags for "setMemoryRegion" can always be set as 0. 462 // There is no need to return read-only physicalRegions. 463 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { 464 return nil 465 } 466 467 func availableRegionsForSetMem() (phyRegions []physicalRegion) { 468 return physicalRegions 469 } 470 471 func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { 472 // Map all the executable regions so that all the entry functions 473 // are mapped in the upper half. 474 applyVirtualRegions(func(vr virtualRegion) { 475 if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { 476 return 477 } 478 479 if vr.accessType.Execute { 480 r := vr.region 481 physical, length, ok := translateToPhysical(r.virtual) 482 if !ok || length < r.length { 483 panic("impossible translation") 484 } 485 pageTable.Map( 486 hostarch.Addr(ring0.KernelStartAddress|r.virtual), 487 r.length, 488 pagetables.MapOpts{AccessType: hostarch.Execute, Global: true}, 489 physical) 490 } 491 }) 492 for start, end := range m.kernel.EntryRegions() { 493 regionLen := end - start 494 physical, length, ok := translateToPhysical(start) 495 if !ok || length < regionLen { 496 panic("impossible translation") 497 } 498 pageTable.Map( 499 hostarch.Addr(ring0.KernelStartAddress|start), 500 regionLen, 501 pagetables.MapOpts{AccessType: hostarch.ReadWrite, Global: true}, 502 physical) 503 } 504 } 505 506 // getMaxVCPU get max vCPU number 507 func (m *machine) getMaxVCPU() { 508 maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) 509 if errno != 0 { 510 m.maxVCPUs = _KVM_NR_VCPUS 511 } else { 512 m.maxVCPUs = int(maxVCPUs) 513 } 514 } 515 516 // getNewVCPU create a new vCPU (maybe) 517 func (m *machine) getNewVCPU() *vCPU { 518 if int(m.nextID) < m.maxVCPUs { 519 c := m.newVCPU() 520 return c 521 } 522 return nil 523 }