github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/kvm/machine_amd64.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 16 // +build amd64 17 18 package kvm 19 20 import ( 21 "fmt" 22 "math/big" 23 "reflect" 24 "runtime" 25 "runtime/debug" 26 27 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 28 "github.com/MerlinKodo/gvisor/pkg/cpuid" 29 "github.com/MerlinKodo/gvisor/pkg/hostarch" 30 "github.com/MerlinKodo/gvisor/pkg/ring0" 31 "github.com/MerlinKodo/gvisor/pkg/ring0/pagetables" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 33 ktime "github.com/MerlinKodo/gvisor/pkg/sentry/time" 34 "golang.org/x/sys/unix" 35 ) 36 37 // initArchState initializes architecture-specific state. 38 func (m *machine) initArchState() error { 39 // Set the legacy TSS address. This address is covered by the reserved 40 // range (up to 4GB). In fact, this is a main reason it exists. 41 if _, _, errno := unix.RawSyscall( 42 unix.SYS_IOCTL, 43 uintptr(m.fd), 44 KVM_SET_TSS_ADDR, 45 uintptr(reservedMemory-(3*hostarch.PageSize))); errno != 0 { 46 return errno 47 } 48 49 // Initialize all vCPUs to minimize kvm ioctl-s allowed by seccomp filters. 50 m.mu.Lock() 51 for i := 0; i < m.maxVCPUs; i++ { 52 m.createVCPU(i) 53 } 54 m.mu.Unlock() 55 56 c := m.Get() 57 defer m.Put(c) 58 // Enable CPUID faulting, if possible. Note that this also serves as a 59 // basic platform sanity tests, since we will enter guest mode for the 60 // first time here. The recovery is necessary, since if we fail to read 61 // the platform info register, we will retry to host mode and 62 // ultimately need to handle a segmentation fault. 63 old := debug.SetPanicOnFault(true) 64 defer func() { 65 recover() 66 debug.SetPanicOnFault(old) 67 }() 68 69 bluepill(c) 70 ring0.SetCPUIDFaulting(true) 71 72 return nil 73 } 74 75 type vCPUArchState struct { 76 // PCIDs is the set of PCIDs for this vCPU. 77 // 78 // This starts above fixedKernelPCID. 79 PCIDs *pagetables.PCIDs 80 } 81 82 const ( 83 // fixedKernelPCID is a fixed kernel PCID used for the kernel page 84 // tables. We must start allocating user PCIDs above this in order to 85 // avoid any conflict (see below). 86 fixedKernelPCID = 1 87 88 // poolPCIDs is the number of PCIDs to record in the database. As this 89 // grows, assignment can take longer, since it is a simple linear scan. 90 // Beyond a relatively small number, there are likely few perform 91 // benefits, since the TLB has likely long since lost any translations 92 // from more than a few PCIDs past. 93 poolPCIDs = 8 94 ) 95 96 // initArchState initializes architecture-specific state. 97 func (c *vCPU) initArchState() error { 98 var ( 99 kernelSystemRegs systemRegs 100 kernelUserRegs userRegs 101 ) 102 103 // Set base control registers. 104 kernelSystemRegs.CR0 = c.CR0() 105 kernelSystemRegs.CR4 = c.CR4() 106 kernelSystemRegs.EFER = c.EFER() 107 108 // Set the IDT & GDT in the registers. 109 kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT() 110 kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT() 111 kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode) 112 kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata) 113 kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata) 114 kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata) 115 kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata) 116 kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata) 117 tssBase, tssLimit, tss := c.TSS() 118 kernelSystemRegs.TR.Load(tss, ring0.Tss) 119 kernelSystemRegs.TR.base = tssBase 120 kernelSystemRegs.TR.limit = uint32(tssLimit) 121 122 // Point to kernel page tables, with no initial PCID. 123 kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0) 124 125 // Initialize the PCID database. 126 if hasGuestPCID { 127 // Note that NewPCIDs may return a nil table here, in which 128 // case we simply don't use PCID support (see below). In 129 // practice, this should not happen, however. 130 c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) 131 } 132 133 // Set the CPUID; this is required before setting system registers, 134 // since KVM will reject several CR4 bits if the CPUID does not 135 // indicate the support is available. 136 if err := c.setCPUID(); err != nil { 137 return err 138 } 139 140 // Set the entrypoint for the kernel. 141 kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) 142 kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) 143 kernelUserRegs.RSP = c.StackTop() 144 kernelUserRegs.RFLAGS = ring0.KernelFlagsSet 145 146 // Set the system registers. 147 if err := c.setSystemRegisters(&kernelSystemRegs); err != nil { 148 return err 149 } 150 151 // Set the user registers. 152 if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 { 153 return fmt.Errorf("error setting user registers: %v", errno) 154 } 155 156 // Set the time offset to the host native time. 157 return c.setSystemTime() 158 } 159 160 // bitsForScaling returns the bits available for storing the fraction component 161 // of the TSC scaling ratio. 162 // It is set using getBitsForScaling when the KVM platform is initialized. 163 var bitsForScaling int64 164 165 // getBitsForScaling returns the bits available for storing the fraction component 166 // of the TSC scaling ratio. This allows us to replicate the (bad) math done by 167 // the kernel below in scaledTSC, and ensure we can compute an exact zero 168 // offset in setSystemTime. 169 // 170 // These constants correspond to kvm_tsc_scaling_ratio_frac_bits. 171 func getBitsForScaling() int64 { 172 fs := cpuid.HostFeatureSet() 173 if fs.Intel() { 174 return 48 // See vmx.c (kvm sources). 175 } else if fs.AMD() { 176 return 32 // See svm.c (svm sources). 177 } else { 178 return 63 // Unknown: theoretical maximum. 179 } 180 } 181 182 // scaledTSC returns the host TSC scaled by the given frequency. 183 // 184 // This assumes a current frequency of 1. We require only the unitless ratio of 185 // rawFreq to some current frequency. See setSystemTime for context. 186 // 187 // The kernel math guarantees that all bits of the multiplication and division 188 // will be correctly preserved and applied. However, it is not possible to 189 // actually store the ratio correctly. So we need to use the same schema in 190 // order to calculate the scaled frequency and get the same result. 191 // 192 // We can assume that the current frequency is (1), so we are calculating a 193 // strict inverse of this value. This simplifies this function considerably. 194 // 195 // Roughly, the returned value "scaledTSC" will have: 196 // scaledTSC/hostTSC == 1/rawFreq 197 // 198 //go:nosplit 199 func scaledTSC(rawFreq uintptr) int64 { 200 scale := int64(1 << bitsForScaling) 201 ratio := big.NewInt(scale / int64(rawFreq)) 202 ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc()))) 203 ratio.Div(ratio, big.NewInt(scale)) 204 return ratio.Int64() 205 } 206 207 // setSystemTime sets the vCPU to the system time. 208 func (c *vCPU) setSystemTime() error { 209 // Attempt to set the offset directly. This is supported as of Linux 5.16, 210 // or commit 828ca89628bfcb1b8f27535025f69dd00eb55207. 211 if err := c.setTSCOffset(); err == nil { 212 return err 213 } 214 215 // If tsc scaling is not supported, fallback to legacy mode. 216 if !c.machine.tscControl { 217 return c.setSystemTimeLegacy() 218 } 219 220 // First, scale down the clock frequency to the lowest value allowed by 221 // the API itself. How low we can go depends on the underlying 222 // hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD. 223 // Even the lower bound here will take a 4GHz frequency down to 1Hz, 224 // meaning that everything should be able to handle a Khz setting of 1 225 // with bits to spare. 226 // 227 // Note that reducing the clock does not typically require special 228 // capabilities as it is emulated in KVM. We don't actually use this 229 // capability, but it means that this method should be robust to 230 // different hardware configurations. 231 rawFreq, err := c.getTSCFreq() 232 if err != nil { 233 return c.setSystemTimeLegacy() 234 } 235 if err := c.setTSCFreq(1); err != nil { 236 return c.setSystemTimeLegacy() 237 } 238 239 // Always restore the original frequency. 240 defer func() { 241 if err := c.setTSCFreq(rawFreq); err != nil { 242 panic(err.Error()) 243 } 244 }() 245 246 // Attempt to set the system time in this compressed world. The 247 // calculation for offset normally looks like: 248 // 249 // offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc()); 250 // 251 // So as long as the kvm_scale_tsc component is constant before and 252 // after the call to set the TSC value (and it is passes as the 253 // target_tsc), we will compute an offset value of zero. 254 // 255 // This is effectively cheating to make our "setSystemTime" call so 256 // unbelievably, incredibly fast that we do it "instantly" and all the 257 // calculations result in an offset of zero. 258 lastTSC := scaledTSC(rawFreq) 259 for { 260 if err := c.setTSC(uint64(lastTSC)); err != nil { 261 return err 262 } 263 nextTSC := scaledTSC(rawFreq) 264 if lastTSC == nextTSC { 265 return nil 266 } 267 lastTSC = nextTSC // Try again. 268 } 269 } 270 271 // nonCanonical generates a canonical address return. 272 // 273 //go:nosplit 274 func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { 275 *info = linux.SignalInfo{ 276 Signo: signal, 277 Code: linux.SI_KERNEL, 278 } 279 info.SetAddr(addr) // Include address. 280 return hostarch.NoAccess, platform.ErrContextSignal 281 } 282 283 // fault generates an appropriate fault return. 284 // 285 //go:nosplit 286 func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { 287 bluepill(c) // Probably no-op, but may not be. 288 faultAddr := ring0.ReadCR2() 289 code, user := c.ErrorCode() 290 if !user { 291 // The last fault serviced by this CPU was not a user 292 // fault, so we can't reliably trust the faultAddr or 293 // the code provided here. We need to re-execute. 294 return hostarch.NoAccess, platform.ErrContextInterrupt 295 } 296 // Reset the pointed SignalInfo. 297 *info = linux.SignalInfo{Signo: signal} 298 info.SetAddr(uint64(faultAddr)) 299 accessType := hostarch.AccessType{} 300 if signal == int32(unix.SIGSEGV) { 301 accessType = hostarch.AccessType{ 302 Read: code&(1<<1) == 0, 303 Write: code&(1<<1) != 0, 304 Execute: code&(1<<4) != 0, 305 } 306 } 307 if !accessType.Write && !accessType.Execute { 308 info.Code = 1 // SEGV_MAPERR. 309 } else { 310 info.Code = 2 // SEGV_ACCERR. 311 } 312 return accessType, platform.ErrContextSignal 313 } 314 315 //go:nosplit 316 //go:noinline 317 func loadByte(ptr *byte) byte { 318 return *ptr 319 } 320 321 // SwitchToUser unpacks architectural-details. 322 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { 323 // Check for canonical addresses. 324 if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) { 325 return nonCanonical(regs.Rip, int32(unix.SIGSEGV), info) 326 } else if !ring0.IsCanonical(regs.Rsp) { 327 return nonCanonical(regs.Rsp, int32(unix.SIGBUS), info) 328 } else if !ring0.IsCanonical(regs.Fs_base) { 329 return nonCanonical(regs.Fs_base, int32(unix.SIGBUS), info) 330 } else if !ring0.IsCanonical(regs.Gs_base) { 331 return nonCanonical(regs.Gs_base, int32(unix.SIGBUS), info) 332 } 333 334 // Assign PCIDs. 335 if c.PCIDs != nil { 336 var requireFlushPCID bool // Force a flush? 337 switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) 338 switchOpts.KernelPCID = fixedKernelPCID 339 switchOpts.Flush = switchOpts.Flush || requireFlushPCID 340 } 341 342 // See below. 343 var vector ring0.Vector 344 345 // Past this point, stack growth can cause system calls (and a break 346 // from guest mode). So we need to ensure that between the bluepill 347 // call here and the switch call immediately below, no additional 348 // allocations occur. 349 entersyscall() 350 bluepill(c) 351 vector = c.CPU.SwitchToUser(switchOpts) 352 exitsyscall() 353 354 switch vector { 355 case ring0.Syscall, ring0.SyscallInt80: 356 // Fast path: system call executed. 357 return hostarch.NoAccess, nil 358 359 case ring0.PageFault: 360 return c.fault(int32(unix.SIGSEGV), info) 361 362 case ring0.Debug, ring0.Breakpoint: 363 *info = linux.SignalInfo{ 364 Signo: int32(unix.SIGTRAP), 365 Code: 1, // TRAP_BRKPT (breakpoint). 366 } 367 info.SetAddr(switchOpts.Registers.Rip) // Include address. 368 return hostarch.AccessType{}, platform.ErrContextSignal 369 370 case ring0.GeneralProtectionFault, 371 ring0.SegmentNotPresent, 372 ring0.BoundRangeExceeded, 373 ring0.InvalidTSS, 374 ring0.StackSegmentFault: 375 *info = linux.SignalInfo{ 376 Signo: int32(unix.SIGSEGV), 377 Code: linux.SI_KERNEL, 378 } 379 info.SetAddr(switchOpts.Registers.Rip) // Include address. 380 if vector == ring0.GeneralProtectionFault { 381 // When CPUID faulting is enabled, we will generate a #GP(0) when 382 // userspace executes a CPUID instruction. This is handled above, 383 // because we need to be able to map and read user memory. 384 return hostarch.AccessType{}, tryCPUIDError{} 385 } 386 return hostarch.AccessType{}, platform.ErrContextSignal 387 388 case ring0.InvalidOpcode: 389 *info = linux.SignalInfo{ 390 Signo: int32(unix.SIGILL), 391 Code: 1, // ILL_ILLOPC (illegal opcode). 392 } 393 info.SetAddr(switchOpts.Registers.Rip) // Include address. 394 return hostarch.AccessType{}, platform.ErrContextSignal 395 396 case ring0.DivideByZero: 397 *info = linux.SignalInfo{ 398 Signo: int32(unix.SIGFPE), 399 Code: 1, // FPE_INTDIV (divide by zero). 400 } 401 info.SetAddr(switchOpts.Registers.Rip) // Include address. 402 return hostarch.AccessType{}, platform.ErrContextSignal 403 404 case ring0.Overflow: 405 *info = linux.SignalInfo{ 406 Signo: int32(unix.SIGFPE), 407 Code: 2, // FPE_INTOVF (integer overflow). 408 } 409 info.SetAddr(switchOpts.Registers.Rip) // Include address. 410 return hostarch.AccessType{}, platform.ErrContextSignal 411 412 case ring0.X87FloatingPointException, 413 ring0.SIMDFloatingPointException: 414 *info = linux.SignalInfo{ 415 Signo: int32(unix.SIGFPE), 416 Code: 7, // FPE_FLTINV (invalid operation). 417 } 418 info.SetAddr(switchOpts.Registers.Rip) // Include address. 419 return hostarch.AccessType{}, platform.ErrContextSignal 420 421 case ring0.Vector(bounce): // ring0.VirtualizationException 422 return hostarch.NoAccess, platform.ErrContextInterrupt 423 424 case ring0.AlignmentCheck: 425 *info = linux.SignalInfo{ 426 Signo: int32(unix.SIGBUS), 427 Code: 2, // BUS_ADRERR (physical address does not exist). 428 } 429 return hostarch.NoAccess, platform.ErrContextSignal 430 431 case ring0.NMI: 432 // An NMI is generated only when a fault is not servicable by 433 // KVM itself, so we think some mapping is writeable but it's 434 // really not. This could happen, e.g. if some file is 435 // truncated (and would generate a SIGBUS) and we map it 436 // directly into the instance. 437 return c.fault(int32(unix.SIGBUS), info) 438 439 case ring0.DeviceNotAvailable, 440 ring0.DoubleFault, 441 ring0.CoprocessorSegmentOverrun, 442 ring0.MachineCheck, 443 ring0.SecurityException: 444 fallthrough 445 default: 446 panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) 447 } 448 } 449 450 func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { 451 // Map all the executable regions so that all the entry functions 452 // are mapped in the upper half. 453 if err := applyVirtualRegions(func(vr virtualRegion) { 454 if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { 455 return 456 } 457 458 if vr.accessType.Execute { 459 r := vr.region 460 physical, length, ok := translateToPhysical(r.virtual) 461 if !ok || length < r.length { 462 panic("impossible translation") 463 } 464 pageTable.Map( 465 hostarch.Addr(ring0.KernelStartAddress|r.virtual), 466 r.length, 467 pagetables.MapOpts{AccessType: hostarch.Execute, Global: true}, 468 physical) 469 } 470 }); err != nil { 471 panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) 472 } 473 for start, end := range m.kernel.EntryRegions() { 474 regionLen := end - start 475 physical, length, ok := translateToPhysical(start) 476 if !ok || length < regionLen { 477 panic("impossible translation") 478 } 479 pageTable.Map( 480 hostarch.Addr(ring0.KernelStartAddress|start), 481 regionLen, 482 pagetables.MapOpts{AccessType: hostarch.ReadWrite, Global: true}, 483 physical) 484 } 485 } 486 487 // getMaxVCPU get max vCPU number 488 func (m *machine) getMaxVCPU() { 489 maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) 490 if errno != 0 { 491 m.maxVCPUs = _KVM_NR_VCPUS 492 } else { 493 m.maxVCPUs = int(maxVCPUs) 494 } 495 496 // The goal here is to avoid vCPU contentions for reasonable workloads. 497 // But "reasonable" isn't defined well in this case. Let's say that CPU 498 // overcommit with factor 2 is still acceptable. We allocate a set of 499 // vCPU for each goruntime processor (P) and two sets of vCPUs to run 500 // user code. 501 rCPUs := runtime.GOMAXPROCS(0) 502 if 3*rCPUs < m.maxVCPUs { 503 m.maxVCPUs = 3 * rCPUs 504 } 505 } 506 507 func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { 508 return physicalRegions 509 }