github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/ring0/kernel_amd64.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 16 // +build amd64 17 18 package ring0 19 20 import ( 21 "encoding/binary" 22 "reflect" 23 24 "github.com/metacubex/gvisor/pkg/cpuid" 25 "github.com/metacubex/gvisor/pkg/hostarch" 26 "github.com/metacubex/gvisor/pkg/sentry/arch" 27 ) 28 29 // HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the 30 // value in regs. 31 func HaltAndWriteFSBase(regs *arch.Registers) 32 33 // init initializes architecture-specific state. 34 func (k *Kernel) init(maxCPUs int) { 35 entrySize := reflect.TypeOf(kernelEntry{}).Size() 36 var ( 37 entries []kernelEntry 38 padding = 1 39 ) 40 for { 41 entries = make([]kernelEntry, maxCPUs+padding-1) 42 totalSize := entrySize * uintptr(maxCPUs+padding-1) 43 addr := reflect.ValueOf(&entries[0]).Pointer() 44 if addr&(hostarch.PageSize-1) == 0 && totalSize >= hostarch.PageSize { 45 // The runtime forces power-of-2 alignment for allocations, and we are therefore 46 // safe once the first address is aligned and the chunk is at least a full page. 47 break 48 } 49 padding = padding << 1 50 } 51 k.cpuEntries = entries 52 53 k.globalIDT = &idt64{} 54 if reflect.TypeOf(idt64{}).Size() != hostarch.PageSize { 55 panic("Size of globalIDT should be PageSize") 56 } 57 if reflect.ValueOf(k.globalIDT).Pointer()&(hostarch.PageSize-1) != 0 { 58 panic("Allocated globalIDT should be page aligned") 59 } 60 61 // Setup the IDT, which is uniform. 62 for v, handler := range handlers { 63 // Allow Breakpoint and Overflow to be called from all 64 // privilege levels. 65 dpl := 0 66 if v == Breakpoint || v == Overflow { 67 dpl = 3 68 } 69 // Note that we set all traps to use the interrupt stack, this 70 // is defined below when setting up the TSS. 71 k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */) 72 } 73 } 74 75 // EntryRegions returns the set of kernel entry regions (must be mapped). 76 func (k *Kernel) EntryRegions() map[uintptr]uintptr { 77 regions := make(map[uintptr]uintptr) 78 79 addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer() 80 size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries)) 81 end, _ := hostarch.Addr(addr + size).RoundUp() 82 regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end) 83 84 addr = reflect.ValueOf(k.globalIDT).Pointer() 85 size = reflect.TypeOf(idt64{}).Size() 86 end, _ = hostarch.Addr(addr + size).RoundUp() 87 regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end) 88 89 return regions 90 } 91 92 // init initializes architecture-specific state. 93 func (c *CPU) init(cpuID int) { 94 c.kernelEntry = &c.kernel.cpuEntries[cpuID] 95 c.cpuSelf = c 96 // Null segment. 97 c.gdt[0].setNull() 98 99 // Kernel & user segments. 100 c.gdt[segKcode] = KernelCodeSegment 101 c.gdt[segKdata] = KernelDataSegment 102 c.gdt[segUcode32] = UserCodeSegment32 103 c.gdt[segUdata] = UserDataSegment 104 c.gdt[segUcode64] = UserCodeSegment64 105 106 // The task segment, this spans two entries. 107 tssBase, tssLimit, _ := c.TSS() 108 c.gdt[segTss].set( 109 uint32(tssBase), 110 uint32(tssLimit), 111 0, // Privilege level zero. 112 SegmentDescriptorPresent| 113 SegmentDescriptorAccess| 114 SegmentDescriptorWrite| 115 SegmentDescriptorExecute) 116 c.gdt[segTssHi].setHi(uint32((tssBase) >> 32)) 117 118 // Set the kernel stack pointer in the TSS (virtual address). 119 stackAddr := c.StackTop() 120 c.stackTop = stackAddr 121 c.tss.rsp0Lo = uint32(stackAddr) 122 c.tss.rsp0Hi = uint32(stackAddr >> 32) 123 c.tss.ist1Lo = uint32(stackAddr) 124 c.tss.ist1Hi = uint32(stackAddr >> 32) 125 126 // Set the I/O bitmap base address beyond the last byte in the TSS 127 // to block access to the entire I/O address range. 128 // 129 // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: 130 // I/O addresses not spanned by the map are treated as if they had set 131 // bits in the map. 132 c.tss.ioPerm = tssLimit + 1 133 134 // Permanently set the kernel segments. 135 c.registers.Cs = uint64(Kcode) 136 c.registers.Ds = uint64(Kdata) 137 c.registers.Es = uint64(Kdata) 138 c.registers.Ss = uint64(Kdata) 139 c.registers.Fs = uint64(Kdata) 140 c.registers.Gs = uint64(Kdata) 141 142 // Set mandatory flags. 143 c.registers.Eflags = KernelFlagsSet 144 145 c.hasXSAVE = hasXSAVE 146 c.hasXSAVEOPT = hasXSAVEOPT 147 c.hasFSGSBASE = hasFSGSBASE 148 } 149 150 // StackTop returns the kernel's stack address. 151 // 152 //go:nosplit 153 func (c *CPU) StackTop() uint64 { 154 return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) 155 } 156 157 // IDT returns the CPU's IDT base and limit. 158 // 159 //go:nosplit 160 func (c *CPU) IDT() (uint64, uint16) { 161 return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1) 162 } 163 164 // GDT returns the CPU's GDT base and limit. 165 // 166 //go:nosplit 167 func (c *CPU) GDT() (uint64, uint16) { 168 return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1) 169 } 170 171 // TSS returns the CPU's TSS base, limit and value. 172 // 173 //go:nosplit 174 func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { 175 return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss] 176 } 177 178 // CR0 returns the CPU's CR0 value. 179 // 180 //go:nosplit 181 func (c *CPU) CR0() uint64 { 182 return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE 183 } 184 185 // CR4 returns the CPU's CR4 value. 186 // 187 //go:nosplit 188 func (c *CPU) CR4() uint64 { 189 cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT) 190 if hasPCID { 191 cr4 |= _CR4_PCIDE 192 } 193 if hasXSAVE { 194 cr4 |= _CR4_OSXSAVE 195 } 196 if hasSMEP { 197 cr4 |= _CR4_SMEP 198 } 199 if hasSMAP { 200 cr4 |= _CR4_SMAP 201 } 202 if hasFSGSBASE { 203 cr4 |= _CR4_FSGSBASE 204 } 205 return cr4 206 } 207 208 // EFER returns the CPU's EFER value. 209 // 210 //go:nosplit 211 func (c *CPU) EFER() uint64 { 212 return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX 213 } 214 215 // IsCanonical indicates whether addr is canonical per the amd64 spec. 216 // 217 //go:nosplit 218 func IsCanonical(addr uint64) bool { 219 return addr <= 0x00007fffffffffff || addr >= 0xffff800000000000 220 } 221 222 // SwitchToUser performs either a sysret or an iret. 223 // 224 // The return value is the vector that interrupted execution. 225 // 226 // This function will not split the stack. Callers will probably want to call 227 // runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to 228 // calling this function. 229 // 230 // When this is done, this region is quite sensitive to things like system 231 // calls. After calling entersyscall, any memory used must have been allocated 232 // and no function calls without go:nosplit are permitted. Any calls made here 233 // are protected appropriately (e.g. IsCanonical and CR3). 234 // 235 // Also note that this function transitively depends on the compiler generating 236 // code that uses IP-relative addressing inside of absolute addresses. That's 237 // the case for amd64, but may not be the case for other architectures. 238 // 239 // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. 240 // 241 // +checkescape:all 242 // 243 //go:nosplit 244 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { 245 userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) 246 c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) 247 248 // Sanitize registers. 249 regs := switchOpts.Registers 250 regs.Eflags &= ^uint64(UserFlagsClear) 251 regs.Eflags |= UserFlagsSet 252 regs.Cs = uint64(Ucode64) // Required for iret. 253 regs.Ss = uint64(Udata) // Ditto. 254 255 // Perform the switch. 256 needIRET := uint64(0) 257 if switchOpts.FullRestore { 258 needIRET = 1 259 } 260 vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no. 261 return 262 } 263 264 func doSwitchToUser( 265 cpu *CPU, // +0(FP) 266 regs *arch.Registers, // +8(FP) 267 fpState *byte, // +16(FP) 268 userCR3 uint64, // +24(FP) 269 needIRET uint64) Vector // +32(FP), +40(FP) 270 271 // startGo is the CPU entrypoint. 272 // 273 // This is called from the start asm stub (see entry_amd64.go); on return the 274 // registers in c.registers will be restored (not segments). 275 // 276 // Note that any code written in Go should adhere to Go expected environment: 277 // - Initialized floating point state (required for optimizations using 278 // floating point instructions). 279 // - Go TLS in FS_BASE (this is required by splittable functions, calls into 280 // the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access 281 // TLS)). 282 // 283 //go:nosplit 284 func startGo(c *CPU) { 285 // Save per-cpu. 286 writeGS(kernelAddr(c.kernelEntry)) 287 288 // 289 // TODO(mpratt): Note that per the note above, this should be done 290 // before entering Go code. However for simplicity we leave it here for 291 // now, since the small critical sections with undefined FPU state 292 // should only contain very limited use of floating point instructions 293 // (notably, use of XMM15 as a zero register). 294 fninit() 295 // Need to sync XCR0 with the host, because xsave and xrstor can be 296 // called from different contexts. 297 if hasXSAVE { 298 // Exclude MPX bits. MPX has been deprecated and we have seen 299 // cases when it isn't supported in VM. 300 xcr0 := localXCR0 &^ (cpuid.XSAVEFeatureBNDCSR | cpuid.XSAVEFeatureBNDREGS) 301 xsetbv(0, xcr0) 302 } 303 304 // Set the syscall target. 305 wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter())) 306 wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) 307 308 // NOTE: This depends on having the 64-bit segments immediately 309 // following the 32-bit user segments. This is simply the way the 310 // sysret instruction is designed to work (it assumes they follow). 311 wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) 312 wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter())) 313 } 314 315 // SetCPUIDFaulting sets CPUID faulting per the boolean value. 316 // 317 // True is returned if faulting could be set. 318 // 319 //go:nosplit 320 func SetCPUIDFaulting(on bool) bool { 321 // Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support 322 // for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR. 323 if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 { 324 features := rdmsr(_MSR_MISC_FEATURES) 325 if on { 326 features |= _MISC_FEATURE_CPUID_TRAP 327 } else { 328 features &^= _MISC_FEATURE_CPUID_TRAP 329 } 330 wrmsr(_MSR_MISC_FEATURES, features) 331 return true // Setting successful. 332 } 333 return false 334 } 335 336 // ReadCR2 reads the current CR2 value. 337 // 338 //go:nosplit 339 func ReadCR2() uintptr { 340 return readCR2() 341 }