github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/ring0/kernel_amd64.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 16 // +build amd64 17 18 package ring0 19 20 import ( 21 "encoding/binary" 22 "reflect" 23 24 "github.com/MerlinKodo/gvisor/pkg/cpuid" 25 "github.com/MerlinKodo/gvisor/pkg/hostarch" 26 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 27 ) 28 29 // HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the 30 // value in regs. 31 func HaltAndWriteFSBase(regs *arch.Registers) 32 33 // init initializes architecture-specific state. 34 func (k *Kernel) init(maxCPUs int) { 35 entrySize := reflect.TypeOf(kernelEntry{}).Size() 36 var ( 37 entries []kernelEntry 38 padding = 1 39 ) 40 for { 41 entries = make([]kernelEntry, maxCPUs+padding-1) 42 totalSize := entrySize * uintptr(maxCPUs+padding-1) 43 addr := reflect.ValueOf(&entries[0]).Pointer() 44 if addr&(hostarch.PageSize-1) == 0 && totalSize >= hostarch.PageSize { 45 // The runtime forces power-of-2 alignment for allocations, and we are therefore 46 // safe once the first address is aligned and the chunk is at least a full page. 47 break 48 } 49 padding = padding << 1 50 } 51 k.cpuEntries = entries 52 53 k.globalIDT = &idt64{} 54 if reflect.TypeOf(idt64{}).Size() != hostarch.PageSize { 55 panic("Size of globalIDT should be PageSize") 56 } 57 if reflect.ValueOf(k.globalIDT).Pointer()&(hostarch.PageSize-1) != 0 { 58 panic("Allocated globalIDT should be page aligned") 59 } 60 61 // Setup the IDT, which is uniform. 62 for v, handler := range handlers { 63 // Allow Breakpoint and Overflow to be called from all 64 // privilege levels. 65 dpl := 0 66 if v == Breakpoint || v == Overflow { 67 dpl = 3 68 } 69 // Note that we set all traps to use the interrupt stack, this 70 // is defined below when setting up the TSS. 71 k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */) 72 } 73 } 74 75 // EntryRegions returns the set of kernel entry regions (must be mapped). 76 func (k *Kernel) EntryRegions() map[uintptr]uintptr { 77 regions := make(map[uintptr]uintptr) 78 79 addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer() 80 size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries)) 81 end, _ := hostarch.Addr(addr + size).RoundUp() 82 regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end) 83 84 addr = reflect.ValueOf(k.globalIDT).Pointer() 85 size = reflect.TypeOf(idt64{}).Size() 86 end, _ = hostarch.Addr(addr + size).RoundUp() 87 regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end) 88 89 return regions 90 } 91 92 // init initializes architecture-specific state. 93 func (c *CPU) init(cpuID int) { 94 c.kernelEntry = &c.kernel.cpuEntries[cpuID] 95 c.cpuSelf = c 96 // Null segment. 97 c.gdt[0].setNull() 98 99 // Kernel & user segments. 100 c.gdt[segKcode] = KernelCodeSegment 101 c.gdt[segKdata] = KernelDataSegment 102 c.gdt[segUcode32] = UserCodeSegment32 103 c.gdt[segUdata] = UserDataSegment 104 c.gdt[segUcode64] = UserCodeSegment64 105 106 // The task segment, this spans two entries. 107 tssBase, tssLimit, _ := c.TSS() 108 c.gdt[segTss].set( 109 uint32(tssBase), 110 uint32(tssLimit), 111 0, // Privilege level zero. 112 SegmentDescriptorPresent| 113 SegmentDescriptorAccess| 114 SegmentDescriptorWrite| 115 SegmentDescriptorExecute) 116 c.gdt[segTssHi].setHi(uint32((tssBase) >> 32)) 117 118 // Set the kernel stack pointer in the TSS (virtual address). 119 stackAddr := c.StackTop() 120 c.stackTop = stackAddr 121 c.tss.rsp0Lo = uint32(stackAddr) 122 c.tss.rsp0Hi = uint32(stackAddr >> 32) 123 c.tss.ist1Lo = uint32(stackAddr) 124 c.tss.ist1Hi = uint32(stackAddr >> 32) 125 126 // Set the I/O bitmap base address beyond the last byte in the TSS 127 // to block access to the entire I/O address range. 128 // 129 // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: 130 // I/O addresses not spanned by the map are treated as if they had set 131 // bits in the map. 132 c.tss.ioPerm = tssLimit + 1 133 134 // Permanently set the kernel segments. 135 c.registers.Cs = uint64(Kcode) 136 c.registers.Ds = uint64(Kdata) 137 c.registers.Es = uint64(Kdata) 138 c.registers.Ss = uint64(Kdata) 139 c.registers.Fs = uint64(Kdata) 140 c.registers.Gs = uint64(Kdata) 141 142 // Set mandatory flags. 143 c.registers.Eflags = KernelFlagsSet 144 145 c.hasXSAVE = hasXSAVE 146 c.hasXSAVEOPT = hasXSAVEOPT 147 } 148 149 // StackTop returns the kernel's stack address. 150 // 151 //go:nosplit 152 func (c *CPU) StackTop() uint64 { 153 return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) 154 } 155 156 // IDT returns the CPU's IDT base and limit. 157 // 158 //go:nosplit 159 func (c *CPU) IDT() (uint64, uint16) { 160 return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1) 161 } 162 163 // GDT returns the CPU's GDT base and limit. 164 // 165 //go:nosplit 166 func (c *CPU) GDT() (uint64, uint16) { 167 return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1) 168 } 169 170 // TSS returns the CPU's TSS base, limit and value. 171 // 172 //go:nosplit 173 func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { 174 return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss] 175 } 176 177 // CR0 returns the CPU's CR0 value. 178 // 179 //go:nosplit 180 func (c *CPU) CR0() uint64 { 181 return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE 182 } 183 184 // CR4 returns the CPU's CR4 value. 185 // 186 //go:nosplit 187 func (c *CPU) CR4() uint64 { 188 cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT) 189 if hasPCID { 190 cr4 |= _CR4_PCIDE 191 } 192 if hasXSAVE { 193 cr4 |= _CR4_OSXSAVE 194 } 195 if hasSMEP { 196 cr4 |= _CR4_SMEP 197 } 198 if hasSMAP { 199 cr4 |= _CR4_SMAP 200 } 201 if hasFSGSBASE { 202 cr4 |= _CR4_FSGSBASE 203 } 204 return cr4 205 } 206 207 // EFER returns the CPU's EFER value. 208 // 209 //go:nosplit 210 func (c *CPU) EFER() uint64 { 211 return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX 212 } 213 214 // IsCanonical indicates whether addr is canonical per the amd64 spec. 215 // 216 //go:nosplit 217 func IsCanonical(addr uint64) bool { 218 return addr <= 0x00007fffffffffff || addr >= 0xffff800000000000 219 } 220 221 // SwitchToUser performs either a sysret or an iret. 222 // 223 // The return value is the vector that interrupted execution. 224 // 225 // This function will not split the stack. Callers will probably want to call 226 // runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to 227 // calling this function. 228 // 229 // When this is done, this region is quite sensitive to things like system 230 // calls. After calling entersyscall, any memory used must have been allocated 231 // and no function calls without go:nosplit are permitted. Any calls made here 232 // are protected appropriately (e.g. IsCanonical and CR3). 233 // 234 // Also note that this function transitively depends on the compiler generating 235 // code that uses IP-relative addressing inside of absolute addresses. That's 236 // the case for amd64, but may not be the case for other architectures. 237 // 238 // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. 239 // 240 // +checkescape:all 241 // 242 //go:nosplit 243 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { 244 userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) 245 c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) 246 247 // Sanitize registers. 248 regs := switchOpts.Registers 249 regs.Eflags &= ^uint64(UserFlagsClear) 250 regs.Eflags |= UserFlagsSet 251 regs.Cs = uint64(Ucode64) // Required for iret. 252 regs.Ss = uint64(Udata) // Ditto. 253 254 // Perform the switch. 255 needIRET := uint64(0) 256 if switchOpts.FullRestore { 257 needIRET = 1 258 } 259 vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no. 260 return 261 } 262 263 func doSwitchToUser( 264 cpu *CPU, // +0(FP) 265 regs *arch.Registers, // +8(FP) 266 fpState *byte, // +16(FP) 267 userCR3 uint64, // +24(FP) 268 needIRET uint64) Vector // +32(FP), +40(FP) 269 270 // startGo is the CPU entrypoint. 271 // 272 // This is called from the start asm stub (see entry_amd64.go); on return the 273 // registers in c.registers will be restored (not segments). 274 // 275 // Note that any code written in Go should adhere to Go expected environment: 276 // - Initialized floating point state (required for optimizations using 277 // floating point instructions). 278 // - Go TLS in FS_BASE (this is required by splittable functions, calls into 279 // the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access 280 // TLS)). 281 // 282 //go:nosplit 283 func startGo(c *CPU) { 284 // Save per-cpu. 285 writeGS(kernelAddr(c.kernelEntry)) 286 287 // 288 // TODO(mpratt): Note that per the note above, this should be done 289 // before entering Go code. However for simplicity we leave it here for 290 // now, since the small critical sections with undefined FPU state 291 // should only contain very limited use of floating point instructions 292 // (notably, use of XMM15 as a zero register). 293 fninit() 294 // Need to sync XCR0 with the host, because xsave and xrstor can be 295 // called from different contexts. 296 if hasXSAVE { 297 // Exclude MPX bits. MPX has been deprecated and we have seen 298 // cases when it isn't supported in VM. 299 xcr0 := localXCR0 &^ (cpuid.XSAVEFeatureBNDCSR | cpuid.XSAVEFeatureBNDREGS) 300 xsetbv(0, xcr0) 301 } 302 303 // Set the syscall target. 304 wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter())) 305 wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) 306 307 // NOTE: This depends on having the 64-bit segments immediately 308 // following the 32-bit user segments. This is simply the way the 309 // sysret instruction is designed to work (it assumes they follow). 310 wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) 311 wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter())) 312 } 313 314 // SetCPUIDFaulting sets CPUID faulting per the boolean value. 315 // 316 // True is returned if faulting could be set. 317 // 318 //go:nosplit 319 func SetCPUIDFaulting(on bool) bool { 320 // Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support 321 // for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR. 322 if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 { 323 features := rdmsr(_MSR_MISC_FEATURES) 324 if on { 325 features |= _MISC_FEATURE_CPUID_TRAP 326 } else { 327 features &^= _MISC_FEATURE_CPUID_TRAP 328 } 329 wrmsr(_MSR_MISC_FEATURES, features) 330 return true // Setting successful. 331 } 332 return false 333 } 334 335 // ReadCR2 reads the current CR2 value. 336 // 337 //go:nosplit 338 func ReadCR2() uintptr { 339 return readCR2() 340 }