gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/ring0/kernel_amd64.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64
    16  // +build amd64
    17  
    18  package ring0
    19  
    20  import (
    21  	"encoding/binary"
    22  	"reflect"
    23  
    24  	"gvisor.dev/gvisor/pkg/cpuid"
    25  	"gvisor.dev/gvisor/pkg/hostarch"
    26  	"gvisor.dev/gvisor/pkg/sentry/arch"
    27  )
    28  
    29  // HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the
    30  // value in regs.
    31  func HaltAndWriteFSBase(regs *arch.Registers)
    32  
    33  // init initializes architecture-specific state.
    34  func (k *Kernel) init(maxCPUs int) {
    35  	entrySize := reflect.TypeOf(kernelEntry{}).Size()
    36  	var (
    37  		entries []kernelEntry
    38  		padding = 1
    39  	)
    40  	for {
    41  		entries = make([]kernelEntry, maxCPUs+padding-1)
    42  		totalSize := entrySize * uintptr(maxCPUs+padding-1)
    43  		addr := reflect.ValueOf(&entries[0]).Pointer()
    44  		if addr&(hostarch.PageSize-1) == 0 && totalSize >= hostarch.PageSize {
    45  			// The runtime forces power-of-2 alignment for allocations, and we are therefore
    46  			// safe once the first address is aligned and the chunk is at least a full page.
    47  			break
    48  		}
    49  		padding = padding << 1
    50  	}
    51  	k.cpuEntries = entries
    52  
    53  	k.globalIDT = &idt64{}
    54  	if reflect.TypeOf(idt64{}).Size() != hostarch.PageSize {
    55  		panic("Size of globalIDT should be PageSize")
    56  	}
    57  	if reflect.ValueOf(k.globalIDT).Pointer()&(hostarch.PageSize-1) != 0 {
    58  		panic("Allocated globalIDT should be page aligned")
    59  	}
    60  
    61  	// Setup the IDT, which is uniform.
    62  	for v, handler := range handlers {
    63  		// Allow Breakpoint and Overflow to be called from all
    64  		// privilege levels.
    65  		dpl := 0
    66  		if v == Breakpoint || v == Overflow {
    67  			dpl = 3
    68  		}
    69  		// Note that we set all traps to use the interrupt stack, this
    70  		// is defined below when setting up the TSS.
    71  		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */)
    72  	}
    73  }
    74  
    75  // EntryRegions returns the set of kernel entry regions (must be mapped).
    76  func (k *Kernel) EntryRegions() map[uintptr]uintptr {
    77  	regions := make(map[uintptr]uintptr)
    78  
    79  	addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
    80  	size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
    81  	end, _ := hostarch.Addr(addr + size).RoundUp()
    82  	regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end)
    83  
    84  	addr = reflect.ValueOf(k.globalIDT).Pointer()
    85  	size = reflect.TypeOf(idt64{}).Size()
    86  	end, _ = hostarch.Addr(addr + size).RoundUp()
    87  	regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end)
    88  
    89  	return regions
    90  }
    91  
    92  // init initializes architecture-specific state.
    93  func (c *CPU) init(cpuID int) {
    94  	c.kernelEntry = &c.kernel.cpuEntries[cpuID]
    95  	c.cpuSelf = c
    96  	// Null segment.
    97  	c.gdt[0].setNull()
    98  
    99  	// Kernel & user segments.
   100  	c.gdt[segKcode] = KernelCodeSegment
   101  	c.gdt[segKdata] = KernelDataSegment
   102  	c.gdt[segUcode32] = UserCodeSegment32
   103  	c.gdt[segUdata] = UserDataSegment
   104  	c.gdt[segUcode64] = UserCodeSegment64
   105  
   106  	// The task segment, this spans two entries.
   107  	tssBase, tssLimit, _ := c.TSS()
   108  	c.gdt[segTss].set(
   109  		uint32(tssBase),
   110  		uint32(tssLimit),
   111  		0, // Privilege level zero.
   112  		SegmentDescriptorPresent|
   113  			SegmentDescriptorAccess|
   114  			SegmentDescriptorWrite|
   115  			SegmentDescriptorExecute)
   116  	c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
   117  
   118  	// Set the kernel stack pointer in the TSS (virtual address).
   119  	stackAddr := c.StackTop()
   120  	c.stackTop = stackAddr
   121  	c.tss.rsp0Lo = uint32(stackAddr)
   122  	c.tss.rsp0Hi = uint32(stackAddr >> 32)
   123  	c.tss.ist1Lo = uint32(stackAddr)
   124  	c.tss.ist1Hi = uint32(stackAddr >> 32)
   125  
   126  	// Set the I/O bitmap base address beyond the last byte in the TSS
   127  	// to block access to the entire I/O address range.
   128  	//
   129  	// From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1:
   130  	// I/O addresses not spanned by the map are treated as if they had set
   131  	// bits in the map.
   132  	c.tss.ioPerm = tssLimit + 1
   133  
   134  	// Permanently set the kernel segments.
   135  	c.registers.Cs = uint64(Kcode)
   136  	c.registers.Ds = uint64(Kdata)
   137  	c.registers.Es = uint64(Kdata)
   138  	c.registers.Ss = uint64(Kdata)
   139  	c.registers.Fs = uint64(Kdata)
   140  	c.registers.Gs = uint64(Kdata)
   141  
   142  	// Set mandatory flags.
   143  	c.registers.Eflags = KernelFlagsSet
   144  
   145  	c.hasXSAVE = hasXSAVE
   146  	c.hasXSAVEOPT = hasXSAVEOPT
   147  	c.hasFSGSBASE = hasFSGSBASE
   148  }
   149  
   150  // StackTop returns the kernel's stack address.
   151  //
   152  //go:nosplit
   153  func (c *CPU) StackTop() uint64 {
   154  	return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
   155  }
   156  
   157  // IDT returns the CPU's IDT base and limit.
   158  //
   159  //go:nosplit
   160  func (c *CPU) IDT() (uint64, uint16) {
   161  	return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
   162  }
   163  
   164  // GDT returns the CPU's GDT base and limit.
   165  //
   166  //go:nosplit
   167  func (c *CPU) GDT() (uint64, uint16) {
   168  	return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
   169  }
   170  
   171  // TSS returns the CPU's TSS base, limit and value.
   172  //
   173  //go:nosplit
   174  func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
   175  	return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
   176  }
   177  
   178  // CR0 returns the CPU's CR0 value.
   179  //
   180  //go:nosplit
   181  func (c *CPU) CR0() uint64 {
   182  	return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE
   183  }
   184  
   185  // CR4 returns the CPU's CR4 value.
   186  //
   187  //go:nosplit
   188  func (c *CPU) CR4() uint64 {
   189  	cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
   190  	if hasPCID {
   191  		cr4 |= _CR4_PCIDE
   192  	}
   193  	if hasXSAVE {
   194  		cr4 |= _CR4_OSXSAVE
   195  	}
   196  	if hasSMEP {
   197  		cr4 |= _CR4_SMEP
   198  	}
   199  	if hasSMAP {
   200  		cr4 |= _CR4_SMAP
   201  	}
   202  	if hasFSGSBASE {
   203  		cr4 |= _CR4_FSGSBASE
   204  	}
   205  	return cr4
   206  }
   207  
   208  // EFER returns the CPU's EFER value.
   209  //
   210  //go:nosplit
   211  func (c *CPU) EFER() uint64 {
   212  	return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX
   213  }
   214  
   215  // IsCanonical indicates whether addr is canonical per the amd64 spec.
   216  //
   217  //go:nosplit
   218  func IsCanonical(addr uint64) bool {
   219  	return addr <= 0x00007fffffffffff || addr >= 0xffff800000000000
   220  }
   221  
   222  // SwitchToUser performs either a sysret or an iret.
   223  //
   224  // The return value is the vector that interrupted execution.
   225  //
   226  // This function will not split the stack. Callers will probably want to call
   227  // runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
   228  // calling this function.
   229  //
   230  // When this is done, this region is quite sensitive to things like system
   231  // calls. After calling entersyscall, any memory used must have been allocated
   232  // and no function calls without go:nosplit are permitted. Any calls made here
   233  // are protected appropriately (e.g. IsCanonical and CR3).
   234  //
   235  // Also note that this function transitively depends on the compiler generating
   236  // code that uses IP-relative addressing inside of absolute addresses. That's
   237  // the case for amd64, but may not be the case for other architectures.
   238  //
   239  // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
   240  //
   241  // +checkescape:all
   242  //
   243  //go:nosplit
   244  func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
   245  	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
   246  	c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
   247  
   248  	// Sanitize registers.
   249  	regs := switchOpts.Registers
   250  	regs.Eflags &= ^uint64(UserFlagsClear)
   251  	regs.Eflags |= UserFlagsSet
   252  	regs.Cs = uint64(Ucode64) // Required for iret.
   253  	regs.Ss = uint64(Udata)   // Ditto.
   254  
   255  	// Perform the switch.
   256  	needIRET := uint64(0)
   257  	if switchOpts.FullRestore {
   258  		needIRET = 1
   259  	}
   260  	vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no.
   261  	return
   262  }
   263  
   264  func doSwitchToUser(
   265  	cpu *CPU, // +0(FP)
   266  	regs *arch.Registers, // +8(FP)
   267  	fpState *byte, // +16(FP)
   268  	userCR3 uint64, // +24(FP)
   269  	needIRET uint64) Vector // +32(FP), +40(FP)
   270  
   271  // startGo is the CPU entrypoint.
   272  //
   273  // This is called from the start asm stub (see entry_amd64.go); on return the
   274  // registers in c.registers will be restored (not segments).
   275  //
   276  // Note that any code written in Go should adhere to Go expected environment:
   277  //   - Initialized floating point state (required for optimizations using
   278  //     floating point instructions).
   279  //   - Go TLS in FS_BASE (this is required by splittable functions, calls into
   280  //     the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access
   281  //     TLS)).
   282  //
   283  //go:nosplit
   284  func startGo(c *CPU) {
   285  	// Save per-cpu.
   286  	writeGS(kernelAddr(c.kernelEntry))
   287  
   288  	//
   289  	// TODO(mpratt): Note that per the note above, this should be done
   290  	// before entering Go code. However for simplicity we leave it here for
   291  	// now, since the small critical sections with undefined FPU state
   292  	// should only contain very limited use of floating point instructions
   293  	// (notably, use of XMM15 as a zero register).
   294  	fninit()
   295  	// Need to sync XCR0 with the host, because xsave and xrstor can be
   296  	// called from different contexts.
   297  	if hasXSAVE {
   298  		// Exclude MPX bits. MPX has been deprecated and we have seen
   299  		// cases when it isn't supported in VM.
   300  		xcr0 := localXCR0 &^ (cpuid.XSAVEFeatureBNDCSR | cpuid.XSAVEFeatureBNDREGS)
   301  		xsetbv(0, xcr0)
   302  	}
   303  
   304  	// Set the syscall target.
   305  	wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter()))
   306  	wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
   307  
   308  	// NOTE: This depends on having the 64-bit segments immediately
   309  	// following the 32-bit user segments. This is simply the way the
   310  	// sysret instruction is designed to work (it assumes they follow).
   311  	wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
   312  	wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter()))
   313  }
   314  
   315  // SetCPUIDFaulting sets CPUID faulting per the boolean value.
   316  //
   317  // True is returned if faulting could be set.
   318  //
   319  //go:nosplit
   320  func SetCPUIDFaulting(on bool) bool {
   321  	// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
   322  	// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
   323  	if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
   324  		features := rdmsr(_MSR_MISC_FEATURES)
   325  		if on {
   326  			features |= _MISC_FEATURE_CPUID_TRAP
   327  		} else {
   328  			features &^= _MISC_FEATURE_CPUID_TRAP
   329  		}
   330  		wrmsr(_MSR_MISC_FEATURES, features)
   331  		return true // Setting successful.
   332  	}
   333  	return false
   334  }
   335  
   336  // ReadCR2 reads the current CR2 value.
   337  //
   338  //go:nosplit
   339  func ReadCR2() uintptr {
   340  	return readCR2()
   341  }