github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/ring0/kernel_amd64.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64
    16  // +build amd64
    17  
    18  package ring0
    19  
    20  import (
    21  	"encoding/binary"
    22  	"reflect"
    23  
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/cpuid"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    27  )
    28  
    29  // HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the
    30  // value in regs.
    31  func HaltAndWriteFSBase(regs *arch.Registers)
    32  
    33  // init initializes architecture-specific state.
    34  func (k *Kernel) init(maxCPUs int) {
    35  	entrySize := reflect.TypeOf(kernelEntry{}).Size()
    36  	var (
    37  		entries []kernelEntry
    38  		padding = 1
    39  	)
    40  	for {
    41  		entries = make([]kernelEntry, maxCPUs+padding-1)
    42  		totalSize := entrySize * uintptr(maxCPUs+padding-1)
    43  		addr := reflect.ValueOf(&entries[0]).Pointer()
    44  		if addr&(hostarch.PageSize-1) == 0 && totalSize >= hostarch.PageSize {
    45  			// The runtime forces power-of-2 alignment for allocations, and we are therefore
    46  			// safe once the first address is aligned and the chunk is at least a full page.
    47  			break
    48  		}
    49  		padding = padding << 1
    50  	}
    51  	k.cpuEntries = entries
    52  
    53  	k.globalIDT = &idt64{}
    54  	if reflect.TypeOf(idt64{}).Size() != hostarch.PageSize {
    55  		panic("Size of globalIDT should be PageSize")
    56  	}
    57  	if reflect.ValueOf(k.globalIDT).Pointer()&(hostarch.PageSize-1) != 0 {
    58  		panic("Allocated globalIDT should be page aligned")
    59  	}
    60  
    61  	// Setup the IDT, which is uniform.
    62  	for v, handler := range handlers {
    63  		// Allow Breakpoint and Overflow to be called from all
    64  		// privilege levels.
    65  		dpl := 0
    66  		if v == Breakpoint || v == Overflow {
    67  			dpl = 3
    68  		}
    69  		// Note that we set all traps to use the interrupt stack, this
    70  		// is defined below when setting up the TSS.
    71  		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */)
    72  	}
    73  }
    74  
    75  // EntryRegions returns the set of kernel entry regions (must be mapped).
    76  func (k *Kernel) EntryRegions() map[uintptr]uintptr {
    77  	regions := make(map[uintptr]uintptr)
    78  
    79  	addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
    80  	size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
    81  	end, _ := hostarch.Addr(addr + size).RoundUp()
    82  	regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end)
    83  
    84  	addr = reflect.ValueOf(k.globalIDT).Pointer()
    85  	size = reflect.TypeOf(idt64{}).Size()
    86  	end, _ = hostarch.Addr(addr + size).RoundUp()
    87  	regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end)
    88  
    89  	return regions
    90  }
    91  
    92  // init initializes architecture-specific state.
    93  func (c *CPU) init(cpuID int) {
    94  	c.kernelEntry = &c.kernel.cpuEntries[cpuID]
    95  	c.cpuSelf = c
    96  	// Null segment.
    97  	c.gdt[0].setNull()
    98  
    99  	// Kernel & user segments.
   100  	c.gdt[segKcode] = KernelCodeSegment
   101  	c.gdt[segKdata] = KernelDataSegment
   102  	c.gdt[segUcode32] = UserCodeSegment32
   103  	c.gdt[segUdata] = UserDataSegment
   104  	c.gdt[segUcode64] = UserCodeSegment64
   105  
   106  	// The task segment, this spans two entries.
   107  	tssBase, tssLimit, _ := c.TSS()
   108  	c.gdt[segTss].set(
   109  		uint32(tssBase),
   110  		uint32(tssLimit),
   111  		0, // Privilege level zero.
   112  		SegmentDescriptorPresent|
   113  			SegmentDescriptorAccess|
   114  			SegmentDescriptorWrite|
   115  			SegmentDescriptorExecute)
   116  	c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
   117  
   118  	// Set the kernel stack pointer in the TSS (virtual address).
   119  	stackAddr := c.StackTop()
   120  	c.stackTop = stackAddr
   121  	c.tss.rsp0Lo = uint32(stackAddr)
   122  	c.tss.rsp0Hi = uint32(stackAddr >> 32)
   123  	c.tss.ist1Lo = uint32(stackAddr)
   124  	c.tss.ist1Hi = uint32(stackAddr >> 32)
   125  
   126  	// Set the I/O bitmap base address beyond the last byte in the TSS
   127  	// to block access to the entire I/O address range.
   128  	//
   129  	// From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1:
   130  	// I/O addresses not spanned by the map are treated as if they had set
   131  	// bits in the map.
   132  	c.tss.ioPerm = tssLimit + 1
   133  
   134  	// Permanently set the kernel segments.
   135  	c.registers.Cs = uint64(Kcode)
   136  	c.registers.Ds = uint64(Kdata)
   137  	c.registers.Es = uint64(Kdata)
   138  	c.registers.Ss = uint64(Kdata)
   139  	c.registers.Fs = uint64(Kdata)
   140  	c.registers.Gs = uint64(Kdata)
   141  
   142  	// Set mandatory flags.
   143  	c.registers.Eflags = KernelFlagsSet
   144  
   145  	c.hasXSAVE = hasXSAVE
   146  	c.hasXSAVEOPT = hasXSAVEOPT
   147  }
   148  
   149  // StackTop returns the kernel's stack address.
   150  //
   151  //go:nosplit
   152  func (c *CPU) StackTop() uint64 {
   153  	return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
   154  }
   155  
   156  // IDT returns the CPU's IDT base and limit.
   157  //
   158  //go:nosplit
   159  func (c *CPU) IDT() (uint64, uint16) {
   160  	return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
   161  }
   162  
   163  // GDT returns the CPU's GDT base and limit.
   164  //
   165  //go:nosplit
   166  func (c *CPU) GDT() (uint64, uint16) {
   167  	return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
   168  }
   169  
   170  // TSS returns the CPU's TSS base, limit and value.
   171  //
   172  //go:nosplit
   173  func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
   174  	return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
   175  }
   176  
   177  // CR0 returns the CPU's CR0 value.
   178  //
   179  //go:nosplit
   180  func (c *CPU) CR0() uint64 {
   181  	return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE
   182  }
   183  
   184  // CR4 returns the CPU's CR4 value.
   185  //
   186  //go:nosplit
   187  func (c *CPU) CR4() uint64 {
   188  	cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
   189  	if hasPCID {
   190  		cr4 |= _CR4_PCIDE
   191  	}
   192  	if hasXSAVE {
   193  		cr4 |= _CR4_OSXSAVE
   194  	}
   195  	if hasSMEP {
   196  		cr4 |= _CR4_SMEP
   197  	}
   198  	if hasSMAP {
   199  		cr4 |= _CR4_SMAP
   200  	}
   201  	if hasFSGSBASE {
   202  		cr4 |= _CR4_FSGSBASE
   203  	}
   204  	return cr4
   205  }
   206  
   207  // EFER returns the CPU's EFER value.
   208  //
   209  //go:nosplit
   210  func (c *CPU) EFER() uint64 {
   211  	return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX
   212  }
   213  
   214  // IsCanonical indicates whether addr is canonical per the amd64 spec.
   215  //
   216  //go:nosplit
   217  func IsCanonical(addr uint64) bool {
   218  	return addr <= 0x00007fffffffffff || addr >= 0xffff800000000000
   219  }
   220  
   221  // SwitchToUser performs either a sysret or an iret.
   222  //
   223  // The return value is the vector that interrupted execution.
   224  //
   225  // This function will not split the stack. Callers will probably want to call
   226  // runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
   227  // calling this function.
   228  //
   229  // When this is done, this region is quite sensitive to things like system
   230  // calls. After calling entersyscall, any memory used must have been allocated
   231  // and no function calls without go:nosplit are permitted. Any calls made here
   232  // are protected appropriately (e.g. IsCanonical and CR3).
   233  //
   234  // Also note that this function transitively depends on the compiler generating
   235  // code that uses IP-relative addressing inside of absolute addresses. That's
   236  // the case for amd64, but may not be the case for other architectures.
   237  //
   238  // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
   239  //
   240  // +checkescape:all
   241  //
   242  //go:nosplit
   243  func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
   244  	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
   245  	c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
   246  
   247  	// Sanitize registers.
   248  	regs := switchOpts.Registers
   249  	regs.Eflags &= ^uint64(UserFlagsClear)
   250  	regs.Eflags |= UserFlagsSet
   251  	regs.Cs = uint64(Ucode64) // Required for iret.
   252  	regs.Ss = uint64(Udata)   // Ditto.
   253  
   254  	// Perform the switch.
   255  	needIRET := uint64(0)
   256  	if switchOpts.FullRestore {
   257  		needIRET = 1
   258  	}
   259  	vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no.
   260  	return
   261  }
   262  
   263  func doSwitchToUser(
   264  	cpu *CPU, // +0(FP)
   265  	regs *arch.Registers, // +8(FP)
   266  	fpState *byte, // +16(FP)
   267  	userCR3 uint64, // +24(FP)
   268  	needIRET uint64) Vector // +32(FP), +40(FP)
   269  
   270  // startGo is the CPU entrypoint.
   271  //
   272  // This is called from the start asm stub (see entry_amd64.go); on return the
   273  // registers in c.registers will be restored (not segments).
   274  //
   275  // Note that any code written in Go should adhere to Go expected environment:
   276  //   - Initialized floating point state (required for optimizations using
   277  //     floating point instructions).
   278  //   - Go TLS in FS_BASE (this is required by splittable functions, calls into
   279  //     the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access
   280  //     TLS)).
   281  //
   282  //go:nosplit
   283  func startGo(c *CPU) {
   284  	// Save per-cpu.
   285  	writeGS(kernelAddr(c.kernelEntry))
   286  
   287  	//
   288  	// TODO(mpratt): Note that per the note above, this should be done
   289  	// before entering Go code. However for simplicity we leave it here for
   290  	// now, since the small critical sections with undefined FPU state
   291  	// should only contain very limited use of floating point instructions
   292  	// (notably, use of XMM15 as a zero register).
   293  	fninit()
   294  	// Need to sync XCR0 with the host, because xsave and xrstor can be
   295  	// called from different contexts.
   296  	if hasXSAVE {
   297  		// Exclude MPX bits. MPX has been deprecated and we have seen
   298  		// cases when it isn't supported in VM.
   299  		xcr0 := localXCR0 &^ (cpuid.XSAVEFeatureBNDCSR | cpuid.XSAVEFeatureBNDREGS)
   300  		xsetbv(0, xcr0)
   301  	}
   302  
   303  	// Set the syscall target.
   304  	wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter()))
   305  	wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
   306  
   307  	// NOTE: This depends on having the 64-bit segments immediately
   308  	// following the 32-bit user segments. This is simply the way the
   309  	// sysret instruction is designed to work (it assumes they follow).
   310  	wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
   311  	wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter()))
   312  }
   313  
   314  // SetCPUIDFaulting sets CPUID faulting per the boolean value.
   315  //
   316  // True is returned if faulting could be set.
   317  //
   318  //go:nosplit
   319  func SetCPUIDFaulting(on bool) bool {
   320  	// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
   321  	// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
   322  	if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
   323  		features := rdmsr(_MSR_MISC_FEATURES)
   324  		if on {
   325  			features |= _MISC_FEATURE_CPUID_TRAP
   326  		} else {
   327  			features &^= _MISC_FEATURE_CPUID_TRAP
   328  		}
   329  		wrmsr(_MSR_MISC_FEATURES, features)
   330  		return true // Setting successful.
   331  	}
   332  	return false
   333  }
   334  
   335  // ReadCR2 reads the current CR2 value.
   336  //
   337  //go:nosplit
   338  func ReadCR2() uintptr {
   339  	return readCR2()
   340  }