github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine_arm64_unsafe.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build arm64
    16  // +build arm64
    17  
    18  package kvm
    19  
    20  import (
    21  	"fmt"
    22  	"reflect"
    23  	"unsafe"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0/pagetables"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform"
    31  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time"
    32  )
    33  
    34  type kvmVcpuInit struct {
    35  	target   uint32
    36  	features [7]uint32
    37  }
    38  
    39  var vcpuInit kvmVcpuInit
    40  
    41  // initArchState initializes architecture-specific state.
    42  func (m *machine) initArchState() error {
    43  	if _, _, errno := unix.RawSyscall(
    44  		unix.SYS_IOCTL,
    45  		uintptr(m.fd),
    46  		_KVM_ARM_PREFERRED_TARGET,
    47  		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
    48  		panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
    49  	}
    50  
    51  	// Initialize all vCPUs on ARM64, while this does not happen on x86_64.
    52  	// The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms.
    53  	// If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time.
    54  	// For more detail, please refer to https://github.com/google/gvisor/issues/5739
    55  	m.mu.Lock()
    56  	for i := 0; i < m.maxVCPUs; i++ {
    57  		m.createVCPU(i)
    58  	}
    59  	m.mu.Unlock()
    60  	return nil
    61  }
    62  
    63  // initArchState initializes architecture-specific state.
    64  func (c *vCPU) initArchState() error {
    65  	var (
    66  		reg     kvmOneReg
    67  		data    uint64
    68  		regGet  kvmOneReg
    69  		dataGet uint64
    70  	)
    71  
    72  	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
    73  	regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
    74  
    75  	vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
    76  	if _, _, errno := unix.RawSyscall(
    77  		unix.SYS_IOCTL,
    78  		uintptr(c.fd),
    79  		_KVM_ARM_VCPU_INIT,
    80  		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
    81  		panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno))
    82  	}
    83  
    84  	// tcr_el1
    85  	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
    86  	reg.id = _KVM_ARM64_REGS_TCR_EL1
    87  	if err := c.setOneRegister(&reg); err != nil {
    88  		return err
    89  	}
    90  
    91  	// mair_el1
    92  	data = _MT_EL1_INIT
    93  	reg.id = _KVM_ARM64_REGS_MAIR_EL1
    94  	if err := c.setOneRegister(&reg); err != nil {
    95  		return err
    96  	}
    97  
    98  	// ttbr0_el1
    99  	data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0)
   100  
   101  	reg.id = _KVM_ARM64_REGS_TTBR0_EL1
   102  	if err := c.setOneRegister(&reg); err != nil {
   103  		return err
   104  	}
   105  
   106  	c.SetTtbr0Kvm(uintptr(data))
   107  
   108  	// ttbr1_el1
   109  	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
   110  
   111  	reg.id = _KVM_ARM64_REGS_TTBR1_EL1
   112  	if err := c.setOneRegister(&reg); err != nil {
   113  		return err
   114  	}
   115  
   116  	// cntkctl_el1
   117  	data = _CNTKCTL_EL1_DEFAULT
   118  	reg.id = _KVM_ARM64_REGS_CNTKCTL_EL1
   119  	if err := c.setOneRegister(&reg); err != nil {
   120  		return err
   121  	}
   122  
   123  	// cpacr_el1
   124  	data = 0
   125  	reg.id = _KVM_ARM64_REGS_CPACR_EL1
   126  	if err := c.setOneRegister(&reg); err != nil {
   127  		return err
   128  	}
   129  
   130  	// sctlr_el1
   131  	data = _SCTLR_EL1_DEFAULT
   132  	reg.id = _KVM_ARM64_REGS_SCTLR_EL1
   133  	if err := c.setOneRegister(&reg); err != nil {
   134  		return err
   135  	}
   136  
   137  	// tpidr_el1
   138  	reg.id = _KVM_ARM64_REGS_TPIDR_EL1
   139  	data = uint64(reflect.ValueOf(&c.CPU).Pointer() | ring0.KernelStartAddress)
   140  	if err := c.setOneRegister(&reg); err != nil {
   141  		return err
   142  	}
   143  
   144  	// sp_el1
   145  	data = c.CPU.StackTop()
   146  	reg.id = _KVM_ARM64_REGS_SP_EL1
   147  	if err := c.setOneRegister(&reg); err != nil {
   148  		return err
   149  	}
   150  
   151  	// pc
   152  	reg.id = _KVM_ARM64_REGS_PC
   153  	data = uint64(ring0.AddrOfStart())
   154  	if err := c.setOneRegister(&reg); err != nil {
   155  		return err
   156  	}
   157  
   158  	// vbar_el1
   159  	reg.id = _KVM_ARM64_REGS_VBAR_EL1
   160  	vectorLocation := ring0.AddrOfVectors()
   161  	data = uint64(ring0.KernelStartAddress | vectorLocation)
   162  	if err := c.setOneRegister(&reg); err != nil {
   163  		return err
   164  	}
   165  
   166  	// Use the address of the exception vector table as
   167  	// the MMIO address base.
   168  	vectorLocationPhys, _, _ := translateToPhysical(vectorLocation)
   169  	arm64HypercallMMIOBase = vectorLocationPhys
   170  
   171  	// Initialize the PCID database.
   172  	if hasGuestPCID {
   173  		// Note that NewPCIDs may return a nil table here, in which
   174  		// case we simply don't use PCID support (see below). In
   175  		// practice, this should not happen, however.
   176  		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
   177  	}
   178  
   179  	return c.setSystemTime()
   180  }
   181  
   182  // setTSC sets the counter Virtual Offset.
   183  func (c *vCPU) setTSC(value uint64) error {
   184  	var (
   185  		reg  kvmOneReg
   186  		data uint64
   187  	)
   188  
   189  	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
   190  	reg.id = _KVM_ARM64_REGS_TIMER_CNT
   191  	data = uint64(value)
   192  
   193  	if err := c.setOneRegister(&reg); err != nil {
   194  		return err
   195  	}
   196  
   197  	return nil
   198  }
   199  
   200  // getTSC gets the counter Physical Counter minus Virtual Offset.
   201  func (c *vCPU) getTSC() error {
   202  	var (
   203  		reg  kvmOneReg
   204  		data uint64
   205  	)
   206  
   207  	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
   208  	reg.id = _KVM_ARM64_REGS_TIMER_CNT
   209  
   210  	if err := c.getOneRegister(&reg); err != nil {
   211  		return err
   212  	}
   213  
   214  	return nil
   215  }
   216  
   217  // setSystemTime sets the vCPU to the system time.
   218  func (c *vCPU) setSystemTime() error {
   219  	const minIterations = 10
   220  	minimum := uint64(0)
   221  	for iter := 0; ; iter++ {
   222  		// Use get the TSC to an estimate of where it will be
   223  		// on the host during a "fast" system call iteration.
   224  		// replace getTSC to another setOneRegister syscall can get more accurate value?
   225  		start := uint64(ktime.Rdtsc())
   226  		if err := c.getTSC(); err != nil {
   227  			return err
   228  		}
   229  		// See if this is our new minimum call time. Note that this
   230  		// serves two functions: one, we make sure that we are
   231  		// accurately predicting the offset we need to set. Second, we
   232  		// don't want to do the final set on a slow call, which could
   233  		// produce a really bad result.
   234  		end := uint64(ktime.Rdtsc())
   235  		if end < start {
   236  			continue // Totally bogus: unstable TSC?
   237  		}
   238  		current := end - start
   239  		if current < minimum || iter == 0 {
   240  			minimum = current // Set our new minimum.
   241  		}
   242  		// Is this past minIterations and within ~10% of minimum?
   243  		upperThreshold := (((minimum << 3) + minimum) >> 3)
   244  		if iter >= minIterations && (current <= upperThreshold || minimum < 50) {
   245  			// Try to set the TSC
   246  			if err := c.setTSC(end + (minimum / 2)); err != nil {
   247  				return err
   248  			}
   249  			return nil
   250  		}
   251  	}
   252  }
   253  
   254  //go:nosplit
   255  func (c *vCPU) loadSegments(tid uint64) {
   256  	// TODO(gvisor.dev/issue/1238):  TLS is not supported.
   257  	// Get TLS from tpidr_el0.
   258  	c.tid.Store(tid)
   259  }
   260  
   261  func (c *vCPU) setOneRegister(reg *kvmOneReg) error {
   262  	if _, _, errno := unix.RawSyscall(
   263  		unix.SYS_IOCTL,
   264  		uintptr(c.fd),
   265  		_KVM_SET_ONE_REG,
   266  		uintptr(unsafe.Pointer(reg))); errno != 0 {
   267  		return fmt.Errorf("error setting one register: %v", errno)
   268  	}
   269  	return nil
   270  }
   271  
   272  func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
   273  	if _, _, errno := unix.RawSyscall(
   274  		unix.SYS_IOCTL,
   275  		uintptr(c.fd),
   276  		_KVM_GET_ONE_REG,
   277  		uintptr(unsafe.Pointer(reg))); errno != 0 {
   278  		return fmt.Errorf("error getting one register: %v", errno)
   279  	}
   280  	return nil
   281  }
   282  
   283  // SwitchToUser unpacks architectural-details.
   284  func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) {
   285  	// Check for canonical addresses.
   286  	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) {
   287  		return nonCanonical(regs.Pc, int32(unix.SIGSEGV), info)
   288  	} else if !ring0.IsCanonical(regs.Sp) {
   289  		return nonCanonical(regs.Sp, int32(unix.SIGSEGV), info)
   290  	}
   291  
   292  	// Assign PCIDs.
   293  	if c.PCIDs != nil {
   294  		var requireFlushPCID bool // Force a flush?
   295  		switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
   296  		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
   297  	}
   298  
   299  	var vector ring0.Vector
   300  	ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
   301  	c.SetTtbr0App(uintptr(ttbr0App))
   302  
   303  	// Full context-switch supporting for Arm64.
   304  	// The Arm64 user-mode execution state consists of:
   305  	// x0-x30
   306  	// PC, SP, PSTATE
   307  	// V0-V31: 32 128-bit registers for floating point, and simd
   308  	// FPSR, FPCR
   309  	// TPIDR_EL0, used for TLS
   310  	appRegs := switchOpts.Registers
   311  	c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs)))
   312  
   313  	entersyscall()
   314  	bluepill(c)
   315  	vector = c.CPU.SwitchToUser(switchOpts)
   316  	exitsyscall()
   317  
   318  	switch vector {
   319  	case ring0.Syscall:
   320  		// Fast path: system call executed.
   321  		return hostarch.NoAccess, nil
   322  	case ring0.PageFault:
   323  		return c.fault(int32(unix.SIGSEGV), info)
   324  	case ring0.El0ErrNMI:
   325  		return c.fault(int32(unix.SIGBUS), info)
   326  	case ring0.Vector(bounce): // ring0.VirtualizationException.
   327  		return hostarch.NoAccess, platform.ErrContextInterrupt
   328  	case ring0.El0SyncUndef:
   329  		return c.fault(int32(unix.SIGILL), info)
   330  	case ring0.El0SyncDbg:
   331  		*info = linux.SignalInfo{
   332  			Signo: int32(unix.SIGTRAP),
   333  			Code:  1, // TRAP_BRKPT (breakpoint).
   334  		}
   335  		info.SetAddr(switchOpts.Registers.Pc) // Include address.
   336  		return hostarch.AccessType{}, platform.ErrContextSignal
   337  	case ring0.El0SyncSpPc:
   338  		*info = linux.SignalInfo{
   339  			Signo: int32(unix.SIGBUS),
   340  			Code:  2, // BUS_ADRERR (physical address does not exist).
   341  		}
   342  		return hostarch.NoAccess, platform.ErrContextSignal
   343  	case ring0.El0SyncSys,
   344  		ring0.El0SyncWfx:
   345  		return hostarch.NoAccess, nil // skip for now.
   346  	default:
   347  		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
   348  	}
   349  
   350  }
   351  
   352  //go:nosplit
   353  func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) {
   354  	ctx := bluepillArchContext(context)
   355  
   356  	// MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters.
   357  	addr, _, e := unix.RawSyscall6(uintptr(ctx.Regs[8]), uintptr(ctx.Regs[0]), uintptr(ctx.Regs[1]),
   358  		uintptr(ctx.Regs[2]), uintptr(ctx.Regs[3])|unix.MAP_DENYWRITE, uintptr(ctx.Regs[4]), uintptr(ctx.Regs[5]))
   359  	ctx.Regs[0] = uint64(addr)
   360  
   361  	return addr, uintptr(ctx.Regs[1]), e
   362  }