github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine_arm64_unsafe.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build arm64
    16  
    17  package kvm
    18  
    19  import (
    20  	"fmt"
    21  	"reflect"
    22  	"sync/atomic"
    23  	"unsafe"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    27  	"github.com/SagerNet/gvisor/pkg/hostarch"
    28  	"github.com/SagerNet/gvisor/pkg/ring0"
    29  	"github.com/SagerNet/gvisor/pkg/ring0/pagetables"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/arch/fpu"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    32  	ktime "github.com/SagerNet/gvisor/pkg/sentry/time"
    33  )
    34  
    35  type kvmVcpuInit struct {
    36  	target   uint32
    37  	features [7]uint32
    38  }
    39  
    40  var vcpuInit kvmVcpuInit
    41  
    42  // initArchState initializes architecture-specific state.
    43  func (m *machine) initArchState() error {
    44  	if _, _, errno := unix.RawSyscall(
    45  		unix.SYS_IOCTL,
    46  		uintptr(m.fd),
    47  		_KVM_ARM_PREFERRED_TARGET,
    48  		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
    49  		panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
    50  	}
    51  
    52  	// Initialize all vCPUs on ARM64, while this does not happen on x86_64.
    53  	// The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms.
    54  	// If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time.
    55  	// For more detail, please refer to https://github.com/google/gvisor/issues/5739
    56  	m.initialvCPUs = make(map[int]*vCPU)
    57  	m.mu.Lock()
    58  	for int(m.nextID) < m.maxVCPUs-1 {
    59  		c := m.newVCPU()
    60  		c.state = 0
    61  		m.initialvCPUs[c.id] = c
    62  	}
    63  	m.mu.Unlock()
    64  	return nil
    65  }
    66  
    67  // initArchState initializes architecture-specific state.
    68  func (c *vCPU) initArchState() error {
    69  	var (
    70  		reg     kvmOneReg
    71  		data    uint64
    72  		regGet  kvmOneReg
    73  		dataGet uint64
    74  	)
    75  
    76  	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
    77  	regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
    78  
    79  	vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
    80  	if _, _, errno := unix.RawSyscall(
    81  		unix.SYS_IOCTL,
    82  		uintptr(c.fd),
    83  		_KVM_ARM_VCPU_INIT,
    84  		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
    85  		panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno))
    86  	}
    87  
    88  	// tcr_el1
    89  	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
    90  	reg.id = _KVM_ARM64_REGS_TCR_EL1
    91  	if err := c.setOneRegister(&reg); err != nil {
    92  		return err
    93  	}
    94  
    95  	// mair_el1
    96  	data = _MT_EL1_INIT
    97  	reg.id = _KVM_ARM64_REGS_MAIR_EL1
    98  	if err := c.setOneRegister(&reg); err != nil {
    99  		return err
   100  	}
   101  
   102  	// ttbr0_el1
   103  	data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0)
   104  
   105  	reg.id = _KVM_ARM64_REGS_TTBR0_EL1
   106  	if err := c.setOneRegister(&reg); err != nil {
   107  		return err
   108  	}
   109  
   110  	c.SetTtbr0Kvm(uintptr(data))
   111  
   112  	// ttbr1_el1
   113  	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
   114  
   115  	reg.id = _KVM_ARM64_REGS_TTBR1_EL1
   116  	if err := c.setOneRegister(&reg); err != nil {
   117  		return err
   118  	}
   119  
   120  	// sp_el1
   121  	data = c.CPU.StackTop()
   122  	reg.id = _KVM_ARM64_REGS_SP_EL1
   123  	if err := c.setOneRegister(&reg); err != nil {
   124  		return err
   125  	}
   126  
   127  	// pc
   128  	reg.id = _KVM_ARM64_REGS_PC
   129  	data = uint64(reflect.ValueOf(ring0.Start).Pointer())
   130  	if err := c.setOneRegister(&reg); err != nil {
   131  		return err
   132  	}
   133  
   134  	// r8
   135  	reg.id = _KVM_ARM64_REGS_R8
   136  	data = uint64(reflect.ValueOf(&c.CPU).Pointer())
   137  	if err := c.setOneRegister(&reg); err != nil {
   138  		return err
   139  	}
   140  
   141  	// vbar_el1
   142  	reg.id = _KVM_ARM64_REGS_VBAR_EL1
   143  	vectorLocation := reflect.ValueOf(ring0.Vectors).Pointer()
   144  	data = uint64(ring0.KernelStartAddress | vectorLocation)
   145  	if err := c.setOneRegister(&reg); err != nil {
   146  		return err
   147  	}
   148  
   149  	// Use the address of the exception vector table as
   150  	// the MMIO address base.
   151  	arm64HypercallMMIOBase = vectorLocation
   152  
   153  	// Initialize the PCID database.
   154  	if hasGuestPCID {
   155  		// Note that NewPCIDs may return a nil table here, in which
   156  		// case we simply don't use PCID support (see below). In
   157  		// practice, this should not happen, however.
   158  		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
   159  	}
   160  
   161  	c.floatingPointState = fpu.NewState()
   162  
   163  	return c.setSystemTime()
   164  }
   165  
   166  // setTSC sets the counter Virtual Offset.
   167  func (c *vCPU) setTSC(value uint64) error {
   168  	var (
   169  		reg  kvmOneReg
   170  		data uint64
   171  	)
   172  
   173  	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
   174  	reg.id = _KVM_ARM64_REGS_TIMER_CNT
   175  	data = uint64(value)
   176  
   177  	if err := c.setOneRegister(&reg); err != nil {
   178  		return err
   179  	}
   180  
   181  	return nil
   182  }
   183  
   184  // getTSC gets the counter Physical Counter minus Virtual Offset.
   185  func (c *vCPU) getTSC() error {
   186  	var (
   187  		reg  kvmOneReg
   188  		data uint64
   189  	)
   190  
   191  	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
   192  	reg.id = _KVM_ARM64_REGS_TIMER_CNT
   193  
   194  	if err := c.getOneRegister(&reg); err != nil {
   195  		return err
   196  	}
   197  
   198  	return nil
   199  }
   200  
   201  // setSystemTime sets the vCPU to the system time.
   202  func (c *vCPU) setSystemTime() error {
   203  	const minIterations = 10
   204  	minimum := uint64(0)
   205  	for iter := 0; ; iter++ {
   206  		// Use get the TSC to an estimate of where it will be
   207  		// on the host during a "fast" system call iteration.
   208  		// replace getTSC to another setOneRegister syscall can get more accurate value?
   209  		start := uint64(ktime.Rdtsc())
   210  		if err := c.getTSC(); err != nil {
   211  			return err
   212  		}
   213  		// See if this is our new minimum call time. Note that this
   214  		// serves two functions: one, we make sure that we are
   215  		// accurately predicting the offset we need to set. Second, we
   216  		// don't want to do the final set on a slow call, which could
   217  		// produce a really bad result.
   218  		end := uint64(ktime.Rdtsc())
   219  		if end < start {
   220  			continue // Totally bogus: unstable TSC?
   221  		}
   222  		current := end - start
   223  		if current < minimum || iter == 0 {
   224  			minimum = current // Set our new minimum.
   225  		}
   226  		// Is this past minIterations and within ~10% of minimum?
   227  		upperThreshold := (((minimum << 3) + minimum) >> 3)
   228  		if iter >= minIterations && (current <= upperThreshold || minimum < 50) {
   229  			// Try to set the TSC
   230  			if err := c.setTSC(end + (minimum / 2)); err != nil {
   231  				return err
   232  			}
   233  			return nil
   234  		}
   235  	}
   236  }
   237  
   238  //go:nosplit
   239  func (c *vCPU) loadSegments(tid uint64) {
   240  	// TODO(github.com/SagerNet/issue/1238):  TLS is not supported.
   241  	// Get TLS from tpidr_el0.
   242  	atomic.StoreUint64(&c.tid, tid)
   243  }
   244  
   245  func (c *vCPU) setOneRegister(reg *kvmOneReg) error {
   246  	if _, _, errno := unix.RawSyscall(
   247  		unix.SYS_IOCTL,
   248  		uintptr(c.fd),
   249  		_KVM_SET_ONE_REG,
   250  		uintptr(unsafe.Pointer(reg))); errno != 0 {
   251  		return fmt.Errorf("error setting one register: %v", errno)
   252  	}
   253  	return nil
   254  }
   255  
   256  func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
   257  	if _, _, errno := unix.RawSyscall(
   258  		unix.SYS_IOCTL,
   259  		uintptr(c.fd),
   260  		_KVM_GET_ONE_REG,
   261  		uintptr(unsafe.Pointer(reg))); errno != 0 {
   262  		return fmt.Errorf("error getting one register: %v", errno)
   263  	}
   264  	return nil
   265  }
   266  
   267  // SwitchToUser unpacks architectural-details.
   268  func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) {
   269  	// Check for canonical addresses.
   270  	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) {
   271  		return nonCanonical(regs.Pc, int32(unix.SIGSEGV), info)
   272  	} else if !ring0.IsCanonical(regs.Sp) {
   273  		return nonCanonical(regs.Sp, int32(unix.SIGSEGV), info)
   274  	}
   275  
   276  	// Assign PCIDs.
   277  	if c.PCIDs != nil {
   278  		var requireFlushPCID bool // Force a flush?
   279  		switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
   280  		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
   281  	}
   282  
   283  	var vector ring0.Vector
   284  	ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
   285  	c.SetTtbr0App(uintptr(ttbr0App))
   286  
   287  	// Full context-switch supporting for Arm64.
   288  	// The Arm64 user-mode execution state consists of:
   289  	// x0-x30
   290  	// PC, SP, PSTATE
   291  	// V0-V31: 32 128-bit registers for floating point, and simd
   292  	// FPSR, FPCR
   293  	// TPIDR_EL0, used for TLS
   294  	appRegs := switchOpts.Registers
   295  	c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs)))
   296  
   297  	entersyscall()
   298  	bluepill(c)
   299  	vector = c.CPU.SwitchToUser(switchOpts)
   300  	exitsyscall()
   301  
   302  	switch vector {
   303  	case ring0.Syscall:
   304  		// Fast path: system call executed.
   305  		return hostarch.NoAccess, nil
   306  	case ring0.PageFault:
   307  		return c.fault(int32(unix.SIGSEGV), info)
   308  	case ring0.El0ErrNMI:
   309  		return c.fault(int32(unix.SIGBUS), info)
   310  	case ring0.Vector(bounce): // ring0.VirtualizationException.
   311  		return hostarch.NoAccess, platform.ErrContextInterrupt
   312  	case ring0.El0SyncUndef:
   313  		return c.fault(int32(unix.SIGILL), info)
   314  	case ring0.El0SyncDbg:
   315  		*info = linux.SignalInfo{
   316  			Signo: int32(unix.SIGTRAP),
   317  			Code:  1, // TRAP_BRKPT (breakpoint).
   318  		}
   319  		info.SetAddr(switchOpts.Registers.Pc) // Include address.
   320  		return hostarch.AccessType{}, platform.ErrContextSignal
   321  	case ring0.El0SyncSpPc:
   322  		*info = linux.SignalInfo{
   323  			Signo: int32(unix.SIGBUS),
   324  			Code:  2, // BUS_ADRERR (physical address does not exist).
   325  		}
   326  		return hostarch.NoAccess, platform.ErrContextSignal
   327  	case ring0.El0SyncSys,
   328  		ring0.El0SyncWfx:
   329  		return hostarch.NoAccess, nil // skip for now.
   330  	default:
   331  		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
   332  	}
   333  
   334  }