github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine_arm64_unsafe.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build arm64 16 17 package kvm 18 19 import ( 20 "fmt" 21 "reflect" 22 "sync/atomic" 23 "unsafe" 24 25 "golang.org/x/sys/unix" 26 "github.com/SagerNet/gvisor/pkg/abi/linux" 27 "github.com/SagerNet/gvisor/pkg/hostarch" 28 "github.com/SagerNet/gvisor/pkg/ring0" 29 "github.com/SagerNet/gvisor/pkg/ring0/pagetables" 30 "github.com/SagerNet/gvisor/pkg/sentry/arch/fpu" 31 "github.com/SagerNet/gvisor/pkg/sentry/platform" 32 ktime "github.com/SagerNet/gvisor/pkg/sentry/time" 33 ) 34 35 type kvmVcpuInit struct { 36 target uint32 37 features [7]uint32 38 } 39 40 var vcpuInit kvmVcpuInit 41 42 // initArchState initializes architecture-specific state. 43 func (m *machine) initArchState() error { 44 if _, _, errno := unix.RawSyscall( 45 unix.SYS_IOCTL, 46 uintptr(m.fd), 47 _KVM_ARM_PREFERRED_TARGET, 48 uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { 49 panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) 50 } 51 52 // Initialize all vCPUs on ARM64, while this does not happen on x86_64. 53 // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms. 54 // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time. 55 // For more detail, please refer to https://github.com/google/gvisor/issues/5739 56 m.initialvCPUs = make(map[int]*vCPU) 57 m.mu.Lock() 58 for int(m.nextID) < m.maxVCPUs-1 { 59 c := m.newVCPU() 60 c.state = 0 61 m.initialvCPUs[c.id] = c 62 } 63 m.mu.Unlock() 64 return nil 65 } 66 67 // initArchState initializes architecture-specific state. 68 func (c *vCPU) initArchState() error { 69 var ( 70 reg kvmOneReg 71 data uint64 72 regGet kvmOneReg 73 dataGet uint64 74 ) 75 76 reg.addr = uint64(reflect.ValueOf(&data).Pointer()) 77 regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer()) 78 79 vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2) 80 if _, _, errno := unix.RawSyscall( 81 unix.SYS_IOCTL, 82 uintptr(c.fd), 83 _KVM_ARM_VCPU_INIT, 84 uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { 85 panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno)) 86 } 87 88 // tcr_el1 89 data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS 90 reg.id = _KVM_ARM64_REGS_TCR_EL1 91 if err := c.setOneRegister(®); err != nil { 92 return err 93 } 94 95 // mair_el1 96 data = _MT_EL1_INIT 97 reg.id = _KVM_ARM64_REGS_MAIR_EL1 98 if err := c.setOneRegister(®); err != nil { 99 return err 100 } 101 102 // ttbr0_el1 103 data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0) 104 105 reg.id = _KVM_ARM64_REGS_TTBR0_EL1 106 if err := c.setOneRegister(®); err != nil { 107 return err 108 } 109 110 c.SetTtbr0Kvm(uintptr(data)) 111 112 // ttbr1_el1 113 data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0) 114 115 reg.id = _KVM_ARM64_REGS_TTBR1_EL1 116 if err := c.setOneRegister(®); err != nil { 117 return err 118 } 119 120 // sp_el1 121 data = c.CPU.StackTop() 122 reg.id = _KVM_ARM64_REGS_SP_EL1 123 if err := c.setOneRegister(®); err != nil { 124 return err 125 } 126 127 // pc 128 reg.id = _KVM_ARM64_REGS_PC 129 data = uint64(reflect.ValueOf(ring0.Start).Pointer()) 130 if err := c.setOneRegister(®); err != nil { 131 return err 132 } 133 134 // r8 135 reg.id = _KVM_ARM64_REGS_R8 136 data = uint64(reflect.ValueOf(&c.CPU).Pointer()) 137 if err := c.setOneRegister(®); err != nil { 138 return err 139 } 140 141 // vbar_el1 142 reg.id = _KVM_ARM64_REGS_VBAR_EL1 143 vectorLocation := reflect.ValueOf(ring0.Vectors).Pointer() 144 data = uint64(ring0.KernelStartAddress | vectorLocation) 145 if err := c.setOneRegister(®); err != nil { 146 return err 147 } 148 149 // Use the address of the exception vector table as 150 // the MMIO address base. 151 arm64HypercallMMIOBase = vectorLocation 152 153 // Initialize the PCID database. 154 if hasGuestPCID { 155 // Note that NewPCIDs may return a nil table here, in which 156 // case we simply don't use PCID support (see below). In 157 // practice, this should not happen, however. 158 c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) 159 } 160 161 c.floatingPointState = fpu.NewState() 162 163 return c.setSystemTime() 164 } 165 166 // setTSC sets the counter Virtual Offset. 167 func (c *vCPU) setTSC(value uint64) error { 168 var ( 169 reg kvmOneReg 170 data uint64 171 ) 172 173 reg.addr = uint64(reflect.ValueOf(&data).Pointer()) 174 reg.id = _KVM_ARM64_REGS_TIMER_CNT 175 data = uint64(value) 176 177 if err := c.setOneRegister(®); err != nil { 178 return err 179 } 180 181 return nil 182 } 183 184 // getTSC gets the counter Physical Counter minus Virtual Offset. 185 func (c *vCPU) getTSC() error { 186 var ( 187 reg kvmOneReg 188 data uint64 189 ) 190 191 reg.addr = uint64(reflect.ValueOf(&data).Pointer()) 192 reg.id = _KVM_ARM64_REGS_TIMER_CNT 193 194 if err := c.getOneRegister(®); err != nil { 195 return err 196 } 197 198 return nil 199 } 200 201 // setSystemTime sets the vCPU to the system time. 202 func (c *vCPU) setSystemTime() error { 203 const minIterations = 10 204 minimum := uint64(0) 205 for iter := 0; ; iter++ { 206 // Use get the TSC to an estimate of where it will be 207 // on the host during a "fast" system call iteration. 208 // replace getTSC to another setOneRegister syscall can get more accurate value? 209 start := uint64(ktime.Rdtsc()) 210 if err := c.getTSC(); err != nil { 211 return err 212 } 213 // See if this is our new minimum call time. Note that this 214 // serves two functions: one, we make sure that we are 215 // accurately predicting the offset we need to set. Second, we 216 // don't want to do the final set on a slow call, which could 217 // produce a really bad result. 218 end := uint64(ktime.Rdtsc()) 219 if end < start { 220 continue // Totally bogus: unstable TSC? 221 } 222 current := end - start 223 if current < minimum || iter == 0 { 224 minimum = current // Set our new minimum. 225 } 226 // Is this past minIterations and within ~10% of minimum? 227 upperThreshold := (((minimum << 3) + minimum) >> 3) 228 if iter >= minIterations && (current <= upperThreshold || minimum < 50) { 229 // Try to set the TSC 230 if err := c.setTSC(end + (minimum / 2)); err != nil { 231 return err 232 } 233 return nil 234 } 235 } 236 } 237 238 //go:nosplit 239 func (c *vCPU) loadSegments(tid uint64) { 240 // TODO(github.com/SagerNet/issue/1238): TLS is not supported. 241 // Get TLS from tpidr_el0. 242 atomic.StoreUint64(&c.tid, tid) 243 } 244 245 func (c *vCPU) setOneRegister(reg *kvmOneReg) error { 246 if _, _, errno := unix.RawSyscall( 247 unix.SYS_IOCTL, 248 uintptr(c.fd), 249 _KVM_SET_ONE_REG, 250 uintptr(unsafe.Pointer(reg))); errno != 0 { 251 return fmt.Errorf("error setting one register: %v", errno) 252 } 253 return nil 254 } 255 256 func (c *vCPU) getOneRegister(reg *kvmOneReg) error { 257 if _, _, errno := unix.RawSyscall( 258 unix.SYS_IOCTL, 259 uintptr(c.fd), 260 _KVM_GET_ONE_REG, 261 uintptr(unsafe.Pointer(reg))); errno != 0 { 262 return fmt.Errorf("error getting one register: %v", errno) 263 } 264 return nil 265 } 266 267 // SwitchToUser unpacks architectural-details. 268 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { 269 // Check for canonical addresses. 270 if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) { 271 return nonCanonical(regs.Pc, int32(unix.SIGSEGV), info) 272 } else if !ring0.IsCanonical(regs.Sp) { 273 return nonCanonical(regs.Sp, int32(unix.SIGSEGV), info) 274 } 275 276 // Assign PCIDs. 277 if c.PCIDs != nil { 278 var requireFlushPCID bool // Force a flush? 279 switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) 280 switchOpts.Flush = switchOpts.Flush || requireFlushPCID 281 } 282 283 var vector ring0.Vector 284 ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0) 285 c.SetTtbr0App(uintptr(ttbr0App)) 286 287 // Full context-switch supporting for Arm64. 288 // The Arm64 user-mode execution state consists of: 289 // x0-x30 290 // PC, SP, PSTATE 291 // V0-V31: 32 128-bit registers for floating point, and simd 292 // FPSR, FPCR 293 // TPIDR_EL0, used for TLS 294 appRegs := switchOpts.Registers 295 c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs))) 296 297 entersyscall() 298 bluepill(c) 299 vector = c.CPU.SwitchToUser(switchOpts) 300 exitsyscall() 301 302 switch vector { 303 case ring0.Syscall: 304 // Fast path: system call executed. 305 return hostarch.NoAccess, nil 306 case ring0.PageFault: 307 return c.fault(int32(unix.SIGSEGV), info) 308 case ring0.El0ErrNMI: 309 return c.fault(int32(unix.SIGBUS), info) 310 case ring0.Vector(bounce): // ring0.VirtualizationException. 311 return hostarch.NoAccess, platform.ErrContextInterrupt 312 case ring0.El0SyncUndef: 313 return c.fault(int32(unix.SIGILL), info) 314 case ring0.El0SyncDbg: 315 *info = linux.SignalInfo{ 316 Signo: int32(unix.SIGTRAP), 317 Code: 1, // TRAP_BRKPT (breakpoint). 318 } 319 info.SetAddr(switchOpts.Registers.Pc) // Include address. 320 return hostarch.AccessType{}, platform.ErrContextSignal 321 case ring0.El0SyncSpPc: 322 *info = linux.SignalInfo{ 323 Signo: int32(unix.SIGBUS), 324 Code: 2, // BUS_ADRERR (physical address does not exist). 325 } 326 return hostarch.NoAccess, platform.ErrContextSignal 327 case ring0.El0SyncSys, 328 ring0.El0SyncWfx: 329 return hostarch.NoAccess, nil // skip for now. 330 default: 331 panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) 332 } 333 334 }