github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine_arm64_unsafe.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build arm64 16 // +build arm64 17 18 package kvm 19 20 import ( 21 "fmt" 22 "reflect" 23 "unsafe" 24 25 "golang.org/x/sys/unix" 26 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 27 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 28 "github.com/nicocha30/gvisor-ligolo/pkg/ring0" 29 "github.com/nicocha30/gvisor-ligolo/pkg/ring0/pagetables" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 31 ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time" 32 ) 33 34 type kvmVcpuInit struct { 35 target uint32 36 features [7]uint32 37 } 38 39 var vcpuInit kvmVcpuInit 40 41 // initArchState initializes architecture-specific state. 42 func (m *machine) initArchState() error { 43 if _, _, errno := unix.RawSyscall( 44 unix.SYS_IOCTL, 45 uintptr(m.fd), 46 _KVM_ARM_PREFERRED_TARGET, 47 uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { 48 panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) 49 } 50 51 // Initialize all vCPUs on ARM64, while this does not happen on x86_64. 52 // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms. 53 // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time. 54 // For more detail, please refer to https://github.com/google/gvisor/issues/5739 55 m.mu.Lock() 56 for i := 0; i < m.maxVCPUs; i++ { 57 m.createVCPU(i) 58 } 59 m.mu.Unlock() 60 return nil 61 } 62 63 // initArchState initializes architecture-specific state. 64 func (c *vCPU) initArchState() error { 65 var ( 66 reg kvmOneReg 67 data uint64 68 regGet kvmOneReg 69 dataGet uint64 70 ) 71 72 reg.addr = uint64(reflect.ValueOf(&data).Pointer()) 73 regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer()) 74 75 vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2) 76 if _, _, errno := unix.RawSyscall( 77 unix.SYS_IOCTL, 78 uintptr(c.fd), 79 _KVM_ARM_VCPU_INIT, 80 uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { 81 panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno)) 82 } 83 84 // tcr_el1 85 data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS 86 reg.id = _KVM_ARM64_REGS_TCR_EL1 87 if err := c.setOneRegister(®); err != nil { 88 return err 89 } 90 91 // mair_el1 92 data = _MT_EL1_INIT 93 reg.id = _KVM_ARM64_REGS_MAIR_EL1 94 if err := c.setOneRegister(®); err != nil { 95 return err 96 } 97 98 // ttbr0_el1 99 data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0) 100 101 reg.id = _KVM_ARM64_REGS_TTBR0_EL1 102 if err := c.setOneRegister(®); err != nil { 103 return err 104 } 105 106 c.SetTtbr0Kvm(uintptr(data)) 107 108 // ttbr1_el1 109 data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0) 110 111 reg.id = _KVM_ARM64_REGS_TTBR1_EL1 112 if err := c.setOneRegister(®); err != nil { 113 return err 114 } 115 116 // cntkctl_el1 117 data = _CNTKCTL_EL1_DEFAULT 118 reg.id = _KVM_ARM64_REGS_CNTKCTL_EL1 119 if err := c.setOneRegister(®); err != nil { 120 return err 121 } 122 123 // cpacr_el1 124 data = 0 125 reg.id = _KVM_ARM64_REGS_CPACR_EL1 126 if err := c.setOneRegister(®); err != nil { 127 return err 128 } 129 130 // sctlr_el1 131 data = _SCTLR_EL1_DEFAULT 132 reg.id = _KVM_ARM64_REGS_SCTLR_EL1 133 if err := c.setOneRegister(®); err != nil { 134 return err 135 } 136 137 // tpidr_el1 138 reg.id = _KVM_ARM64_REGS_TPIDR_EL1 139 data = uint64(reflect.ValueOf(&c.CPU).Pointer() | ring0.KernelStartAddress) 140 if err := c.setOneRegister(®); err != nil { 141 return err 142 } 143 144 // sp_el1 145 data = c.CPU.StackTop() 146 reg.id = _KVM_ARM64_REGS_SP_EL1 147 if err := c.setOneRegister(®); err != nil { 148 return err 149 } 150 151 // pc 152 reg.id = _KVM_ARM64_REGS_PC 153 data = uint64(ring0.AddrOfStart()) 154 if err := c.setOneRegister(®); err != nil { 155 return err 156 } 157 158 // vbar_el1 159 reg.id = _KVM_ARM64_REGS_VBAR_EL1 160 vectorLocation := ring0.AddrOfVectors() 161 data = uint64(ring0.KernelStartAddress | vectorLocation) 162 if err := c.setOneRegister(®); err != nil { 163 return err 164 } 165 166 // Use the address of the exception vector table as 167 // the MMIO address base. 168 vectorLocationPhys, _, _ := translateToPhysical(vectorLocation) 169 arm64HypercallMMIOBase = vectorLocationPhys 170 171 // Initialize the PCID database. 172 if hasGuestPCID { 173 // Note that NewPCIDs may return a nil table here, in which 174 // case we simply don't use PCID support (see below). In 175 // practice, this should not happen, however. 176 c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) 177 } 178 179 return c.setSystemTime() 180 } 181 182 // setTSC sets the counter Virtual Offset. 183 func (c *vCPU) setTSC(value uint64) error { 184 var ( 185 reg kvmOneReg 186 data uint64 187 ) 188 189 reg.addr = uint64(reflect.ValueOf(&data).Pointer()) 190 reg.id = _KVM_ARM64_REGS_TIMER_CNT 191 data = uint64(value) 192 193 if err := c.setOneRegister(®); err != nil { 194 return err 195 } 196 197 return nil 198 } 199 200 // getTSC gets the counter Physical Counter minus Virtual Offset. 201 func (c *vCPU) getTSC() error { 202 var ( 203 reg kvmOneReg 204 data uint64 205 ) 206 207 reg.addr = uint64(reflect.ValueOf(&data).Pointer()) 208 reg.id = _KVM_ARM64_REGS_TIMER_CNT 209 210 if err := c.getOneRegister(®); err != nil { 211 return err 212 } 213 214 return nil 215 } 216 217 // setSystemTime sets the vCPU to the system time. 218 func (c *vCPU) setSystemTime() error { 219 const minIterations = 10 220 minimum := uint64(0) 221 for iter := 0; ; iter++ { 222 // Use get the TSC to an estimate of where it will be 223 // on the host during a "fast" system call iteration. 224 // replace getTSC to another setOneRegister syscall can get more accurate value? 225 start := uint64(ktime.Rdtsc()) 226 if err := c.getTSC(); err != nil { 227 return err 228 } 229 // See if this is our new minimum call time. Note that this 230 // serves two functions: one, we make sure that we are 231 // accurately predicting the offset we need to set. Second, we 232 // don't want to do the final set on a slow call, which could 233 // produce a really bad result. 234 end := uint64(ktime.Rdtsc()) 235 if end < start { 236 continue // Totally bogus: unstable TSC? 237 } 238 current := end - start 239 if current < minimum || iter == 0 { 240 minimum = current // Set our new minimum. 241 } 242 // Is this past minIterations and within ~10% of minimum? 243 upperThreshold := (((minimum << 3) + minimum) >> 3) 244 if iter >= minIterations && (current <= upperThreshold || minimum < 50) { 245 // Try to set the TSC 246 if err := c.setTSC(end + (minimum / 2)); err != nil { 247 return err 248 } 249 return nil 250 } 251 } 252 } 253 254 //go:nosplit 255 func (c *vCPU) loadSegments(tid uint64) { 256 // TODO(gvisor.dev/issue/1238): TLS is not supported. 257 // Get TLS from tpidr_el0. 258 c.tid.Store(tid) 259 } 260 261 func (c *vCPU) setOneRegister(reg *kvmOneReg) error { 262 if _, _, errno := unix.RawSyscall( 263 unix.SYS_IOCTL, 264 uintptr(c.fd), 265 _KVM_SET_ONE_REG, 266 uintptr(unsafe.Pointer(reg))); errno != 0 { 267 return fmt.Errorf("error setting one register: %v", errno) 268 } 269 return nil 270 } 271 272 func (c *vCPU) getOneRegister(reg *kvmOneReg) error { 273 if _, _, errno := unix.RawSyscall( 274 unix.SYS_IOCTL, 275 uintptr(c.fd), 276 _KVM_GET_ONE_REG, 277 uintptr(unsafe.Pointer(reg))); errno != 0 { 278 return fmt.Errorf("error getting one register: %v", errno) 279 } 280 return nil 281 } 282 283 // SwitchToUser unpacks architectural-details. 284 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { 285 // Check for canonical addresses. 286 if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) { 287 return nonCanonical(regs.Pc, int32(unix.SIGSEGV), info) 288 } else if !ring0.IsCanonical(regs.Sp) { 289 return nonCanonical(regs.Sp, int32(unix.SIGSEGV), info) 290 } 291 292 // Assign PCIDs. 293 if c.PCIDs != nil { 294 var requireFlushPCID bool // Force a flush? 295 switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) 296 switchOpts.Flush = switchOpts.Flush || requireFlushPCID 297 } 298 299 var vector ring0.Vector 300 ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0) 301 c.SetTtbr0App(uintptr(ttbr0App)) 302 303 // Full context-switch supporting for Arm64. 304 // The Arm64 user-mode execution state consists of: 305 // x0-x30 306 // PC, SP, PSTATE 307 // V0-V31: 32 128-bit registers for floating point, and simd 308 // FPSR, FPCR 309 // TPIDR_EL0, used for TLS 310 appRegs := switchOpts.Registers 311 c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs))) 312 313 entersyscall() 314 bluepill(c) 315 vector = c.CPU.SwitchToUser(switchOpts) 316 exitsyscall() 317 318 switch vector { 319 case ring0.Syscall: 320 // Fast path: system call executed. 321 return hostarch.NoAccess, nil 322 case ring0.PageFault: 323 return c.fault(int32(unix.SIGSEGV), info) 324 case ring0.El0ErrNMI: 325 return c.fault(int32(unix.SIGBUS), info) 326 case ring0.Vector(bounce): // ring0.VirtualizationException. 327 return hostarch.NoAccess, platform.ErrContextInterrupt 328 case ring0.El0SyncUndef: 329 return c.fault(int32(unix.SIGILL), info) 330 case ring0.El0SyncDbg: 331 *info = linux.SignalInfo{ 332 Signo: int32(unix.SIGTRAP), 333 Code: 1, // TRAP_BRKPT (breakpoint). 334 } 335 info.SetAddr(switchOpts.Registers.Pc) // Include address. 336 return hostarch.AccessType{}, platform.ErrContextSignal 337 case ring0.El0SyncSpPc: 338 *info = linux.SignalInfo{ 339 Signo: int32(unix.SIGBUS), 340 Code: 2, // BUS_ADRERR (physical address does not exist). 341 } 342 return hostarch.NoAccess, platform.ErrContextSignal 343 case ring0.El0SyncSys, 344 ring0.El0SyncWfx: 345 return hostarch.NoAccess, nil // skip for now. 346 default: 347 panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) 348 } 349 350 } 351 352 //go:nosplit 353 func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) { 354 ctx := bluepillArchContext(context) 355 356 // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters. 357 addr, _, e := unix.RawSyscall6(uintptr(ctx.Regs[8]), uintptr(ctx.Regs[0]), uintptr(ctx.Regs[1]), 358 uintptr(ctx.Regs[2]), uintptr(ctx.Regs[3])|unix.MAP_DENYWRITE, uintptr(ctx.Regs[4]), uintptr(ctx.Regs[5])) 359 ctx.Regs[0] = uint64(addr) 360 361 return addr, uintptr(ctx.Regs[1]), e 362 }