github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/ring0/entry_amd64.s (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "funcdata.h" 16 #include "textflag.h" 17 18 // CPU offsets. 19 #define CPU_REGISTERS 72 // +checkoffset . CPU.registers 20 #define CPU_FPU_STATE 288 // +checkoffset . CPU.floatingPointState 21 #define CPU_ARCH_STATE 16 // +checkoffset . CPU.CPUArchState 22 #define CPU_ERROR_CODE CPU_ARCH_STATE+0 // +checkoffset . CPUArchState.errorCode 23 #define CPU_ERROR_TYPE CPU_ARCH_STATE+8 // +checkoffset . CPUArchState.errorType 24 #define CPU_VECTOR CPU_ARCH_STATE+16 // +checkoffset . CPUArchState.vector 25 #define CPU_FAULT_ADDR CPU_ARCH_STATE+24 // +checkoffset . CPUArchState.faultAddr 26 #define CPU_ENTRY CPU_ARCH_STATE+32 // +checkoffset . CPUArchState.kernelEntry 27 #define CPU_APP_GS_BASE CPU_ARCH_STATE+40 // +checkoffset . CPUArchState.appGsBase 28 #define CPU_HAS_XSAVE CPU_ARCH_STATE+48 // +checkoffset . CPUArchState.hasXSAVE 29 #define CPU_HAS_XSAVEOPT CPU_ARCH_STATE+49 // +checkoffset . CPUArchState.hasXSAVEOPT 30 #define CPU_HAS_FSGSBASE CPU_ARCH_STATE+50 // +checkoffset . CPUArchState.hasFSGSBASE 31 32 #define ENTRY_SCRATCH0 256 // +checkoffset . kernelEntry.scratch0 33 #define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop 34 #define ENTRY_CPU_SELF 272 // +checkoffset . kernelEntry.cpuSelf 35 #define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3 36 37 // Bits. 38 #define _RFLAGS_IF 512 // +checkconst . _RFLAGS_IF 39 #define _RFLAGS_IOPL0 4096 // +checkconst . _RFLAGS_IOPL0 40 #define _KERNEL_FLAGS 2 // +checkconst . KernelFlagsSet 41 42 // Vectors. 43 #define DivideByZero 0 // +checkconst . DivideByZero 44 #define Debug 1 // +checkconst . Debug 45 #define NMI 2 // +checkconst . NMI 46 #define Breakpoint 3 // +checkconst . Breakpoint 47 #define Overflow 4 // +checkconst . Overflow 48 #define BoundRangeExceeded 5 // +checkconst . BoundRangeExceeded 49 #define InvalidOpcode 6 // +checkconst . InvalidOpcode 50 #define DeviceNotAvailable 7 // +checkconst . DeviceNotAvailable 51 #define DoubleFault 8 // +checkconst . DoubleFault 52 #define CoprocessorSegmentOverrun 9 // +checkconst . CoprocessorSegmentOverrun 53 #define InvalidTSS 10 // +checkconst . InvalidTSS 54 #define SegmentNotPresent 11 // +checkconst . SegmentNotPresent 55 #define StackSegmentFault 12 // +checkconst . StackSegmentFault 56 #define GeneralProtectionFault 13 // +checkconst . GeneralProtectionFault 57 #define PageFault 14 // +checkconst . PageFault 58 #define X87FloatingPointException 16 // +checkconst . X87FloatingPointException 59 #define AlignmentCheck 17 // +checkconst . AlignmentCheck 60 #define MachineCheck 18 // +checkconst . MachineCheck 61 #define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException 62 #define VirtualizationException 20 // +checkconst . VirtualizationException 63 #define SecurityException 30 // +checkconst . SecurityException 64 #define SyscallInt80 128 // +checkconst . SyscallInt80 65 #define Syscall 256 // +checkconst . Syscall 66 67 #define PTRACE_R15 0 // +checkoffset linux PtraceRegs.R15 68 #define PTRACE_R14 8 // +checkoffset linux PtraceRegs.R14 69 #define PTRACE_R13 16 // +checkoffset linux PtraceRegs.R13 70 #define PTRACE_R12 24 // +checkoffset linux PtraceRegs.R12 71 #define PTRACE_RBP 32 // +checkoffset linux PtraceRegs.Rbp 72 #define PTRACE_RBX 40 // +checkoffset linux PtraceRegs.Rbx 73 #define PTRACE_R11 48 // +checkoffset linux PtraceRegs.R11 74 #define PTRACE_R10 56 // +checkoffset linux PtraceRegs.R10 75 #define PTRACE_R9 64 // +checkoffset linux PtraceRegs.R9 76 #define PTRACE_R8 72 // +checkoffset linux PtraceRegs.R8 77 #define PTRACE_RAX 80 // +checkoffset linux PtraceRegs.Rax 78 #define PTRACE_RCX 88 // +checkoffset linux PtraceRegs.Rcx 79 #define PTRACE_RDX 96 // +checkoffset linux PtraceRegs.Rdx 80 #define PTRACE_RSI 104 // +checkoffset linux PtraceRegs.Rsi 81 #define PTRACE_RDI 112 // +checkoffset linux PtraceRegs.Rdi 82 #define PTRACE_ORIGRAX 120 // +checkoffset linux PtraceRegs.Orig_rax 83 #define PTRACE_RIP 128 // +checkoffset linux PtraceRegs.Rip 84 #define PTRACE_CS 136 // +checkoffset linux PtraceRegs.Cs 85 #define PTRACE_FLAGS 144 // +checkoffset linux PtraceRegs.Eflags 86 #define PTRACE_RSP 152 // +checkoffset linux PtraceRegs.Rsp 87 #define PTRACE_SS 160 // +checkoffset linux PtraceRegs.Ss 88 #define PTRACE_FS_BASE 168 // +checkoffset linux PtraceRegs.Fs_base 89 #define PTRACE_GS_BASE 176 // +checkoffset linux PtraceRegs.Gs_base 90 91 // The value for XCR0 is defined to xsave/xrstor everything except for PKRU and 92 // AMX regions. 93 // TODO(gvisor.dev/issues/9896): Implement AMX support. 94 // TODO(gvisor.dev/issues/10087): Implement PKRU support. 95 #define XCR0_DISABLED_MASK ((1 << 9) | (1 << 17) | (1 << 18)) 96 #define XCR0_EAX (0xffffffff ^ XCR0_DISABLED_MASK) 97 #define XCR0_EDX 0xffffffff 98 99 // Saves a register set. 100 // 101 // This is a macro because it may need to executed in contents where a stack is 102 // not available for calls. 103 // 104 // The following registers are not saved: AX, SP, IP, FLAGS, all segments. 105 #define REGISTERS_SAVE(reg, offset) \ 106 MOVQ R15, offset+PTRACE_R15(reg); \ 107 MOVQ R14, offset+PTRACE_R14(reg); \ 108 MOVQ R13, offset+PTRACE_R13(reg); \ 109 MOVQ R12, offset+PTRACE_R12(reg); \ 110 MOVQ BP, offset+PTRACE_RBP(reg); \ 111 MOVQ BX, offset+PTRACE_RBX(reg); \ 112 MOVQ CX, offset+PTRACE_RCX(reg); \ 113 MOVQ DX, offset+PTRACE_RDX(reg); \ 114 MOVQ R11, offset+PTRACE_R11(reg); \ 115 MOVQ R10, offset+PTRACE_R10(reg); \ 116 MOVQ R9, offset+PTRACE_R9(reg); \ 117 MOVQ R8, offset+PTRACE_R8(reg); \ 118 MOVQ SI, offset+PTRACE_RSI(reg); \ 119 MOVQ DI, offset+PTRACE_RDI(reg); 120 121 // Loads a register set. 122 // 123 // This is a macro because it may need to executed in contents where a stack is 124 // not available for calls. 125 // 126 // The following registers are not loaded: AX, SP, IP, FLAGS, all segments. 127 #define REGISTERS_LOAD(reg, offset) \ 128 MOVQ offset+PTRACE_R15(reg), R15; \ 129 MOVQ offset+PTRACE_R14(reg), R14; \ 130 MOVQ offset+PTRACE_R13(reg), R13; \ 131 MOVQ offset+PTRACE_R12(reg), R12; \ 132 MOVQ offset+PTRACE_RBP(reg), BP; \ 133 MOVQ offset+PTRACE_RBX(reg), BX; \ 134 MOVQ offset+PTRACE_RCX(reg), CX; \ 135 MOVQ offset+PTRACE_RDX(reg), DX; \ 136 MOVQ offset+PTRACE_R11(reg), R11; \ 137 MOVQ offset+PTRACE_R10(reg), R10; \ 138 MOVQ offset+PTRACE_R9(reg), R9; \ 139 MOVQ offset+PTRACE_R8(reg), R8; \ 140 MOVQ offset+PTRACE_RSI(reg), SI; \ 141 MOVQ offset+PTRACE_RDI(reg), DI; 142 143 // WRITE_CR3() writes the given CR3 value. 144 // 145 // The code corresponds to: 146 // 147 // mov %rax, %cr3 148 // 149 #define WRITE_CR3() \ 150 BYTE $0x0f; BYTE $0x22; BYTE $0xd8; 151 152 // SWAP_GS swaps the kernel GS (CPU). 153 #define SWAP_GS() \ 154 BYTE $0x0F; BYTE $0x01; BYTE $0xf8; 155 156 // IRET returns from an interrupt frame. 157 #define IRET() \ 158 BYTE $0x48; BYTE $0xcf; 159 160 // SYSRET64 executes the sysret instruction. 161 #define SYSRET64() \ 162 BYTE $0x48; BYTE $0x0f; BYTE $0x07; 163 164 // LOAD_KERNEL_STACK loads the kernel stack. 165 #define LOAD_KERNEL_STACK(entry) \ 166 MOVQ ENTRY_STACK_TOP(entry), SP; 167 168 // ADDR_OF_FUNC defines a function named 'name' that returns the address of 169 // 'symbol'. 170 #define ADDR_OF_FUNC(name, symbol) \ 171 TEXT name,$0-8; \ 172 MOVQ $symbol, AX; \ 173 MOVQ AX, ret+0(FP); \ 174 RET 175 176 // See kernel.go. 177 TEXT ·Halt(SB),NOSPLIT|NOFRAME,$0 178 HLT 179 RET 180 181 // See kernel_amd64.go. 182 TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8 183 HLT 184 185 // Restore FS_BASE. 186 MOVQ regs+0(FP), AX 187 MOVQ PTRACE_FS_BASE(AX), AX 188 189 PUSHQ AX // First argument (FS_BASE) 190 CALL ·writeFS(SB) 191 POPQ AX 192 193 RET 194 195 // jumpToKernel changes execution to the kernel address space. 196 // 197 // This works by changing the return value to the kernel version. 198 TEXT ·jumpToKernel(SB),NOSPLIT|NOFRAME,$0 199 MOVQ 0(SP), AX 200 ORQ ·KernelStartAddress(SB), AX // Future return value. 201 MOVQ AX, 0(SP) 202 RET 203 204 // jumpToUser changes execution to the user address space. 205 // 206 // This works by changing the return value to the user version. 207 TEXT ·jumpToUser(SB),NOSPLIT|NOFRAME,$0 208 // N.B. we can't access KernelStartAddress from the upper half (data 209 // pages not available), so just naively clear all the upper bits. 210 // We are assuming a 47-bit virtual address space. 211 MOVQ $0x00007fffffffffff, AX 212 MOVQ 0(SP), BX 213 ANDQ BX, AX // Future return value. 214 MOVQ AX, 0(SP) 215 RET 216 217 // See kernel_amd64.go. 218 // 219 // The 16-byte frame size is for the saved values of MXCSR and the x87 control 220 // word. 221 TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48 222 // We are passed pointers to heap objects, but do not store them in our 223 // local frame. 224 NO_LOCAL_POINTERS 225 226 // MXCSR and the x87 control word are the only floating point state 227 // that is callee-save and thus we must save. 228 STMXCSR mxcsr-0(SP) 229 FSTCW cw-8(SP) 230 231 // Restore application floating point state. 232 MOVQ cpu+0(FP), SI 233 MOVQ fpState+16(FP), DI 234 MOVB ·hasXSAVE(SB), BX 235 TESTB BX, BX 236 JZ no_xrstor 237 // Use xrstor to restore all available fp state. 238 MOVL $XCR0_EAX, AX 239 MOVL $XCR0_EDX, DX 240 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI) 241 JMP fprestore_done 242 no_xrstor: 243 // Fall back to fxrstor if xsave is not available. 244 FXRSTOR64 0(DI) 245 fprestore_done: 246 247 // Set application GS. 248 MOVQ regs+8(FP), R8 249 SWAP_GS() 250 MOVQ PTRACE_GS_BASE(R8), AX 251 CMPQ AX, CPU_APP_GS_BASE(SI) 252 JE skip_gs 253 MOVQ AX, CPU_APP_GS_BASE(SI) 254 PUSHQ AX 255 CALL ·writeGS(SB) 256 POPQ AX 257 skip_gs: 258 // Call sysret() or iret(). 259 MOVQ userCR3+24(FP), CX 260 MOVQ needIRET+32(FP), R9 261 ADDQ $-32, SP 262 MOVQ SI, 0(SP) // cpu 263 MOVQ R8, 8(SP) // regs 264 MOVQ CX, 16(SP) // userCR3 265 TESTQ R9, R9 266 JNZ do_iret 267 CALL ·sysret(SB) 268 JMP done_sysret_or_iret 269 do_iret: 270 CALL ·iret(SB) 271 done_sysret_or_iret: 272 MOVQ 24(SP), AX // vector 273 ADDQ $32, SP 274 MOVQ AX, ret+40(FP) 275 276 // Save application floating point state. 277 MOVQ fpState+16(FP), DI 278 MOVB ·hasXSAVE(SB), BX 279 MOVB ·hasXSAVEOPT(SB), CX 280 TESTB BX, BX 281 JZ no_xsave 282 // Use xsave/xsaveopt to save all extended state. 283 MOVL $XCR0_EAX, AX 284 MOVL $XCR0_EDX, DX 285 TESTB CX, CX 286 JZ no_xsaveopt 287 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) 288 JMP fpsave_done 289 no_xsaveopt: 290 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) 291 JMP fpsave_done 292 no_xsave: 293 FXSAVE64 0(DI) 294 fpsave_done: 295 296 // Restore MXCSR and the x87 control word after one of the two floating 297 // point save cases above, to ensure the application versions are saved 298 // before being clobbered here. 299 LDMXCSR mxcsr-0(SP) 300 301 // FLDCW is a "waiting" x87 instruction, meaning it checks for pending 302 // unmasked exceptions before executing. Thus if userspace has unmasked 303 // an exception and has one pending, it can be raised by FLDCW even 304 // though the new control word will mask exceptions. To prevent this, 305 // we must first clear pending exceptions (which will be restored by 306 // XRSTOR, et al). 307 BYTE $0xDB; BYTE $0xE2; // FNCLEX 308 FLDCW cw-8(SP) 309 310 RET 311 312 // See entry_amd64.go. 313 TEXT ·sysret(SB),NOSPLIT|NOFRAME,$0-32 314 // Set application FS. We can't do this in Go because Go code needs FS. 315 MOVQ regs+8(FP), AX 316 MOVQ PTRACE_FS_BASE(AX), AX 317 318 PUSHQ AX 319 CALL ·writeFS(SB) 320 POPQ AX 321 322 CALL ·jumpToKernel(SB) 323 // Save original state and stack. sysenter() or exception() 324 // from APP(gr3) will switch to this stack, set the return 325 // value (vector: 32(SP)) and then do RET, which will also 326 // automatically return to the lower half. 327 MOVQ cpu+0(FP), BX 328 MOVQ regs+8(FP), AX 329 MOVQ userCR3+16(FP), CX 330 MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) 331 MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) 332 MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) 333 334 // save SP AX userCR3 on the kernel stack. 335 MOVQ CPU_ENTRY(BX), BX 336 LOAD_KERNEL_STACK(BX) 337 PUSHQ PTRACE_RSP(AX) 338 PUSHQ PTRACE_RAX(AX) 339 PUSHQ CX 340 341 // Restore user register state. 342 REGISTERS_LOAD(AX, 0) 343 MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. 344 MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. 345 346 // restore userCR3, AX, SP. 347 POPQ AX // Get userCR3. 348 WRITE_CR3() // Switch to userCR3. 349 POPQ AX // Restore AX. 350 POPQ SP // Restore SP. 351 SYSRET64() 352 // sysenter or exception will write our return value and return to our 353 // caller. 354 355 // See entry_amd64.go. 356 TEXT ·iret(SB),NOSPLIT|NOFRAME,$0-32 357 // Set application FS. We can't do this in Go because Go code needs FS. 358 MOVQ regs+8(FP), AX 359 MOVQ PTRACE_FS_BASE(AX), AX 360 361 PUSHQ AX // First argument (FS_BASE) 362 CALL ·writeFS(SB) 363 POPQ AX 364 365 CALL ·jumpToKernel(SB) 366 // Save original state and stack. sysenter() or exception() 367 // from APP(gr3) will switch to this stack, set the return 368 // value (vector: 32(SP)) and then do RET, which will also 369 // automatically return to the lower half. 370 MOVQ cpu+0(FP), BX 371 MOVQ regs+8(FP), AX 372 MOVQ userCR3+16(FP), CX 373 MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) 374 MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) 375 MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) 376 377 // Build an IRET frame & restore state. 378 MOVQ CPU_ENTRY(BX), BX 379 LOAD_KERNEL_STACK(BX) 380 PUSHQ PTRACE_SS(AX) 381 PUSHQ PTRACE_RSP(AX) 382 PUSHQ PTRACE_FLAGS(AX) 383 PUSHQ PTRACE_CS(AX) 384 PUSHQ PTRACE_RIP(AX) 385 PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack. 386 PUSHQ CX // Save userCR3 on kernel stack. 387 REGISTERS_LOAD(AX, 0) // Restore most registers. 388 POPQ AX // Get userCR3. 389 WRITE_CR3() // Switch to userCR3. 390 POPQ AX // Restore AX. 391 IRET() 392 // sysenter or exception will write our return value and return to our 393 // caller. 394 395 // See entry_amd64.go. 396 TEXT ·resume(SB),NOSPLIT|NOFRAME,$0 397 // See iret, above. 398 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 399 PUSHQ CPU_REGISTERS+PTRACE_SS(AX) 400 PUSHQ CPU_REGISTERS+PTRACE_RSP(AX) 401 PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX) 402 PUSHQ CPU_REGISTERS+PTRACE_CS(AX) 403 PUSHQ CPU_REGISTERS+PTRACE_RIP(AX) 404 REGISTERS_LOAD(AX, CPU_REGISTERS) 405 MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX 406 IRET() 407 408 // See entry_amd64.go. 409 TEXT ·start(SB),NOSPLIT|NOFRAME,$0 410 // N.B. This is the vCPU entrypoint. It is not called from Go code and 411 // thus pushes and pops values on the stack until calling into Go 412 // (startGo) because we aren't usually a typical Go assembly frame. 413 PUSHQ $0x0 // Previous frame pointer. 414 MOVQ SP, BP // Set frame pointer. 415 PUSHQ AX // Save CPU. 416 417 // Set up environment required by Go before calling startGo: Go needs 418 // FS_BASE and floating point initialized. 419 MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX 420 PUSHQ BX // First argument (FS_BASE) 421 CALL ·writeFS(SB) 422 POPQ BX 423 424 MOVQ CPU_APP_GS_BASE(AX),BX 425 PUSHQ BX 426 CALL ·writeGS(SB) 427 POPQ BX 428 SWAP_GS() 429 430 // First argument (CPU) already at bottom of stack. 431 CALL ·startGo(SB) // Call Go hook. 432 JMP ·resume(SB) // Restore to registers. 433 434 ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB)); 435 436 // See entry_amd64.go. 437 TEXT ·sysenter(SB),NOSPLIT|NOFRAME,$0 438 // _RFLAGS_IOPL0 is always set in the user mode and it is never set in 439 // the kernel mode. See the comment of UserFlagsSet for more details. 440 TESTL $_RFLAGS_IOPL0, R11 441 JZ kernel 442 user: 443 SWAP_GS() 444 MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. 445 MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. 446 WRITE_CR3() // Switch to kernel cr3. 447 448 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 449 MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. 450 REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. 451 MOVQ CX, PTRACE_RIP(AX) 452 MOVQ R11, PTRACE_FLAGS(AX) 453 MOVQ SP, PTRACE_RSP(AX) 454 MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value. 455 MOVQ CX, PTRACE_RAX(AX) // Save everything else. 456 MOVQ CX, PTRACE_ORIGRAX(AX) 457 458 CMPB CPU_HAS_FSGSBASE(GS), $1 459 JNE sysenter_skip_gs 460 SWAP_GS() 461 BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx 462 MOVQ BX, PTRACE_GS_BASE(AX) 463 SWAP_GS() 464 465 sysenter_skip_gs: 466 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 467 MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks. 468 MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. 469 MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. 470 471 CALL ·jumpToUser(SB) 472 473 // Restore kernel FS_BASE. 474 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 475 MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX 476 477 PUSHQ BX // First argument (FS_BASE) 478 CALL ·writeFS(SB) 479 POPQ BX 480 481 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 482 483 // Return to the kernel, where the frame is: 484 // 485 // vector (sp+32) 486 // userCR3 (sp+24) 487 // regs (sp+16) 488 // cpu (sp+8) 489 // vcpu.Switch (sp+0) 490 // 491 MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. 492 MOVQ $Syscall, 32(SP) // Output vector. 493 RET 494 495 kernel: 496 // We can't restore the original stack, but we can access the registers 497 // in the CPU state directly. No need for temporary juggling. 498 MOVQ AX, ENTRY_SCRATCH0(GS) 499 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 500 REGISTERS_SAVE(AX, CPU_REGISTERS) 501 MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX) 502 MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX) 503 MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX) 504 MOVQ ENTRY_SCRATCH0(GS), BX 505 MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) 506 MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) 507 MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. 508 MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. 509 MOVQ $0xffffffffffffffff, CPU_VECTOR(AX) // Set error type to kernel. 510 511 // Save floating point state. CPU.floatingPointState is a slice, so the 512 // first word of CPU.floatingPointState is a pointer to the destination 513 // array. 514 MOVQ CPU_FPU_STATE(AX), DI 515 MOVB CPU_HAS_XSAVE(AX), BX 516 MOVB CPU_HAS_XSAVEOPT(AX), CX 517 TESTB BX, BX 518 JZ no_xsave 519 // Use xsave/xsaveopt to save all extended state. 520 MOVL $XCR0_EAX, AX 521 MOVL $XCR0_EDX, DX 522 TESTB CX, CX 523 JZ no_xsaveopt 524 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) 525 JMP fpsave_done 526 no_xsaveopt: 527 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) 528 JMP fpsave_done 529 no_xsave: 530 FXSAVE64 0(DI) 531 fpsave_done: 532 533 // Call the syscall trampoline. 534 LOAD_KERNEL_STACK(GS) 535 MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. 536 PUSHQ AX // First argument (vCPU). 537 CALL ·kernelSyscall(SB) // Call the trampoline. 538 POPQ AX // Pop vCPU. 539 540 // We only trigger a bluepill entry in the bluepill function, and can 541 // therefore be guaranteed that there is no floating point state to be 542 // loaded on resuming from halt. 543 JMP ·resume(SB) 544 545 ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); 546 547 // exception is a generic exception handler. 548 // 549 // There are two cases handled: 550 // 551 // 1) An exception in kernel mode: this results in saving the state at the time 552 // of the exception and calling the defined hook. 553 // 554 // 2) An exception in guest mode: the original kernel frame is restored, and 555 // the vector & error codes are pushed as return values. 556 // 557 // See below for the stubs that call exception. 558 TEXT ·exception(SB),NOSPLIT|NOFRAME,$0 559 // Determine whether the exception occurred in kernel mode or user 560 // mode, based on the flags. We expect the following stack: 561 // 562 // SS (sp+48) 563 // SP (sp+40) 564 // FLAGS (sp+32) 565 // CS (sp+24) 566 // IP (sp+16) 567 // ERROR_CODE (sp+8) 568 // VECTOR (sp+0) 569 // 570 TESTL $_RFLAGS_IOPL0, 32(SP) 571 JZ kernel 572 573 user: 574 SWAP_GS() 575 ADDQ $-8, SP // Adjust for flags. 576 MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). 577 PUSHQ AX // Save user AX on stack. 578 MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. 579 WRITE_CR3() // Switch to kernel cr3. 580 581 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 582 MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. 583 REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. 584 POPQ BX // Restore original AX. 585 MOVQ BX, PTRACE_RAX(AX) // Save it. 586 MOVQ BX, PTRACE_ORIGRAX(AX) 587 CMPB CPU_HAS_FSGSBASE(GS), $1 588 JNE exception_skip_gs 589 SWAP_GS() 590 BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx 591 MOVQ BX, PTRACE_GS_BASE(AX) 592 SWAP_GS() 593 exception_skip_gs: 594 MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) 595 MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX) 596 MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX) 597 MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) 598 MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) 599 600 CALL ·jumpToUser(SB) 601 602 // Restore kernel FS_BASE. 603 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 604 MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX 605 606 PUSHQ BX // First argument (FS_BASE) 607 CALL ·writeFS(SB) 608 POPQ BX 609 610 // Copy out and return. 611 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 612 MOVQ 0(SP), BX // Load vector. 613 MOVQ 8(SP), CX // Load error code. 614 MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version). 615 MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. 616 MOVQ CX, CPU_ERROR_CODE(AX) // Set error code. 617 MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. 618 MOVQ BX, 32(SP) // Output vector. 619 RET 620 621 kernel: 622 // As per above, we can save directly. 623 PUSHQ AX 624 MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. 625 REGISTERS_SAVE(AX, CPU_REGISTERS) 626 POPQ BX 627 MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) 628 MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) 629 MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX) 630 MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX) 631 MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX) 632 633 // Set the error code and adjust the stack. 634 MOVQ 8(SP), BX // Load the error code. 635 MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. 636 MOVQ 0(SP), BX // Load the error code. 637 MOVQ BX, CPU_VECTOR(AX) // Copy out to the CPU. 638 BYTE $0x0f; BYTE $0x20; BYTE $0xd3; // MOV CR2, RBX 639 MOVQ BX, CPU_FAULT_ADDR(AX) 640 MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. 641 642 // Save floating point state. CPU.floatingPointState is a slice, so the 643 // first word of CPU.floatingPointState is a pointer to the destination 644 // array. 645 MOVQ CPU_FPU_STATE(AX), DI 646 MOVB CPU_HAS_XSAVE(AX), BX 647 MOVB CPU_HAS_XSAVEOPT(AX), CX 648 TESTB BX, BX 649 JZ no_xsave 650 // Use xsave/xsaveopt to save all extended state. 651 MOVL $XCR0_EAX, AX 652 MOVL $XCR0_EDX, DX 653 TESTB CX, CX 654 JZ no_xsaveopt 655 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) 656 JMP fpsave_done 657 no_xsaveopt: 658 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) 659 JMP fpsave_done 660 no_xsave: 661 FXSAVE64 0(DI) 662 fpsave_done: 663 664 // Call the exception trampoline. 665 MOVQ 0(SP), BX // BX contains the vector. 666 LOAD_KERNEL_STACK(GS) 667 MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. 668 PUSHQ BX // Second argument (vector). 669 PUSHQ AX // First argument (vCPU). 670 CALL ·kernelException(SB) // Call the trampoline. 671 POPQ BX // Pop vector. 672 POPQ AX // Pop vCPU. 673 674 // We only trigger a bluepill entry in the bluepill function, and can 675 // therefore be guaranteed that there is no floating point state to be 676 // loaded on resuming from halt. 677 JMP ·resume(SB) 678 679 #define EXCEPTION_WITH_ERROR(value, symbol, addr) \ 680 ADDR_OF_FUNC(addr, symbol); \ 681 TEXT symbol,NOSPLIT|NOFRAME,$0; \ 682 PUSHQ $value; \ 683 JMP ·exception(SB); 684 685 #define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \ 686 ADDR_OF_FUNC(addr, symbol); \ 687 TEXT symbol,NOSPLIT|NOFRAME,$0; \ 688 PUSHQ $0x0; \ 689 PUSHQ $value; \ 690 JMP ·exception(SB); 691 692 EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB)) 693 EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB)) 694 EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB)) 695 EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB)) 696 EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB)) 697 EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB)) 698 EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB)) 699 EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB)) 700 EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB)) 701 EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB)) 702 EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB)) 703 EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB)) 704 EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB)) 705 EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB)) 706 EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB)) 707 EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB)) 708 EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB)) 709 EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB)) 710 EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB)) 711 EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB)) 712 EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB)) 713 EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB))