github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/ring0/entry_amd64.s (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include "funcdata.h"
    16  #include "textflag.h"
    17  
    18  // CPU offsets.
    19  #define CPU_REGISTERS    72  // +checkoffset . CPU.registers
    20  #define CPU_FPU_STATE    288 // +checkoffset . CPU.floatingPointState
    21  #define CPU_ARCH_STATE   16  // +checkoffset . CPU.CPUArchState
    22  #define CPU_ERROR_CODE   CPU_ARCH_STATE+0  // +checkoffset . CPUArchState.errorCode
    23  #define CPU_ERROR_TYPE   CPU_ARCH_STATE+8  // +checkoffset . CPUArchState.errorType
    24  #define CPU_VECTOR       CPU_ARCH_STATE+16 // +checkoffset . CPUArchState.vector
    25  #define CPU_FAULT_ADDR   CPU_ARCH_STATE+24 // +checkoffset . CPUArchState.faultAddr
    26  #define CPU_ENTRY        CPU_ARCH_STATE+32 // +checkoffset . CPUArchState.kernelEntry
    27  #define CPU_APP_GS_BASE        CPU_ARCH_STATE+40 // +checkoffset . CPUArchState.appGsBase
    28  #define CPU_HAS_XSAVE    CPU_ARCH_STATE+48 // +checkoffset . CPUArchState.hasXSAVE
    29  #define CPU_HAS_XSAVEOPT CPU_ARCH_STATE+49 // +checkoffset . CPUArchState.hasXSAVEOPT
    30  #define CPU_HAS_FSGSBASE CPU_ARCH_STATE+50 // +checkoffset . CPUArchState.hasFSGSBASE
    31  
    32  #define ENTRY_SCRATCH0   256 // +checkoffset . kernelEntry.scratch0
    33  #define ENTRY_STACK_TOP  264 // +checkoffset . kernelEntry.stackTop
    34  #define ENTRY_CPU_SELF   272 // +checkoffset . kernelEntry.cpuSelf
    35  #define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3
    36  
    37  // Bits.
    38  #define _RFLAGS_IF    512  // +checkconst . _RFLAGS_IF
    39  #define _RFLAGS_IOPL0 4096 // +checkconst . _RFLAGS_IOPL0
    40  #define _KERNEL_FLAGS 2    // +checkconst . KernelFlagsSet
    41  
    42  // Vectors.
    43  #define DivideByZero               0 // +checkconst . DivideByZero
    44  #define Debug                      1 // +checkconst . Debug
    45  #define NMI                        2 // +checkconst . NMI
    46  #define Breakpoint                 3 // +checkconst . Breakpoint
    47  #define Overflow                   4 // +checkconst . Overflow
    48  #define BoundRangeExceeded         5 // +checkconst . BoundRangeExceeded
    49  #define InvalidOpcode              6 // +checkconst . InvalidOpcode
    50  #define DeviceNotAvailable         7 // +checkconst . DeviceNotAvailable
    51  #define DoubleFault                8 // +checkconst . DoubleFault
    52  #define CoprocessorSegmentOverrun  9 // +checkconst . CoprocessorSegmentOverrun
    53  #define InvalidTSS                 10 // +checkconst . InvalidTSS
    54  #define SegmentNotPresent          11 // +checkconst . SegmentNotPresent
    55  #define StackSegmentFault          12 // +checkconst . StackSegmentFault
    56  #define GeneralProtectionFault     13 // +checkconst . GeneralProtectionFault
    57  #define PageFault                  14 // +checkconst . PageFault
    58  #define X87FloatingPointException  16 // +checkconst . X87FloatingPointException
    59  #define AlignmentCheck             17 // +checkconst . AlignmentCheck
    60  #define MachineCheck               18 // +checkconst . MachineCheck
    61  #define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException
    62  #define VirtualizationException    20 // +checkconst . VirtualizationException
    63  #define SecurityException          30 // +checkconst . SecurityException
    64  #define SyscallInt80               128 // +checkconst . SyscallInt80
    65  #define Syscall                    256 // +checkconst . Syscall
    66  
    67  #define PTRACE_R15      0   // +checkoffset linux PtraceRegs.R15
    68  #define PTRACE_R14      8   // +checkoffset linux PtraceRegs.R14
    69  #define PTRACE_R13      16  // +checkoffset linux PtraceRegs.R13
    70  #define PTRACE_R12      24  // +checkoffset linux PtraceRegs.R12
    71  #define PTRACE_RBP      32  // +checkoffset linux PtraceRegs.Rbp
    72  #define PTRACE_RBX      40  // +checkoffset linux PtraceRegs.Rbx
    73  #define PTRACE_R11      48  // +checkoffset linux PtraceRegs.R11
    74  #define PTRACE_R10      56  // +checkoffset linux PtraceRegs.R10
    75  #define PTRACE_R9       64  // +checkoffset linux PtraceRegs.R9
    76  #define PTRACE_R8       72  // +checkoffset linux PtraceRegs.R8
    77  #define PTRACE_RAX      80  // +checkoffset linux PtraceRegs.Rax
    78  #define PTRACE_RCX      88  // +checkoffset linux PtraceRegs.Rcx
    79  #define PTRACE_RDX      96  // +checkoffset linux PtraceRegs.Rdx
    80  #define PTRACE_RSI      104 // +checkoffset linux PtraceRegs.Rsi
    81  #define PTRACE_RDI      112 // +checkoffset linux PtraceRegs.Rdi
    82  #define PTRACE_ORIGRAX  120 // +checkoffset linux PtraceRegs.Orig_rax
    83  #define PTRACE_RIP      128 // +checkoffset linux PtraceRegs.Rip
    84  #define PTRACE_CS       136 // +checkoffset linux PtraceRegs.Cs
    85  #define PTRACE_FLAGS    144 // +checkoffset linux PtraceRegs.Eflags
    86  #define PTRACE_RSP      152 // +checkoffset linux PtraceRegs.Rsp
    87  #define PTRACE_SS       160 // +checkoffset linux PtraceRegs.Ss
    88  #define PTRACE_FS_BASE  168 // +checkoffset linux PtraceRegs.Fs_base
    89  #define PTRACE_GS_BASE  176 // +checkoffset linux PtraceRegs.Gs_base
    90  
    91  // The value for XCR0 is defined to xsave/xrstor everything except for PKRU and
    92  // AMX regions.
    93  // TODO(gvisor.dev/issues/9896): Implement AMX support.
    94  // TODO(gvisor.dev/issues/10087): Implement PKRU support.
    95  #define XCR0_DISABLED_MASK ((1 << 9) | (1 << 17) | (1 << 18))
    96  #define XCR0_EAX (0xffffffff ^ XCR0_DISABLED_MASK)
    97  #define XCR0_EDX 0xffffffff
    98  
    99  // Saves a register set.
   100  //
   101  // This is a macro because it may need to executed in contents where a stack is
   102  // not available for calls.
   103  //
   104  // The following registers are not saved: AX, SP, IP, FLAGS, all segments.
   105  #define REGISTERS_SAVE(reg, offset) \
   106    MOVQ R15, offset+PTRACE_R15(reg); \
   107    MOVQ R14, offset+PTRACE_R14(reg); \
   108    MOVQ R13, offset+PTRACE_R13(reg); \
   109    MOVQ R12, offset+PTRACE_R12(reg); \
   110    MOVQ BP,  offset+PTRACE_RBP(reg); \
   111    MOVQ BX,  offset+PTRACE_RBX(reg); \
   112    MOVQ CX,  offset+PTRACE_RCX(reg); \
   113    MOVQ DX,  offset+PTRACE_RDX(reg); \
   114    MOVQ R11, offset+PTRACE_R11(reg); \
   115    MOVQ R10, offset+PTRACE_R10(reg); \
   116    MOVQ R9,  offset+PTRACE_R9(reg); \
   117    MOVQ R8,  offset+PTRACE_R8(reg); \
   118    MOVQ SI,  offset+PTRACE_RSI(reg); \
   119    MOVQ DI,  offset+PTRACE_RDI(reg);
   120  
   121  // Loads a register set.
   122  //
   123  // This is a macro because it may need to executed in contents where a stack is
   124  // not available for calls.
   125  //
   126  // The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
   127  #define REGISTERS_LOAD(reg, offset) \
   128    MOVQ offset+PTRACE_R15(reg), R15; \
   129    MOVQ offset+PTRACE_R14(reg), R14; \
   130    MOVQ offset+PTRACE_R13(reg), R13; \
   131    MOVQ offset+PTRACE_R12(reg), R12; \
   132    MOVQ offset+PTRACE_RBP(reg), BP; \
   133    MOVQ offset+PTRACE_RBX(reg), BX; \
   134    MOVQ offset+PTRACE_RCX(reg), CX; \
   135    MOVQ offset+PTRACE_RDX(reg), DX; \
   136    MOVQ offset+PTRACE_R11(reg), R11; \
   137    MOVQ offset+PTRACE_R10(reg), R10; \
   138    MOVQ offset+PTRACE_R9(reg),  R9; \
   139    MOVQ offset+PTRACE_R8(reg),  R8; \
   140    MOVQ offset+PTRACE_RSI(reg), SI; \
   141    MOVQ offset+PTRACE_RDI(reg), DI;
   142  
   143  // WRITE_CR3() writes the given CR3 value.
   144  //
   145  // The code corresponds to:
   146  //
   147  //     mov %rax, %cr3
   148  //
   149  #define WRITE_CR3() \
   150  	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
   151  
   152  // SWAP_GS swaps the kernel GS (CPU).
   153  #define SWAP_GS() \
   154  	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
   155  
   156  // IRET returns from an interrupt frame.
   157  #define IRET() \
   158  	BYTE $0x48; BYTE $0xcf;
   159  
   160  // SYSRET64 executes the sysret instruction.
   161  #define SYSRET64() \
   162  	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
   163  
   164  // LOAD_KERNEL_STACK loads the kernel stack.
   165  #define LOAD_KERNEL_STACK(entry) \
   166  	MOVQ ENTRY_STACK_TOP(entry), SP;
   167  
   168  // ADDR_OF_FUNC defines a function named 'name' that returns the address of
   169  // 'symbol'.
   170  #define ADDR_OF_FUNC(name, symbol) \
   171  TEXT name,$0-8; \
   172  	MOVQ $symbol, AX; \
   173  	MOVQ AX, ret+0(FP); \
   174  	RET
   175  
   176  // See kernel.go.
   177  TEXT ·Halt(SB),NOSPLIT|NOFRAME,$0
   178  	HLT
   179  	RET
   180  
   181  // See kernel_amd64.go.
   182  TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8
   183  	HLT
   184  
   185  	// Restore FS_BASE.
   186  	MOVQ regs+0(FP), AX
   187  	MOVQ PTRACE_FS_BASE(AX), AX
   188  
   189  	PUSHQ AX  // First argument (FS_BASE)
   190  	CALL ·writeFS(SB)
   191  	POPQ AX
   192  
   193  	RET
   194  
   195  // jumpToKernel changes execution to the kernel address space.
   196  //
   197  // This works by changing the return value to the kernel version.
   198  TEXT ·jumpToKernel(SB),NOSPLIT|NOFRAME,$0
   199  	MOVQ 0(SP), AX
   200  	ORQ ·KernelStartAddress(SB), AX // Future return value.
   201  	MOVQ AX, 0(SP)
   202  	RET
   203  
   204  // jumpToUser changes execution to the user address space.
   205  //
   206  // This works by changing the return value to the user version.
   207  TEXT ·jumpToUser(SB),NOSPLIT|NOFRAME,$0
   208  	// N.B. we can't access KernelStartAddress from the upper half (data
   209  	// pages not available), so just naively clear all the upper bits.
   210  	// We are assuming a 47-bit virtual address space.
   211  	MOVQ $0x00007fffffffffff, AX
   212  	MOVQ 0(SP), BX
   213  	ANDQ BX, AX // Future return value.
   214  	MOVQ AX, 0(SP)
   215  	RET
   216  
   217  // See kernel_amd64.go.
   218  //
   219  // The 16-byte frame size is for the saved values of MXCSR and the x87 control
   220  // word.
   221  TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48
   222  	// We are passed pointers to heap objects, but do not store them in our
   223  	// local frame.
   224  	NO_LOCAL_POINTERS
   225  
   226  	// MXCSR and the x87 control word are the only floating point state
   227  	// that is callee-save and thus we must save.
   228  	STMXCSR mxcsr-0(SP)
   229  	FSTCW cw-8(SP)
   230  
   231  	// Restore application floating point state.
   232  	MOVQ cpu+0(FP), SI
   233  	MOVQ fpState+16(FP), DI
   234  	MOVB ·hasXSAVE(SB), BX
   235  	TESTB BX, BX
   236  	JZ no_xrstor
   237  	// Use xrstor to restore all available fp state.
   238  	MOVL $XCR0_EAX, AX
   239  	MOVL $XCR0_EDX, DX
   240  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI)
   241  	JMP fprestore_done
   242  no_xrstor:
   243  	// Fall back to fxrstor if xsave is not available.
   244  	FXRSTOR64 0(DI)
   245  fprestore_done:
   246  
   247  	// Set application GS.
   248  	MOVQ regs+8(FP), R8
   249  	SWAP_GS()
   250  	MOVQ PTRACE_GS_BASE(R8), AX
   251  	CMPQ AX, CPU_APP_GS_BASE(SI)
   252  	JE skip_gs
   253  	MOVQ AX, CPU_APP_GS_BASE(SI)
   254  	PUSHQ AX
   255  	CALL ·writeGS(SB)
   256  	POPQ AX
   257  skip_gs:
   258  	// Call sysret() or iret().
   259  	MOVQ userCR3+24(FP), CX
   260  	MOVQ needIRET+32(FP), R9
   261  	ADDQ $-32, SP
   262  	MOVQ SI, 0(SP)  // cpu
   263  	MOVQ R8, 8(SP)  // regs
   264  	MOVQ CX, 16(SP) // userCR3
   265  	TESTQ R9, R9
   266  	JNZ do_iret
   267  	CALL ·sysret(SB)
   268  	JMP done_sysret_or_iret
   269  do_iret:
   270  	CALL ·iret(SB)
   271  done_sysret_or_iret:
   272  	MOVQ 24(SP), AX // vector
   273  	ADDQ $32, SP
   274  	MOVQ AX, ret+40(FP)
   275  
   276  	// Save application floating point state.
   277  	MOVQ fpState+16(FP), DI
   278  	MOVB ·hasXSAVE(SB), BX
   279  	MOVB ·hasXSAVEOPT(SB), CX
   280  	TESTB BX, BX
   281  	JZ no_xsave
   282  	// Use xsave/xsaveopt to save all extended state.
   283  	MOVL $XCR0_EAX, AX
   284  	MOVL $XCR0_EDX, DX
   285  	TESTB CX, CX
   286  	JZ no_xsaveopt
   287  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
   288  	JMP fpsave_done
   289  no_xsaveopt:
   290  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
   291  	JMP fpsave_done
   292  no_xsave:
   293  	FXSAVE64 0(DI)
   294  fpsave_done:
   295  
   296  	// Restore MXCSR and the x87 control word after one of the two floating
   297  	// point save cases above, to ensure the application versions are saved
   298  	// before being clobbered here.
   299  	LDMXCSR mxcsr-0(SP)
   300  
   301  	// FLDCW is a "waiting" x87 instruction, meaning it checks for pending
   302  	// unmasked exceptions before executing. Thus if userspace has unmasked
   303  	// an exception and has one pending, it can be raised by FLDCW even
   304  	// though the new control word will mask exceptions. To prevent this,
   305  	// we must first clear pending exceptions (which will be restored by
   306  	// XRSTOR, et al).
   307  	BYTE $0xDB; BYTE $0xE2; // FNCLEX
   308  	FLDCW cw-8(SP)
   309  
   310  	RET
   311  
   312  // See entry_amd64.go.
   313  TEXT ·sysret(SB),NOSPLIT|NOFRAME,$0-32
   314  	// Set application FS. We can't do this in Go because Go code needs FS.
   315  	MOVQ regs+8(FP), AX
   316  	MOVQ PTRACE_FS_BASE(AX), AX
   317  
   318  	PUSHQ AX
   319  	CALL ·writeFS(SB)
   320  	POPQ AX
   321  
   322  	CALL ·jumpToKernel(SB)
   323  	// Save original state and stack. sysenter() or exception()
   324  	// from APP(gr3) will switch to this stack, set the return
   325  	// value (vector: 32(SP)) and then do RET, which will also
   326  	// automatically return to the lower half.
   327  	MOVQ cpu+0(FP), BX
   328  	MOVQ regs+8(FP), AX
   329  	MOVQ userCR3+16(FP), CX
   330  	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
   331  	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
   332  	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
   333  
   334  	// save SP AX userCR3 on the kernel stack.
   335  	MOVQ CPU_ENTRY(BX), BX
   336  	LOAD_KERNEL_STACK(BX)
   337  	PUSHQ PTRACE_RSP(AX)
   338  	PUSHQ PTRACE_RAX(AX)
   339  	PUSHQ CX
   340  
   341  	// Restore user register state.
   342  	REGISTERS_LOAD(AX, 0)
   343  	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
   344  	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
   345  
   346  	// restore userCR3, AX, SP.
   347  	POPQ AX	                            // Get userCR3.
   348  	WRITE_CR3()                         // Switch to userCR3.
   349  	POPQ AX                             // Restore AX.
   350  	POPQ SP                             // Restore SP.
   351  	SYSRET64()
   352  	// sysenter or exception will write our return value and return to our
   353  	// caller.
   354  
   355  // See entry_amd64.go.
   356  TEXT ·iret(SB),NOSPLIT|NOFRAME,$0-32
   357  	// Set application FS. We can't do this in Go because Go code needs FS.
   358  	MOVQ regs+8(FP), AX
   359  	MOVQ PTRACE_FS_BASE(AX), AX
   360  
   361  	PUSHQ AX // First argument (FS_BASE)
   362  	CALL ·writeFS(SB)
   363  	POPQ AX
   364  
   365  	CALL ·jumpToKernel(SB)
   366  	// Save original state and stack. sysenter() or exception()
   367  	// from APP(gr3) will switch to this stack, set the return
   368  	// value (vector: 32(SP)) and then do RET, which will also
   369  	// automatically return to the lower half.
   370  	MOVQ cpu+0(FP), BX
   371  	MOVQ regs+8(FP), AX
   372  	MOVQ userCR3+16(FP), CX
   373  	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
   374  	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
   375  	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
   376  
   377  	// Build an IRET frame & restore state.
   378  	MOVQ CPU_ENTRY(BX), BX
   379  	LOAD_KERNEL_STACK(BX)
   380  	PUSHQ PTRACE_SS(AX)
   381  	PUSHQ PTRACE_RSP(AX)
   382  	PUSHQ PTRACE_FLAGS(AX)
   383  	PUSHQ PTRACE_CS(AX)
   384  	PUSHQ PTRACE_RIP(AX)
   385  	PUSHQ PTRACE_RAX(AX)                // Save AX on kernel stack.
   386  	PUSHQ CX                            // Save userCR3 on kernel stack.
   387  	REGISTERS_LOAD(AX, 0)               // Restore most registers.
   388  	POPQ AX	                            // Get userCR3.
   389  	WRITE_CR3()                         // Switch to userCR3.
   390  	POPQ AX                             // Restore AX.
   391  	IRET()
   392  	// sysenter or exception will write our return value and return to our
   393  	// caller.
   394  
   395  // See entry_amd64.go.
   396  TEXT ·resume(SB),NOSPLIT|NOFRAME,$0
   397  	// See iret, above.
   398  	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
   399  	PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
   400  	PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
   401  	PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
   402  	PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
   403  	PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
   404  	REGISTERS_LOAD(AX, CPU_REGISTERS)
   405  	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
   406  	IRET()
   407  
   408  // See entry_amd64.go.
   409  TEXT ·start(SB),NOSPLIT|NOFRAME,$0
   410  	// N.B. This is the vCPU entrypoint. It is not called from Go code and
   411  	// thus pushes and pops values on the stack until calling into Go
   412  	// (startGo) because we aren't usually a typical Go assembly frame.
   413  	PUSHQ $0x0  // Previous frame pointer.
   414  	MOVQ SP, BP // Set frame pointer.
   415  	PUSHQ AX    // Save CPU.
   416  
   417  	// Set up environment required by Go before calling startGo: Go needs
   418  	// FS_BASE and floating point initialized.
   419  	MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
   420  	PUSHQ BX          // First argument (FS_BASE)
   421  	CALL ·writeFS(SB)
   422  	POPQ BX
   423  
   424  	MOVQ CPU_APP_GS_BASE(AX),BX
   425  	PUSHQ BX
   426  	CALL ·writeGS(SB)
   427  	POPQ BX
   428  	SWAP_GS()
   429  
   430  	// First argument (CPU) already at bottom of stack.
   431  	CALL ·startGo(SB) // Call Go hook.
   432  	JMP ·resume(SB)   // Restore to registers.
   433  
   434  ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB));
   435  
   436  // See entry_amd64.go.
   437  TEXT ·sysenter(SB),NOSPLIT|NOFRAME,$0
   438  	// _RFLAGS_IOPL0 is always set in the user mode and it is never set in
   439  	// the kernel mode. See the comment of UserFlagsSet for more details.
   440  	TESTL $_RFLAGS_IOPL0, R11
   441  	JZ kernel
   442  user:
   443  	SWAP_GS()
   444  	MOVQ AX, ENTRY_SCRATCH0(GS)            // Save user AX on scratch.
   445  	MOVQ ENTRY_KERNEL_CR3(GS), AX          // Get kernel cr3 on AX.
   446  	WRITE_CR3()                            // Switch to kernel cr3.
   447  
   448  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   449  	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX  // Get user regs.
   450  	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
   451  	MOVQ CX,  PTRACE_RIP(AX)
   452  	MOVQ R11, PTRACE_FLAGS(AX)
   453  	MOVQ SP,  PTRACE_RSP(AX)
   454  	MOVQ ENTRY_SCRATCH0(GS), CX            // Load saved user AX value.
   455  	MOVQ CX,  PTRACE_RAX(AX)               // Save everything else.
   456  	MOVQ CX,  PTRACE_ORIGRAX(AX)
   457  
   458  	CMPB CPU_HAS_FSGSBASE(GS), $1
   459  	JNE sysenter_skip_gs
   460  	SWAP_GS()
   461  	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx
   462  	MOVQ BX, PTRACE_GS_BASE(AX)
   463  	SWAP_GS()
   464  
   465  sysenter_skip_gs:
   466  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   467  	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP  // Get stacks.
   468  	MOVQ $0, CPU_ERROR_CODE(AX)            // Clear error code.
   469  	MOVQ $1, CPU_ERROR_TYPE(AX)            // Set error type to user.
   470  
   471  	CALL ·jumpToUser(SB)
   472  
   473  	// Restore kernel FS_BASE.
   474  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   475  	MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
   476  
   477  	PUSHQ BX                               // First argument (FS_BASE)
   478  	CALL ·writeFS(SB)
   479  	POPQ BX
   480  
   481  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   482  
   483  	// Return to the kernel, where the frame is:
   484  	//
   485  	//	vector      (sp+32)
   486  	//	userCR3     (sp+24)
   487  	// 	regs        (sp+16)
   488  	// 	cpu         (sp+8)
   489  	// 	vcpu.Switch (sp+0)
   490  	//
   491  	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
   492  	MOVQ $Syscall, 32(SP)                 // Output vector.
   493  	RET
   494  
   495  kernel:
   496  	// We can't restore the original stack, but we can access the registers
   497  	// in the CPU state directly. No need for temporary juggling.
   498  	MOVQ AX,  ENTRY_SCRATCH0(GS)
   499  	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
   500  	REGISTERS_SAVE(AX, CPU_REGISTERS)
   501  	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(AX)
   502  	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
   503  	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(AX)
   504  	MOVQ ENTRY_SCRATCH0(GS), BX
   505  	MOVQ BX,  CPU_REGISTERS+PTRACE_ORIGRAX(AX)
   506  	MOVQ BX,  CPU_REGISTERS+PTRACE_RAX(AX)
   507  	MOVQ $0,  CPU_ERROR_CODE(AX)                // Clear error code.
   508  	MOVQ $0,  CPU_ERROR_TYPE(AX)                // Set error type to kernel.
   509  	MOVQ $0xffffffffffffffff,  CPU_VECTOR(AX)                // Set error type to kernel.
   510  
   511  	// Save floating point state. CPU.floatingPointState is a slice, so the
   512  	// first word of CPU.floatingPointState is a pointer to the destination
   513  	// array.
   514  	MOVQ CPU_FPU_STATE(AX), DI
   515  	MOVB CPU_HAS_XSAVE(AX), BX
   516  	MOVB CPU_HAS_XSAVEOPT(AX), CX
   517  	TESTB BX, BX
   518  	JZ no_xsave
   519  	// Use xsave/xsaveopt to save all extended state.
   520  	MOVL $XCR0_EAX, AX
   521  	MOVL $XCR0_EDX, DX
   522  	TESTB CX, CX
   523  	JZ no_xsaveopt
   524  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
   525  	JMP fpsave_done
   526  no_xsaveopt:
   527  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
   528  	JMP fpsave_done
   529  no_xsave:
   530  	FXSAVE64 0(DI)
   531  fpsave_done:
   532  
   533  	// Call the syscall trampoline.
   534  	LOAD_KERNEL_STACK(GS)
   535  	MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU.
   536  	PUSHQ AX                    // First argument (vCPU).
   537  	CALL ·kernelSyscall(SB)     // Call the trampoline.
   538  	POPQ AX                     // Pop vCPU.
   539  
   540  	// We only trigger a bluepill entry in the bluepill function, and can
   541  	// therefore be guaranteed that there is no floating point state to be
   542  	// loaded on resuming from halt.
   543  	JMP ·resume(SB)
   544  
   545  ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB));
   546  
   547  // exception is a generic exception handler.
   548  //
   549  // There are two cases handled:
   550  //
   551  // 1) An exception in kernel mode: this results in saving the state at the time
   552  // of the exception and calling the defined hook.
   553  //
   554  // 2) An exception in guest mode: the original kernel frame is restored, and
   555  // the vector & error codes are pushed as return values.
   556  //
   557  // See below for the stubs that call exception.
   558  TEXT ·exception(SB),NOSPLIT|NOFRAME,$0
   559  	// Determine whether the exception occurred in kernel mode or user
   560  	// mode, based on the flags. We expect the following stack:
   561  	//
   562  	//	SS          (sp+48)
   563  	//	SP          (sp+40)
   564  	//	FLAGS       (sp+32)
   565  	//	CS          (sp+24)
   566  	//	IP          (sp+16)
   567  	//	ERROR_CODE  (sp+8)
   568  	//	VECTOR      (sp+0)
   569  	//
   570  	TESTL $_RFLAGS_IOPL0, 32(SP)
   571  	JZ kernel
   572  
   573  user:
   574  	SWAP_GS()
   575  	ADDQ $-8, SP                            // Adjust for flags.
   576  	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
   577  	PUSHQ AX                                // Save user AX on stack.
   578  	MOVQ ENTRY_KERNEL_CR3(GS), AX           // Get kernel cr3 on AX.
   579  	WRITE_CR3()                             // Switch to kernel cr3.
   580  
   581  	MOVQ ENTRY_CPU_SELF(GS), AX             // Load vCPU.
   582  	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX   // Get user regs.
   583  	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
   584  	POPQ BX                                 // Restore original AX.
   585  	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
   586  	MOVQ BX, PTRACE_ORIGRAX(AX)
   587  	CMPB CPU_HAS_FSGSBASE(GS), $1
   588  	JNE exception_skip_gs
   589  	SWAP_GS()
   590  	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx
   591  	MOVQ BX, PTRACE_GS_BASE(AX)
   592  	SWAP_GS()
   593  exception_skip_gs:
   594  	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
   595  	MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
   596  	MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
   597  	MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
   598  	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
   599  
   600  	CALL ·jumpToUser(SB)
   601  
   602  	// Restore kernel FS_BASE.
   603  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   604  	MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
   605  
   606  	PUSHQ BX                               // First argument (FS_BASE)
   607  	CALL ·writeFS(SB)
   608  	POPQ BX
   609  
   610  	// Copy out and return.
   611  	MOVQ ENTRY_CPU_SELF(GS), AX           // Load vCPU.
   612  	MOVQ 0(SP), BX                        // Load vector.
   613  	MOVQ 8(SP), CX                        // Load error code.
   614  	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
   615  	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
   616  	MOVQ CX, CPU_ERROR_CODE(AX)           // Set error code.
   617  	MOVQ $1, CPU_ERROR_TYPE(AX)           // Set error type to user.
   618  	MOVQ BX, 32(SP)                       // Output vector.
   619  	RET
   620  
   621  kernel:
   622  	// As per above, we can save directly.
   623  	PUSHQ AX
   624  	MOVQ ENTRY_CPU_SELF(GS), AX                        // Load vCPU.
   625  	REGISTERS_SAVE(AX, CPU_REGISTERS)
   626  	POPQ BX
   627  	MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
   628  	MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
   629  	MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
   630  	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
   631  	MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
   632  
   633  	// Set the error code and adjust the stack.
   634  	MOVQ 8(SP), BX              // Load the error code.
   635  	MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
   636  	MOVQ 0(SP), BX              // Load the error code.
   637  	MOVQ BX, CPU_VECTOR(AX) // Copy out to the CPU.
   638  	BYTE $0x0f; BYTE $0x20; BYTE $0xd3; // MOV CR2, RBX
   639  	MOVQ BX, CPU_FAULT_ADDR(AX)
   640  	MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
   641  
   642  	// Save floating point state. CPU.floatingPointState is a slice, so the
   643  	// first word of CPU.floatingPointState is a pointer to the destination
   644  	// array.
   645  	MOVQ CPU_FPU_STATE(AX), DI
   646  	MOVB CPU_HAS_XSAVE(AX), BX
   647  	MOVB CPU_HAS_XSAVEOPT(AX), CX
   648  	TESTB BX, BX
   649  	JZ no_xsave
   650  	// Use xsave/xsaveopt to save all extended state.
   651  	MOVL $XCR0_EAX, AX
   652  	MOVL $XCR0_EDX, DX
   653  	TESTB CX, CX
   654  	JZ no_xsaveopt
   655  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
   656  	JMP fpsave_done
   657  no_xsaveopt:
   658  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
   659  	JMP fpsave_done
   660  no_xsave:
   661  	FXSAVE64 0(DI)
   662  fpsave_done:
   663  
   664  	// Call the exception trampoline.
   665  	MOVQ 0(SP), BX              // BX contains the vector.
   666  	LOAD_KERNEL_STACK(GS)
   667  	MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU.
   668  	PUSHQ BX                    // Second argument (vector).
   669  	PUSHQ AX                    // First argument (vCPU).
   670  	CALL ·kernelException(SB)   // Call the trampoline.
   671  	POPQ BX                     // Pop vector.
   672  	POPQ AX                     // Pop vCPU.
   673  
   674  	// We only trigger a bluepill entry in the bluepill function, and can
   675  	// therefore be guaranteed that there is no floating point state to be
   676  	// loaded on resuming from halt.
   677  	JMP ·resume(SB)
   678  
   679  #define EXCEPTION_WITH_ERROR(value, symbol, addr) \
   680  ADDR_OF_FUNC(addr, symbol); \
   681  TEXT symbol,NOSPLIT|NOFRAME,$0; \
   682  	PUSHQ $value; \
   683  	JMP ·exception(SB);
   684  
   685  #define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \
   686  ADDR_OF_FUNC(addr, symbol); \
   687  TEXT symbol,NOSPLIT|NOFRAME,$0; \
   688  	PUSHQ $0x0; \
   689  	PUSHQ $value; \
   690  	JMP ·exception(SB);
   691  
   692  EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB))
   693  EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB))
   694  EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB))
   695  EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB))
   696  EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB))
   697  EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB))
   698  EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB))
   699  EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB))
   700  EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB))
   701  EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB))
   702  EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB))
   703  EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB))
   704  EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB))
   705  EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB))
   706  EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB))
   707  EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB))
   708  EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB))
   709  EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB))
   710  EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB))
   711  EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB))
   712  EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB))
   713  EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB))