github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/ring0/entry_amd64.s (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include "funcdata.h"
    16  #include "textflag.h"
    17  
    18  // CPU offsets.
    19  #define CPU_REGISTERS    64  // +checkoffset . CPU.registers
    20  #define CPU_FPU_STATE    280 // +checkoffset . CPU.floatingPointState
    21  #define CPU_ARCH_STATE   16  // +checkoffset . CPU.CPUArchState
    22  #define CPU_ERROR_CODE   CPU_ARCH_STATE+0  // +checkoffset . CPUArchState.errorCode
    23  #define CPU_ERROR_TYPE   CPU_ARCH_STATE+8  // +checkoffset . CPUArchState.errorType
    24  #define CPU_VECTOR       CPU_ARCH_STATE+16 // +checkoffset . CPUArchState.vector
    25  #define CPU_FAULT_ADDR   CPU_ARCH_STATE+24 // +checkoffset . CPUArchState.faultAddr
    26  #define CPU_ENTRY        CPU_ARCH_STATE+32 // +checkoffset . CPUArchState.kernelEntry
    27  #define CPU_HAS_XSAVE    CPU_ARCH_STATE+40 // +checkoffset . CPUArchState.hasXSAVE
    28  #define CPU_HAS_XSAVEOPT CPU_ARCH_STATE+41 // +checkoffset . CPUArchState.hasXSAVEOPT
    29  
    30  #define ENTRY_SCRATCH0   256 // +checkoffset . kernelEntry.scratch0
    31  #define ENTRY_STACK_TOP  264 // +checkoffset . kernelEntry.stackTop
    32  #define ENTRY_CPU_SELF   272 // +checkoffset . kernelEntry.cpuSelf
    33  #define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3
    34  
    35  // Bits.
    36  #define _RFLAGS_IF    512  // +checkconst . _RFLAGS_IF
    37  #define _RFLAGS_IOPL0 4096 // +checkconst . _RFLAGS_IOPL0
    38  #define _KERNEL_FLAGS 2    // +checkconst . KernelFlagsSet
    39  
    40  // Vectors.
    41  #define DivideByZero               0 // +checkconst . DivideByZero
    42  #define Debug                      1 // +checkconst . Debug
    43  #define NMI                        2 // +checkconst . NMI
    44  #define Breakpoint                 3 // +checkconst . Breakpoint
    45  #define Overflow                   4 // +checkconst . Overflow
    46  #define BoundRangeExceeded         5 // +checkconst . BoundRangeExceeded
    47  #define InvalidOpcode              6 // +checkconst . InvalidOpcode
    48  #define DeviceNotAvailable         7 // +checkconst . DeviceNotAvailable
    49  #define DoubleFault                8 // +checkconst . DoubleFault
    50  #define CoprocessorSegmentOverrun  9 // +checkconst . CoprocessorSegmentOverrun
    51  #define InvalidTSS                 10 // +checkconst . InvalidTSS
    52  #define SegmentNotPresent          11 // +checkconst . SegmentNotPresent
    53  #define StackSegmentFault          12 // +checkconst . StackSegmentFault
    54  #define GeneralProtectionFault     13 // +checkconst . GeneralProtectionFault
    55  #define PageFault                  14 // +checkconst . PageFault
    56  #define X87FloatingPointException  16 // +checkconst . X87FloatingPointException
    57  #define AlignmentCheck             17 // +checkconst . AlignmentCheck
    58  #define MachineCheck               18 // +checkconst . MachineCheck
    59  #define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException
    60  #define VirtualizationException    20 // +checkconst . VirtualizationException
    61  #define SecurityException          30 // +checkconst . SecurityException
    62  #define SyscallInt80               128 // +checkconst . SyscallInt80
    63  #define Syscall                    256 // +checkconst . Syscall
    64  
    65  #define PTRACE_R15      0   // +checkoffset linux PtraceRegs.R15
    66  #define PTRACE_R14      8   // +checkoffset linux PtraceRegs.R14
    67  #define PTRACE_R13      16  // +checkoffset linux PtraceRegs.R13
    68  #define PTRACE_R12      24  // +checkoffset linux PtraceRegs.R12
    69  #define PTRACE_RBP      32  // +checkoffset linux PtraceRegs.Rbp
    70  #define PTRACE_RBX      40  // +checkoffset linux PtraceRegs.Rbx
    71  #define PTRACE_R11      48  // +checkoffset linux PtraceRegs.R11
    72  #define PTRACE_R10      56  // +checkoffset linux PtraceRegs.R10
    73  #define PTRACE_R9       64  // +checkoffset linux PtraceRegs.R9
    74  #define PTRACE_R8       72  // +checkoffset linux PtraceRegs.R8
    75  #define PTRACE_RAX      80  // +checkoffset linux PtraceRegs.Rax
    76  #define PTRACE_RCX      88  // +checkoffset linux PtraceRegs.Rcx
    77  #define PTRACE_RDX      96  // +checkoffset linux PtraceRegs.Rdx
    78  #define PTRACE_RSI      104 // +checkoffset linux PtraceRegs.Rsi
    79  #define PTRACE_RDI      112 // +checkoffset linux PtraceRegs.Rdi
    80  #define PTRACE_ORIGRAX  120 // +checkoffset linux PtraceRegs.Orig_rax
    81  #define PTRACE_RIP      128 // +checkoffset linux PtraceRegs.Rip
    82  #define PTRACE_CS       136 // +checkoffset linux PtraceRegs.Cs
    83  #define PTRACE_FLAGS    144 // +checkoffset linux PtraceRegs.Eflags
    84  #define PTRACE_RSP      152 // +checkoffset linux PtraceRegs.Rsp
    85  #define PTRACE_SS       160 // +checkoffset linux PtraceRegs.Ss
    86  #define PTRACE_FS_BASE  168 // +checkoffset linux PtraceRegs.Fs_base
    87  #define PTRACE_GS_BASE  176 // +checkoffset linux PtraceRegs.Gs_base
    88  
    89  // Saves a register set.
    90  //
    91  // This is a macro because it may need to executed in contents where a stack is
    92  // not available for calls.
    93  //
    94  // The following registers are not saved: AX, SP, IP, FLAGS, all segments.
    95  #define REGISTERS_SAVE(reg, offset) \
    96    MOVQ R15, offset+PTRACE_R15(reg); \
    97    MOVQ R14, offset+PTRACE_R14(reg); \
    98    MOVQ R13, offset+PTRACE_R13(reg); \
    99    MOVQ R12, offset+PTRACE_R12(reg); \
   100    MOVQ BP,  offset+PTRACE_RBP(reg); \
   101    MOVQ BX,  offset+PTRACE_RBX(reg); \
   102    MOVQ CX,  offset+PTRACE_RCX(reg); \
   103    MOVQ DX,  offset+PTRACE_RDX(reg); \
   104    MOVQ R11, offset+PTRACE_R11(reg); \
   105    MOVQ R10, offset+PTRACE_R10(reg); \
   106    MOVQ R9,  offset+PTRACE_R9(reg); \
   107    MOVQ R8,  offset+PTRACE_R8(reg); \
   108    MOVQ SI,  offset+PTRACE_RSI(reg); \
   109    MOVQ DI,  offset+PTRACE_RDI(reg);
   110  
   111  // Loads a register set.
   112  //
   113  // This is a macro because it may need to executed in contents where a stack is
   114  // not available for calls.
   115  //
   116  // The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
   117  #define REGISTERS_LOAD(reg, offset) \
   118    MOVQ offset+PTRACE_R15(reg), R15; \
   119    MOVQ offset+PTRACE_R14(reg), R14; \
   120    MOVQ offset+PTRACE_R13(reg), R13; \
   121    MOVQ offset+PTRACE_R12(reg), R12; \
   122    MOVQ offset+PTRACE_RBP(reg), BP; \
   123    MOVQ offset+PTRACE_RBX(reg), BX; \
   124    MOVQ offset+PTRACE_RCX(reg), CX; \
   125    MOVQ offset+PTRACE_RDX(reg), DX; \
   126    MOVQ offset+PTRACE_R11(reg), R11; \
   127    MOVQ offset+PTRACE_R10(reg), R10; \
   128    MOVQ offset+PTRACE_R9(reg),  R9; \
   129    MOVQ offset+PTRACE_R8(reg),  R8; \
   130    MOVQ offset+PTRACE_RSI(reg), SI; \
   131    MOVQ offset+PTRACE_RDI(reg), DI;
   132  
   133  // WRITE_CR3() writes the given CR3 value.
   134  //
   135  // The code corresponds to:
   136  //
   137  //     mov %rax, %cr3
   138  //
   139  #define WRITE_CR3() \
   140  	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
   141  
   142  // SWAP_GS swaps the kernel GS (CPU).
   143  #define SWAP_GS() \
   144  	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
   145  
   146  // IRET returns from an interrupt frame.
   147  #define IRET() \
   148  	BYTE $0x48; BYTE $0xcf;
   149  
   150  // SYSRET64 executes the sysret instruction.
   151  #define SYSRET64() \
   152  	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
   153  
   154  // LOAD_KERNEL_STACK loads the kernel stack.
   155  #define LOAD_KERNEL_STACK(entry) \
   156  	MOVQ ENTRY_STACK_TOP(entry), SP;
   157  
   158  // ADDR_OF_FUNC defines a function named 'name' that returns the address of
   159  // 'symbol'.
   160  #define ADDR_OF_FUNC(name, symbol) \
   161  TEXT name,$0-8; \
   162  	MOVQ $symbol, AX; \
   163  	MOVQ AX, ret+0(FP); \
   164  	RET
   165  
   166  // See kernel.go.
   167  TEXT ·Halt(SB),NOSPLIT|NOFRAME,$0
   168  	HLT
   169  	RET
   170  
   171  // See kernel_amd64.go.
   172  TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8
   173  	HLT
   174  
   175  	// Restore FS_BASE.
   176  	MOVQ regs+0(FP), AX
   177  	MOVQ PTRACE_FS_BASE(AX), AX
   178  
   179  	PUSHQ AX  // First argument (FS_BASE)
   180  	CALL ·writeFS(SB)
   181  	POPQ AX
   182  
   183  	RET
   184  
   185  // jumpToKernel changes execution to the kernel address space.
   186  //
   187  // This works by changing the return value to the kernel version.
   188  TEXT ·jumpToKernel(SB),NOSPLIT|NOFRAME,$0
   189  	MOVQ 0(SP), AX
   190  	ORQ ·KernelStartAddress(SB), AX // Future return value.
   191  	MOVQ AX, 0(SP)
   192  	RET
   193  
   194  // jumpToUser changes execution to the user address space.
   195  //
   196  // This works by changing the return value to the user version.
   197  TEXT ·jumpToUser(SB),NOSPLIT|NOFRAME,$0
   198  	// N.B. we can't access KernelStartAddress from the upper half (data
   199  	// pages not available), so just naively clear all the upper bits.
   200  	// We are assuming a 47-bit virtual address space.
   201  	MOVQ $0x00007fffffffffff, AX
   202  	MOVQ 0(SP), BX
   203  	ANDQ BX, AX // Future return value.
   204  	MOVQ AX, 0(SP)
   205  	RET
   206  
   207  // See kernel_amd64.go.
   208  //
   209  // The 16-byte frame size is for the saved values of MXCSR and the x87 control
   210  // word.
   211  TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48
   212  	// We are passed pointers to heap objects, but do not store them in our
   213  	// local frame.
   214  	NO_LOCAL_POINTERS
   215  
   216  	// MXCSR and the x87 control word are the only floating point state
   217  	// that is callee-save and thus we must save.
   218  	STMXCSR mxcsr-0(SP)
   219  	FSTCW cw-8(SP)
   220  
   221  	// Restore application floating point state.
   222  	MOVQ cpu+0(FP), SI
   223  	MOVQ fpState+16(FP), DI
   224  	MOVB ·hasXSAVE(SB), BX
   225  	TESTB BX, BX
   226  	JZ no_xrstor
   227  	// Use xrstor to restore all available fp state. For now, we restore
   228  	// everything unconditionally by setting the implicit operand edx:eax
   229  	// (the "requested feature bitmap") to all 1's.
   230  	MOVL $0xffffffff, AX
   231  	MOVL $0xffffffff, DX
   232  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI)
   233  	JMP fprestore_done
   234  no_xrstor:
   235  	// Fall back to fxrstor if xsave is not available.
   236  	FXRSTOR64 0(DI)
   237  fprestore_done:
   238  
   239  	// Set application GS.
   240  	MOVQ regs+8(FP), R8
   241  	SWAP_GS()
   242  	MOVQ PTRACE_GS_BASE(R8), AX
   243  	PUSHQ AX
   244  	CALL ·writeGS(SB)
   245  	POPQ AX
   246  
   247  	// Call sysret() or iret().
   248  	MOVQ userCR3+24(FP), CX
   249  	MOVQ needIRET+32(FP), R9
   250  	ADDQ $-32, SP
   251  	MOVQ SI, 0(SP)  // cpu
   252  	MOVQ R8, 8(SP)  // regs
   253  	MOVQ CX, 16(SP) // userCR3
   254  	TESTQ R9, R9
   255  	JNZ do_iret
   256  	CALL ·sysret(SB)
   257  	JMP done_sysret_or_iret
   258  do_iret:
   259  	CALL ·iret(SB)
   260  done_sysret_or_iret:
   261  	MOVQ 24(SP), AX // vector
   262  	ADDQ $32, SP
   263  	MOVQ AX, ret+40(FP)
   264  
   265  	// Save application floating point state.
   266  	MOVQ fpState+16(FP), DI
   267  	MOVB ·hasXSAVE(SB), BX
   268  	MOVB ·hasXSAVEOPT(SB), CX
   269  	TESTB BX, BX
   270  	JZ no_xsave
   271  	// Use xsave/xsaveopt to save all extended state.
   272  	// We save everything unconditionally by setting RFBM to all 1's.
   273  	MOVL $0xffffffff, AX
   274  	MOVL $0xffffffff, DX
   275  	TESTB CX, CX
   276  	JZ no_xsaveopt
   277  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
   278  	JMP fpsave_done
   279  no_xsaveopt:
   280  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
   281  	JMP fpsave_done
   282  no_xsave:
   283  	FXSAVE64 0(DI)
   284  fpsave_done:
   285  
   286  	// Restore MXCSR and the x87 control word after one of the two floating
   287  	// point save cases above, to ensure the application versions are saved
   288  	// before being clobbered here.
   289  	LDMXCSR mxcsr-0(SP)
   290  
   291  	// FLDCW is a "waiting" x87 instruction, meaning it checks for pending
   292  	// unmasked exceptions before executing. Thus if userspace has unmasked
   293  	// an exception and has one pending, it can be raised by FLDCW even
   294  	// though the new control word will mask exceptions. To prevent this,
   295  	// we must first clear pending exceptions (which will be restored by
   296  	// XRSTOR, et al).
   297  	BYTE $0xDB; BYTE $0xE2; // FNCLEX
   298  	FLDCW cw-8(SP)
   299  
   300  	RET
   301  
   302  // See entry_amd64.go.
   303  TEXT ·sysret(SB),NOSPLIT|NOFRAME,$0-32
   304  	// Set application FS. We can't do this in Go because Go code needs FS.
   305  	MOVQ regs+8(FP), AX
   306  	MOVQ PTRACE_FS_BASE(AX), AX
   307  
   308  	PUSHQ AX
   309  	CALL ·writeFS(SB)
   310  	POPQ AX
   311  
   312  	CALL ·jumpToKernel(SB)
   313  	// Save original state and stack. sysenter() or exception()
   314  	// from APP(gr3) will switch to this stack, set the return
   315  	// value (vector: 32(SP)) and then do RET, which will also
   316  	// automatically return to the lower half.
   317  	MOVQ cpu+0(FP), BX
   318  	MOVQ regs+8(FP), AX
   319  	MOVQ userCR3+16(FP), CX
   320  	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
   321  	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
   322  	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
   323  
   324  	// save SP AX userCR3 on the kernel stack.
   325  	MOVQ CPU_ENTRY(BX), BX
   326  	LOAD_KERNEL_STACK(BX)
   327  	PUSHQ PTRACE_RSP(AX)
   328  	PUSHQ PTRACE_RAX(AX)
   329  	PUSHQ CX
   330  
   331  	// Restore user register state.
   332  	REGISTERS_LOAD(AX, 0)
   333  	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
   334  	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
   335  
   336  	// restore userCR3, AX, SP.
   337  	POPQ AX	                            // Get userCR3.
   338  	WRITE_CR3()                         // Switch to userCR3.
   339  	POPQ AX                             // Restore AX.
   340  	POPQ SP                             // Restore SP.
   341  	SYSRET64()
   342  	// sysenter or exception will write our return value and return to our
   343  	// caller.
   344  
   345  // See entry_amd64.go.
   346  TEXT ·iret(SB),NOSPLIT|NOFRAME,$0-32
   347  	// Set application FS. We can't do this in Go because Go code needs FS.
   348  	MOVQ regs+8(FP), AX
   349  	MOVQ PTRACE_FS_BASE(AX), AX
   350  
   351  	PUSHQ AX // First argument (FS_BASE)
   352  	CALL ·writeFS(SB)
   353  	POPQ AX
   354  
   355  	CALL ·jumpToKernel(SB)
   356  	// Save original state and stack. sysenter() or exception()
   357  	// from APP(gr3) will switch to this stack, set the return
   358  	// value (vector: 32(SP)) and then do RET, which will also
   359  	// automatically return to the lower half.
   360  	MOVQ cpu+0(FP), BX
   361  	MOVQ regs+8(FP), AX
   362  	MOVQ userCR3+16(FP), CX
   363  	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
   364  	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
   365  	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
   366  
   367  	// Build an IRET frame & restore state.
   368  	MOVQ CPU_ENTRY(BX), BX
   369  	LOAD_KERNEL_STACK(BX)
   370  	PUSHQ PTRACE_SS(AX)
   371  	PUSHQ PTRACE_RSP(AX)
   372  	PUSHQ PTRACE_FLAGS(AX)
   373  	PUSHQ PTRACE_CS(AX)
   374  	PUSHQ PTRACE_RIP(AX)
   375  	PUSHQ PTRACE_RAX(AX)                // Save AX on kernel stack.
   376  	PUSHQ CX                            // Save userCR3 on kernel stack.
   377  	REGISTERS_LOAD(AX, 0)               // Restore most registers.
   378  	POPQ AX	                            // Get userCR3.
   379  	WRITE_CR3()                         // Switch to userCR3.
   380  	POPQ AX                             // Restore AX.
   381  	IRET()
   382  	// sysenter or exception will write our return value and return to our
   383  	// caller.
   384  
   385  // See entry_amd64.go.
   386  TEXT ·resume(SB),NOSPLIT|NOFRAME,$0
   387  	// See iret, above.
   388  	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
   389  	PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
   390  	PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
   391  	PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
   392  	PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
   393  	PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
   394  	REGISTERS_LOAD(AX, CPU_REGISTERS)
   395  	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
   396  	IRET()
   397  
   398  // See entry_amd64.go.
   399  TEXT ·start(SB),NOSPLIT|NOFRAME,$0
   400  	// N.B. This is the vCPU entrypoint. It is not called from Go code and
   401  	// thus pushes and pops values on the stack until calling into Go
   402  	// (startGo) because we aren't usually a typical Go assembly frame.
   403  	PUSHQ $0x0  // Previous frame pointer.
   404  	MOVQ SP, BP // Set frame pointer.
   405  	PUSHQ AX    // Save CPU.
   406  
   407  	// Set up environment required by Go before calling startGo: Go needs
   408  	// FS_BASE and floating point initialized.
   409  	MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
   410  	PUSHQ BX          // First argument (FS_BASE)
   411  	CALL ·writeFS(SB)
   412  	POPQ BX
   413  
   414  	// First argument (CPU) already at bottom of stack.
   415  	CALL ·startGo(SB) // Call Go hook.
   416  	JMP ·resume(SB)   // Restore to registers.
   417  
   418  ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB));
   419  
   420  // See entry_amd64.go.
   421  TEXT ·sysenter(SB),NOSPLIT|NOFRAME,$0
   422  	// _RFLAGS_IOPL0 is always set in the user mode and it is never set in
   423  	// the kernel mode. See the comment of UserFlagsSet for more details.
   424  	TESTL $_RFLAGS_IOPL0, R11
   425  	JZ kernel
   426  user:
   427  	SWAP_GS()
   428  	MOVQ AX, ENTRY_SCRATCH0(GS)            // Save user AX on scratch.
   429  	MOVQ ENTRY_KERNEL_CR3(GS), AX          // Get kernel cr3 on AX.
   430  	WRITE_CR3()                            // Switch to kernel cr3.
   431  
   432  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   433  	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX  // Get user regs.
   434  	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
   435  	MOVQ CX,  PTRACE_RIP(AX)
   436  	MOVQ R11, PTRACE_FLAGS(AX)
   437  	MOVQ SP,  PTRACE_RSP(AX)
   438  	MOVQ ENTRY_SCRATCH0(GS), CX            // Load saved user AX value.
   439  	MOVQ CX,  PTRACE_RAX(AX)               // Save everything else.
   440  	MOVQ CX,  PTRACE_ORIGRAX(AX)
   441  
   442  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   443  	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP  // Get stacks.
   444  	MOVQ $0, CPU_ERROR_CODE(AX)            // Clear error code.
   445  	MOVQ $1, CPU_ERROR_TYPE(AX)            // Set error type to user.
   446  
   447  	CALL ·jumpToUser(SB)
   448  
   449  	// Restore kernel FS_BASE.
   450  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   451  	MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
   452  
   453  	PUSHQ BX                               // First argument (FS_BASE)
   454  	CALL ·writeFS(SB)
   455  	POPQ BX
   456  
   457  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   458  
   459  	// Return to the kernel, where the frame is:
   460  	//
   461  	//	vector      (sp+32)
   462  	//	userCR3     (sp+24)
   463  	// 	regs        (sp+16)
   464  	// 	cpu         (sp+8)
   465  	// 	vcpu.Switch (sp+0)
   466  	//
   467  	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
   468  	MOVQ $Syscall, 32(SP)                 // Output vector.
   469  	RET
   470  
   471  kernel:
   472  	// We can't restore the original stack, but we can access the registers
   473  	// in the CPU state directly. No need for temporary juggling.
   474  	MOVQ AX,  ENTRY_SCRATCH0(GS)
   475  	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
   476  	REGISTERS_SAVE(AX, CPU_REGISTERS)
   477  	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(AX)
   478  	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
   479  	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(AX)
   480  	MOVQ ENTRY_SCRATCH0(GS), BX
   481  	MOVQ BX,  CPU_REGISTERS+PTRACE_ORIGRAX(AX)
   482  	MOVQ BX,  CPU_REGISTERS+PTRACE_RAX(AX)
   483  	MOVQ $0,  CPU_ERROR_CODE(AX)                // Clear error code.
   484  	MOVQ $0,  CPU_ERROR_TYPE(AX)                // Set error type to kernel.
   485  	MOVQ $0xffffffffffffffff,  CPU_VECTOR(AX)                // Set error type to kernel.
   486  
   487  	// Save floating point state. CPU.floatingPointState is a slice, so the
   488  	// first word of CPU.floatingPointState is a pointer to the destination
   489  	// array.
   490  	MOVQ CPU_FPU_STATE(AX), DI
   491  	MOVB CPU_HAS_XSAVE(AX), BX
   492  	MOVB CPU_HAS_XSAVEOPT(AX), CX
   493  	TESTB BX, BX
   494  	JZ no_xsave
   495  	// Use xsave/xsaveopt to save all extended state.
   496  	// We save everything unconditionally by setting RFBM to all 1's.
   497  	MOVL $0xffffffff, AX
   498  	MOVL $0xffffffff, DX
   499  	TESTB CX, CX
   500  	JZ no_xsaveopt
   501  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
   502  	JMP fpsave_done
   503  no_xsaveopt:
   504  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
   505  	JMP fpsave_done
   506  no_xsave:
   507  	FXSAVE64 0(DI)
   508  fpsave_done:
   509  
   510  	// Call the syscall trampoline.
   511  	LOAD_KERNEL_STACK(GS)
   512  	MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU.
   513  	PUSHQ AX                    // First argument (vCPU).
   514  	CALL ·kernelSyscall(SB)     // Call the trampoline.
   515  	POPQ AX                     // Pop vCPU.
   516  
   517  	// We only trigger a bluepill entry in the bluepill function, and can
   518  	// therefore be guaranteed that there is no floating point state to be
   519  	// loaded on resuming from halt.
   520  	JMP ·resume(SB)
   521  
   522  ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB));
   523  
   524  // exception is a generic exception handler.
   525  //
   526  // There are two cases handled:
   527  //
   528  // 1) An exception in kernel mode: this results in saving the state at the time
   529  // of the exception and calling the defined hook.
   530  //
   531  // 2) An exception in guest mode: the original kernel frame is restored, and
   532  // the vector & error codes are pushed as return values.
   533  //
   534  // See below for the stubs that call exception.
   535  TEXT ·exception(SB),NOSPLIT|NOFRAME,$0
   536  	// Determine whether the exception occurred in kernel mode or user
   537  	// mode, based on the flags. We expect the following stack:
   538  	//
   539  	//	SS          (sp+48)
   540  	//	SP          (sp+40)
   541  	//	FLAGS       (sp+32)
   542  	//	CS          (sp+24)
   543  	//	IP          (sp+16)
   544  	//	ERROR_CODE  (sp+8)
   545  	//	VECTOR      (sp+0)
   546  	//
   547  	TESTL $_RFLAGS_IOPL0, 32(SP)
   548  	JZ kernel
   549  
   550  user:
   551  	SWAP_GS()
   552  	ADDQ $-8, SP                            // Adjust for flags.
   553  	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
   554  	PUSHQ AX                                // Save user AX on stack.
   555  	MOVQ ENTRY_KERNEL_CR3(GS), AX           // Get kernel cr3 on AX.
   556  	WRITE_CR3()                             // Switch to kernel cr3.
   557  
   558  	MOVQ ENTRY_CPU_SELF(GS), AX             // Load vCPU.
   559  	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX   // Get user regs.
   560  	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
   561  	POPQ BX                                 // Restore original AX.
   562  	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
   563  	MOVQ BX, PTRACE_ORIGRAX(AX)
   564  	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
   565  	MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
   566  	MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
   567  	MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
   568  	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
   569  
   570  	CALL ·jumpToUser(SB)
   571  
   572  	// Restore kernel FS_BASE.
   573  	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
   574  	MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
   575  
   576  	PUSHQ BX                               // First argument (FS_BASE)
   577  	CALL ·writeFS(SB)
   578  	POPQ BX
   579  
   580  	// Copy out and return.
   581  	MOVQ ENTRY_CPU_SELF(GS), AX           // Load vCPU.
   582  	MOVQ 0(SP), BX                        // Load vector.
   583  	MOVQ 8(SP), CX                        // Load error code.
   584  	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
   585  	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
   586  	MOVQ CX, CPU_ERROR_CODE(AX)           // Set error code.
   587  	MOVQ $1, CPU_ERROR_TYPE(AX)           // Set error type to user.
   588  	MOVQ BX, 32(SP)                       // Output vector.
   589  	RET
   590  
   591  kernel:
   592  	// As per above, we can save directly.
   593  	PUSHQ AX
   594  	MOVQ ENTRY_CPU_SELF(GS), AX                        // Load vCPU.
   595  	REGISTERS_SAVE(AX, CPU_REGISTERS)
   596  	POPQ BX
   597  	MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
   598  	MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
   599  	MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
   600  	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
   601  	MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
   602  
   603  	// Set the error code and adjust the stack.
   604  	MOVQ 8(SP), BX              // Load the error code.
   605  	MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
   606  	MOVQ 0(SP), BX              // Load the error code.
   607  	MOVQ BX, CPU_VECTOR(AX) // Copy out to the CPU.
   608  	BYTE $0x0f; BYTE $0x20; BYTE $0xd3; // MOV CR2, RBX
   609  	MOVQ BX, CPU_FAULT_ADDR(AX)
   610  	MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
   611  
   612  	// Save floating point state. CPU.floatingPointState is a slice, so the
   613  	// first word of CPU.floatingPointState is a pointer to the destination
   614  	// array.
   615  	MOVQ CPU_FPU_STATE(AX), DI
   616  	MOVB CPU_HAS_XSAVE(AX), BX
   617  	MOVB CPU_HAS_XSAVEOPT(AX), CX
   618  	TESTB BX, BX
   619  	JZ no_xsave
   620  	// Use xsave/xsaveopt to save all extended state.
   621  	// We save everything unconditionally by setting RFBM to all 1's.
   622  	MOVL $0xffffffff, AX
   623  	MOVL $0xffffffff, DX
   624  	TESTB CX, CX
   625  	JZ no_xsaveopt
   626  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI)
   627  	JMP fpsave_done
   628  no_xsaveopt:
   629  	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI)
   630  	JMP fpsave_done
   631  no_xsave:
   632  	FXSAVE64 0(DI)
   633  fpsave_done:
   634  
   635  	// Call the exception trampoline.
   636  	MOVQ 0(SP), BX              // BX contains the vector.
   637  	LOAD_KERNEL_STACK(GS)
   638  	MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU.
   639  	PUSHQ BX                    // Second argument (vector).
   640  	PUSHQ AX                    // First argument (vCPU).
   641  	CALL ·kernelException(SB)   // Call the trampoline.
   642  	POPQ BX                     // Pop vector.
   643  	POPQ AX                     // Pop vCPU.
   644  
   645  	// We only trigger a bluepill entry in the bluepill function, and can
   646  	// therefore be guaranteed that there is no floating point state to be
   647  	// loaded on resuming from halt.
   648  	JMP ·resume(SB)
   649  
   650  #define EXCEPTION_WITH_ERROR(value, symbol, addr) \
   651  ADDR_OF_FUNC(addr, symbol); \
   652  TEXT symbol,NOSPLIT|NOFRAME,$0; \
   653  	PUSHQ $value; \
   654  	JMP ·exception(SB);
   655  
   656  #define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \
   657  ADDR_OF_FUNC(addr, symbol); \
   658  TEXT symbol,NOSPLIT|NOFRAME,$0; \
   659  	PUSHQ $0x0; \
   660  	PUSHQ $value; \
   661  	JMP ·exception(SB);
   662  
   663  EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB))
   664  EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB))
   665  EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB))
   666  EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB))
   667  EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB))
   668  EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB))
   669  EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB))
   670  EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB))
   671  EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB))
   672  EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB))
   673  EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB))
   674  EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB))
   675  EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB))
   676  EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB))
   677  EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB))
   678  EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB))
   679  EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB))
   680  EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB))
   681  EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB))
   682  EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB))
   683  EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB))
   684  EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB))