github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/executor/common_kvm_amd64_syzos.h (about)

     1  // Copyright 2025 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  #ifndef EXECUTOR_COMMON_KVM_AMD64_SYZOS_H
     5  #define EXECUTOR_COMMON_KVM_AMD64_SYZOS_H
     6  
     7  // This file provides guest code running inside the AMD64 KVM.
     8  
     9  #include "common_kvm_syzos.h"
    10  #include "kvm.h"
    11  #include <linux/kvm.h>
    12  #include <stdbool.h>
    13  
    14  // There are no particular rules to assign numbers here, but changing them will
    15  // result in losing some existing reproducers. Therefore, we try to leave spaces
    16  // between unrelated IDs.
    17  // Remember these constants must match those in sys/linux/dev_kvm_amd64.txt.
    18  typedef enum {
    19  	SYZOS_API_UEXIT = 0,
    20  	SYZOS_API_CODE = 10,
    21  	SYZOS_API_CPUID = 100,
    22  	SYZOS_API_WRMSR = 101,
    23  	SYZOS_API_RDMSR = 102,
    24  	SYZOS_API_WR_CRN = 103,
    25  	SYZOS_API_WR_DRN = 104,
    26  	SYZOS_API_IN_DX = 105,
    27  	SYZOS_API_OUT_DX = 106,
    28  	SYZOS_API_SET_IRQ_HANDLER = 200,
    29  	SYZOS_API_ENABLE_NESTED = 300,
    30  	SYZOS_API_NESTED_CREATE_VM = 301,
    31  	SYZOS_API_NESTED_LOAD_CODE = 302,
    32  	SYZOS_API_NESTED_VMLAUNCH = 303,
    33  	SYZOS_API_NESTED_VMRESUME = 304,
    34  	SYZOS_API_NESTED_INTEL_VMWRITE_MASK = 340,
    35  	SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK = 380,
    36  	SYZOS_API_STOP, // Must be the last one
    37  } syzos_api_id;
    38  
    39  struct api_call_header {
    40  	uint64 call;
    41  	uint64 size;
    42  };
    43  
    44  struct api_call_uexit {
    45  	struct api_call_header header;
    46  	uint64 exit_code;
    47  };
    48  
    49  struct api_call_code {
    50  	struct api_call_header header;
    51  	uint8 insns[];
    52  };
    53  
    54  struct api_call_nested_load_code {
    55  	struct api_call_header header;
    56  	uint64 vm_id;
    57  	uint8 insns[];
    58  };
    59  
    60  struct api_call_cpuid {
    61  	struct api_call_header header;
    62  	uint32 eax;
    63  	uint32 ecx;
    64  };
    65  
    66  struct api_call_1 {
    67  	struct api_call_header header;
    68  	uint64 arg;
    69  };
    70  
    71  struct api_call_2 {
    72  	struct api_call_header header;
    73  	uint64 args[2];
    74  };
    75  
    76  struct api_call_3 {
    77  	struct api_call_header header;
    78  	uint64 args[3];
    79  };
    80  
    81  struct api_call_5 {
    82  	struct api_call_header header;
    83  	uint64 args[5];
    84  };
    85  
    86  // This struct must match the push/pop order in nested_vm_exit_handler_intel_asm().
    87  struct l2_guest_regs {
    88  	uint64 rax, rbx, rcx, rdx, rsi, rdi, rbp;
    89  	uint64 r8, r9, r10, r11, r12, r13, r14, r15;
    90  };
    91  
    92  #ifdef __cplusplus
    93  extern "C" {
    94  #endif
    95  GUEST_CODE static void guest_uexit(uint64 exit_code);
    96  GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs);
    97  #ifdef __cplusplus
    98  }
    99  #endif
   100  GUEST_CODE static void guest_execute_code(uint8* insns, uint64 size);
   101  GUEST_CODE static void guest_handle_cpuid(uint32 eax, uint32 ecx);
   102  GUEST_CODE static void guest_handle_wrmsr(uint64 reg, uint64 val);
   103  GUEST_CODE static void guest_handle_rdmsr(uint64 reg);
   104  GUEST_CODE static void guest_handle_wr_crn(struct api_call_2* cmd);
   105  GUEST_CODE static void guest_handle_wr_drn(struct api_call_2* cmd);
   106  GUEST_CODE static void guest_handle_in_dx(struct api_call_2* cmd);
   107  GUEST_CODE static void guest_handle_out_dx(struct api_call_3* cmd);
   108  GUEST_CODE static void guest_handle_set_irq_handler(struct api_call_2* cmd);
   109  GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id);
   110  GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id);
   111  GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id);
   112  GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id);
   113  GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id);
   114  GUEST_CODE static void guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id);
   115  GUEST_CODE static void guest_handle_nested_amd_vmcb_write_mask(struct api_call_5* cmd, uint64 cpu_id);
   116  
   117  typedef enum {
   118  	UEXIT_END = (uint64)-1,
   119  	UEXIT_IRQ = (uint64)-2,
   120  	UEXIT_ASSERT = (uint64)-3,
   121  } uexit_code;
   122  
   123  typedef enum {
   124  	CPU_VENDOR_INTEL,
   125  	CPU_VENDOR_AMD,
   126  } cpu_vendor_id;
   127  
   128  __attribute__((naked))
   129  GUEST_CODE static void
   130  dummy_null_handler()
   131  {
   132  	asm("iretq");
   133  }
   134  
   135  __attribute__((naked)) GUEST_CODE static void uexit_irq_handler()
   136  {
   137  	asm volatile(R"(
   138  	    // Call guest_uexit(UEXIT_IRQ).
   139  	    movq $-2, %rdi
   140  	    call guest_uexit
   141  
   142  	    iretq
   143  	)");
   144  }
   145  
   146  // Main guest function that performs necessary setup and passes the control to the user-provided
   147  // payload.
   148  // The inner loop uses a complex if-statement, because Clang is eager to insert a jump table into
   149  // a switch statement.
   150  
   151  // TODO(glider): executor/style_test.go insists that single-line compound statements should not
   152  // be used e.g. in the following case:
   153  //   if (call == SYZOS_API_UEXIT) {
   154  //     struct api_call_uexit* ucmd = (struct api_call_uexit*)cmd;
   155  //     guest_uexit(ucmd->exit_code);
   156  //   } else if (call == SYZOS_API_WR_CRN) {
   157  //     guest_handle_wr_crn((struct api_call_2*)cmd);  // Style check fails here
   158  //   }
   159  // , i.e. when the braces are consistent with the rest of the code, even despite this violates the
   160  // Google C++ style guide.
   161  // We add single-line comments to justify having the compound statements below.
   162  __attribute__((used))
   163  GUEST_CODE static void
   164  guest_main(uint64 size, uint64 cpu)
   165  {
   166  	uint64 addr = X86_SYZOS_ADDR_USER_CODE + cpu * KVM_PAGE_SIZE;
   167  
   168  	while (size >= sizeof(struct api_call_header)) {
   169  		struct api_call_header* cmd = (struct api_call_header*)addr;
   170  		if (cmd->call >= SYZOS_API_STOP)
   171  			return;
   172  		if (cmd->size > size)
   173  			return;
   174  		volatile uint64 call = cmd->call;
   175  		if (call == SYZOS_API_UEXIT) {
   176  			// Issue a user exit.
   177  			struct api_call_uexit* ucmd = (struct api_call_uexit*)cmd;
   178  			guest_uexit(ucmd->exit_code);
   179  		} else if (call == SYZOS_API_CODE) {
   180  			// Execute an instruction blob.
   181  			struct api_call_code* ccmd = (struct api_call_code*)cmd;
   182  			guest_execute_code(ccmd->insns, cmd->size - sizeof(struct api_call_header));
   183  		} else if (call == SYZOS_API_CPUID) {
   184  			// Issue CPUID.
   185  			struct api_call_cpuid* ccmd = (struct api_call_cpuid*)cmd;
   186  			guest_handle_cpuid(ccmd->eax, ccmd->ecx);
   187  		} else if (call == SYZOS_API_WRMSR) {
   188  			// Write an MSR register.
   189  			struct api_call_2* ccmd = (struct api_call_2*)cmd;
   190  			guest_handle_wrmsr(ccmd->args[0], ccmd->args[1]);
   191  		} else if (call == SYZOS_API_RDMSR) {
   192  			// Read an MSR register.
   193  			struct api_call_1* ccmd = (struct api_call_1*)cmd;
   194  			guest_handle_rdmsr(ccmd->arg);
   195  		} else if (call == SYZOS_API_WR_CRN) {
   196  			// Write value to a control register.
   197  			guest_handle_wr_crn((struct api_call_2*)cmd);
   198  		} else if (call == SYZOS_API_WR_DRN) {
   199  			// Write value to a debug register.
   200  			guest_handle_wr_drn((struct api_call_2*)cmd);
   201  		} else if (call == SYZOS_API_IN_DX) {
   202  			// Read data from an I/O port.
   203  			guest_handle_in_dx((struct api_call_2*)cmd);
   204  		} else if (call == SYZOS_API_OUT_DX) {
   205  			// Write data to an I/O port.
   206  			guest_handle_out_dx((struct api_call_3*)cmd);
   207  		} else if (call == SYZOS_API_SET_IRQ_HANDLER) {
   208  			// Set the handler for a particular IRQ.
   209  			guest_handle_set_irq_handler((struct api_call_2*)cmd);
   210  		} else if (call == SYZOS_API_ENABLE_NESTED) {
   211  			// Enable nested virtualization.
   212  			guest_handle_enable_nested((struct api_call_1*)cmd, cpu);
   213  		} else if (call == SYZOS_API_NESTED_CREATE_VM) {
   214  			// Create a nested VM.
   215  			guest_handle_nested_create_vm((struct api_call_1*)cmd, cpu);
   216  		} else if (call == SYZOS_API_NESTED_LOAD_CODE) {
   217  			// Load code into the nested VM.
   218  			guest_handle_nested_load_code((struct api_call_nested_load_code*)cmd, cpu);
   219  		} else if (call == SYZOS_API_NESTED_VMLAUNCH) {
   220  			// Launch the nested VM.
   221  			guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu);
   222  		} else if (call == SYZOS_API_NESTED_VMRESUME) {
   223  			// Resume a nested VM.
   224  			guest_handle_nested_vmresume((struct api_call_1*)cmd, cpu);
   225  		} else if (call == SYZOS_API_NESTED_INTEL_VMWRITE_MASK) {
   226  			// Write to a VMCS field using masks.
   227  			guest_handle_nested_intel_vmwrite_mask((struct api_call_5*)cmd, cpu);
   228  		} else if (call == SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK) {
   229  			// Write to a VMCB field using masks.
   230  			guest_handle_nested_amd_vmcb_write_mask((struct api_call_5*)cmd, cpu);
   231  		}
   232  		addr += cmd->size;
   233  		size -= cmd->size;
   234  	};
   235  	guest_uexit((uint64)-1);
   236  }
   237  
   238  GUEST_CODE static noinline void guest_execute_code(uint8* insns, uint64 size)
   239  {
   240  	volatile void (*fn)() = (volatile void (*)())insns;
   241  	fn();
   242  }
   243  
   244  // Perform a userspace exit that can be handled by the host.
   245  // The host returns from ioctl(KVM_RUN) with kvm_run.exit_reason=KVM_EXIT_MMIO,
   246  // and can handle the call depending on the data passed as exit code.
   247  
   248  // Make sure the compiler does not optimize this function away, it is called from
   249  // assembly.
   250  __attribute__((used))
   251  GUEST_CODE static noinline void
   252  guest_uexit(uint64 exit_code)
   253  {
   254  	volatile uint64* ptr = (volatile uint64*)X86_SYZOS_ADDR_UEXIT;
   255  	*ptr = exit_code;
   256  }
   257  
   258  GUEST_CODE static noinline void guest_handle_cpuid(uint32 eax, uint32 ecx)
   259  {
   260  	asm volatile(
   261  	    "cpuid\n"
   262  	    : // Currently ignore outputs
   263  	    : "a"(eax), "c"(ecx)
   264  	    : "rbx", "rdx");
   265  }
   266  
   267  GUEST_CODE static noinline void wrmsr(uint64 reg, uint64 val)
   268  {
   269  	asm volatile(
   270  	    "wrmsr"
   271  	    :
   272  	    : "c"(reg),
   273  	      "a"((uint32)val),
   274  	      "d"((uint32)(val >> 32))
   275  	    : "memory");
   276  }
   277  
   278  // Write val into an MSR register reg.
   279  GUEST_CODE static noinline void guest_handle_wrmsr(uint64 reg, uint64 val)
   280  {
   281  	wrmsr(reg, val);
   282  }
   283  
   284  GUEST_CODE static noinline uint64 rdmsr(uint64 msr_id)
   285  {
   286  	uint32 low = 0, high = 0; // nolint
   287  	// The RDMSR instruction takes the MSR address in ecx.
   288  	// It puts the lower 32 bits of the MSR value into eax, and the upper.
   289  	// 32 bits of the MSR value into edx.
   290  	asm volatile("rdmsr" : "=a"(low), "=d"(high) : "c"(msr_id));
   291  	return ((uint64)high << 32) | low;
   292  }
   293  
   294  // Read an MSR register, ignore the result.
   295  GUEST_CODE static noinline void guest_handle_rdmsr(uint64 reg)
   296  {
   297  	(void)rdmsr(reg);
   298  }
   299  
   300  // Write to CRn control register.
   301  GUEST_CODE static noinline void guest_handle_wr_crn(struct api_call_2* cmd)
   302  {
   303  	uint64 value = cmd->args[1];
   304  	// Prevent the compiler from generating a switch table.
   305  	volatile uint64 reg = cmd->args[0];
   306  	if (reg == 0) {
   307  		// Move value to CR0.
   308  		asm volatile("movq %0, %%cr0" ::"r"(value) : "memory");
   309  		return;
   310  	}
   311  	if (reg == 2) {
   312  		// Move value to CR2.
   313  		asm volatile("movq %0, %%cr2" ::"r"(value) : "memory");
   314  		return;
   315  	}
   316  	if (reg == 3) {
   317  		// Move value to CR3.
   318  		asm volatile("movq %0, %%cr3" ::"r"(value) : "memory");
   319  		return;
   320  	}
   321  	if (reg == 4) {
   322  		// Move value to CR4.
   323  		asm volatile("movq %0, %%cr4" ::"r"(value) : "memory");
   324  		return;
   325  	}
   326  	if (reg == 8) {
   327  		// Move value to CR8 (TPR - Task Priority Register).
   328  		asm volatile("movq %0, %%cr8" ::"r"(value) : "memory");
   329  		return;
   330  	}
   331  }
   332  
   333  // Write to DRn debug register.
   334  GUEST_CODE static noinline void guest_handle_wr_drn(struct api_call_2* cmd)
   335  {
   336  	uint64 value = cmd->args[1];
   337  	volatile uint64 reg = cmd->args[0];
   338  	if (reg == 0) {
   339  		asm volatile("movq %0, %%dr0" ::"r"(value) : "memory");
   340  		return;
   341  	}
   342  	if (reg == 1) {
   343  		asm volatile("movq %0, %%dr1" ::"r"(value) : "memory");
   344  		return;
   345  	}
   346  	if (reg == 2) {
   347  		asm volatile("movq %0, %%dr2" ::"r"(value) : "memory");
   348  		return;
   349  	}
   350  	if (reg == 3) {
   351  		asm volatile("movq %0, %%dr3" ::"r"(value) : "memory");
   352  		return;
   353  	}
   354  	if (reg == 4) {
   355  		asm volatile("movq %0, %%dr4" ::"r"(value) : "memory");
   356  		return;
   357  	}
   358  	if (reg == 5) {
   359  		asm volatile("movq %0, %%dr5" ::"r"(value) : "memory");
   360  		return;
   361  	}
   362  	if (reg == 6) {
   363  		asm volatile("movq %0, %%dr6" ::"r"(value) : "memory");
   364  		return;
   365  	}
   366  	if (reg == 7) {
   367  		asm volatile("movq %0, %%dr7" ::"r"(value) : "memory");
   368  		return;
   369  	}
   370  }
   371  
   372  // Read data from an I/O port, should result in KVM_EXIT_IO.
   373  GUEST_CODE static noinline void guest_handle_in_dx(struct api_call_2* cmd)
   374  {
   375  	uint16 port = cmd->args[0];
   376  	volatile int size = cmd->args[1];
   377  
   378  	if (size == 1) {
   379  		uint8 unused;
   380  		// Reads 1 byte from the port in DX into AL.
   381  		asm volatile("inb %1, %0" : "=a"(unused) : "d"(port));
   382  		return;
   383  	}
   384  	if (size == 2) {
   385  		uint16 unused;
   386  		// Reads 2 bytes from the port in DX into AX.
   387  		asm volatile("inw %1, %0" : "=a"(unused) : "d"(port));
   388  		return;
   389  	}
   390  	if (size == 4) {
   391  		uint32 unused;
   392  		// Reads 4 bytes from the port in DX into EAX.
   393  		asm volatile("inl %1, %0" : "=a"(unused) : "d"(port));
   394  	}
   395  	return;
   396  }
   397  
   398  // Write data to an I/O port, should result in KVM_EXIT_IO.
   399  GUEST_CODE static noinline void guest_handle_out_dx(struct api_call_3* cmd)
   400  {
   401  	uint16 port = cmd->args[0];
   402  	volatile int size = cmd->args[1];
   403  	uint32 data = (uint32)cmd->args[2];
   404  
   405  	if (size == 1) {
   406  		// Writes 1 byte from AL to the port in DX.
   407  		asm volatile("outb %b0, %w1" ::"a"(data), "d"(port));
   408  		return;
   409  	}
   410  	if (size == 2) {
   411  		// Writes 2 bytes from AX to the port in DX.
   412  		asm volatile("outw %w0, %w1" ::"a"(data), "d"(port));
   413  		return;
   414  	}
   415  	if (size == 4) {
   416  		// Writes 4 bytes from EAX to the port in DX.
   417  		asm volatile("outl %k0, %w1" ::"a"(data), "d"(port));
   418  		return;
   419  	}
   420  }
   421  
   422  // See https://wiki.osdev.org/Interrupt_Descriptor_Table#Gate_Descriptor_2.
   423  struct idt_entry_64 {
   424  	uint16 offset_low;
   425  	uint16 selector;
   426  	// Interrupt Stack Table offset in bits 0..2
   427  	uint8 ist;
   428  	// Gate Type, P and DPL.
   429  	uint8 type_attr;
   430  	uint16 offset_mid;
   431  	uint32 offset_high;
   432  	uint32 reserved;
   433  } __attribute__((packed));
   434  
   435  // IDT gate setup should be similar to syzos_setup_idt() in the host code.
   436  GUEST_CODE static void set_idt_gate(uint8 vector, uint64 handler)
   437  {
   438  	volatile struct idt_entry_64* idt =
   439  	    (volatile struct idt_entry_64*)(X86_SYZOS_ADDR_VAR_IDT);
   440  	volatile struct idt_entry_64* idt_entry = &idt[vector];
   441  	idt_entry->offset_low = (uint16)handler;
   442  	idt_entry->offset_mid = (uint16)(handler >> 16);
   443  	idt_entry->offset_high = (uint32)(handler >> 32);
   444  	idt_entry->selector = X86_SYZOS_SEL_CODE;
   445  	idt_entry->type_attr = 0x8E;
   446  	idt_entry->ist = 0;
   447  	idt_entry->reserved = 0;
   448  }
   449  
   450  GUEST_CODE static noinline void guest_handle_set_irq_handler(struct api_call_2* cmd)
   451  {
   452  	uint8 vector = (uint8)cmd->args[0];
   453  	uint64 type = cmd->args[1];
   454  	volatile uint64 handler_addr = 0;
   455  	if (type == 1)
   456  		handler_addr = executor_fn_guest_addr(dummy_null_handler);
   457  	else if (type == 2)
   458  		handler_addr = executor_fn_guest_addr(uexit_irq_handler);
   459  	set_idt_gate(vector, handler_addr);
   460  }
   461  
   462  GUEST_CODE static cpu_vendor_id get_cpu_vendor(void)
   463  {
   464  	uint32 ebx, eax = 0;
   465  
   466  	asm volatile(
   467  	    "cpuid"
   468  	    : "+a"(eax), "=b"(ebx)
   469  	    : // No explicit inputs, EAX is handled by +a.
   470  	    : "ecx", "edx");
   471  
   472  	if (ebx == 0x756e6547) { // "Genu[ineIntel]".
   473  		return CPU_VENDOR_INTEL;
   474  	} else if (ebx == 0x68747541) { // "Auth[enticAMD]".
   475  		return CPU_VENDOR_AMD;
   476  	} else {
   477  		// Should not happen on AMD64, but for completeness.
   478  		guest_uexit(UEXIT_ASSERT);
   479  		return CPU_VENDOR_INTEL; // Default to Intel if unknown.
   480  	}
   481  }
   482  
   483  GUEST_CODE static inline uint64 read_cr0(void)
   484  {
   485  	uint64 val;
   486  	asm volatile("mov %%cr0, %0" : "=r"(val));
   487  	return val;
   488  }
   489  
   490  GUEST_CODE static inline uint64 read_cr3(void)
   491  {
   492  	uint64 val;
   493  	asm volatile("mov %%cr3, %0" : "=r"(val));
   494  	return val;
   495  }
   496  
   497  GUEST_CODE static inline uint64 read_cr4(void)
   498  {
   499  	uint64 val;
   500  	asm volatile("mov %%cr4, %0" : "=r"(val));
   501  	return val;
   502  }
   503  
   504  GUEST_CODE static inline void write_cr4(uint64 val)
   505  {
   506  	asm volatile("mov %0, %%cr4" : : "r"(val));
   507  }
   508  
   509  GUEST_CODE static noinline void vmwrite(uint64 field, uint64 value)
   510  {
   511  	uint8 error = 0; // nolint
   512  	// 'setna' sets the byte to 1 if CF=1 or ZF=1 (VMfail)
   513  	asm volatile("vmwrite %%rax, %%rbx; setna %0"
   514  		     : "=q"(error)
   515  		     : "a"(value), "b"(field)
   516  		     : "cc", "memory");
   517  	if (error)
   518  		guest_uexit(UEXIT_ASSERT);
   519  }
   520  
   521  GUEST_CODE static noinline uint64 vmread(uint64 field)
   522  {
   523  	uint64 value;
   524  	asm volatile("vmread %%rbx, %%rax"
   525  		     : "=a"(value)
   526  		     : "b"(field)
   527  		     : "cc");
   528  	return value;
   529  }
   530  
   531  GUEST_CODE static inline void nested_vmptrld(uint64 cpu_id, uint64 vm_id)
   532  {
   533  	uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
   534  	uint8 error = 0; // nolint
   535  	asm volatile("vmptrld %1; setna %0"
   536  		     : "=q"(error)
   537  		     : "m"(vmcs_addr)
   538  		     : "memory", "cc");
   539  	if (error)
   540  		guest_uexit(0xE2BAD2);
   541  }
   542  
   543  GUEST_CODE static noinline void vmcb_write16(uint64 vmcb, uint16 offset, uint16 val)
   544  {
   545  	*((volatile uint16*)(vmcb + offset)) = val;
   546  }
   547  
   548  GUEST_CODE static noinline void vmcb_write32(uint64 vmcb, uint16 offset, uint32 val)
   549  {
   550  	*((volatile uint32*)(vmcb + offset)) = val;
   551  }
   552  
   553  GUEST_CODE static noinline void vmcb_write64(uint64 vmcb, uint16 offset, uint64 val)
   554  {
   555  	*((volatile uint64*)(vmcb + offset)) = val;
   556  }
   557  
   558  GUEST_CODE static noinline uint64 vmcb_read64(volatile uint8* vmcb, uint16 offset)
   559  {
   560  	return *((volatile uint64*)(vmcb + offset));
   561  }
   562  
   563  GUEST_CODE static void guest_memset(void* s, uint8 c, int size)
   564  {
   565  	volatile uint8* p = (volatile uint8*)s;
   566  	for (int i = 0; i < size; i++)
   567  		p[i] = c;
   568  }
   569  
   570  GUEST_CODE static void guest_memcpy(void* dst, void* src, int size)
   571  {
   572  	volatile uint8* d = (volatile uint8*)dst;
   573  	volatile uint8* s = (volatile uint8*)src;
   574  	for (int i = 0; i < size; i++)
   575  		d[i] = s[i];
   576  }
   577  
   578  GUEST_CODE static noinline void
   579  nested_enable_vmx_intel(uint64 cpu_id)
   580  {
   581  	uint64 vmxon_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id);
   582  	uint64 cr4 = read_cr4();
   583  	cr4 |= X86_CR4_VMXE;
   584  	write_cr4(cr4);
   585  
   586  	uint64 feature_control = rdmsr(X86_MSR_IA32_FEATURE_CONTROL);
   587  	// Check if Lock bit (bit 0) is clear.
   588  	if ((feature_control & 1) == 0) {
   589  		// If unlocked, set Lock bit (bit 0) and Enable VMX outside SMX bit (bit 2).
   590  		feature_control |= 0b101;
   591  		asm volatile("wrmsr" : : "d"(0x0), "c"(X86_MSR_IA32_FEATURE_CONTROL), "A"(feature_control));
   592  	}
   593  
   594  	// Store revision ID at the beginning of VMXON.
   595  	*(uint32*)vmxon_addr = rdmsr(X86_MSR_IA32_VMX_BASIC);
   596  	uint8 error;
   597  	// Can't use enter_vmx_operation() yet, because VMCS is not valid.
   598  	asm volatile("vmxon %1; setna %0"
   599  		     : "=q"(error)
   600  		     : "m"(vmxon_addr)
   601  		     : "memory", "cc");
   602  	if (error) {
   603  		guest_uexit(0xE2BAD0);
   604  		return;
   605  	}
   606  }
   607  
   608  GUEST_CODE static noinline void
   609  nested_enable_svm_amd(uint64 cpu_id)
   610  {
   611  	// Get the Host Save Area (HSAVE) physical address for this CPU.
   612  	// The HSAVE area stores the host processor's state on VMRUN and is restored on VMEXIT.
   613  	uint64 hsave_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id);
   614  
   615  	// Set the SVM Enable (SVME) bit in EFER. This enables SVM operations.
   616  	uint64 efer = rdmsr(X86_MSR_IA32_EFER);
   617  	efer |= X86_EFER_SVME;
   618  	wrmsr(X86_MSR_IA32_EFER, efer);
   619  
   620  	// Write the physical address of the HSAVE area to the VM_HSAVE_PA MSR.
   621  	// This MSR tells the CPU where to save/restore host state during VMRUN/VMEXIT.
   622  	wrmsr(X86_MSR_VM_HSAVE_PA, hsave_addr);
   623  }
   624  
   625  GUEST_CODE static noinline void
   626  guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id)
   627  {
   628  	if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
   629  		nested_enable_vmx_intel(cpu_id);
   630  	} else {
   631  		nested_enable_svm_amd(cpu_id);
   632  	}
   633  }
   634  
   635  GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id)
   636  {
   637  	uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
   638  	uint64 l2_pdpt_addr = l2_pml4_addr + KVM_PAGE_SIZE;
   639  	uint64 l2_pd_addr = l2_pml4_addr + 2 * KVM_PAGE_SIZE;
   640  	uint64 l2_pt_addr = l2_pml4_addr + 3 * KVM_PAGE_SIZE;
   641  
   642  	volatile uint64* pml4 = (volatile uint64*)l2_pml4_addr;
   643  	volatile uint64* pdpt = (volatile uint64*)l2_pdpt_addr;
   644  	volatile uint64* pd = (volatile uint64*)l2_pd_addr;
   645  	volatile uint64* pt = (volatile uint64*)l2_pt_addr;
   646  
   647  	guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE);
   648  	guest_memset((void*)l2_pdpt_addr, 0, KVM_PAGE_SIZE);
   649  	guest_memset((void*)l2_pd_addr, 0, KVM_PAGE_SIZE);
   650  	guest_memset((void*)l2_pt_addr, 0, KVM_PAGE_SIZE);
   651  	guest_memset((void*)X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id), 0, KVM_PAGE_SIZE);
   652  
   653  	// Intel EPT: set Read, Write, Execute.
   654  	// AMD NPT: set Present, Write, User.
   655  	uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER;
   656  	// Create the 4-level page table entries using 4KB pages:
   657  	//    PML4[0] -> points to PDPT
   658  	pml4[0] = l2_pdpt_addr | flags;
   659  	//    PDPT[0] -> points to Page Directory (PD)
   660  	pdpt[0] = l2_pd_addr | flags;
   661  	//    PD[0]   -> points to Page Table (PT) (NO X86_PDE64_PS)
   662  	pd[0] = l2_pt_addr | flags;
   663  	//    PT[0..511] -> maps 512 4KB pages (2MB total) identity
   664  	uint64 pt_flags = flags;
   665  	if (vendor == CPU_VENDOR_INTEL) {
   666  		pt_flags |= EPT_MEMTYPE_WB | EPT_ACCESSED | EPT_DIRTY;
   667  	} else {
   668  		pt_flags |= X86_PDE64_ACCESSED | X86_PDE64_DIRTY;
   669  	}
   670  	for (int i = 0; i < 512; i++)
   671  		pt[i] = (i * KVM_PAGE_SIZE) | pt_flags;
   672  }
   673  
   674  GUEST_CODE static noinline void init_vmcs_control_fields(uint64 cpu_id, uint64 vm_id)
   675  {
   676  	// Read and write Pin-Based controls from TRUE MSR.
   677  	uint64 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS);
   678  	vmwrite(VMCS_PIN_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr);
   679  
   680  	// Setup Secondary Processor-Based controls: enable EPT.
   681  	vmx_msr = (uint32)rdmsr(X86_MSR_IA32_VMX_PROCBASED_CTLS2);
   682  	vmx_msr |= SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_ENABLE_RDTSCP;
   683  	vmwrite(VMCS_SECONDARY_VM_EXEC_CONTROL, vmx_msr);
   684  
   685  	// Read and write Primary Processor-Based controls from TRUE MSR.
   686  	// We also add the bit to enable the secondary controls.
   687  	vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
   688  	vmx_msr |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
   689  	// Exit on HLT and RDTSC.
   690  	vmx_msr |= CPU_BASED_HLT_EXITING | CPU_BASED_RDTSC_EXITING;
   691  	vmwrite(VMCS_CPU_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr);
   692  
   693  	// Set up VM-Exit controls via TRUE MSR: indicate a 64-bit host.
   694  	vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS);
   695  	vmwrite(VMCS_VM_EXIT_CONTROLS, (uint32)vmx_msr | VM_EXIT_HOST_ADDR_SPACE_SIZE);
   696  	// Read and write VM-Entry controls from TRUE MSR
   697  	// We add the bit to indicate a 64-bit guest.
   698  	vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS);
   699  	vmwrite(VMCS_VM_ENTRY_CONTROLS, (uint32)vmx_msr | VM_ENTRY_IA32E_MODE);
   700  
   701  	// Set up the EPT Pointer.
   702  	// We use the L2 PML4 address we calculate in guest_handle_create_nested_vm.
   703  	// The EPT Pointer has:
   704  	// - Memory Type = 6 (Write-Back)
   705  	// - Page-Walk Length = 3 (meaning 4 levels: PML4, PDPT, PD, PT)
   706  	// - Address of the PML4 table
   707  	uint64 eptp = (X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id) & ~0xFFF) | (6 << 0) | (3 << 3);
   708  	vmwrite(VMCS_EPT_POINTER, eptp);
   709  
   710  	// Set CR0/CR4 masks and shadows.
   711  	// This simple setup (masks=0) means any guest CR0/CR4 write is allowed
   712  	// and won't cause a VM-Exit.
   713  	vmwrite(VMCS_CR0_GUEST_HOST_MASK, 0);
   714  	vmwrite(VMCS_CR4_GUEST_HOST_MASK, 0);
   715  	vmwrite(VMCS_CR0_READ_SHADOW, read_cr0());
   716  	vmwrite(VMCS_CR4_READ_SHADOW, read_cr4());
   717  
   718  	// Disable the bitmaps which we do not use.
   719  	vmwrite(VMCS_MSR_BITMAP, 0);
   720  	vmwrite(VMCS_VMREAD_BITMAP, 0);
   721  	vmwrite(VMCS_VMWRITE_BITMAP, 0);
   722  
   723  	// Intercept #UD (Invalid Opcode)
   724  	vmwrite(VMCS_EXCEPTION_BITMAP, (1 << 6));
   725  
   726  	// Clear unused/unsupported fields.
   727  	// TODO(glider): do we need these?
   728  	vmwrite(VMCS_VIRTUAL_PROCESSOR_ID, 0);
   729  	vmwrite(VMCS_POSTED_INTR_NV, 0);
   730  	vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MASK, 0);
   731  	vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MATCH, -1);
   732  	vmwrite(VMCS_CR3_TARGET_COUNT, 0);
   733  	vmwrite(VMCS_VM_EXIT_MSR_STORE_COUNT, 0);
   734  	vmwrite(VMCS_VM_EXIT_MSR_LOAD_COUNT, 0);
   735  	vmwrite(VMCS_VM_ENTRY_MSR_LOAD_COUNT, 0);
   736  	vmwrite(VMCS_VM_ENTRY_INTR_INFO_FIELD, 0);
   737  	vmwrite(VMCS_TPR_THRESHOLD, 0);
   738  }
   739  
   740  // Common L2 exit reasons for Intel and AMD.
   741  typedef enum {
   742  	SYZOS_NESTED_EXIT_REASON_HLT = 1,
   743  	SYZOS_NESTED_EXIT_REASON_INVD = 2,
   744  	SYZOS_NESTED_EXIT_REASON_CPUID = 3,
   745  	SYZOS_NESTED_EXIT_REASON_RDTSC = 4,
   746  	SYZOS_NESTED_EXIT_REASON_RDTSCP = 5,
   747  	SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF,
   748  } syz_nested_exit_reason;
   749  
   750  GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason mapped_reason,
   751  				      cpu_vendor_id vendor)
   752  {
   753  	if (mapped_reason != SYZOS_NESTED_EXIT_REASON_UNKNOWN) {
   754  		guest_uexit(0xe2e20000 | mapped_reason);
   755  	} else if (vendor == CPU_VENDOR_INTEL) {
   756  		guest_uexit(0xe2110000 | exit_reason);
   757  	} else {
   758  		guest_uexit(0xe2aa0000 | exit_reason);
   759  	}
   760  }
   761  
   762  #define EXIT_REASON_CPUID 0xa
   763  #define EXIT_REASON_HLT 0xc
   764  #define EXIT_REASON_INVD 0xd
   765  #define EXIT_REASON_RDTSC 0x10
   766  #define EXIT_REASON_RDTSCP 0x33
   767  
   768  GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reason)
   769  {
   770  	// Disable optimizations.
   771  	volatile uint64 reason = basic_reason;
   772  	if (reason == EXIT_REASON_HLT)
   773  		return SYZOS_NESTED_EXIT_REASON_HLT;
   774  	if (reason == EXIT_REASON_INVD)
   775  		return SYZOS_NESTED_EXIT_REASON_INVD;
   776  	if (reason == EXIT_REASON_CPUID)
   777  		return SYZOS_NESTED_EXIT_REASON_CPUID;
   778  	if (reason == EXIT_REASON_RDTSC)
   779  		return SYZOS_NESTED_EXIT_REASON_RDTSC;
   780  	if (reason == EXIT_REASON_RDTSCP)
   781  		return SYZOS_NESTED_EXIT_REASON_RDTSCP;
   782  	return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
   783  }
   784  
   785  GUEST_CODE static void advance_l2_rip_intel(uint64 basic_reason)
   786  {
   787  	// Disable optimizations.
   788  	volatile uint64 reason = basic_reason;
   789  	uint64 rip = vmread(VMCS_GUEST_RIP);
   790  	if ((reason == EXIT_REASON_INVD) || (reason == EXIT_REASON_CPUID) ||
   791  	    (reason == EXIT_REASON_RDTSC)) {
   792  		rip += 2;
   793  	} else if (reason == EXIT_REASON_RDTSCP) {
   794  		// We insist on a single-line compound statement for else-if.
   795  		rip += 3;
   796  	}
   797  	vmwrite(VMCS_GUEST_RIP, rip);
   798  }
   799  
   800  // This function is called from inline assembly.
   801  __attribute__((used))
   802  GUEST_CODE static void
   803  nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs)
   804  {
   805  	uint64 basic_reason = exit_reason & 0xFFFF;
   806  	syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason);
   807  	guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL);
   808  	advance_l2_rip_intel(basic_reason);
   809  }
   810  
   811  extern char after_vmentry_label;
   812  __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(void)
   813  {
   814  	asm volatile(R"(
   815        // Save L2's GPRs. This creates the 'struct l2_guest_regs' on the stack.
   816        // The order MUST match the struct.
   817        push %%rax
   818        push %%rbx
   819        push %%rcx
   820        push %%rdx
   821        push %%rsi
   822        push %%rdi
   823        push %%rbp
   824        push %%r8
   825        push %%r9
   826        push %%r10
   827        push %%r11
   828        push %%r12
   829        push %%r13
   830        push %%r14
   831        push %%r15
   832  
   833        // Prepare arguments for the C handler:
   834        //    arg1 (RDI) = exit_reason
   835        //    arg2 (RSI) = pointer to the saved registers
   836        mov %%rsp, %%rsi
   837        mov %[vm_exit_reason], %%rbx
   838        vmread %%rbx, %%rdi
   839  
   840        // Call the C handler.
   841        call nested_vm_exit_handler_intel
   842  
   843        // The C handler has processed the exit. Now, return to the L1 command
   844        // processing loop. VMX remains enabled.
   845        add %[stack_cleanup_size], %%rsp
   846  
   847        // Jump to L1 main flow
   848        jmp after_vmentry_label
   849  	)"
   850  
   851  		     : : [stack_cleanup_size] "i"(sizeof(struct l2_guest_regs)),
   852  			 [vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi");
   853  }
   854  
   855  #define VMEXIT_RDTSC 0x6e
   856  #define VMEXIT_CPUID 0x72
   857  #define VMEXIT_INVD 0x76
   858  #define VMEXIT_HLT 0x78
   859  #define VMEXIT_RDTSCP 0x87
   860  
   861  GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason)
   862  {
   863  	// Disable optimizations.
   864  	volatile uint64 reason = basic_reason;
   865  	if (reason == VMEXIT_HLT)
   866  		return SYZOS_NESTED_EXIT_REASON_HLT;
   867  	if (reason == VMEXIT_INVD)
   868  		return SYZOS_NESTED_EXIT_REASON_INVD;
   869  	if (reason == VMEXIT_CPUID)
   870  		return SYZOS_NESTED_EXIT_REASON_CPUID;
   871  	if (reason == VMEXIT_RDTSC)
   872  		return SYZOS_NESTED_EXIT_REASON_RDTSC;
   873  	if (reason == VMEXIT_RDTSCP)
   874  		return SYZOS_NESTED_EXIT_REASON_RDTSCP;
   875  	return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
   876  }
   877  
   878  GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, uint64 vm_id)
   879  {
   880  	// Disable optimizations.
   881  	volatile uint64 reason = basic_reason;
   882  	uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
   883  	uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP);
   884  	if ((reason == VMEXIT_INVD) || (reason == VMEXIT_CPUID) ||
   885  	    (reason == VMEXIT_RDTSC)) {
   886  		rip += 2;
   887  	} else if (reason == VMEXIT_RDTSCP) {
   888  		// We insist on a single-line compound statement for else-if.
   889  		rip += 3;
   890  	}
   891  	vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip);
   892  }
   893  
   894  __attribute__((used)) GUEST_CODE static void
   895  nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id)
   896  {
   897  	volatile uint64 basic_reason = exit_reason & 0xFFFF;
   898  	syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason);
   899  	guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD);
   900  	advance_l2_rip_amd(basic_reason, cpu_id, vm_id);
   901  }
   902  
   903  GUEST_CODE static noinline void init_vmcs_host_state(void)
   904  {
   905  	// Segment Selectors.
   906  	vmwrite(VMCS_HOST_CS_SELECTOR, X86_SYZOS_SEL_CODE);
   907  	vmwrite(VMCS_HOST_DS_SELECTOR, X86_SYZOS_SEL_DATA);
   908  	vmwrite(VMCS_HOST_ES_SELECTOR, X86_SYZOS_SEL_DATA);
   909  	vmwrite(VMCS_HOST_SS_SELECTOR, X86_SYZOS_SEL_DATA);
   910  	vmwrite(VMCS_HOST_FS_SELECTOR, X86_SYZOS_SEL_DATA);
   911  	vmwrite(VMCS_HOST_GS_SELECTOR, X86_SYZOS_SEL_DATA);
   912  	vmwrite(VMCS_HOST_TR_SELECTOR, X86_SYZOS_SEL_TSS64);
   913  
   914  	// Base addresses.
   915  	vmwrite(VMCS_HOST_TR_BASE, 0);
   916  	vmwrite(VMCS_HOST_GDTR_BASE, X86_SYZOS_ADDR_GDT);
   917  	vmwrite(VMCS_HOST_IDTR_BASE, X86_SYZOS_ADDR_VAR_IDT);
   918  	vmwrite(VMCS_HOST_FS_BASE, rdmsr(X86_MSR_FS_BASE));
   919  	vmwrite(VMCS_HOST_GS_BASE, rdmsr(X86_MSR_GS_BASE));
   920  
   921  	// RIP and RSP.
   922  	uint64 tmpreg = 0; // nolint
   923  	asm volatile("mov %%rsp, %0" : "=r"(tmpreg));
   924  	vmwrite(VMCS_HOST_RSP, tmpreg);
   925  	vmwrite(VMCS_HOST_RIP, (uintptr_t)nested_vm_exit_handler_intel_asm);
   926  
   927  	// Control Registers.
   928  	vmwrite(VMCS_HOST_CR0, read_cr0());
   929  	vmwrite(VMCS_HOST_CR3, read_cr3());
   930  	vmwrite(VMCS_HOST_CR4, read_cr4());
   931  
   932  	// MSRs.
   933  	vmwrite(VMCS_HOST_IA32_PAT, rdmsr(X86_MSR_IA32_CR_PAT));
   934  	vmwrite(VMCS_HOST_IA32_EFER, rdmsr(X86_MSR_IA32_EFER));
   935  	vmwrite(VMCS_HOST_IA32_PERF_GLOBAL_CTRL, rdmsr(X86_MSR_CORE_PERF_GLOBAL_CTRL));
   936  	vmwrite(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(X86_MSR_IA32_SYSENTER_CS));
   937  	vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(X86_MSR_IA32_SYSENTER_ESP));
   938  	vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(X86_MSR_IA32_SYSENTER_EIP));
   939  }
   940  
   941  #define COPY_VMCS_FIELD(GUEST_FIELD, HOST_FIELD) \
   942  	vmwrite(GUEST_FIELD, vmread(HOST_FIELD))
   943  
   944  #define SETUP_L2_SEGMENT(SEG, SELECTOR, BASE, LIMIT, AR) \
   945  	vmwrite(VMCS_GUEST_##SEG##_SELECTOR, SELECTOR);  \
   946  	vmwrite(VMCS_GUEST_##SEG##_BASE, BASE);          \
   947  	vmwrite(VMCS_GUEST_##SEG##_LIMIT, LIMIT);        \
   948  	vmwrite(VMCS_GUEST_##SEG##_ACCESS_RIGHTS, AR);
   949  
   950  GUEST_CODE static noinline void init_vmcs_guest_state(uint64 cpu_id, uint64 vm_id)
   951  {
   952  	uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
   953  	uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
   954  	// Segment Registers.
   955  	SETUP_L2_SEGMENT(CS, vmread(VMCS_HOST_CS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_CODE);
   956  	SETUP_L2_SEGMENT(DS, vmread(VMCS_HOST_DS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
   957  	SETUP_L2_SEGMENT(ES, vmread(VMCS_HOST_ES_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
   958  	SETUP_L2_SEGMENT(SS, vmread(VMCS_HOST_SS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
   959  	SETUP_L2_SEGMENT(FS, vmread(VMCS_HOST_FS_SELECTOR), vmread(VMCS_HOST_FS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
   960  	SETUP_L2_SEGMENT(GS, vmread(VMCS_HOST_GS_SELECTOR), vmread(VMCS_HOST_GS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
   961  
   962  	// Task and LDT Registers.
   963  	SETUP_L2_SEGMENT(TR, vmread(VMCS_HOST_TR_SELECTOR), vmread(VMCS_HOST_TR_BASE), 0x67, VMX_AR_TSS_BUSY);
   964  	SETUP_L2_SEGMENT(LDTR, 0, 0, 0, VMX_AR_LDTR_UNUSABLE);
   965  
   966  	// Control Registers & CPU State.
   967  	vmwrite(VMCS_GUEST_CR0, vmread(VMCS_HOST_CR0));
   968  	vmwrite(VMCS_GUEST_CR3, vmread(VMCS_HOST_CR3));
   969  	vmwrite(VMCS_GUEST_CR4, vmread(VMCS_HOST_CR4));
   970  	vmwrite(VMCS_GUEST_RIP, l2_code_addr);
   971  	vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
   972  	vmwrite(VMCS_GUEST_RFLAGS, RFLAGS_1_BIT);
   973  	// TODO
   974  	vmwrite(VMCS_GUEST_DR7, 0x400);
   975  
   976  	// MSRs - Copy from host or set to default.
   977  	COPY_VMCS_FIELD(VMCS_GUEST_IA32_EFER, VMCS_HOST_IA32_EFER);
   978  	COPY_VMCS_FIELD(VMCS_GUEST_IA32_PAT, VMCS_HOST_IA32_PAT);
   979  	COPY_VMCS_FIELD(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL, VMCS_HOST_IA32_PERF_GLOBAL_CTRL);
   980  	COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_CS, VMCS_HOST_IA32_SYSENTER_CS);
   981  	COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_ESP, VMCS_HOST_IA32_SYSENTER_ESP);
   982  	COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_EIP, VMCS_HOST_IA32_SYSENTER_EIP);
   983  	vmwrite(VMCS_GUEST_IA32_DEBUGCTL, 0);
   984  
   985  	// Descriptor Tables.
   986  	vmwrite(VMCS_GUEST_GDTR_BASE, vmread(VMCS_HOST_GDTR_BASE));
   987  	vmwrite(VMCS_GUEST_GDTR_LIMIT, 0xffff);
   988  	vmwrite(VMCS_GUEST_IDTR_BASE, vmread(VMCS_HOST_IDTR_BASE));
   989  	vmwrite(VMCS_GUEST_IDTR_LIMIT, 0xffff);
   990  
   991  	// Miscellaneous Fields.
   992  	vmwrite(VMCS_LINK_POINTER, 0xffffffffffffffff);
   993  	// 0 = Active.
   994  	vmwrite(VMCS_GUEST_ACTIVITY_STATE, 0);
   995  	vmwrite(VMCS_GUEST_INTERRUPTIBILITY_INFO, 0);
   996  	vmwrite(VMCS_GUEST_PENDING_DBG_EXCEPTIONS, 0);
   997  	vmwrite(VMCS_VMX_PREEMPTION_TIMER_VALUE, 0);
   998  	vmwrite(VMCS_GUEST_INTR_STATUS, 0);
   999  	vmwrite(VMCS_GUEST_PML_INDEX, 0);
  1000  }
  1001  
  1002  GUEST_CODE static noinline void
  1003  nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id)
  1004  {
  1005  	uint64 vm_id = cmd->arg;
  1006  	uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
  1007  	uint8 error = 0; // nolint
  1008  
  1009  	*(uint32*)vmcs_addr = rdmsr(X86_MSR_IA32_VMX_BASIC);
  1010  	asm volatile("vmclear %1; setna %0"
  1011  		     : "=q"(error)
  1012  		     : "m"(vmcs_addr)
  1013  		     : "memory", "cc");
  1014  	if (error) {
  1015  		guest_uexit(0xE2BAD1);
  1016  		return;
  1017  	}
  1018  	nested_vmptrld(cpu_id, vm_id);
  1019  
  1020  	setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id);
  1021  	init_vmcs_control_fields(cpu_id, vm_id);
  1022  	init_vmcs_host_state();
  1023  	init_vmcs_guest_state(cpu_id, vm_id);
  1024  }
  1025  
  1026  // Helper for setting up a segment in the VMCB
  1027  #define SETUP_L2_SEGMENT_SVM(VMBC_PTR, SEG_NAME, SELECTOR, BASE, LIMIT, ATTR) \
  1028  	vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_SEL, SELECTOR);        \
  1029  	vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_ATTR, ATTR);           \
  1030  	vmcb_write32(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_LIM, LIMIT);           \
  1031  	vmcb_write64(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_BASE, BASE);
  1032  
  1033  GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_id)
  1034  {
  1035  	uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
  1036  	uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
  1037  	uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
  1038  	uint64 npt_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
  1039  	// Setup Guest Segment Registers.
  1040  	// We copy the L1 guest's segment setup, as it's a good 64-bit environment.
  1041  	SETUP_L2_SEGMENT_SVM(vmcb_addr, CS, X86_SYZOS_SEL_CODE, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_CODE);
  1042  	SETUP_L2_SEGMENT_SVM(vmcb_addr, DS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
  1043  	SETUP_L2_SEGMENT_SVM(vmcb_addr, ES, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
  1044  	SETUP_L2_SEGMENT_SVM(vmcb_addr, SS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
  1045  	SETUP_L2_SEGMENT_SVM(vmcb_addr, FS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
  1046  	SETUP_L2_SEGMENT_SVM(vmcb_addr, GS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
  1047  
  1048  	// Task Register (TR). Must point to a valid, present, 64-bit TSS.
  1049  	SETUP_L2_SEGMENT_SVM(vmcb_addr, TR, X86_SYZOS_SEL_TSS64, X86_SYZOS_ADDR_VAR_TSS, 0x67, VMX_AR_TSS_AVAILABLE);
  1050  
  1051  	// LDT Register (LDTR) - Mark as unusable.
  1052  	// A null selector and attribute is the correct way to disable LDTR.
  1053  	SETUP_L2_SEGMENT_SVM(vmcb_addr, LDTR, 0, 0, 0, SVM_ATTR_LDTR_UNUSABLE);
  1054  
  1055  	// Setup Guest Control Registers & CPU State.
  1056  	uint64 efer = rdmsr(X86_MSR_IA32_EFER);
  1057  	vmcb_write64(vmcb_addr, VMCB_GUEST_CR0, read_cr0() | X86_CR0_WP);
  1058  	// L2 will use L1's page tables.
  1059  	vmcb_write64(vmcb_addr, VMCB_GUEST_CR3, read_cr3());
  1060  	vmcb_write64(vmcb_addr, VMCB_GUEST_CR4, read_cr4());
  1061  	vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, l2_code_addr);
  1062  	vmcb_write64(vmcb_addr, VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
  1063  	vmcb_write64(vmcb_addr, VMCB_GUEST_RFLAGS, RFLAGS_1_BIT);
  1064  
  1065  	// Setup Guest MSRs.
  1066  
  1067  	// SYSCALL/SYSRET MSRs.
  1068  	vmcb_write64(vmcb_addr, VMCB_GUEST_DEBUGCTL, 0);
  1069  	vmcb_write64(vmcb_addr, VMCB_GUEST_DR6, 0x0);
  1070  	vmcb_write64(vmcb_addr, VMCB_GUEST_DR7, 0x0);
  1071  
  1072  	vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, efer & ~X86_EFER_SCE);
  1073  	vmcb_write64(vmcb_addr, VMCB_GUEST_PAT, rdmsr(X86_MSR_IA32_CR_PAT));
  1074  
  1075  	// Setup Guest Descriptor Tables.
  1076  	struct {
  1077  		uint16 limit;
  1078  		uint64 base;
  1079  	} __attribute__((packed)) gdtr, idtr;
  1080  	asm volatile("sgdt %0" : "=m"(gdtr));
  1081  	asm volatile("sidt %0" : "=m"(idtr));
  1082  	vmcb_write64(vmcb_addr, VMCB_GUEST_GDTR_BASE, gdtr.base);
  1083  	vmcb_write32(vmcb_addr, VMCB_GUEST_GDTR_LIM, gdtr.limit);
  1084  	vmcb_write64(vmcb_addr, VMCB_GUEST_IDTR_BASE, idtr.base);
  1085  	vmcb_write32(vmcb_addr, VMCB_GUEST_IDTR_LIM, idtr.limit);
  1086  
  1087  	// Setup VMCB Control Fields.
  1088  	vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_VEC3_ALL);
  1089  	vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC4, VMCB_CTRL_INTERCEPT_VEC4_ALL);
  1090  
  1091  	// Enable Nested Paging (NPT):
  1092  	// Write '1' to the NPT Enable field (0x090).
  1093  	vmcb_write64(vmcb_addr, VMCB_CTRL_NP_ENABLE, (1 << VMCB_CTRL_NPT_ENABLE_BIT));
  1094  
  1095  	// 2Write the NPT root address to N_CR3 (0x098)
  1096  	// Unlike Intel's EPTP, AMD's N_CR3 field is *only* the
  1097  	// 4K-aligned physical address of the PML4 table.
  1098  	// It does not contain any control bits.
  1099  	uint64 npt_pointer = (npt_pml4_addr & ~0xFFF);
  1100  	vmcb_write64(vmcb_addr, VMCB_CTRL_N_CR3, npt_pointer);
  1101  
  1102  	// Set Guest ASID.
  1103  	vmcb_write32(vmcb_addr, VMCB_CTRL_ASID, 1);
  1104  }
  1105  
  1106  GUEST_CODE static noinline void
  1107  nested_create_vm_amd(struct api_call_1* cmd, uint64 cpu_id)
  1108  {
  1109  	uint64 vm_id = cmd->arg;
  1110  	uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
  1111  
  1112  	guest_memset((void*)vmcb_addr, 0, KVM_PAGE_SIZE);
  1113  	guest_memset((void*)X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id), 0, KVM_PAGE_SIZE);
  1114  
  1115  	// Setup NPT (Nested Page Tables)
  1116  	setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id);
  1117  
  1118  	// Initialize VMCB Control and Guest State
  1119  	init_vmcb_guest_state(cpu_id, vm_id);
  1120  }
  1121  
  1122  GUEST_CODE static noinline void
  1123  guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id)
  1124  {
  1125  	if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
  1126  		nested_create_vm_intel(cmd, cpu_id);
  1127  	} else {
  1128  		nested_create_vm_amd(cmd, cpu_id);
  1129  	}
  1130  }
  1131  
  1132  GUEST_CODE static noinline void
  1133  guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id)
  1134  {
  1135  	uint64 vm_id = cmd->vm_id;
  1136  	uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
  1137  	uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
  1138  	// Code size = command size - header size - vm_id size.
  1139  	uint64 l2_code_size = cmd->header.size - sizeof(struct api_call_header) - sizeof(uint64);
  1140  	if (l2_code_size > KVM_PAGE_SIZE)
  1141  		l2_code_size = KVM_PAGE_SIZE;
  1142  	guest_memcpy((void*)l2_code_addr, (void*)cmd->insns,
  1143  		     l2_code_size);
  1144  	if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
  1145  		nested_vmptrld(cpu_id, vm_id);
  1146  		vmwrite(VMCS_GUEST_RIP, l2_code_addr);
  1147  		vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
  1148  	} else {
  1149  		vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RIP, l2_code_addr);
  1150  		vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
  1151  	}
  1152  }
  1153  
  1154  // Clang's LTO may ignore noinline and attempt to inline this function into both callers,
  1155  // which results in duplicate declaration of after_vmentry_label.
  1156  // Applying __optnone should prevent this behavior.
  1157  GUEST_CODE static noinline __optnone void
  1158  guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch)
  1159  {
  1160  	uint64 vmx_error_code = 0;
  1161  	uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set
  1162  
  1163  	nested_vmptrld(cpu_id, vm_id);
  1164  
  1165  	if (is_launch) {
  1166  		asm volatile(R"(
  1167  	// Attempt to launch the L2 guest.
  1168  	vmlaunch
  1169  	// Set AL to 1 if CF=1 (VMfailValid)
  1170  	setc %%al
  1171  	// Set BL to 1 if ZF=1 (VMfailInvalid)
  1172  	setz %%bl
  1173  	or %%bl, %%al)"
  1174  			     : "=a"(fail_flag)
  1175  			     :
  1176  			     : "rbx", "cc", "memory");
  1177  	} else {
  1178  		asm volatile(R"(
  1179  	// Attempt to resume the L2 guest.
  1180  	vmresume
  1181  	// Set AL to 1 if CF=1 (VMfailValid)
  1182  	setc %%al
  1183  	// Set BL to 1 if ZF=1 (VMfailInvalid)
  1184  	setz %%bl
  1185  	or %%bl, %%al)"
  1186  			     : "=a"(fail_flag)
  1187  			     :
  1188  			     : "rbx", "cc", "memory");
  1189  	}
  1190  	asm volatile(".globl after_vmentry_label\nafter_vmentry_label:");
  1191  	if (fail_flag) {
  1192  		// VMLAUNCH/VMRESUME failed, so VMCS is still valid and can be read.
  1193  		vmx_error_code = vmread(VMCS_VM_INSTRUCTION_ERROR);
  1194  		guest_uexit(0xE2E10000 | (uint32)vmx_error_code);
  1195  		return;
  1196  	}
  1197  	// If we get here, this means VMLAUNCH/VMRESUME truly succeeded (CF=0 and ZF=0)
  1198  	// and the L2 guest has run and exited.
  1199  }
  1200  
  1201  GUEST_CODE static noinline void
  1202  guest_run_amd_vm(uint64 cpu_id, uint64 vm_id)
  1203  {
  1204  	uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
  1205  	volatile uint8* vmcb_ptr = (volatile uint8*)vmcb_addr;
  1206  	uint8 fail_flag = 0;
  1207  
  1208  	asm volatile(
  1209  	    "mov %1, %%rax\n\t" // Load VMCB physical address into RAX
  1210  	    "vmrun\n\t" // Launch or resume L2 guest
  1211  	    "setc %0\n\t"
  1212  	    : "=q"(fail_flag)
  1213  	    : "m"(vmcb_addr)
  1214  	    : "rax", "cc", "memory");
  1215  
  1216  	if (fail_flag) {
  1217  		// VMRUN failed.
  1218  		guest_uexit(0xE2E10000 | 0xFFFF);
  1219  		return;
  1220  	}
  1221  
  1222  	// VMRUN succeeded and we have a VM-exit.
  1223  	uint64 exit_reason = vmcb_read64(vmcb_ptr, VMCB_EXIT_CODE);
  1224  	nested_vm_exit_handler_amd(exit_reason, cpu_id, vm_id);
  1225  }
  1226  
  1227  GUEST_CODE static noinline void
  1228  guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id)
  1229  {
  1230  	uint64 vm_id = cmd->arg;
  1231  	if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
  1232  		guest_handle_nested_vmentry_intel(vm_id, cpu_id, true);
  1233  	} else {
  1234  		guest_run_amd_vm(cpu_id, vm_id);
  1235  	}
  1236  }
  1237  
  1238  GUEST_CODE static noinline void
  1239  guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id)
  1240  {
  1241  	uint64 vm_id = cmd->arg;
  1242  	if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
  1243  		guest_handle_nested_vmentry_intel(vm_id, cpu_id, false);
  1244  	} else {
  1245  		guest_run_amd_vm(cpu_id, vm_id);
  1246  	}
  1247  }
  1248  
  1249  GUEST_CODE static noinline void
  1250  guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id)
  1251  {
  1252  	if (get_cpu_vendor() != CPU_VENDOR_INTEL)
  1253  		return;
  1254  	uint64 vm_id = cmd->args[0];
  1255  	nested_vmptrld(cpu_id, vm_id);
  1256  	uint64 field = cmd->args[1];
  1257  	uint64 set_mask = cmd->args[2];
  1258  	uint64 unset_mask = cmd->args[3];
  1259  	uint64 flip_mask = cmd->args[4];
  1260  
  1261  	uint64 current_value = vmread(field);
  1262  	uint64 new_value = (current_value & ~unset_mask) | set_mask;
  1263  	new_value ^= flip_mask;
  1264  	vmwrite(field, new_value);
  1265  }
  1266  
  1267  GUEST_CODE static noinline void
  1268  guest_handle_nested_amd_vmcb_write_mask(struct api_call_5* cmd, uint64 cpu_id)
  1269  {
  1270  	if (get_cpu_vendor() != CPU_VENDOR_AMD)
  1271  		return;
  1272  	uint64 vm_id = cmd->args[0];
  1273  	uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
  1274  	uint64 offset = cmd->args[1];
  1275  	uint64 set_mask = cmd->args[2];
  1276  	uint64 unset_mask = cmd->args[3];
  1277  	uint64 flip_mask = cmd->args[4];
  1278  
  1279  	uint64 current_value = vmcb_read64((volatile uint8*)vmcb_addr, offset);
  1280  	uint64 new_value = (current_value & ~unset_mask) | set_mask;
  1281  	new_value ^= flip_mask;
  1282  	vmcb_write64(vmcb_addr, offset, new_value);
  1283  }
  1284  
  1285  #endif // EXECUTOR_COMMON_KVM_AMD64_SYZOS_H