github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/executor/common_kvm_amd64_syzos.h (about) 1 // Copyright 2025 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 #ifndef EXECUTOR_COMMON_KVM_AMD64_SYZOS_H 5 #define EXECUTOR_COMMON_KVM_AMD64_SYZOS_H 6 7 // This file provides guest code running inside the AMD64 KVM. 8 9 #include "common_kvm_syzos.h" 10 #include "kvm.h" 11 #include <linux/kvm.h> 12 #include <stdbool.h> 13 14 // There are no particular rules to assign numbers here, but changing them will 15 // result in losing some existing reproducers. Therefore, we try to leave spaces 16 // between unrelated IDs. 17 // Remember these constants must match those in sys/linux/dev_kvm_amd64.txt. 18 typedef enum { 19 SYZOS_API_UEXIT = 0, 20 SYZOS_API_CODE = 10, 21 SYZOS_API_CPUID = 100, 22 SYZOS_API_WRMSR = 101, 23 SYZOS_API_RDMSR = 102, 24 SYZOS_API_WR_CRN = 103, 25 SYZOS_API_WR_DRN = 104, 26 SYZOS_API_IN_DX = 105, 27 SYZOS_API_OUT_DX = 106, 28 SYZOS_API_SET_IRQ_HANDLER = 200, 29 SYZOS_API_ENABLE_NESTED = 300, 30 SYZOS_API_NESTED_CREATE_VM = 301, 31 SYZOS_API_NESTED_LOAD_CODE = 302, 32 SYZOS_API_NESTED_VMLAUNCH = 303, 33 SYZOS_API_NESTED_VMRESUME = 304, 34 SYZOS_API_NESTED_INTEL_VMWRITE_MASK = 340, 35 SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK = 380, 36 SYZOS_API_STOP, // Must be the last one 37 } syzos_api_id; 38 39 struct api_call_header { 40 uint64 call; 41 uint64 size; 42 }; 43 44 struct api_call_uexit { 45 struct api_call_header header; 46 uint64 exit_code; 47 }; 48 49 struct api_call_code { 50 struct api_call_header header; 51 uint8 insns[]; 52 }; 53 54 struct api_call_nested_load_code { 55 struct api_call_header header; 56 uint64 vm_id; 57 uint8 insns[]; 58 }; 59 60 struct api_call_cpuid { 61 struct api_call_header header; 62 uint32 eax; 63 uint32 ecx; 64 }; 65 66 struct api_call_1 { 67 struct api_call_header header; 68 uint64 arg; 69 }; 70 71 struct api_call_2 { 72 struct api_call_header header; 73 uint64 args[2]; 74 }; 75 76 struct api_call_3 { 77 struct api_call_header header; 78 uint64 args[3]; 79 }; 80 81 struct api_call_5 { 82 struct api_call_header header; 83 uint64 args[5]; 84 }; 85 86 // This struct must match the push/pop order in nested_vm_exit_handler_intel_asm(). 87 struct l2_guest_regs { 88 uint64 rax, rbx, rcx, rdx, rsi, rdi, rbp; 89 uint64 r8, r9, r10, r11, r12, r13, r14, r15; 90 }; 91 92 #ifdef __cplusplus 93 extern "C" { 94 #endif 95 GUEST_CODE static void guest_uexit(uint64 exit_code); 96 GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs); 97 #ifdef __cplusplus 98 } 99 #endif 100 GUEST_CODE static void guest_execute_code(uint8* insns, uint64 size); 101 GUEST_CODE static void guest_handle_cpuid(uint32 eax, uint32 ecx); 102 GUEST_CODE static void guest_handle_wrmsr(uint64 reg, uint64 val); 103 GUEST_CODE static void guest_handle_rdmsr(uint64 reg); 104 GUEST_CODE static void guest_handle_wr_crn(struct api_call_2* cmd); 105 GUEST_CODE static void guest_handle_wr_drn(struct api_call_2* cmd); 106 GUEST_CODE static void guest_handle_in_dx(struct api_call_2* cmd); 107 GUEST_CODE static void guest_handle_out_dx(struct api_call_3* cmd); 108 GUEST_CODE static void guest_handle_set_irq_handler(struct api_call_2* cmd); 109 GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id); 110 GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id); 111 GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id); 112 GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id); 113 GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id); 114 GUEST_CODE static void guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id); 115 GUEST_CODE static void guest_handle_nested_amd_vmcb_write_mask(struct api_call_5* cmd, uint64 cpu_id); 116 117 typedef enum { 118 UEXIT_END = (uint64)-1, 119 UEXIT_IRQ = (uint64)-2, 120 UEXIT_ASSERT = (uint64)-3, 121 } uexit_code; 122 123 typedef enum { 124 CPU_VENDOR_INTEL, 125 CPU_VENDOR_AMD, 126 } cpu_vendor_id; 127 128 __attribute__((naked)) 129 GUEST_CODE static void 130 dummy_null_handler() 131 { 132 asm("iretq"); 133 } 134 135 __attribute__((naked)) GUEST_CODE static void uexit_irq_handler() 136 { 137 asm volatile(R"( 138 // Call guest_uexit(UEXIT_IRQ). 139 movq $-2, %rdi 140 call guest_uexit 141 142 iretq 143 )"); 144 } 145 146 // Main guest function that performs necessary setup and passes the control to the user-provided 147 // payload. 148 // The inner loop uses a complex if-statement, because Clang is eager to insert a jump table into 149 // a switch statement. 150 151 // TODO(glider): executor/style_test.go insists that single-line compound statements should not 152 // be used e.g. in the following case: 153 // if (call == SYZOS_API_UEXIT) { 154 // struct api_call_uexit* ucmd = (struct api_call_uexit*)cmd; 155 // guest_uexit(ucmd->exit_code); 156 // } else if (call == SYZOS_API_WR_CRN) { 157 // guest_handle_wr_crn((struct api_call_2*)cmd); // Style check fails here 158 // } 159 // , i.e. when the braces are consistent with the rest of the code, even despite this violates the 160 // Google C++ style guide. 161 // We add single-line comments to justify having the compound statements below. 162 __attribute__((used)) 163 GUEST_CODE static void 164 guest_main(uint64 size, uint64 cpu) 165 { 166 uint64 addr = X86_SYZOS_ADDR_USER_CODE + cpu * KVM_PAGE_SIZE; 167 168 while (size >= sizeof(struct api_call_header)) { 169 struct api_call_header* cmd = (struct api_call_header*)addr; 170 if (cmd->call >= SYZOS_API_STOP) 171 return; 172 if (cmd->size > size) 173 return; 174 volatile uint64 call = cmd->call; 175 if (call == SYZOS_API_UEXIT) { 176 // Issue a user exit. 177 struct api_call_uexit* ucmd = (struct api_call_uexit*)cmd; 178 guest_uexit(ucmd->exit_code); 179 } else if (call == SYZOS_API_CODE) { 180 // Execute an instruction blob. 181 struct api_call_code* ccmd = (struct api_call_code*)cmd; 182 guest_execute_code(ccmd->insns, cmd->size - sizeof(struct api_call_header)); 183 } else if (call == SYZOS_API_CPUID) { 184 // Issue CPUID. 185 struct api_call_cpuid* ccmd = (struct api_call_cpuid*)cmd; 186 guest_handle_cpuid(ccmd->eax, ccmd->ecx); 187 } else if (call == SYZOS_API_WRMSR) { 188 // Write an MSR register. 189 struct api_call_2* ccmd = (struct api_call_2*)cmd; 190 guest_handle_wrmsr(ccmd->args[0], ccmd->args[1]); 191 } else if (call == SYZOS_API_RDMSR) { 192 // Read an MSR register. 193 struct api_call_1* ccmd = (struct api_call_1*)cmd; 194 guest_handle_rdmsr(ccmd->arg); 195 } else if (call == SYZOS_API_WR_CRN) { 196 // Write value to a control register. 197 guest_handle_wr_crn((struct api_call_2*)cmd); 198 } else if (call == SYZOS_API_WR_DRN) { 199 // Write value to a debug register. 200 guest_handle_wr_drn((struct api_call_2*)cmd); 201 } else if (call == SYZOS_API_IN_DX) { 202 // Read data from an I/O port. 203 guest_handle_in_dx((struct api_call_2*)cmd); 204 } else if (call == SYZOS_API_OUT_DX) { 205 // Write data to an I/O port. 206 guest_handle_out_dx((struct api_call_3*)cmd); 207 } else if (call == SYZOS_API_SET_IRQ_HANDLER) { 208 // Set the handler for a particular IRQ. 209 guest_handle_set_irq_handler((struct api_call_2*)cmd); 210 } else if (call == SYZOS_API_ENABLE_NESTED) { 211 // Enable nested virtualization. 212 guest_handle_enable_nested((struct api_call_1*)cmd, cpu); 213 } else if (call == SYZOS_API_NESTED_CREATE_VM) { 214 // Create a nested VM. 215 guest_handle_nested_create_vm((struct api_call_1*)cmd, cpu); 216 } else if (call == SYZOS_API_NESTED_LOAD_CODE) { 217 // Load code into the nested VM. 218 guest_handle_nested_load_code((struct api_call_nested_load_code*)cmd, cpu); 219 } else if (call == SYZOS_API_NESTED_VMLAUNCH) { 220 // Launch the nested VM. 221 guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu); 222 } else if (call == SYZOS_API_NESTED_VMRESUME) { 223 // Resume a nested VM. 224 guest_handle_nested_vmresume((struct api_call_1*)cmd, cpu); 225 } else if (call == SYZOS_API_NESTED_INTEL_VMWRITE_MASK) { 226 // Write to a VMCS field using masks. 227 guest_handle_nested_intel_vmwrite_mask((struct api_call_5*)cmd, cpu); 228 } else if (call == SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK) { 229 // Write to a VMCB field using masks. 230 guest_handle_nested_amd_vmcb_write_mask((struct api_call_5*)cmd, cpu); 231 } 232 addr += cmd->size; 233 size -= cmd->size; 234 }; 235 guest_uexit((uint64)-1); 236 } 237 238 GUEST_CODE static noinline void guest_execute_code(uint8* insns, uint64 size) 239 { 240 volatile void (*fn)() = (volatile void (*)())insns; 241 fn(); 242 } 243 244 // Perform a userspace exit that can be handled by the host. 245 // The host returns from ioctl(KVM_RUN) with kvm_run.exit_reason=KVM_EXIT_MMIO, 246 // and can handle the call depending on the data passed as exit code. 247 248 // Make sure the compiler does not optimize this function away, it is called from 249 // assembly. 250 __attribute__((used)) 251 GUEST_CODE static noinline void 252 guest_uexit(uint64 exit_code) 253 { 254 volatile uint64* ptr = (volatile uint64*)X86_SYZOS_ADDR_UEXIT; 255 *ptr = exit_code; 256 } 257 258 GUEST_CODE static noinline void guest_handle_cpuid(uint32 eax, uint32 ecx) 259 { 260 asm volatile( 261 "cpuid\n" 262 : // Currently ignore outputs 263 : "a"(eax), "c"(ecx) 264 : "rbx", "rdx"); 265 } 266 267 GUEST_CODE static noinline void wrmsr(uint64 reg, uint64 val) 268 { 269 asm volatile( 270 "wrmsr" 271 : 272 : "c"(reg), 273 "a"((uint32)val), 274 "d"((uint32)(val >> 32)) 275 : "memory"); 276 } 277 278 // Write val into an MSR register reg. 279 GUEST_CODE static noinline void guest_handle_wrmsr(uint64 reg, uint64 val) 280 { 281 wrmsr(reg, val); 282 } 283 284 GUEST_CODE static noinline uint64 rdmsr(uint64 msr_id) 285 { 286 uint32 low = 0, high = 0; // nolint 287 // The RDMSR instruction takes the MSR address in ecx. 288 // It puts the lower 32 bits of the MSR value into eax, and the upper. 289 // 32 bits of the MSR value into edx. 290 asm volatile("rdmsr" : "=a"(low), "=d"(high) : "c"(msr_id)); 291 return ((uint64)high << 32) | low; 292 } 293 294 // Read an MSR register, ignore the result. 295 GUEST_CODE static noinline void guest_handle_rdmsr(uint64 reg) 296 { 297 (void)rdmsr(reg); 298 } 299 300 // Write to CRn control register. 301 GUEST_CODE static noinline void guest_handle_wr_crn(struct api_call_2* cmd) 302 { 303 uint64 value = cmd->args[1]; 304 // Prevent the compiler from generating a switch table. 305 volatile uint64 reg = cmd->args[0]; 306 if (reg == 0) { 307 // Move value to CR0. 308 asm volatile("movq %0, %%cr0" ::"r"(value) : "memory"); 309 return; 310 } 311 if (reg == 2) { 312 // Move value to CR2. 313 asm volatile("movq %0, %%cr2" ::"r"(value) : "memory"); 314 return; 315 } 316 if (reg == 3) { 317 // Move value to CR3. 318 asm volatile("movq %0, %%cr3" ::"r"(value) : "memory"); 319 return; 320 } 321 if (reg == 4) { 322 // Move value to CR4. 323 asm volatile("movq %0, %%cr4" ::"r"(value) : "memory"); 324 return; 325 } 326 if (reg == 8) { 327 // Move value to CR8 (TPR - Task Priority Register). 328 asm volatile("movq %0, %%cr8" ::"r"(value) : "memory"); 329 return; 330 } 331 } 332 333 // Write to DRn debug register. 334 GUEST_CODE static noinline void guest_handle_wr_drn(struct api_call_2* cmd) 335 { 336 uint64 value = cmd->args[1]; 337 volatile uint64 reg = cmd->args[0]; 338 if (reg == 0) { 339 asm volatile("movq %0, %%dr0" ::"r"(value) : "memory"); 340 return; 341 } 342 if (reg == 1) { 343 asm volatile("movq %0, %%dr1" ::"r"(value) : "memory"); 344 return; 345 } 346 if (reg == 2) { 347 asm volatile("movq %0, %%dr2" ::"r"(value) : "memory"); 348 return; 349 } 350 if (reg == 3) { 351 asm volatile("movq %0, %%dr3" ::"r"(value) : "memory"); 352 return; 353 } 354 if (reg == 4) { 355 asm volatile("movq %0, %%dr4" ::"r"(value) : "memory"); 356 return; 357 } 358 if (reg == 5) { 359 asm volatile("movq %0, %%dr5" ::"r"(value) : "memory"); 360 return; 361 } 362 if (reg == 6) { 363 asm volatile("movq %0, %%dr6" ::"r"(value) : "memory"); 364 return; 365 } 366 if (reg == 7) { 367 asm volatile("movq %0, %%dr7" ::"r"(value) : "memory"); 368 return; 369 } 370 } 371 372 // Read data from an I/O port, should result in KVM_EXIT_IO. 373 GUEST_CODE static noinline void guest_handle_in_dx(struct api_call_2* cmd) 374 { 375 uint16 port = cmd->args[0]; 376 volatile int size = cmd->args[1]; 377 378 if (size == 1) { 379 uint8 unused; 380 // Reads 1 byte from the port in DX into AL. 381 asm volatile("inb %1, %0" : "=a"(unused) : "d"(port)); 382 return; 383 } 384 if (size == 2) { 385 uint16 unused; 386 // Reads 2 bytes from the port in DX into AX. 387 asm volatile("inw %1, %0" : "=a"(unused) : "d"(port)); 388 return; 389 } 390 if (size == 4) { 391 uint32 unused; 392 // Reads 4 bytes from the port in DX into EAX. 393 asm volatile("inl %1, %0" : "=a"(unused) : "d"(port)); 394 } 395 return; 396 } 397 398 // Write data to an I/O port, should result in KVM_EXIT_IO. 399 GUEST_CODE static noinline void guest_handle_out_dx(struct api_call_3* cmd) 400 { 401 uint16 port = cmd->args[0]; 402 volatile int size = cmd->args[1]; 403 uint32 data = (uint32)cmd->args[2]; 404 405 if (size == 1) { 406 // Writes 1 byte from AL to the port in DX. 407 asm volatile("outb %b0, %w1" ::"a"(data), "d"(port)); 408 return; 409 } 410 if (size == 2) { 411 // Writes 2 bytes from AX to the port in DX. 412 asm volatile("outw %w0, %w1" ::"a"(data), "d"(port)); 413 return; 414 } 415 if (size == 4) { 416 // Writes 4 bytes from EAX to the port in DX. 417 asm volatile("outl %k0, %w1" ::"a"(data), "d"(port)); 418 return; 419 } 420 } 421 422 // See https://wiki.osdev.org/Interrupt_Descriptor_Table#Gate_Descriptor_2. 423 struct idt_entry_64 { 424 uint16 offset_low; 425 uint16 selector; 426 // Interrupt Stack Table offset in bits 0..2 427 uint8 ist; 428 // Gate Type, P and DPL. 429 uint8 type_attr; 430 uint16 offset_mid; 431 uint32 offset_high; 432 uint32 reserved; 433 } __attribute__((packed)); 434 435 // IDT gate setup should be similar to syzos_setup_idt() in the host code. 436 GUEST_CODE static void set_idt_gate(uint8 vector, uint64 handler) 437 { 438 volatile struct idt_entry_64* idt = 439 (volatile struct idt_entry_64*)(X86_SYZOS_ADDR_VAR_IDT); 440 volatile struct idt_entry_64* idt_entry = &idt[vector]; 441 idt_entry->offset_low = (uint16)handler; 442 idt_entry->offset_mid = (uint16)(handler >> 16); 443 idt_entry->offset_high = (uint32)(handler >> 32); 444 idt_entry->selector = X86_SYZOS_SEL_CODE; 445 idt_entry->type_attr = 0x8E; 446 idt_entry->ist = 0; 447 idt_entry->reserved = 0; 448 } 449 450 GUEST_CODE static noinline void guest_handle_set_irq_handler(struct api_call_2* cmd) 451 { 452 uint8 vector = (uint8)cmd->args[0]; 453 uint64 type = cmd->args[1]; 454 volatile uint64 handler_addr = 0; 455 if (type == 1) 456 handler_addr = executor_fn_guest_addr(dummy_null_handler); 457 else if (type == 2) 458 handler_addr = executor_fn_guest_addr(uexit_irq_handler); 459 set_idt_gate(vector, handler_addr); 460 } 461 462 GUEST_CODE static cpu_vendor_id get_cpu_vendor(void) 463 { 464 uint32 ebx, eax = 0; 465 466 asm volatile( 467 "cpuid" 468 : "+a"(eax), "=b"(ebx) 469 : // No explicit inputs, EAX is handled by +a. 470 : "ecx", "edx"); 471 472 if (ebx == 0x756e6547) { // "Genu[ineIntel]". 473 return CPU_VENDOR_INTEL; 474 } else if (ebx == 0x68747541) { // "Auth[enticAMD]". 475 return CPU_VENDOR_AMD; 476 } else { 477 // Should not happen on AMD64, but for completeness. 478 guest_uexit(UEXIT_ASSERT); 479 return CPU_VENDOR_INTEL; // Default to Intel if unknown. 480 } 481 } 482 483 GUEST_CODE static inline uint64 read_cr0(void) 484 { 485 uint64 val; 486 asm volatile("mov %%cr0, %0" : "=r"(val)); 487 return val; 488 } 489 490 GUEST_CODE static inline uint64 read_cr3(void) 491 { 492 uint64 val; 493 asm volatile("mov %%cr3, %0" : "=r"(val)); 494 return val; 495 } 496 497 GUEST_CODE static inline uint64 read_cr4(void) 498 { 499 uint64 val; 500 asm volatile("mov %%cr4, %0" : "=r"(val)); 501 return val; 502 } 503 504 GUEST_CODE static inline void write_cr4(uint64 val) 505 { 506 asm volatile("mov %0, %%cr4" : : "r"(val)); 507 } 508 509 GUEST_CODE static noinline void vmwrite(uint64 field, uint64 value) 510 { 511 uint8 error = 0; // nolint 512 // 'setna' sets the byte to 1 if CF=1 or ZF=1 (VMfail) 513 asm volatile("vmwrite %%rax, %%rbx; setna %0" 514 : "=q"(error) 515 : "a"(value), "b"(field) 516 : "cc", "memory"); 517 if (error) 518 guest_uexit(UEXIT_ASSERT); 519 } 520 521 GUEST_CODE static noinline uint64 vmread(uint64 field) 522 { 523 uint64 value; 524 asm volatile("vmread %%rbx, %%rax" 525 : "=a"(value) 526 : "b"(field) 527 : "cc"); 528 return value; 529 } 530 531 GUEST_CODE static inline void nested_vmptrld(uint64 cpu_id, uint64 vm_id) 532 { 533 uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 534 uint8 error = 0; // nolint 535 asm volatile("vmptrld %1; setna %0" 536 : "=q"(error) 537 : "m"(vmcs_addr) 538 : "memory", "cc"); 539 if (error) 540 guest_uexit(0xE2BAD2); 541 } 542 543 GUEST_CODE static noinline void vmcb_write16(uint64 vmcb, uint16 offset, uint16 val) 544 { 545 *((volatile uint16*)(vmcb + offset)) = val; 546 } 547 548 GUEST_CODE static noinline void vmcb_write32(uint64 vmcb, uint16 offset, uint32 val) 549 { 550 *((volatile uint32*)(vmcb + offset)) = val; 551 } 552 553 GUEST_CODE static noinline void vmcb_write64(uint64 vmcb, uint16 offset, uint64 val) 554 { 555 *((volatile uint64*)(vmcb + offset)) = val; 556 } 557 558 GUEST_CODE static noinline uint64 vmcb_read64(volatile uint8* vmcb, uint16 offset) 559 { 560 return *((volatile uint64*)(vmcb + offset)); 561 } 562 563 GUEST_CODE static void guest_memset(void* s, uint8 c, int size) 564 { 565 volatile uint8* p = (volatile uint8*)s; 566 for (int i = 0; i < size; i++) 567 p[i] = c; 568 } 569 570 GUEST_CODE static void guest_memcpy(void* dst, void* src, int size) 571 { 572 volatile uint8* d = (volatile uint8*)dst; 573 volatile uint8* s = (volatile uint8*)src; 574 for (int i = 0; i < size; i++) 575 d[i] = s[i]; 576 } 577 578 GUEST_CODE static noinline void 579 nested_enable_vmx_intel(uint64 cpu_id) 580 { 581 uint64 vmxon_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id); 582 uint64 cr4 = read_cr4(); 583 cr4 |= X86_CR4_VMXE; 584 write_cr4(cr4); 585 586 uint64 feature_control = rdmsr(X86_MSR_IA32_FEATURE_CONTROL); 587 // Check if Lock bit (bit 0) is clear. 588 if ((feature_control & 1) == 0) { 589 // If unlocked, set Lock bit (bit 0) and Enable VMX outside SMX bit (bit 2). 590 feature_control |= 0b101; 591 asm volatile("wrmsr" : : "d"(0x0), "c"(X86_MSR_IA32_FEATURE_CONTROL), "A"(feature_control)); 592 } 593 594 // Store revision ID at the beginning of VMXON. 595 *(uint32*)vmxon_addr = rdmsr(X86_MSR_IA32_VMX_BASIC); 596 uint8 error; 597 // Can't use enter_vmx_operation() yet, because VMCS is not valid. 598 asm volatile("vmxon %1; setna %0" 599 : "=q"(error) 600 : "m"(vmxon_addr) 601 : "memory", "cc"); 602 if (error) { 603 guest_uexit(0xE2BAD0); 604 return; 605 } 606 } 607 608 GUEST_CODE static noinline void 609 nested_enable_svm_amd(uint64 cpu_id) 610 { 611 // Get the Host Save Area (HSAVE) physical address for this CPU. 612 // The HSAVE area stores the host processor's state on VMRUN and is restored on VMEXIT. 613 uint64 hsave_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id); 614 615 // Set the SVM Enable (SVME) bit in EFER. This enables SVM operations. 616 uint64 efer = rdmsr(X86_MSR_IA32_EFER); 617 efer |= X86_EFER_SVME; 618 wrmsr(X86_MSR_IA32_EFER, efer); 619 620 // Write the physical address of the HSAVE area to the VM_HSAVE_PA MSR. 621 // This MSR tells the CPU where to save/restore host state during VMRUN/VMEXIT. 622 wrmsr(X86_MSR_VM_HSAVE_PA, hsave_addr); 623 } 624 625 GUEST_CODE static noinline void 626 guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id) 627 { 628 if (get_cpu_vendor() == CPU_VENDOR_INTEL) { 629 nested_enable_vmx_intel(cpu_id); 630 } else { 631 nested_enable_svm_amd(cpu_id); 632 } 633 } 634 635 GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id) 636 { 637 uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); 638 uint64 l2_pdpt_addr = l2_pml4_addr + KVM_PAGE_SIZE; 639 uint64 l2_pd_addr = l2_pml4_addr + 2 * KVM_PAGE_SIZE; 640 uint64 l2_pt_addr = l2_pml4_addr + 3 * KVM_PAGE_SIZE; 641 642 volatile uint64* pml4 = (volatile uint64*)l2_pml4_addr; 643 volatile uint64* pdpt = (volatile uint64*)l2_pdpt_addr; 644 volatile uint64* pd = (volatile uint64*)l2_pd_addr; 645 volatile uint64* pt = (volatile uint64*)l2_pt_addr; 646 647 guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE); 648 guest_memset((void*)l2_pdpt_addr, 0, KVM_PAGE_SIZE); 649 guest_memset((void*)l2_pd_addr, 0, KVM_PAGE_SIZE); 650 guest_memset((void*)l2_pt_addr, 0, KVM_PAGE_SIZE); 651 guest_memset((void*)X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id), 0, KVM_PAGE_SIZE); 652 653 // Intel EPT: set Read, Write, Execute. 654 // AMD NPT: set Present, Write, User. 655 uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER; 656 // Create the 4-level page table entries using 4KB pages: 657 // PML4[0] -> points to PDPT 658 pml4[0] = l2_pdpt_addr | flags; 659 // PDPT[0] -> points to Page Directory (PD) 660 pdpt[0] = l2_pd_addr | flags; 661 // PD[0] -> points to Page Table (PT) (NO X86_PDE64_PS) 662 pd[0] = l2_pt_addr | flags; 663 // PT[0..511] -> maps 512 4KB pages (2MB total) identity 664 uint64 pt_flags = flags; 665 if (vendor == CPU_VENDOR_INTEL) { 666 pt_flags |= EPT_MEMTYPE_WB | EPT_ACCESSED | EPT_DIRTY; 667 } else { 668 pt_flags |= X86_PDE64_ACCESSED | X86_PDE64_DIRTY; 669 } 670 for (int i = 0; i < 512; i++) 671 pt[i] = (i * KVM_PAGE_SIZE) | pt_flags; 672 } 673 674 GUEST_CODE static noinline void init_vmcs_control_fields(uint64 cpu_id, uint64 vm_id) 675 { 676 // Read and write Pin-Based controls from TRUE MSR. 677 uint64 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS); 678 vmwrite(VMCS_PIN_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr); 679 680 // Setup Secondary Processor-Based controls: enable EPT. 681 vmx_msr = (uint32)rdmsr(X86_MSR_IA32_VMX_PROCBASED_CTLS2); 682 vmx_msr |= SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_ENABLE_RDTSCP; 683 vmwrite(VMCS_SECONDARY_VM_EXEC_CONTROL, vmx_msr); 684 685 // Read and write Primary Processor-Based controls from TRUE MSR. 686 // We also add the bit to enable the secondary controls. 687 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS); 688 vmx_msr |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 689 // Exit on HLT and RDTSC. 690 vmx_msr |= CPU_BASED_HLT_EXITING | CPU_BASED_RDTSC_EXITING; 691 vmwrite(VMCS_CPU_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr); 692 693 // Set up VM-Exit controls via TRUE MSR: indicate a 64-bit host. 694 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS); 695 vmwrite(VMCS_VM_EXIT_CONTROLS, (uint32)vmx_msr | VM_EXIT_HOST_ADDR_SPACE_SIZE); 696 // Read and write VM-Entry controls from TRUE MSR 697 // We add the bit to indicate a 64-bit guest. 698 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS); 699 vmwrite(VMCS_VM_ENTRY_CONTROLS, (uint32)vmx_msr | VM_ENTRY_IA32E_MODE); 700 701 // Set up the EPT Pointer. 702 // We use the L2 PML4 address we calculate in guest_handle_create_nested_vm. 703 // The EPT Pointer has: 704 // - Memory Type = 6 (Write-Back) 705 // - Page-Walk Length = 3 (meaning 4 levels: PML4, PDPT, PD, PT) 706 // - Address of the PML4 table 707 uint64 eptp = (X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id) & ~0xFFF) | (6 << 0) | (3 << 3); 708 vmwrite(VMCS_EPT_POINTER, eptp); 709 710 // Set CR0/CR4 masks and shadows. 711 // This simple setup (masks=0) means any guest CR0/CR4 write is allowed 712 // and won't cause a VM-Exit. 713 vmwrite(VMCS_CR0_GUEST_HOST_MASK, 0); 714 vmwrite(VMCS_CR4_GUEST_HOST_MASK, 0); 715 vmwrite(VMCS_CR0_READ_SHADOW, read_cr0()); 716 vmwrite(VMCS_CR4_READ_SHADOW, read_cr4()); 717 718 // Disable the bitmaps which we do not use. 719 vmwrite(VMCS_MSR_BITMAP, 0); 720 vmwrite(VMCS_VMREAD_BITMAP, 0); 721 vmwrite(VMCS_VMWRITE_BITMAP, 0); 722 723 // Intercept #UD (Invalid Opcode) 724 vmwrite(VMCS_EXCEPTION_BITMAP, (1 << 6)); 725 726 // Clear unused/unsupported fields. 727 // TODO(glider): do we need these? 728 vmwrite(VMCS_VIRTUAL_PROCESSOR_ID, 0); 729 vmwrite(VMCS_POSTED_INTR_NV, 0); 730 vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MASK, 0); 731 vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MATCH, -1); 732 vmwrite(VMCS_CR3_TARGET_COUNT, 0); 733 vmwrite(VMCS_VM_EXIT_MSR_STORE_COUNT, 0); 734 vmwrite(VMCS_VM_EXIT_MSR_LOAD_COUNT, 0); 735 vmwrite(VMCS_VM_ENTRY_MSR_LOAD_COUNT, 0); 736 vmwrite(VMCS_VM_ENTRY_INTR_INFO_FIELD, 0); 737 vmwrite(VMCS_TPR_THRESHOLD, 0); 738 } 739 740 // Common L2 exit reasons for Intel and AMD. 741 typedef enum { 742 SYZOS_NESTED_EXIT_REASON_HLT = 1, 743 SYZOS_NESTED_EXIT_REASON_INVD = 2, 744 SYZOS_NESTED_EXIT_REASON_CPUID = 3, 745 SYZOS_NESTED_EXIT_REASON_RDTSC = 4, 746 SYZOS_NESTED_EXIT_REASON_RDTSCP = 5, 747 SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF, 748 } syz_nested_exit_reason; 749 750 GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason mapped_reason, 751 cpu_vendor_id vendor) 752 { 753 if (mapped_reason != SYZOS_NESTED_EXIT_REASON_UNKNOWN) { 754 guest_uexit(0xe2e20000 | mapped_reason); 755 } else if (vendor == CPU_VENDOR_INTEL) { 756 guest_uexit(0xe2110000 | exit_reason); 757 } else { 758 guest_uexit(0xe2aa0000 | exit_reason); 759 } 760 } 761 762 #define EXIT_REASON_CPUID 0xa 763 #define EXIT_REASON_HLT 0xc 764 #define EXIT_REASON_INVD 0xd 765 #define EXIT_REASON_RDTSC 0x10 766 #define EXIT_REASON_RDTSCP 0x33 767 768 GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reason) 769 { 770 // Disable optimizations. 771 volatile uint64 reason = basic_reason; 772 if (reason == EXIT_REASON_HLT) 773 return SYZOS_NESTED_EXIT_REASON_HLT; 774 if (reason == EXIT_REASON_INVD) 775 return SYZOS_NESTED_EXIT_REASON_INVD; 776 if (reason == EXIT_REASON_CPUID) 777 return SYZOS_NESTED_EXIT_REASON_CPUID; 778 if (reason == EXIT_REASON_RDTSC) 779 return SYZOS_NESTED_EXIT_REASON_RDTSC; 780 if (reason == EXIT_REASON_RDTSCP) 781 return SYZOS_NESTED_EXIT_REASON_RDTSCP; 782 return SYZOS_NESTED_EXIT_REASON_UNKNOWN; 783 } 784 785 GUEST_CODE static void advance_l2_rip_intel(uint64 basic_reason) 786 { 787 // Disable optimizations. 788 volatile uint64 reason = basic_reason; 789 uint64 rip = vmread(VMCS_GUEST_RIP); 790 if ((reason == EXIT_REASON_INVD) || (reason == EXIT_REASON_CPUID) || 791 (reason == EXIT_REASON_RDTSC)) { 792 rip += 2; 793 } else if (reason == EXIT_REASON_RDTSCP) { 794 // We insist on a single-line compound statement for else-if. 795 rip += 3; 796 } 797 vmwrite(VMCS_GUEST_RIP, rip); 798 } 799 800 // This function is called from inline assembly. 801 __attribute__((used)) 802 GUEST_CODE static void 803 nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs) 804 { 805 uint64 basic_reason = exit_reason & 0xFFFF; 806 syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason); 807 guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL); 808 advance_l2_rip_intel(basic_reason); 809 } 810 811 extern char after_vmentry_label; 812 __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(void) 813 { 814 asm volatile(R"( 815 // Save L2's GPRs. This creates the 'struct l2_guest_regs' on the stack. 816 // The order MUST match the struct. 817 push %%rax 818 push %%rbx 819 push %%rcx 820 push %%rdx 821 push %%rsi 822 push %%rdi 823 push %%rbp 824 push %%r8 825 push %%r9 826 push %%r10 827 push %%r11 828 push %%r12 829 push %%r13 830 push %%r14 831 push %%r15 832 833 // Prepare arguments for the C handler: 834 // arg1 (RDI) = exit_reason 835 // arg2 (RSI) = pointer to the saved registers 836 mov %%rsp, %%rsi 837 mov %[vm_exit_reason], %%rbx 838 vmread %%rbx, %%rdi 839 840 // Call the C handler. 841 call nested_vm_exit_handler_intel 842 843 // The C handler has processed the exit. Now, return to the L1 command 844 // processing loop. VMX remains enabled. 845 add %[stack_cleanup_size], %%rsp 846 847 // Jump to L1 main flow 848 jmp after_vmentry_label 849 )" 850 851 : : [stack_cleanup_size] "i"(sizeof(struct l2_guest_regs)), 852 [vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi"); 853 } 854 855 #define VMEXIT_RDTSC 0x6e 856 #define VMEXIT_CPUID 0x72 857 #define VMEXIT_INVD 0x76 858 #define VMEXIT_HLT 0x78 859 #define VMEXIT_RDTSCP 0x87 860 861 GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason) 862 { 863 // Disable optimizations. 864 volatile uint64 reason = basic_reason; 865 if (reason == VMEXIT_HLT) 866 return SYZOS_NESTED_EXIT_REASON_HLT; 867 if (reason == VMEXIT_INVD) 868 return SYZOS_NESTED_EXIT_REASON_INVD; 869 if (reason == VMEXIT_CPUID) 870 return SYZOS_NESTED_EXIT_REASON_CPUID; 871 if (reason == VMEXIT_RDTSC) 872 return SYZOS_NESTED_EXIT_REASON_RDTSC; 873 if (reason == VMEXIT_RDTSCP) 874 return SYZOS_NESTED_EXIT_REASON_RDTSCP; 875 return SYZOS_NESTED_EXIT_REASON_UNKNOWN; 876 } 877 878 GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, uint64 vm_id) 879 { 880 // Disable optimizations. 881 volatile uint64 reason = basic_reason; 882 uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 883 uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP); 884 if ((reason == VMEXIT_INVD) || (reason == VMEXIT_CPUID) || 885 (reason == VMEXIT_RDTSC)) { 886 rip += 2; 887 } else if (reason == VMEXIT_RDTSCP) { 888 // We insist on a single-line compound statement for else-if. 889 rip += 3; 890 } 891 vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip); 892 } 893 894 __attribute__((used)) GUEST_CODE static void 895 nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id) 896 { 897 volatile uint64 basic_reason = exit_reason & 0xFFFF; 898 syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason); 899 guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD); 900 advance_l2_rip_amd(basic_reason, cpu_id, vm_id); 901 } 902 903 GUEST_CODE static noinline void init_vmcs_host_state(void) 904 { 905 // Segment Selectors. 906 vmwrite(VMCS_HOST_CS_SELECTOR, X86_SYZOS_SEL_CODE); 907 vmwrite(VMCS_HOST_DS_SELECTOR, X86_SYZOS_SEL_DATA); 908 vmwrite(VMCS_HOST_ES_SELECTOR, X86_SYZOS_SEL_DATA); 909 vmwrite(VMCS_HOST_SS_SELECTOR, X86_SYZOS_SEL_DATA); 910 vmwrite(VMCS_HOST_FS_SELECTOR, X86_SYZOS_SEL_DATA); 911 vmwrite(VMCS_HOST_GS_SELECTOR, X86_SYZOS_SEL_DATA); 912 vmwrite(VMCS_HOST_TR_SELECTOR, X86_SYZOS_SEL_TSS64); 913 914 // Base addresses. 915 vmwrite(VMCS_HOST_TR_BASE, 0); 916 vmwrite(VMCS_HOST_GDTR_BASE, X86_SYZOS_ADDR_GDT); 917 vmwrite(VMCS_HOST_IDTR_BASE, X86_SYZOS_ADDR_VAR_IDT); 918 vmwrite(VMCS_HOST_FS_BASE, rdmsr(X86_MSR_FS_BASE)); 919 vmwrite(VMCS_HOST_GS_BASE, rdmsr(X86_MSR_GS_BASE)); 920 921 // RIP and RSP. 922 uint64 tmpreg = 0; // nolint 923 asm volatile("mov %%rsp, %0" : "=r"(tmpreg)); 924 vmwrite(VMCS_HOST_RSP, tmpreg); 925 vmwrite(VMCS_HOST_RIP, (uintptr_t)nested_vm_exit_handler_intel_asm); 926 927 // Control Registers. 928 vmwrite(VMCS_HOST_CR0, read_cr0()); 929 vmwrite(VMCS_HOST_CR3, read_cr3()); 930 vmwrite(VMCS_HOST_CR4, read_cr4()); 931 932 // MSRs. 933 vmwrite(VMCS_HOST_IA32_PAT, rdmsr(X86_MSR_IA32_CR_PAT)); 934 vmwrite(VMCS_HOST_IA32_EFER, rdmsr(X86_MSR_IA32_EFER)); 935 vmwrite(VMCS_HOST_IA32_PERF_GLOBAL_CTRL, rdmsr(X86_MSR_CORE_PERF_GLOBAL_CTRL)); 936 vmwrite(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(X86_MSR_IA32_SYSENTER_CS)); 937 vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(X86_MSR_IA32_SYSENTER_ESP)); 938 vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(X86_MSR_IA32_SYSENTER_EIP)); 939 } 940 941 #define COPY_VMCS_FIELD(GUEST_FIELD, HOST_FIELD) \ 942 vmwrite(GUEST_FIELD, vmread(HOST_FIELD)) 943 944 #define SETUP_L2_SEGMENT(SEG, SELECTOR, BASE, LIMIT, AR) \ 945 vmwrite(VMCS_GUEST_##SEG##_SELECTOR, SELECTOR); \ 946 vmwrite(VMCS_GUEST_##SEG##_BASE, BASE); \ 947 vmwrite(VMCS_GUEST_##SEG##_LIMIT, LIMIT); \ 948 vmwrite(VMCS_GUEST_##SEG##_ACCESS_RIGHTS, AR); 949 950 GUEST_CODE static noinline void init_vmcs_guest_state(uint64 cpu_id, uint64 vm_id) 951 { 952 uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); 953 uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); 954 // Segment Registers. 955 SETUP_L2_SEGMENT(CS, vmread(VMCS_HOST_CS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_CODE); 956 SETUP_L2_SEGMENT(DS, vmread(VMCS_HOST_DS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); 957 SETUP_L2_SEGMENT(ES, vmread(VMCS_HOST_ES_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); 958 SETUP_L2_SEGMENT(SS, vmread(VMCS_HOST_SS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); 959 SETUP_L2_SEGMENT(FS, vmread(VMCS_HOST_FS_SELECTOR), vmread(VMCS_HOST_FS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); 960 SETUP_L2_SEGMENT(GS, vmread(VMCS_HOST_GS_SELECTOR), vmread(VMCS_HOST_GS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); 961 962 // Task and LDT Registers. 963 SETUP_L2_SEGMENT(TR, vmread(VMCS_HOST_TR_SELECTOR), vmread(VMCS_HOST_TR_BASE), 0x67, VMX_AR_TSS_BUSY); 964 SETUP_L2_SEGMENT(LDTR, 0, 0, 0, VMX_AR_LDTR_UNUSABLE); 965 966 // Control Registers & CPU State. 967 vmwrite(VMCS_GUEST_CR0, vmread(VMCS_HOST_CR0)); 968 vmwrite(VMCS_GUEST_CR3, vmread(VMCS_HOST_CR3)); 969 vmwrite(VMCS_GUEST_CR4, vmread(VMCS_HOST_CR4)); 970 vmwrite(VMCS_GUEST_RIP, l2_code_addr); 971 vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); 972 vmwrite(VMCS_GUEST_RFLAGS, RFLAGS_1_BIT); 973 // TODO 974 vmwrite(VMCS_GUEST_DR7, 0x400); 975 976 // MSRs - Copy from host or set to default. 977 COPY_VMCS_FIELD(VMCS_GUEST_IA32_EFER, VMCS_HOST_IA32_EFER); 978 COPY_VMCS_FIELD(VMCS_GUEST_IA32_PAT, VMCS_HOST_IA32_PAT); 979 COPY_VMCS_FIELD(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL, VMCS_HOST_IA32_PERF_GLOBAL_CTRL); 980 COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_CS, VMCS_HOST_IA32_SYSENTER_CS); 981 COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_ESP, VMCS_HOST_IA32_SYSENTER_ESP); 982 COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_EIP, VMCS_HOST_IA32_SYSENTER_EIP); 983 vmwrite(VMCS_GUEST_IA32_DEBUGCTL, 0); 984 985 // Descriptor Tables. 986 vmwrite(VMCS_GUEST_GDTR_BASE, vmread(VMCS_HOST_GDTR_BASE)); 987 vmwrite(VMCS_GUEST_GDTR_LIMIT, 0xffff); 988 vmwrite(VMCS_GUEST_IDTR_BASE, vmread(VMCS_HOST_IDTR_BASE)); 989 vmwrite(VMCS_GUEST_IDTR_LIMIT, 0xffff); 990 991 // Miscellaneous Fields. 992 vmwrite(VMCS_LINK_POINTER, 0xffffffffffffffff); 993 // 0 = Active. 994 vmwrite(VMCS_GUEST_ACTIVITY_STATE, 0); 995 vmwrite(VMCS_GUEST_INTERRUPTIBILITY_INFO, 0); 996 vmwrite(VMCS_GUEST_PENDING_DBG_EXCEPTIONS, 0); 997 vmwrite(VMCS_VMX_PREEMPTION_TIMER_VALUE, 0); 998 vmwrite(VMCS_GUEST_INTR_STATUS, 0); 999 vmwrite(VMCS_GUEST_PML_INDEX, 0); 1000 } 1001 1002 GUEST_CODE static noinline void 1003 nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id) 1004 { 1005 uint64 vm_id = cmd->arg; 1006 uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 1007 uint8 error = 0; // nolint 1008 1009 *(uint32*)vmcs_addr = rdmsr(X86_MSR_IA32_VMX_BASIC); 1010 asm volatile("vmclear %1; setna %0" 1011 : "=q"(error) 1012 : "m"(vmcs_addr) 1013 : "memory", "cc"); 1014 if (error) { 1015 guest_uexit(0xE2BAD1); 1016 return; 1017 } 1018 nested_vmptrld(cpu_id, vm_id); 1019 1020 setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id); 1021 init_vmcs_control_fields(cpu_id, vm_id); 1022 init_vmcs_host_state(); 1023 init_vmcs_guest_state(cpu_id, vm_id); 1024 } 1025 1026 // Helper for setting up a segment in the VMCB 1027 #define SETUP_L2_SEGMENT_SVM(VMBC_PTR, SEG_NAME, SELECTOR, BASE, LIMIT, ATTR) \ 1028 vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_SEL, SELECTOR); \ 1029 vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_ATTR, ATTR); \ 1030 vmcb_write32(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_LIM, LIMIT); \ 1031 vmcb_write64(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_BASE, BASE); 1032 1033 GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_id) 1034 { 1035 uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 1036 uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); 1037 uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); 1038 uint64 npt_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); 1039 // Setup Guest Segment Registers. 1040 // We copy the L1 guest's segment setup, as it's a good 64-bit environment. 1041 SETUP_L2_SEGMENT_SVM(vmcb_addr, CS, X86_SYZOS_SEL_CODE, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_CODE); 1042 SETUP_L2_SEGMENT_SVM(vmcb_addr, DS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); 1043 SETUP_L2_SEGMENT_SVM(vmcb_addr, ES, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); 1044 SETUP_L2_SEGMENT_SVM(vmcb_addr, SS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); 1045 SETUP_L2_SEGMENT_SVM(vmcb_addr, FS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); 1046 SETUP_L2_SEGMENT_SVM(vmcb_addr, GS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); 1047 1048 // Task Register (TR). Must point to a valid, present, 64-bit TSS. 1049 SETUP_L2_SEGMENT_SVM(vmcb_addr, TR, X86_SYZOS_SEL_TSS64, X86_SYZOS_ADDR_VAR_TSS, 0x67, VMX_AR_TSS_AVAILABLE); 1050 1051 // LDT Register (LDTR) - Mark as unusable. 1052 // A null selector and attribute is the correct way to disable LDTR. 1053 SETUP_L2_SEGMENT_SVM(vmcb_addr, LDTR, 0, 0, 0, SVM_ATTR_LDTR_UNUSABLE); 1054 1055 // Setup Guest Control Registers & CPU State. 1056 uint64 efer = rdmsr(X86_MSR_IA32_EFER); 1057 vmcb_write64(vmcb_addr, VMCB_GUEST_CR0, read_cr0() | X86_CR0_WP); 1058 // L2 will use L1's page tables. 1059 vmcb_write64(vmcb_addr, VMCB_GUEST_CR3, read_cr3()); 1060 vmcb_write64(vmcb_addr, VMCB_GUEST_CR4, read_cr4()); 1061 vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, l2_code_addr); 1062 vmcb_write64(vmcb_addr, VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); 1063 vmcb_write64(vmcb_addr, VMCB_GUEST_RFLAGS, RFLAGS_1_BIT); 1064 1065 // Setup Guest MSRs. 1066 1067 // SYSCALL/SYSRET MSRs. 1068 vmcb_write64(vmcb_addr, VMCB_GUEST_DEBUGCTL, 0); 1069 vmcb_write64(vmcb_addr, VMCB_GUEST_DR6, 0x0); 1070 vmcb_write64(vmcb_addr, VMCB_GUEST_DR7, 0x0); 1071 1072 vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, efer & ~X86_EFER_SCE); 1073 vmcb_write64(vmcb_addr, VMCB_GUEST_PAT, rdmsr(X86_MSR_IA32_CR_PAT)); 1074 1075 // Setup Guest Descriptor Tables. 1076 struct { 1077 uint16 limit; 1078 uint64 base; 1079 } __attribute__((packed)) gdtr, idtr; 1080 asm volatile("sgdt %0" : "=m"(gdtr)); 1081 asm volatile("sidt %0" : "=m"(idtr)); 1082 vmcb_write64(vmcb_addr, VMCB_GUEST_GDTR_BASE, gdtr.base); 1083 vmcb_write32(vmcb_addr, VMCB_GUEST_GDTR_LIM, gdtr.limit); 1084 vmcb_write64(vmcb_addr, VMCB_GUEST_IDTR_BASE, idtr.base); 1085 vmcb_write32(vmcb_addr, VMCB_GUEST_IDTR_LIM, idtr.limit); 1086 1087 // Setup VMCB Control Fields. 1088 vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_VEC3_ALL); 1089 vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC4, VMCB_CTRL_INTERCEPT_VEC4_ALL); 1090 1091 // Enable Nested Paging (NPT): 1092 // Write '1' to the NPT Enable field (0x090). 1093 vmcb_write64(vmcb_addr, VMCB_CTRL_NP_ENABLE, (1 << VMCB_CTRL_NPT_ENABLE_BIT)); 1094 1095 // 2Write the NPT root address to N_CR3 (0x098) 1096 // Unlike Intel's EPTP, AMD's N_CR3 field is *only* the 1097 // 4K-aligned physical address of the PML4 table. 1098 // It does not contain any control bits. 1099 uint64 npt_pointer = (npt_pml4_addr & ~0xFFF); 1100 vmcb_write64(vmcb_addr, VMCB_CTRL_N_CR3, npt_pointer); 1101 1102 // Set Guest ASID. 1103 vmcb_write32(vmcb_addr, VMCB_CTRL_ASID, 1); 1104 } 1105 1106 GUEST_CODE static noinline void 1107 nested_create_vm_amd(struct api_call_1* cmd, uint64 cpu_id) 1108 { 1109 uint64 vm_id = cmd->arg; 1110 uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 1111 1112 guest_memset((void*)vmcb_addr, 0, KVM_PAGE_SIZE); 1113 guest_memset((void*)X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id), 0, KVM_PAGE_SIZE); 1114 1115 // Setup NPT (Nested Page Tables) 1116 setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id); 1117 1118 // Initialize VMCB Control and Guest State 1119 init_vmcb_guest_state(cpu_id, vm_id); 1120 } 1121 1122 GUEST_CODE static noinline void 1123 guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id) 1124 { 1125 if (get_cpu_vendor() == CPU_VENDOR_INTEL) { 1126 nested_create_vm_intel(cmd, cpu_id); 1127 } else { 1128 nested_create_vm_amd(cmd, cpu_id); 1129 } 1130 } 1131 1132 GUEST_CODE static noinline void 1133 guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id) 1134 { 1135 uint64 vm_id = cmd->vm_id; 1136 uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); 1137 uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); 1138 // Code size = command size - header size - vm_id size. 1139 uint64 l2_code_size = cmd->header.size - sizeof(struct api_call_header) - sizeof(uint64); 1140 if (l2_code_size > KVM_PAGE_SIZE) 1141 l2_code_size = KVM_PAGE_SIZE; 1142 guest_memcpy((void*)l2_code_addr, (void*)cmd->insns, 1143 l2_code_size); 1144 if (get_cpu_vendor() == CPU_VENDOR_INTEL) { 1145 nested_vmptrld(cpu_id, vm_id); 1146 vmwrite(VMCS_GUEST_RIP, l2_code_addr); 1147 vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); 1148 } else { 1149 vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RIP, l2_code_addr); 1150 vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); 1151 } 1152 } 1153 1154 // Clang's LTO may ignore noinline and attempt to inline this function into both callers, 1155 // which results in duplicate declaration of after_vmentry_label. 1156 // Applying __optnone should prevent this behavior. 1157 GUEST_CODE static noinline __optnone void 1158 guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch) 1159 { 1160 uint64 vmx_error_code = 0; 1161 uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set 1162 1163 nested_vmptrld(cpu_id, vm_id); 1164 1165 if (is_launch) { 1166 asm volatile(R"( 1167 // Attempt to launch the L2 guest. 1168 vmlaunch 1169 // Set AL to 1 if CF=1 (VMfailValid) 1170 setc %%al 1171 // Set BL to 1 if ZF=1 (VMfailInvalid) 1172 setz %%bl 1173 or %%bl, %%al)" 1174 : "=a"(fail_flag) 1175 : 1176 : "rbx", "cc", "memory"); 1177 } else { 1178 asm volatile(R"( 1179 // Attempt to resume the L2 guest. 1180 vmresume 1181 // Set AL to 1 if CF=1 (VMfailValid) 1182 setc %%al 1183 // Set BL to 1 if ZF=1 (VMfailInvalid) 1184 setz %%bl 1185 or %%bl, %%al)" 1186 : "=a"(fail_flag) 1187 : 1188 : "rbx", "cc", "memory"); 1189 } 1190 asm volatile(".globl after_vmentry_label\nafter_vmentry_label:"); 1191 if (fail_flag) { 1192 // VMLAUNCH/VMRESUME failed, so VMCS is still valid and can be read. 1193 vmx_error_code = vmread(VMCS_VM_INSTRUCTION_ERROR); 1194 guest_uexit(0xE2E10000 | (uint32)vmx_error_code); 1195 return; 1196 } 1197 // If we get here, this means VMLAUNCH/VMRESUME truly succeeded (CF=0 and ZF=0) 1198 // and the L2 guest has run and exited. 1199 } 1200 1201 GUEST_CODE static noinline void 1202 guest_run_amd_vm(uint64 cpu_id, uint64 vm_id) 1203 { 1204 uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 1205 volatile uint8* vmcb_ptr = (volatile uint8*)vmcb_addr; 1206 uint8 fail_flag = 0; 1207 1208 asm volatile( 1209 "mov %1, %%rax\n\t" // Load VMCB physical address into RAX 1210 "vmrun\n\t" // Launch or resume L2 guest 1211 "setc %0\n\t" 1212 : "=q"(fail_flag) 1213 : "m"(vmcb_addr) 1214 : "rax", "cc", "memory"); 1215 1216 if (fail_flag) { 1217 // VMRUN failed. 1218 guest_uexit(0xE2E10000 | 0xFFFF); 1219 return; 1220 } 1221 1222 // VMRUN succeeded and we have a VM-exit. 1223 uint64 exit_reason = vmcb_read64(vmcb_ptr, VMCB_EXIT_CODE); 1224 nested_vm_exit_handler_amd(exit_reason, cpu_id, vm_id); 1225 } 1226 1227 GUEST_CODE static noinline void 1228 guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id) 1229 { 1230 uint64 vm_id = cmd->arg; 1231 if (get_cpu_vendor() == CPU_VENDOR_INTEL) { 1232 guest_handle_nested_vmentry_intel(vm_id, cpu_id, true); 1233 } else { 1234 guest_run_amd_vm(cpu_id, vm_id); 1235 } 1236 } 1237 1238 GUEST_CODE static noinline void 1239 guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id) 1240 { 1241 uint64 vm_id = cmd->arg; 1242 if (get_cpu_vendor() == CPU_VENDOR_INTEL) { 1243 guest_handle_nested_vmentry_intel(vm_id, cpu_id, false); 1244 } else { 1245 guest_run_amd_vm(cpu_id, vm_id); 1246 } 1247 } 1248 1249 GUEST_CODE static noinline void 1250 guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id) 1251 { 1252 if (get_cpu_vendor() != CPU_VENDOR_INTEL) 1253 return; 1254 uint64 vm_id = cmd->args[0]; 1255 nested_vmptrld(cpu_id, vm_id); 1256 uint64 field = cmd->args[1]; 1257 uint64 set_mask = cmd->args[2]; 1258 uint64 unset_mask = cmd->args[3]; 1259 uint64 flip_mask = cmd->args[4]; 1260 1261 uint64 current_value = vmread(field); 1262 uint64 new_value = (current_value & ~unset_mask) | set_mask; 1263 new_value ^= flip_mask; 1264 vmwrite(field, new_value); 1265 } 1266 1267 GUEST_CODE static noinline void 1268 guest_handle_nested_amd_vmcb_write_mask(struct api_call_5* cmd, uint64 cpu_id) 1269 { 1270 if (get_cpu_vendor() != CPU_VENDOR_AMD) 1271 return; 1272 uint64 vm_id = cmd->args[0]; 1273 uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); 1274 uint64 offset = cmd->args[1]; 1275 uint64 set_mask = cmd->args[2]; 1276 uint64 unset_mask = cmd->args[3]; 1277 uint64 flip_mask = cmd->args[4]; 1278 1279 uint64 current_value = vmcb_read64((volatile uint8*)vmcb_addr, offset); 1280 uint64 new_value = (current_value & ~unset_mask) | set_mask; 1281 new_value ^= flip_mask; 1282 vmcb_write64(vmcb_addr, offset, new_value); 1283 } 1284 1285 #endif // EXECUTOR_COMMON_KVM_AMD64_SYZOS_H