github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/executor/common_kvm_amd64.h (about) 1 // Copyright 2017 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 #ifndef EXECUTOR_COMMON_KVM_AMD64_H 5 #define EXECUTOR_COMMON_KVM_AMD64_H 6 7 // This file is shared between executor and csource package. 8 9 // Implementation of syz_kvm_setup_cpu pseudo-syscall. 10 // See Intel Software Developer’s Manual Volume 3: System Programming Guide 11 // for details on what happens here. 12 13 #include "common_kvm.h" 14 #include "common_kvm_amd64_syzos.h" 15 #include "kvm.h" 16 #include "kvm_amd64.S.h" 17 18 #ifndef KVM_SMI 19 #define KVM_SMI _IO(KVMIO, 0xb7) 20 #endif 21 22 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 23 struct tss16 { 24 uint16 prev; 25 uint16 sp0; 26 uint16 ss0; 27 uint16 sp1; 28 uint16 ss1; 29 uint16 sp2; 30 uint16 ss2; 31 uint16 ip; 32 uint16 flags; 33 uint16 ax; 34 uint16 cx; 35 uint16 dx; 36 uint16 bx; 37 uint16 sp; 38 uint16 bp; 39 uint16 si; 40 uint16 di; 41 uint16 es; 42 uint16 cs; 43 uint16 ss; 44 uint16 ds; 45 uint16 ldt; 46 } __attribute__((packed)); 47 48 struct tss32 { 49 uint16 prev, prevh; 50 uint32 sp0; 51 uint16 ss0, ss0h; 52 uint32 sp1; 53 uint16 ss1, ss1h; 54 uint32 sp2; 55 uint16 ss2, ss2h; 56 uint32 cr3; 57 uint32 ip; 58 uint32 flags; 59 uint32 ax; 60 uint32 cx; 61 uint32 dx; 62 uint32 bx; 63 uint32 sp; 64 uint32 bp; 65 uint32 si; 66 uint32 di; 67 uint16 es, esh; 68 uint16 cs, csh; 69 uint16 ss, ssh; 70 uint16 ds, dsh; 71 uint16 fs, fsh; 72 uint16 gs, gsh; 73 uint16 ldt, ldth; 74 uint16 trace; 75 uint16 io_bitmap; 76 } __attribute__((packed)); 77 #endif 78 79 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 80 struct tss64 { 81 uint32 reserved0; 82 uint64 rsp[3]; 83 uint64 reserved1; 84 uint64 ist[7]; 85 uint64 reserved2; 86 uint16 reserved3; 87 uint16 io_bitmap; 88 } __attribute__((packed)); 89 90 static void fill_segment_descriptor(uint64* dt, uint64* lt, struct kvm_segment* seg) 91 { 92 uint16 index = seg->selector >> 3; 93 uint64 limit = seg->g ? seg->limit >> 12 : seg->limit; 94 uint64 sd = (limit & 0xffff) | (seg->base & 0xffffff) << 16 | (uint64)seg->type << 40 | (uint64)seg->s << 44 | (uint64)seg->dpl << 45 | (uint64)seg->present << 47 | (limit & 0xf0000ULL) << 48 | (uint64)seg->avl << 52 | (uint64)seg->l << 53 | (uint64)seg->db << 54 | (uint64)seg->g << 55 | (seg->base & 0xff000000ULL) << 56; 95 dt[index] = sd; 96 lt[index] = sd; 97 } 98 99 static void fill_segment_descriptor_dword(uint64* dt, uint64* lt, struct kvm_segment* seg) 100 { 101 fill_segment_descriptor(dt, lt, seg); 102 uint16 index = seg->selector >> 3; 103 dt[index + 1] = 0; 104 lt[index + 1] = 0; 105 } 106 #endif 107 108 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 109 static void setup_syscall_msrs(int cpufd, uint16 sel_cs, uint16 sel_cs_cpl3) 110 { 111 char buf[sizeof(struct kvm_msrs) + 5 * sizeof(struct kvm_msr_entry)]; 112 memset(buf, 0, sizeof(buf)); 113 struct kvm_msrs* msrs = (struct kvm_msrs*)buf; 114 struct kvm_msr_entry* entries = msrs->entries; 115 msrs->nmsrs = 5; 116 entries[0].index = X86_MSR_IA32_SYSENTER_CS; 117 entries[0].data = sel_cs; 118 entries[1].index = X86_MSR_IA32_SYSENTER_ESP; 119 entries[1].data = X86_ADDR_STACK0; 120 entries[2].index = X86_MSR_IA32_SYSENTER_EIP; 121 entries[2].data = X86_ADDR_VAR_SYSEXIT; 122 entries[3].index = X86_MSR_IA32_STAR; 123 entries[3].data = ((uint64)sel_cs << 32) | ((uint64)sel_cs_cpl3 << 48); 124 entries[4].index = X86_MSR_IA32_LSTAR; 125 entries[4].data = X86_ADDR_VAR_SYSRET; 126 ioctl(cpufd, KVM_SET_MSRS, msrs); 127 } 128 #endif 129 130 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 131 static void setup_32bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem) 132 { 133 sregs->idt.base = guest_mem + X86_ADDR_VAR_IDT; 134 sregs->idt.limit = 0x1ff; 135 uint64* idt = (uint64*)(host_mem + sregs->idt.base); 136 for (int i = 0; i < 32; i++) { 137 struct kvm_segment gate; 138 gate.selector = i << 3; 139 switch (i % 6) { 140 case 0: 141 // 16-bit interrupt gate 142 gate.type = 6; 143 gate.base = X86_SEL_CS16; 144 break; 145 case 1: 146 // 16-bit trap gate 147 gate.type = 7; 148 gate.base = X86_SEL_CS16; 149 break; 150 case 2: 151 // 16-bit task gate 152 gate.type = 3; 153 gate.base = X86_SEL_TGATE16; 154 break; 155 case 3: 156 // 32-bit interrupt gate 157 gate.type = 14; 158 gate.base = X86_SEL_CS32; 159 break; 160 case 4: 161 // 32-bit trap gate 162 gate.type = 15; 163 gate.base = X86_SEL_CS32; 164 break; 165 case 5: 166 // 32-bit task gate 167 gate.type = 11; 168 gate.base = X86_SEL_TGATE32; 169 break; 170 } 171 gate.limit = guest_mem + X86_ADDR_VAR_USER_CODE2; // entry offset 172 gate.present = 1; 173 gate.dpl = 0; 174 gate.s = 0; 175 gate.g = 0; 176 gate.db = 0; 177 gate.l = 0; 178 gate.avl = 0; 179 fill_segment_descriptor(idt, idt, &gate); 180 } 181 } 182 #endif 183 184 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 185 static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem) 186 { 187 sregs->idt.base = guest_mem + X86_ADDR_VAR_IDT; 188 sregs->idt.limit = 0x1ff; 189 uint64* idt = (uint64*)(host_mem + sregs->idt.base); 190 for (int i = 0; i < 32; i++) { 191 struct kvm_segment gate; 192 gate.selector = (i * 2) << 3; 193 gate.type = (i & 1) ? 14 : 15; // interrupt or trap gate 194 gate.base = X86_SEL_CS64; 195 gate.limit = guest_mem + X86_ADDR_VAR_USER_CODE2; // entry offset 196 gate.present = 1; 197 gate.dpl = 0; 198 gate.s = 0; 199 gate.g = 0; 200 gate.db = 0; 201 gate.l = 0; 202 gate.avl = 0; 203 fill_segment_descriptor_dword(idt, idt, &gate); 204 } 205 } 206 #endif 207 208 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm || __NR_syz_kvm_add_vcpu 209 // Flags for mem_region 210 #define MEM_REGION_FLAG_USER_CODE (1 << 0) 211 #define MEM_REGION_FLAG_DIRTY_LOG (1 << 1) 212 #define MEM_REGION_FLAG_READONLY (1 << 2) 213 #define MEM_REGION_FLAG_EXECUTOR_CODE (1 << 3) 214 #define MEM_REGION_FLAG_GPA0 (1 << 5) 215 #define MEM_REGION_FLAG_NO_HOST_MEM (1 << 6) 216 217 struct mem_region { 218 uint64 gpa; 219 int pages; 220 uint32 flags; 221 }; 222 223 // SYZOS guest virtual memory layout (must be in sync with executor/kvm.h): 224 static const struct mem_region syzos_mem_regions[] = { 225 // AMD64 data structures (48 pages starting at GPA 0x0, see kvm.h). 226 {X86_SYZOS_ADDR_ZERO, 48, MEM_REGION_FLAG_GPA0}, 227 // SMRAM memory. 228 {X86_SYZOS_ADDR_SMRAM, 10, 0}, 229 // Unmapped region to trigger a page faults for uexits etc. 230 {X86_SYZOS_ADDR_EXIT, 1, MEM_REGION_FLAG_NO_HOST_MEM}, 231 // Writable region with KVM_MEM_LOG_DIRTY_PAGES to fuzz dirty ring. 232 {X86_SYZOS_ADDR_DIRTY_PAGES, 2, MEM_REGION_FLAG_DIRTY_LOG}, 233 // SYZOS user code (generated by the fuzzer). 234 {X86_SYZOS_ADDR_USER_CODE, KVM_MAX_VCPU, MEM_REGION_FLAG_READONLY | MEM_REGION_FLAG_USER_CODE}, 235 // Executor guest code. 236 {SYZOS_ADDR_EXECUTOR_CODE, 4, MEM_REGION_FLAG_READONLY | MEM_REGION_FLAG_EXECUTOR_CODE}, 237 // Scratch memory for code generated at runtime. 238 {X86_SYZOS_ADDR_SCRATCH_CODE, 1, 0}, 239 // CPU stack. 240 {X86_SYZOS_ADDR_STACK_BOTTOM, 1, 0}, 241 // Per-VCPU regions for L2 VMs. 242 {X86_SYZOS_PER_VCPU_REGIONS_BASE, (KVM_MAX_VCPU * X86_SYZOS_L1_VCPU_REGION_SIZE) / KVM_PAGE_SIZE, 0}, 243 // IOAPIC memory. 244 {X86_SYZOS_ADDR_IOAPIC, 1, 0}, 245 }; 246 #endif 247 248 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm || __NR_syz_kvm_setup_cpu || __NR_syz_kvm_add_vcpu 249 struct kvm_syz_vm { 250 int vmfd; 251 int next_cpu_id; 252 void* host_mem; 253 size_t total_pages; 254 void* user_text; 255 void* gpa0_mem; 256 }; 257 #endif 258 259 #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu 260 261 #define X86_NUM_IDT_ENTRIES 256 262 static void syzos_setup_idt(struct kvm_syz_vm* vm, struct kvm_sregs* sregs) 263 { 264 sregs->idt.base = X86_SYZOS_ADDR_VAR_IDT; 265 sregs->idt.limit = (X86_NUM_IDT_ENTRIES * sizeof(struct idt_entry_64)) - 1; 266 volatile struct idt_entry_64* idt = 267 (volatile struct idt_entry_64*)((uint64)vm->host_mem + sregs->idt.base); 268 uint64 handler_addr = executor_fn_guest_addr(dummy_null_handler); 269 for (int i = 0; i < X86_NUM_IDT_ENTRIES; i++) { 270 idt[i].offset_low = (uint16)(handler_addr & 0xFFFF); 271 idt[i].selector = X86_SYZOS_SEL_CODE; 272 idt[i].ist = 0; 273 // 0x8E is a 64-bit interrupt gate: P=1, DPL=0, type=0xE. 274 idt[i].type_attr = 0x8E; 275 idt[i].offset_mid = (uint16)((handler_addr >> 16) & 0xFFFF); 276 idt[i].offset_high = (uint32)((handler_addr >> 32) & 0xFFFFFFFF); 277 idt[i].reserved = 0; 278 } 279 } 280 #endif 281 282 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu || __NR_syz_kvm_add_vcpu 283 struct kvm_text { 284 uintptr_t typ; 285 const void* text; 286 uintptr_t size; 287 }; 288 #endif 289 290 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 291 struct kvm_opt { 292 uint64 typ; 293 uint64 val; 294 }; 295 #endif 296 297 #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu 298 #define PAGE_MASK GENMASK_ULL(51, 12) 299 300 typedef struct { 301 uint64 next_page; 302 uint64 last_page; 303 } page_alloc_t; 304 305 static uint64 pg_alloc(page_alloc_t* alloc) 306 { 307 if (alloc->next_page >= alloc->last_page) 308 fail("page table allocation failed"); 309 uint64 page = alloc->next_page; 310 alloc->next_page += KVM_PAGE_SIZE; 311 return page; 312 } 313 314 static void map_4k_page(uint64 host_mem, page_alloc_t* alloc, uint64 gpa) 315 { 316 uint64* pml4 = (uint64*)(host_mem + X86_SYZOS_ADDR_PML4); 317 318 // PML4 Entry (Level 4). 319 uint64 pml4_idx = (gpa >> 39) & 0x1FF; 320 if (pml4[pml4_idx] == 0) 321 pml4[pml4_idx] = X86_PDE64_PRESENT | X86_PDE64_RW | pg_alloc(alloc); 322 uint64* pdpt = (uint64*)(host_mem + (pml4[pml4_idx] & PAGE_MASK)); 323 324 // PDPT Entry (Level 3). 325 uint64 pdpt_idx = (gpa >> 30) & 0x1FF; 326 if (pdpt[pdpt_idx] == 0) 327 pdpt[pdpt_idx] = X86_PDE64_PRESENT | X86_PDE64_RW | pg_alloc(alloc); 328 uint64* pd = (uint64*)(host_mem + (pdpt[pdpt_idx] & PAGE_MASK)); 329 330 // PD Entry (Level 2). 331 uint64 pd_idx = (gpa >> 21) & 0x1FF; 332 if (pd[pd_idx] == 0) 333 pd[pd_idx] = X86_PDE64_PRESENT | X86_PDE64_RW | pg_alloc(alloc); 334 uint64* pt = (uint64*)(host_mem + (pd[pd_idx] & PAGE_MASK)); 335 336 // PT Entry (Level 1). 337 uint64 pt_idx = (gpa >> 12) & 0x1FF; 338 339 // Set the final 4KB page table entry to map the GPA 340 // This is an identity map: GPA -> GPA 341 pt[pt_idx] = (gpa & PAGE_MASK) | X86_PDE64_PRESENT | X86_PDE64_RW; 342 } 343 344 static int map_4k_region(uint64 host_mem, page_alloc_t* alloc, uint64 gpa_start, int num_pages) 345 { 346 for (int i = 0; i < num_pages; i++) 347 map_4k_page(host_mem, alloc, gpa_start + (i * KVM_PAGE_SIZE)); 348 return num_pages; 349 } 350 351 // We assume a 4-level page table, in the future we could add support for 352 // n-level if needed. 353 static void setup_pg_table(struct kvm_syz_vm* vm) 354 { 355 int total = vm->total_pages; 356 // Page tables are located in the first memory region starting at 0x0. 357 uint64 host_mem = (uint64)vm->gpa0_mem; 358 359 page_alloc_t alloc = {.next_page = X86_SYZOS_ADDR_PT_POOL, 360 .last_page = X86_SYZOS_ADDR_PT_POOL + 32 * KVM_PAGE_SIZE}; 361 362 // Zero-out all page table memory. 363 for (uint64 i = 0; i < (alloc.last_page - alloc.next_page); i += KVM_PAGE_SIZE) 364 memset((void*)(host_mem + alloc.next_page + i), 0, KVM_PAGE_SIZE); 365 366 // Map all the regions defined in setup_vm() 367 for (size_t i = 0; i < sizeof(syzos_mem_regions) / sizeof(syzos_mem_regions[0]); i++) 368 total -= map_4k_region(host_mem, &alloc, syzos_mem_regions[i].gpa, syzos_mem_regions[i].pages); 369 map_4k_region(host_mem, &alloc, X86_SYZOS_ADDR_UNUSED, total); 370 } 371 372 // A 64-bit GDT entry for a code or data segment. 373 // System segments (like TSS) are different and use a 128-bit format. 374 struct gdt_entry { 375 uint16 limit_low; 376 uint16 base_low; 377 uint8 base_mid; 378 uint8 access; 379 uint8 limit_high_and_flags; 380 uint8 base_high; 381 } __attribute__((packed)); 382 383 static void setup_gdt_64(struct gdt_entry* gdt) 384 { 385 // Entry 0: Null 386 gdt[0] = (struct gdt_entry){0}; 387 388 // Entry 1 (selector 0x08): 64-bit Code Segment 389 // P=1, DPL=0, S=1, Type=Execute/Read, L=1, G=1 390 gdt[X86_SYZOS_SEL_CODE >> 3] = (struct gdt_entry){ 391 .limit_low = 0xFFFF, 392 .base_low = 0, 393 .base_mid = 0, 394 .access = 0x9A, // Present, DPL=0, S=1, Type=Execute/Read, Accessed 395 .limit_high_and_flags = 0xAF, // Granularity=1, L=1, Limit=0xF 396 .base_high = 0}; 397 398 // Entry 2 (selector 0x10): 64-bit Data Segment 399 // P=1, DPL=0, S=1, Type=Read/Write, DB=1, G=1 400 gdt[X86_SYZOS_SEL_DATA >> 3] = (struct gdt_entry){ 401 .limit_low = 0xFFFF, 402 .base_low = (uint16)(X86_SYZOS_ADDR_VAR_TSS & 0xFFFF), 403 .base_mid = (uint8)((X86_SYZOS_ADDR_VAR_TSS >> 16) & 0xFF), 404 .access = 0x92, // Present, DPL=0, S=1, Type=Read/Write, Accessed 405 .limit_high_and_flags = 0xCF, // Granularity=1, DB=1, Limit=0xF 406 .base_high = (uint8)((X86_SYZOS_ADDR_VAR_TSS >> 24) & 0xFF)}; 407 // Entry 3 (selector 0x18): 64-bit TSS Segment 408 gdt[X86_SYZOS_SEL_TSS64 >> 3] = (struct gdt_entry){ 409 .limit_low = 0x67, // Minimal TSS limit 410 .base_low = 0, 411 .base_mid = 0, 412 .access = 0x89, // Present, DPL=0, 64-bit TSS (Available) 413 .limit_high_and_flags = 0x00, // G=0, Limit High = 0 414 .base_high = 0}; 415 // NOTE: A 64-bit TSS descriptor actually needs a second GDT entry for the high 32 bits of the base. 416 // We'll keep the base 0 for simplicity, so the second entry (index 4) can remain 0. 417 } 418 419 // This only sets up a 64-bit VCPU. 420 // TODO: Should add support for other modes. 421 static void setup_gdt_ldt_pg(struct kvm_syz_vm* vm, int cpufd) 422 { 423 struct kvm_sregs sregs; 424 ioctl(cpufd, KVM_GET_SREGS, &sregs); 425 426 sregs.gdt.base = X86_SYZOS_ADDR_GDT; 427 sregs.gdt.limit = 5 * sizeof(struct gdt_entry) - 1; 428 struct gdt_entry* gdt = (struct gdt_entry*)((uint64)vm->host_mem + sregs.gdt.base); 429 430 struct kvm_segment seg_cs64; 431 memset(&seg_cs64, 0, sizeof(seg_cs64)); 432 seg_cs64.selector = X86_SYZOS_SEL_CODE; 433 seg_cs64.type = 11; 434 seg_cs64.base = 0; 435 seg_cs64.limit = 0xFFFFFFFFu; 436 seg_cs64.present = 1; 437 seg_cs64.s = 1; 438 seg_cs64.g = 1; 439 seg_cs64.l = 1; 440 441 sregs.cs = seg_cs64; 442 443 struct kvm_segment seg_ds64; 444 memset(&seg_ds64, 0, sizeof(struct kvm_segment)); 445 seg_ds64.selector = X86_SYZOS_SEL_DATA; 446 seg_ds64.type = 3; 447 seg_ds64.limit = 0xFFFFFFFFu; 448 seg_ds64.present = 1; 449 seg_ds64.s = 1; 450 seg_ds64.g = 1; 451 seg_ds64.db = 1; 452 453 sregs.ds = seg_ds64; 454 sregs.es = seg_ds64; 455 sregs.fs = seg_ds64; 456 sregs.gs = seg_ds64; 457 sregs.ss = seg_ds64; 458 459 // The L1 guest (the host for L2) MUST have a valid TR 460 // pointing to the 64-bit TSS in the GDT. 461 struct kvm_segment seg_tr; 462 memset(&seg_tr, 0, sizeof(seg_tr)); 463 seg_tr.selector = X86_SYZOS_SEL_TSS64; // 0x18 464 seg_tr.type = 11; // 64-bit TSS (Busy) 465 seg_tr.base = X86_SYZOS_ADDR_VAR_TSS; 466 seg_tr.limit = 0x67; // Limit of the TSS descriptor 467 seg_tr.present = 1; 468 seg_tr.s = 0; // System segment 469 sregs.tr = seg_tr; 470 471 // The L1 TSS memory is at (vm->host_mem + X86_SYZOS_ADDR_VAR_TSS) 472 volatile uint8* l1_tss = 473 (volatile uint8*)((uint64)vm->host_mem + X86_SYZOS_ADDR_VAR_TSS); 474 475 // Zero out the TSS (104 bytes for 64-bit) 476 memset((void*)l1_tss, 0, 104); 477 478 // Set the critical RSP0 field to the L1 guest's main stack. 479 // RSP0 is at offset +4 bytes in a 64-bit TSS. 480 *(volatile uint64*)(l1_tss + 4) = X86_SYZOS_ADDR_STACK0; 481 482 setup_gdt_64(gdt); 483 484 syzos_setup_idt(vm, &sregs); 485 setup_pg_table(vm); 486 487 sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; 488 sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; 489 sregs.efer |= (X86_EFER_LME | X86_EFER_LMA | X86_EFER_NXE); 490 sregs.cr3 = X86_ADDR_PML4; 491 492 ioctl(cpufd, KVM_SET_SREGS, &sregs); 493 } 494 #endif 495 496 #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu 497 static void setup_cpuid(int cpufd) 498 { 499 int kvmfd = open("/dev/kvm", O_RDWR); 500 char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)]; 501 memset(buf, 0, sizeof(buf)); 502 struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf; 503 cpuid->nent = 128; 504 ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid); 505 ioctl(cpufd, KVM_SET_CPUID2, cpuid); 506 close(kvmfd); 507 } 508 #endif 509 510 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu 511 #define KVM_SETUP_PAGING (1 << 0) 512 #define KVM_SETUP_PAE (1 << 1) 513 #define KVM_SETUP_PROTECTED (1 << 2) 514 #define KVM_SETUP_CPL3 (1 << 3) 515 #define KVM_SETUP_VIRT86 (1 << 4) 516 #define KVM_SETUP_SMM (1 << 5) 517 #define KVM_SETUP_VM (1 << 6) 518 519 // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts]) 520 static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4, volatile long a5, volatile long a6, volatile long a7) 521 { 522 const int vmfd = a0; 523 const int cpufd = a1; 524 char* const host_mem = (char*)a2; 525 const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3; 526 const uintptr_t text_count = a4; 527 const uintptr_t flags = a5; 528 const struct kvm_opt* const opt_array_ptr = (struct kvm_opt*)a6; 529 uintptr_t opt_count = a7; 530 531 const uintptr_t page_size = 4 << 10; 532 const uintptr_t ioapic_page = 10; 533 const uintptr_t guest_mem_size = 24 * page_size; 534 const uintptr_t guest_mem = 0; 535 536 (void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count 537 int text_type = text_array_ptr[0].typ; 538 const void* text = text_array_ptr[0].text; 539 uintptr_t text_size = text_array_ptr[0].size; 540 541 for (uintptr_t i = 0; i < guest_mem_size / page_size; i++) { 542 struct kvm_userspace_memory_region memreg; 543 memreg.slot = i; 544 memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY 545 memreg.guest_phys_addr = guest_mem + i * page_size; 546 if (i == ioapic_page) 547 memreg.guest_phys_addr = 0xfec00000; 548 memreg.memory_size = page_size; 549 memreg.userspace_addr = (uintptr_t)host_mem + i * page_size; 550 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); 551 } 552 // SMRAM 553 struct kvm_userspace_memory_region memreg; 554 memreg.slot = 1 + (1 << 16); 555 memreg.flags = 0; 556 memreg.guest_phys_addr = 0x30000; 557 memreg.memory_size = 64 << 10; 558 memreg.userspace_addr = (uintptr_t)host_mem; 559 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); 560 561 struct kvm_sregs sregs; 562 if (ioctl(cpufd, KVM_GET_SREGS, &sregs)) 563 return -1; 564 565 struct kvm_regs regs; 566 memset(®s, 0, sizeof(regs)); 567 regs.rip = guest_mem + X86_ADDR_TEXT; 568 regs.rsp = X86_ADDR_STACK0; 569 570 sregs.gdt.base = guest_mem + X86_ADDR_GDT; 571 sregs.gdt.limit = 256 * sizeof(uint64) - 1; 572 uint64* gdt = (uint64*)(host_mem + sregs.gdt.base); 573 574 struct kvm_segment seg_ldt; 575 memset(&seg_ldt, 0, sizeof(seg_ldt)); 576 seg_ldt.selector = X86_SEL_LDT; 577 seg_ldt.type = 2; 578 seg_ldt.base = guest_mem + X86_ADDR_LDT; 579 seg_ldt.limit = 256 * sizeof(uint64) - 1; 580 seg_ldt.present = 1; 581 seg_ldt.dpl = 0; 582 seg_ldt.s = 0; 583 seg_ldt.g = 0; 584 seg_ldt.db = 1; 585 seg_ldt.l = 0; 586 sregs.ldt = seg_ldt; 587 uint64* ldt = (uint64*)(host_mem + sregs.ldt.base); 588 589 struct kvm_segment seg_cs16; 590 memset(&seg_cs16, 0, sizeof(seg_cs16)); 591 seg_cs16.selector = X86_SEL_CS16; 592 seg_cs16.type = 11; 593 seg_cs16.base = 0; 594 seg_cs16.limit = 0xfffff; 595 seg_cs16.present = 1; 596 seg_cs16.dpl = 0; 597 seg_cs16.s = 1; 598 seg_cs16.g = 0; 599 seg_cs16.db = 0; 600 seg_cs16.l = 0; 601 602 struct kvm_segment seg_ds16 = seg_cs16; 603 seg_ds16.selector = X86_SEL_DS16; 604 seg_ds16.type = 3; 605 606 struct kvm_segment seg_cs16_cpl3 = seg_cs16; 607 seg_cs16_cpl3.selector = X86_SEL_CS16_CPL3; 608 seg_cs16_cpl3.dpl = 3; 609 610 struct kvm_segment seg_ds16_cpl3 = seg_ds16; 611 seg_ds16_cpl3.selector = X86_SEL_DS16_CPL3; 612 seg_ds16_cpl3.dpl = 3; 613 614 struct kvm_segment seg_cs32 = seg_cs16; 615 seg_cs32.selector = X86_SEL_CS32; 616 seg_cs32.db = 1; 617 618 struct kvm_segment seg_ds32 = seg_ds16; 619 seg_ds32.selector = X86_SEL_DS32; 620 seg_ds32.db = 1; 621 622 struct kvm_segment seg_cs32_cpl3 = seg_cs32; 623 seg_cs32_cpl3.selector = X86_SEL_CS32_CPL3; 624 seg_cs32_cpl3.dpl = 3; 625 626 struct kvm_segment seg_ds32_cpl3 = seg_ds32; 627 seg_ds32_cpl3.selector = X86_SEL_DS32_CPL3; 628 seg_ds32_cpl3.dpl = 3; 629 630 struct kvm_segment seg_cs64 = seg_cs16; 631 seg_cs64.selector = X86_SEL_CS64; 632 seg_cs64.l = 1; 633 634 struct kvm_segment seg_ds64 = seg_ds32; 635 seg_ds64.selector = X86_SEL_DS64; 636 637 struct kvm_segment seg_cs64_cpl3 = seg_cs64; 638 seg_cs64_cpl3.selector = X86_SEL_CS64_CPL3; 639 seg_cs64_cpl3.dpl = 3; 640 641 struct kvm_segment seg_ds64_cpl3 = seg_ds64; 642 seg_ds64_cpl3.selector = X86_SEL_DS64_CPL3; 643 seg_ds64_cpl3.dpl = 3; 644 645 struct kvm_segment seg_tss32; 646 memset(&seg_tss32, 0, sizeof(seg_tss32)); 647 seg_tss32.selector = X86_SEL_TSS32; 648 seg_tss32.type = 9; 649 seg_tss32.base = X86_ADDR_VAR_TSS32; 650 seg_tss32.limit = 0x1ff; 651 seg_tss32.present = 1; 652 seg_tss32.dpl = 0; 653 seg_tss32.s = 0; 654 seg_tss32.g = 0; 655 seg_tss32.db = 0; 656 seg_tss32.l = 0; 657 658 struct kvm_segment seg_tss32_2 = seg_tss32; 659 seg_tss32_2.selector = X86_SEL_TSS32_2; 660 seg_tss32_2.base = X86_ADDR_VAR_TSS32_2; 661 662 struct kvm_segment seg_tss32_cpl3 = seg_tss32; 663 seg_tss32_cpl3.selector = X86_SEL_TSS32_CPL3; 664 seg_tss32_cpl3.base = X86_ADDR_VAR_TSS32_CPL3; 665 666 struct kvm_segment seg_tss32_vm86 = seg_tss32; 667 seg_tss32_vm86.selector = X86_SEL_TSS32_VM86; 668 seg_tss32_vm86.base = X86_ADDR_VAR_TSS32_VM86; 669 670 struct kvm_segment seg_tss16 = seg_tss32; 671 seg_tss16.selector = X86_SEL_TSS16; 672 seg_tss16.base = X86_ADDR_VAR_TSS16; 673 seg_tss16.limit = 0xff; 674 seg_tss16.type = 1; 675 676 struct kvm_segment seg_tss16_2 = seg_tss16; 677 seg_tss16_2.selector = X86_SEL_TSS16_2; 678 seg_tss16_2.base = X86_ADDR_VAR_TSS16_2; 679 seg_tss16_2.dpl = 0; 680 681 struct kvm_segment seg_tss16_cpl3 = seg_tss16; 682 seg_tss16_cpl3.selector = X86_SEL_TSS16_CPL3; 683 seg_tss16_cpl3.base = X86_ADDR_VAR_TSS16_CPL3; 684 seg_tss16_cpl3.dpl = 3; 685 686 struct kvm_segment seg_tss64 = seg_tss32; 687 seg_tss64.selector = X86_SEL_TSS64; 688 seg_tss64.base = X86_ADDR_VAR_TSS64; 689 seg_tss64.limit = 0x1ff; 690 691 struct kvm_segment seg_tss64_cpl3 = seg_tss64; 692 seg_tss64_cpl3.selector = X86_SEL_TSS64_CPL3; 693 seg_tss64_cpl3.base = X86_ADDR_VAR_TSS64_CPL3; 694 seg_tss64_cpl3.dpl = 3; 695 696 struct kvm_segment seg_cgate16; 697 memset(&seg_cgate16, 0, sizeof(seg_cgate16)); 698 seg_cgate16.selector = X86_SEL_CGATE16; 699 seg_cgate16.type = 4; 700 seg_cgate16.base = X86_SEL_CS16 | (2 << 16); // selector + param count 701 seg_cgate16.limit = X86_ADDR_VAR_USER_CODE2; // entry offset 702 seg_cgate16.present = 1; 703 seg_cgate16.dpl = 0; 704 seg_cgate16.s = 0; 705 seg_cgate16.g = 0; 706 seg_cgate16.db = 0; 707 seg_cgate16.l = 0; 708 seg_cgate16.avl = 0; 709 710 struct kvm_segment seg_tgate16 = seg_cgate16; 711 seg_tgate16.selector = X86_SEL_TGATE16; 712 seg_tgate16.type = 3; 713 seg_cgate16.base = X86_SEL_TSS16_2; 714 seg_tgate16.limit = 0; 715 716 struct kvm_segment seg_cgate32 = seg_cgate16; 717 seg_cgate32.selector = X86_SEL_CGATE32; 718 seg_cgate32.type = 12; 719 seg_cgate32.base = X86_SEL_CS32 | (2 << 16); // selector + param count 720 721 struct kvm_segment seg_tgate32 = seg_cgate32; 722 seg_tgate32.selector = X86_SEL_TGATE32; 723 seg_tgate32.type = 11; 724 seg_tgate32.base = X86_SEL_TSS32_2; 725 seg_tgate32.limit = 0; 726 727 struct kvm_segment seg_cgate64 = seg_cgate16; 728 seg_cgate64.selector = X86_SEL_CGATE64; 729 seg_cgate64.type = 12; 730 seg_cgate64.base = X86_SEL_CS64; 731 732 int kvmfd = open("/dev/kvm", O_RDWR); 733 char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)]; 734 memset(buf, 0, sizeof(buf)); 735 struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf; 736 cpuid->nent = 128; 737 ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid); 738 ioctl(cpufd, KVM_SET_CPUID2, cpuid); 739 close(kvmfd); 740 741 const char* text_prefix = 0; 742 int text_prefix_size = 0; 743 char* host_text = host_mem + X86_ADDR_TEXT; 744 745 if (text_type == 8) { 746 if (flags & KVM_SETUP_SMM) { 747 if (flags & KVM_SETUP_PROTECTED) { 748 sregs.cs = seg_cs16; 749 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16; 750 sregs.cr0 |= X86_CR0_PE; 751 } else { 752 sregs.cs.selector = 0; 753 sregs.cs.base = 0; 754 } 755 756 *(host_mem + X86_ADDR_TEXT) = 0xf4; // hlt for rsm 757 host_text = host_mem + 0x8000; 758 759 ioctl(cpufd, KVM_SMI, 0); 760 } else if (flags & KVM_SETUP_VIRT86) { 761 sregs.cs = seg_cs32; 762 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 763 sregs.cr0 |= X86_CR0_PE; 764 sregs.efer |= X86_EFER_SCE; 765 766 setup_syscall_msrs(cpufd, X86_SEL_CS32, X86_SEL_CS32_CPL3); 767 setup_32bit_idt(&sregs, host_mem, guest_mem); 768 769 if (flags & KVM_SETUP_PAGING) { 770 uint64 pd_addr = guest_mem + X86_ADDR_PD; 771 uint64* pd = (uint64*)(host_mem + X86_ADDR_PD); 772 // A single 4MB page to cover the memory region 773 pd[0] = X86_PDE32_PRESENT | X86_PDE32_RW | X86_PDE32_USER | X86_PDE32_PS; 774 sregs.cr3 = pd_addr; 775 sregs.cr4 |= X86_CR4_PSE; 776 777 text_prefix = kvm_asm32_paged_vm86; 778 text_prefix_size = sizeof(kvm_asm32_paged_vm86) - 1; 779 } else { 780 text_prefix = kvm_asm32_vm86; 781 text_prefix_size = sizeof(kvm_asm32_vm86) - 1; 782 } 783 } else { 784 sregs.cs.selector = 0; 785 sregs.cs.base = 0; 786 } 787 } else if (text_type == 16) { 788 if (flags & KVM_SETUP_CPL3) { 789 sregs.cs = seg_cs16; 790 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16; 791 792 text_prefix = kvm_asm16_cpl3; 793 text_prefix_size = sizeof(kvm_asm16_cpl3) - 1; 794 } else { 795 sregs.cr0 |= X86_CR0_PE; 796 sregs.cs = seg_cs16; 797 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16; 798 } 799 } else if (text_type == 32) { 800 sregs.cr0 |= X86_CR0_PE; 801 sregs.efer |= X86_EFER_SCE; 802 803 setup_syscall_msrs(cpufd, X86_SEL_CS32, X86_SEL_CS32_CPL3); 804 setup_32bit_idt(&sregs, host_mem, guest_mem); 805 806 if (flags & KVM_SETUP_SMM) { 807 sregs.cs = seg_cs32; 808 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 809 810 *(host_mem + X86_ADDR_TEXT) = 0xf4; // hlt for rsm 811 host_text = host_mem + 0x8000; 812 813 ioctl(cpufd, KVM_SMI, 0); 814 } else if (flags & KVM_SETUP_PAGING) { 815 sregs.cs = seg_cs32; 816 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 817 818 uint64 pd_addr = guest_mem + X86_ADDR_PD; 819 uint64* pd = (uint64*)(host_mem + X86_ADDR_PD); 820 // A single 4MB page to cover the memory region 821 pd[0] = X86_PDE32_PRESENT | X86_PDE32_RW | X86_PDE32_USER | X86_PDE32_PS; 822 sregs.cr3 = pd_addr; 823 sregs.cr4 |= X86_CR4_PSE; 824 825 text_prefix = kvm_asm32_paged; 826 text_prefix_size = sizeof(kvm_asm32_paged) - 1; 827 } else if (flags & KVM_SETUP_CPL3) { 828 sregs.cs = seg_cs32_cpl3; 829 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32_cpl3; 830 } else { 831 sregs.cs = seg_cs32; 832 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 833 } 834 } else { 835 sregs.efer |= X86_EFER_LME | X86_EFER_SCE; 836 sregs.cr0 |= X86_CR0_PE; 837 838 setup_syscall_msrs(cpufd, X86_SEL_CS64, X86_SEL_CS64_CPL3); 839 setup_64bit_idt(&sregs, host_mem, guest_mem); 840 841 sregs.cs = seg_cs32; 842 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 843 844 uint64 pml4_addr = guest_mem + X86_ADDR_PML4; 845 uint64* pml4 = (uint64*)(host_mem + X86_ADDR_PML4); 846 uint64 pdpt_addr = guest_mem + X86_ADDR_PDP; 847 uint64* pdpt = (uint64*)(host_mem + X86_ADDR_PDP); 848 uint64 pd_addr = guest_mem + X86_ADDR_PD; 849 uint64* pd = (uint64*)(host_mem + X86_ADDR_PD); 850 pml4[0] = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER | pdpt_addr; 851 pdpt[0] = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER | pd_addr; 852 pd[0] = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER | X86_PDE64_PS; 853 sregs.cr3 = pml4_addr; 854 sregs.cr4 |= X86_CR4_PAE; 855 856 if (flags & KVM_SETUP_VM) { 857 sregs.cr0 |= X86_CR0_NE; 858 859 *((uint64*)(host_mem + X86_ADDR_VAR_VMXON_PTR)) = X86_ADDR_VAR_VMXON; 860 *((uint64*)(host_mem + X86_ADDR_VAR_VMCS_PTR)) = X86_ADDR_VAR_VMCS; 861 memcpy(host_mem + X86_ADDR_VAR_VMEXIT_CODE, kvm_asm64_vm_exit, sizeof(kvm_asm64_vm_exit) - 1); 862 *((uint64*)(host_mem + X86_ADDR_VAR_VMEXIT_PTR)) = X86_ADDR_VAR_VMEXIT_CODE; 863 864 text_prefix = kvm_asm64_init_vm; 865 text_prefix_size = sizeof(kvm_asm64_init_vm) - 1; 866 } else if (flags & KVM_SETUP_CPL3) { 867 text_prefix = kvm_asm64_cpl3; 868 text_prefix_size = sizeof(kvm_asm64_cpl3) - 1; 869 } else { 870 text_prefix = kvm_asm64_enable_long; 871 text_prefix_size = sizeof(kvm_asm64_enable_long) - 1; 872 } 873 } 874 875 struct tss16 tss16; 876 memset(&tss16, 0, sizeof(tss16)); 877 tss16.ss0 = tss16.ss1 = tss16.ss2 = X86_SEL_DS16; 878 tss16.sp0 = tss16.sp1 = tss16.sp2 = X86_ADDR_STACK0; 879 tss16.ip = X86_ADDR_VAR_USER_CODE2; 880 tss16.flags = (1 << 1); 881 tss16.cs = X86_SEL_CS16; 882 tss16.es = tss16.ds = tss16.ss = X86_SEL_DS16; 883 tss16.ldt = X86_SEL_LDT; 884 struct tss16* tss16_addr = (struct tss16*)(host_mem + seg_tss16_2.base); 885 memcpy(tss16_addr, &tss16, sizeof(tss16)); 886 887 memset(&tss16, 0, sizeof(tss16)); 888 tss16.ss0 = tss16.ss1 = tss16.ss2 = X86_SEL_DS16; 889 tss16.sp0 = tss16.sp1 = tss16.sp2 = X86_ADDR_STACK0; 890 tss16.ip = X86_ADDR_VAR_USER_CODE2; 891 tss16.flags = (1 << 1); 892 tss16.cs = X86_SEL_CS16_CPL3; 893 tss16.es = tss16.ds = tss16.ss = X86_SEL_DS16_CPL3; 894 tss16.ldt = X86_SEL_LDT; 895 struct tss16* tss16_cpl3_addr = (struct tss16*)(host_mem + seg_tss16_cpl3.base); 896 memcpy(tss16_cpl3_addr, &tss16, sizeof(tss16)); 897 898 struct tss32 tss32; 899 memset(&tss32, 0, sizeof(tss32)); 900 tss32.ss0 = tss32.ss1 = tss32.ss2 = X86_SEL_DS32; 901 tss32.sp0 = tss32.sp1 = tss32.sp2 = X86_ADDR_STACK0; 902 tss32.ip = X86_ADDR_VAR_USER_CODE; 903 tss32.flags = (1 << 1) | (1 << 17); 904 tss32.ldt = X86_SEL_LDT; 905 tss32.cr3 = sregs.cr3; 906 tss32.io_bitmap = offsetof(struct tss32, io_bitmap); 907 struct tss32* tss32_addr = (struct tss32*)(host_mem + seg_tss32_vm86.base); 908 memcpy(tss32_addr, &tss32, sizeof(tss32)); 909 910 memset(&tss32, 0, sizeof(tss32)); 911 tss32.ss0 = tss32.ss1 = tss32.ss2 = X86_SEL_DS32; 912 tss32.sp0 = tss32.sp1 = tss32.sp2 = X86_ADDR_STACK0; 913 tss32.ip = X86_ADDR_VAR_USER_CODE; 914 tss32.flags = (1 << 1); 915 tss32.cr3 = sregs.cr3; 916 tss32.es = tss32.ds = tss32.ss = tss32.gs = tss32.fs = X86_SEL_DS32; 917 tss32.cs = X86_SEL_CS32; 918 tss32.ldt = X86_SEL_LDT; 919 tss32.cr3 = sregs.cr3; 920 tss32.io_bitmap = offsetof(struct tss32, io_bitmap); 921 struct tss32* tss32_cpl3_addr = (struct tss32*)(host_mem + seg_tss32_2.base); 922 memcpy(tss32_cpl3_addr, &tss32, sizeof(tss32)); 923 924 struct tss64 tss64; 925 memset(&tss64, 0, sizeof(tss64)); 926 tss64.rsp[0] = X86_ADDR_STACK0; 927 tss64.rsp[1] = X86_ADDR_STACK0; 928 tss64.rsp[2] = X86_ADDR_STACK0; 929 tss64.io_bitmap = offsetof(struct tss64, io_bitmap); 930 struct tss64* tss64_addr = (struct tss64*)(host_mem + seg_tss64.base); 931 memcpy(tss64_addr, &tss64, sizeof(tss64)); 932 933 memset(&tss64, 0, sizeof(tss64)); 934 tss64.rsp[0] = X86_ADDR_STACK0; 935 tss64.rsp[1] = X86_ADDR_STACK0; 936 tss64.rsp[2] = X86_ADDR_STACK0; 937 tss64.io_bitmap = offsetof(struct tss64, io_bitmap); 938 struct tss64* tss64_cpl3_addr = (struct tss64*)(host_mem + seg_tss64_cpl3.base); 939 memcpy(tss64_cpl3_addr, &tss64, sizeof(tss64)); 940 941 if (text_size > 1000) 942 text_size = 1000; 943 if (text_prefix) { 944 memcpy(host_text, text_prefix, text_prefix_size); 945 // Replace 0xbadc0de in LJMP with offset of a next instruction. 946 void* patch = memmem(host_text, text_prefix_size, "\xde\xc0\xad\x0b", 4); 947 if (patch) 948 *((uint32*)patch) = guest_mem + X86_ADDR_TEXT + ((char*)patch - host_text) + 6; 949 uint16 magic = X86_PREFIX_SIZE; 950 patch = memmem(host_text, text_prefix_size, &magic, sizeof(magic)); 951 if (patch) 952 *((uint16*)patch) = guest_mem + X86_ADDR_TEXT + text_prefix_size; 953 } 954 memcpy((void*)(host_text + text_prefix_size), text, text_size); 955 *(host_text + text_prefix_size + text_size) = 0xf4; // hlt 956 957 memcpy(host_mem + X86_ADDR_VAR_USER_CODE, text, text_size); 958 *(host_mem + X86_ADDR_VAR_USER_CODE + text_size) = 0xf4; // hlt 959 960 *(host_mem + X86_ADDR_VAR_HLT) = 0xf4; // hlt 961 memcpy(host_mem + X86_ADDR_VAR_SYSRET, "\x0f\x07\xf4", 3); 962 memcpy(host_mem + X86_ADDR_VAR_SYSEXIT, "\x0f\x35\xf4", 3); 963 964 *(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_FLD) = 0; 965 *(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_VAL) = 0; 966 967 if (opt_count > 2) 968 opt_count = 2; 969 for (uintptr_t i = 0; i < opt_count; i++) { 970 uint64 typ = opt_array_ptr[i].typ; 971 uint64 val = opt_array_ptr[i].val; 972 switch (typ % 9) { 973 case 0: 974 sregs.cr0 ^= val & (X86_CR0_MP | X86_CR0_EM | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | X86_CR0_NW | X86_CR0_CD); 975 break; 976 case 1: 977 sregs.cr4 ^= val & (X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE | X86_CR4_MCE | X86_CR4_PGE | X86_CR4_PCE | 978 X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT | X86_CR4_UMIP | X86_CR4_VMXE | X86_CR4_SMXE | X86_CR4_FSGSBASE | X86_CR4_PCIDE | 979 X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 980 break; 981 case 2: 982 sregs.efer ^= val & (X86_EFER_SCE | X86_EFER_NXE | X86_EFER_SVME | X86_EFER_LMSLE | X86_EFER_FFXSR | X86_EFER_TCE); 983 break; 984 case 3: 985 val &= ((1 << 8) | (1 << 9) | (1 << 10) | (1 << 12) | (1 << 13) | (1 << 14) | 986 (1 << 15) | (1 << 18) | (1 << 19) | (1 << 20) | (1 << 21)); 987 regs.rflags ^= val; 988 tss16_addr->flags ^= val; 989 tss16_cpl3_addr->flags ^= val; 990 tss32_addr->flags ^= val; 991 tss32_cpl3_addr->flags ^= val; 992 break; 993 case 4: 994 seg_cs16.type = val & 0xf; 995 seg_cs32.type = val & 0xf; 996 seg_cs64.type = val & 0xf; 997 break; 998 case 5: 999 seg_cs16_cpl3.type = val & 0xf; 1000 seg_cs32_cpl3.type = val & 0xf; 1001 seg_cs64_cpl3.type = val & 0xf; 1002 break; 1003 case 6: 1004 seg_ds16.type = val & 0xf; 1005 seg_ds32.type = val & 0xf; 1006 seg_ds64.type = val & 0xf; 1007 break; 1008 case 7: 1009 seg_ds16_cpl3.type = val & 0xf; 1010 seg_ds32_cpl3.type = val & 0xf; 1011 seg_ds64_cpl3.type = val & 0xf; 1012 break; 1013 case 8: 1014 *(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_FLD) = (val & 0xffff); 1015 *(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_VAL) = (val >> 16); 1016 break; 1017 default: 1018 fail("bad kvm setup opt"); 1019 } 1020 } 1021 regs.rflags |= 2; // bit 1 is always set 1022 1023 fill_segment_descriptor(gdt, ldt, &seg_ldt); 1024 fill_segment_descriptor(gdt, ldt, &seg_cs16); 1025 fill_segment_descriptor(gdt, ldt, &seg_ds16); 1026 fill_segment_descriptor(gdt, ldt, &seg_cs16_cpl3); 1027 fill_segment_descriptor(gdt, ldt, &seg_ds16_cpl3); 1028 fill_segment_descriptor(gdt, ldt, &seg_cs32); 1029 fill_segment_descriptor(gdt, ldt, &seg_ds32); 1030 fill_segment_descriptor(gdt, ldt, &seg_cs32_cpl3); 1031 fill_segment_descriptor(gdt, ldt, &seg_ds32_cpl3); 1032 fill_segment_descriptor(gdt, ldt, &seg_cs64); 1033 fill_segment_descriptor(gdt, ldt, &seg_ds64); 1034 fill_segment_descriptor(gdt, ldt, &seg_cs64_cpl3); 1035 fill_segment_descriptor(gdt, ldt, &seg_ds64_cpl3); 1036 fill_segment_descriptor(gdt, ldt, &seg_tss32); 1037 fill_segment_descriptor(gdt, ldt, &seg_tss32_2); 1038 fill_segment_descriptor(gdt, ldt, &seg_tss32_cpl3); 1039 fill_segment_descriptor(gdt, ldt, &seg_tss32_vm86); 1040 fill_segment_descriptor(gdt, ldt, &seg_tss16); 1041 fill_segment_descriptor(gdt, ldt, &seg_tss16_2); 1042 fill_segment_descriptor(gdt, ldt, &seg_tss16_cpl3); 1043 fill_segment_descriptor_dword(gdt, ldt, &seg_tss64); 1044 fill_segment_descriptor_dword(gdt, ldt, &seg_tss64_cpl3); 1045 fill_segment_descriptor(gdt, ldt, &seg_cgate16); 1046 fill_segment_descriptor(gdt, ldt, &seg_tgate16); 1047 fill_segment_descriptor(gdt, ldt, &seg_cgate32); 1048 fill_segment_descriptor(gdt, ldt, &seg_tgate32); 1049 fill_segment_descriptor_dword(gdt, ldt, &seg_cgate64); 1050 1051 if (ioctl(cpufd, KVM_SET_SREGS, &sregs)) 1052 return -1; 1053 if (ioctl(cpufd, KVM_SET_REGS, ®s)) 1054 return -1; 1055 return 0; 1056 } 1057 #endif 1058 1059 #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu 1060 1061 #define RFLAGS_1_BIT (1ULL << 1) 1062 #define RFLAGS_IF_BIT (1ULL << 9) 1063 1064 static void reset_cpu_regs(int cpufd, int cpu_id, size_t text_size) 1065 { 1066 struct kvm_regs regs; 1067 memset(®s, 0, sizeof(regs)); 1068 1069 // RFLAGS.1 must be 1, RFLAGS.IF enables interrupts. 1070 regs.rflags |= RFLAGS_1_BIT | RFLAGS_IF_BIT; 1071 // PC points to the relative offset of guest_main() within the guest code. 1072 regs.rip = executor_fn_guest_addr(guest_main); 1073 regs.rsp = X86_SYZOS_ADDR_STACK0; 1074 // Pass parameters to guest_main(). 1075 regs.rdi = text_size; 1076 regs.rsi = cpu_id; 1077 ioctl(cpufd, KVM_SET_REGS, ®s); 1078 } 1079 1080 static void install_user_code(struct kvm_syz_vm* vm, int cpufd, int cpu_id, const void* text, size_t text_size) 1081 { 1082 if ((cpu_id < 0) || (cpu_id >= KVM_MAX_VCPU)) 1083 return; 1084 if (text_size > KVM_PAGE_SIZE) 1085 text_size = KVM_PAGE_SIZE; 1086 void* target = (void*)((uint64)vm->user_text + (KVM_PAGE_SIZE * cpu_id)); 1087 memcpy(target, text, text_size); 1088 setup_gdt_ldt_pg(vm, cpufd); 1089 setup_cpuid(cpufd); 1090 reset_cpu_regs(cpufd, cpu_id, text_size); 1091 } 1092 #endif 1093 1094 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm 1095 struct addr_size { 1096 void* addr; 1097 size_t size; 1098 }; 1099 1100 static struct addr_size alloc_guest_mem(struct addr_size* free, size_t size) 1101 { 1102 struct addr_size ret = {.addr = NULL, .size = 0}; 1103 1104 if (free->size < size) 1105 return ret; 1106 ret.addr = free->addr; 1107 ret.size = size; 1108 free->addr = (void*)((char*)free->addr + size); 1109 free->size -= size; 1110 return ret; 1111 } 1112 1113 // Call KVM_SET_USER_MEMORY_REGION for the given pages. 1114 static void vm_set_user_memory_region(int vmfd, uint32 slot, uint32 flags, uint64 guest_phys_addr, uint64 memory_size, uint64 userspace_addr) 1115 { 1116 struct kvm_userspace_memory_region memreg; 1117 memreg.slot = slot; 1118 memreg.flags = flags; 1119 memreg.guest_phys_addr = guest_phys_addr; 1120 memreg.memory_size = memory_size; 1121 memreg.userspace_addr = userspace_addr; 1122 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); 1123 } 1124 1125 static void install_syzos_code(void* host_mem, size_t mem_size) 1126 { 1127 size_t size = (char*)&__stop_guest - (char*)&__start_guest; 1128 if (size > mem_size) 1129 fail("SyzOS size exceeds guest memory"); 1130 memcpy(host_mem, &__start_guest, size); 1131 } 1132 1133 static void setup_vm(int vmfd, struct kvm_syz_vm* vm) 1134 { 1135 struct addr_size allocator = {.addr = vm->host_mem, .size = vm->total_pages * KVM_PAGE_SIZE}; 1136 int slot = 0; // Slot numbers do not matter, they just have to be different. 1137 1138 for (size_t i = 0; i < sizeof(syzos_mem_regions) / sizeof(syzos_mem_regions[0]); i++) { 1139 const struct mem_region* r = &syzos_mem_regions[i]; 1140 if (r->flags & MEM_REGION_FLAG_NO_HOST_MEM) 1141 continue; 1142 struct addr_size next = alloc_guest_mem(&allocator, r->pages * KVM_PAGE_SIZE); 1143 uint32 flags = 0; 1144 if (r->flags & MEM_REGION_FLAG_DIRTY_LOG) 1145 flags |= KVM_MEM_LOG_DIRTY_PAGES; 1146 if (r->flags & MEM_REGION_FLAG_READONLY) 1147 flags |= KVM_MEM_READONLY; 1148 if (r->flags & MEM_REGION_FLAG_USER_CODE) 1149 vm->user_text = next.addr; 1150 if (r->flags & MEM_REGION_FLAG_GPA0) 1151 vm->gpa0_mem = next.addr; 1152 if (r->flags & MEM_REGION_FLAG_EXECUTOR_CODE) 1153 install_syzos_code(next.addr, next.size); 1154 vm_set_user_memory_region(vmfd, slot++, flags, r->gpa, next.size, (uintptr_t)next.addr); 1155 } 1156 1157 // Map the remaining pages at an unused address. 1158 struct addr_size next = alloc_guest_mem(&allocator, allocator.size); 1159 vm_set_user_memory_region(vmfd, slot++, 0, X86_SYZOS_ADDR_UNUSED, next.size, (uintptr_t)next.addr); 1160 } 1161 #endif 1162 1163 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm 1164 static long syz_kvm_setup_syzos_vm(volatile long a0, volatile long a1) 1165 { 1166 const int vmfd = a0; 1167 void* host_mem = (void*)a1; 1168 struct kvm_syz_vm* ret = (struct kvm_syz_vm*)host_mem; 1169 ret->host_mem = (void*)((uint64)host_mem + KVM_PAGE_SIZE); 1170 ret->total_pages = KVM_GUEST_PAGES - 1; 1171 setup_vm(vmfd, ret); 1172 ret->vmfd = vmfd; 1173 ret->next_cpu_id = 0; 1174 return (long)ret; 1175 } 1176 #endif 1177 1178 #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu 1179 static long syz_kvm_add_vcpu(volatile long a0, volatile long a1) 1180 { 1181 struct kvm_syz_vm* vm = (struct kvm_syz_vm*)a0; 1182 struct kvm_text* utext = (struct kvm_text*)a1; 1183 const void* text = utext->text; 1184 size_t text_size = utext->size; 1185 1186 if (!vm) { 1187 errno = EINVAL; 1188 return -1; 1189 } 1190 if (vm->next_cpu_id == KVM_MAX_VCPU) { 1191 errno = ENOMEM; 1192 return -1; 1193 } 1194 int cpu_id = vm->next_cpu_id; 1195 int cpufd = ioctl(vm->vmfd, KVM_CREATE_VCPU, cpu_id); 1196 if (cpufd == -1) 1197 return -1; 1198 // Only increment next_cpu_id if CPU creation succeeded. 1199 vm->next_cpu_id++; 1200 install_user_code(vm, cpufd, cpu_id, text, text_size); 1201 return cpufd; 1202 } 1203 #endif 1204 1205 #if SYZ_EXECUTOR || __NR_syz_kvm_assert_syzos_uexit 1206 static long syz_kvm_assert_syzos_uexit(volatile long a0, volatile long a1) 1207 { 1208 struct kvm_run* run = (struct kvm_run*)a0; 1209 uint64 expect = a1; 1210 1211 if (!run || (run->exit_reason != KVM_EXIT_MMIO) || (run->mmio.phys_addr != X86_SYZOS_ADDR_UEXIT)) { 1212 errno = EINVAL; 1213 return -1; 1214 } 1215 1216 if ((((uint64*)(run->mmio.data))[0]) != expect) { 1217 errno = EDOM; 1218 return -1; 1219 } 1220 return 0; 1221 } 1222 #endif 1223 1224 #endif // EXECUTOR_COMMON_KVM_AMD64_H