github.com/google/syzkaller@v0.0.0-20240517125934-c0f1611a36d6/executor/common_kvm_amd64.h (about) 1 // Copyright 2017 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 // This file is shared between executor and csource package. 5 6 // Implementation of syz_kvm_setup_cpu pseudo-syscall. 7 // See Intel Software Developer’s Manual Volume 3: System Programming Guide 8 // for details on what happens here. 9 10 #include "kvm.h" 11 #include "kvm_amd64.S.h" 12 13 #ifndef KVM_SMI 14 #define KVM_SMI _IO(KVMIO, 0xb7) 15 #endif 16 17 #define CR0_PE 1 18 #define CR0_MP (1 << 1) 19 #define CR0_EM (1 << 2) 20 #define CR0_TS (1 << 3) 21 #define CR0_ET (1 << 4) 22 #define CR0_NE (1 << 5) 23 #define CR0_WP (1 << 16) 24 #define CR0_AM (1 << 18) 25 #define CR0_NW (1 << 29) 26 #define CR0_CD (1 << 30) 27 #define CR0_PG (1 << 31) 28 29 #define CR4_VME 1 30 #define CR4_PVI (1 << 1) 31 #define CR4_TSD (1 << 2) 32 #define CR4_DE (1 << 3) 33 #define CR4_PSE (1 << 4) 34 #define CR4_PAE (1 << 5) 35 #define CR4_MCE (1 << 6) 36 #define CR4_PGE (1 << 7) 37 #define CR4_PCE (1 << 8) 38 #define CR4_OSFXSR (1 << 8) 39 #define CR4_OSXMMEXCPT (1 << 10) 40 #define CR4_UMIP (1 << 11) 41 #define CR4_VMXE (1 << 13) 42 #define CR4_SMXE (1 << 14) 43 #define CR4_FSGSBASE (1 << 16) 44 #define CR4_PCIDE (1 << 17) 45 #define CR4_OSXSAVE (1 << 18) 46 #define CR4_SMEP (1 << 20) 47 #define CR4_SMAP (1 << 21) 48 #define CR4_PKE (1 << 22) 49 50 #define EFER_SCE 1 51 #define EFER_LME (1 << 8) 52 #define EFER_LMA (1 << 10) 53 #define EFER_NXE (1 << 11) 54 #define EFER_SVME (1 << 12) 55 #define EFER_LMSLE (1 << 13) 56 #define EFER_FFXSR (1 << 14) 57 #define EFER_TCE (1 << 15) 58 59 // 32-bit page directory entry bits 60 #define PDE32_PRESENT 1 61 #define PDE32_RW (1 << 1) 62 #define PDE32_USER (1 << 2) 63 #define PDE32_PS (1 << 7) 64 65 // 64-bit page * entry bits 66 #define PDE64_PRESENT 1 67 #define PDE64_RW (1 << 1) 68 #define PDE64_USER (1 << 2) 69 #define PDE64_ACCESSED (1 << 5) 70 #define PDE64_DIRTY (1 << 6) 71 #define PDE64_PS (1 << 7) 72 #define PDE64_G (1 << 8) 73 74 struct tss16 { 75 uint16 prev; 76 uint16 sp0; 77 uint16 ss0; 78 uint16 sp1; 79 uint16 ss1; 80 uint16 sp2; 81 uint16 ss2; 82 uint16 ip; 83 uint16 flags; 84 uint16 ax; 85 uint16 cx; 86 uint16 dx; 87 uint16 bx; 88 uint16 sp; 89 uint16 bp; 90 uint16 si; 91 uint16 di; 92 uint16 es; 93 uint16 cs; 94 uint16 ss; 95 uint16 ds; 96 uint16 ldt; 97 } __attribute__((packed)); 98 99 struct tss32 { 100 uint16 prev, prevh; 101 uint32 sp0; 102 uint16 ss0, ss0h; 103 uint32 sp1; 104 uint16 ss1, ss1h; 105 uint32 sp2; 106 uint16 ss2, ss2h; 107 uint32 cr3; 108 uint32 ip; 109 uint32 flags; 110 uint32 ax; 111 uint32 cx; 112 uint32 dx; 113 uint32 bx; 114 uint32 sp; 115 uint32 bp; 116 uint32 si; 117 uint32 di; 118 uint16 es, esh; 119 uint16 cs, csh; 120 uint16 ss, ssh; 121 uint16 ds, dsh; 122 uint16 fs, fsh; 123 uint16 gs, gsh; 124 uint16 ldt, ldth; 125 uint16 trace; 126 uint16 io_bitmap; 127 } __attribute__((packed)); 128 129 struct tss64 { 130 uint32 reserved0; 131 uint64 rsp[3]; 132 uint64 reserved1; 133 uint64 ist[7]; 134 uint64 reserved2; 135 uint32 reserved3; 136 uint32 io_bitmap; 137 } __attribute__((packed)); 138 139 static void fill_segment_descriptor(uint64* dt, uint64* lt, struct kvm_segment* seg) 140 { 141 uint16 index = seg->selector >> 3; 142 uint64 limit = seg->g ? seg->limit >> 12 : seg->limit; 143 uint64 sd = (limit & 0xffff) | (seg->base & 0xffffff) << 16 | (uint64)seg->type << 40 | (uint64)seg->s << 44 | (uint64)seg->dpl << 45 | (uint64)seg->present << 47 | (limit & 0xf0000ULL) << 48 | (uint64)seg->avl << 52 | (uint64)seg->l << 53 | (uint64)seg->db << 54 | (uint64)seg->g << 55 | (seg->base & 0xff000000ULL) << 56; 144 dt[index] = sd; 145 lt[index] = sd; 146 } 147 148 static void fill_segment_descriptor_dword(uint64* dt, uint64* lt, struct kvm_segment* seg) 149 { 150 fill_segment_descriptor(dt, lt, seg); 151 uint16 index = seg->selector >> 3; 152 dt[index + 1] = 0; 153 lt[index + 1] = 0; 154 } 155 156 static void setup_syscall_msrs(int cpufd, uint16 sel_cs, uint16 sel_cs_cpl3) 157 { 158 char buf[sizeof(struct kvm_msrs) + 5 * sizeof(struct kvm_msr_entry)]; 159 memset(buf, 0, sizeof(buf)); 160 struct kvm_msrs* msrs = (struct kvm_msrs*)buf; 161 struct kvm_msr_entry* entries = msrs->entries; 162 msrs->nmsrs = 5; 163 entries[0].index = MSR_IA32_SYSENTER_CS; 164 entries[0].data = sel_cs; 165 entries[1].index = MSR_IA32_SYSENTER_ESP; 166 entries[1].data = ADDR_STACK0; 167 entries[2].index = MSR_IA32_SYSENTER_EIP; 168 entries[2].data = ADDR_VAR_SYSEXIT; 169 entries[3].index = MSR_IA32_STAR; 170 entries[3].data = ((uint64)sel_cs << 32) | ((uint64)sel_cs_cpl3 << 48); 171 entries[4].index = MSR_IA32_LSTAR; 172 entries[4].data = ADDR_VAR_SYSRET; 173 ioctl(cpufd, KVM_SET_MSRS, msrs); 174 } 175 176 static void setup_32bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem) 177 { 178 sregs->idt.base = guest_mem + ADDR_VAR_IDT; 179 sregs->idt.limit = 0x1ff; 180 uint64* idt = (uint64*)(host_mem + sregs->idt.base); 181 for (int i = 0; i < 32; i++) { 182 struct kvm_segment gate; 183 gate.selector = i << 3; 184 switch (i % 6) { 185 case 0: 186 // 16-bit interrupt gate 187 gate.type = 6; 188 gate.base = SEL_CS16; 189 break; 190 case 1: 191 // 16-bit trap gate 192 gate.type = 7; 193 gate.base = SEL_CS16; 194 break; 195 case 2: 196 // 16-bit task gate 197 gate.type = 3; 198 gate.base = SEL_TGATE16; 199 break; 200 case 3: 201 // 32-bit interrupt gate 202 gate.type = 14; 203 gate.base = SEL_CS32; 204 break; 205 case 4: 206 // 32-bit trap gate 207 gate.type = 15; 208 gate.base = SEL_CS32; 209 break; 210 case 5: 211 // 32-bit task gate 212 gate.type = 11; 213 gate.base = SEL_TGATE32; 214 break; 215 } 216 gate.limit = guest_mem + ADDR_VAR_USER_CODE2; // entry offset 217 gate.present = 1; 218 gate.dpl = 0; 219 gate.s = 0; 220 gate.g = 0; 221 gate.db = 0; 222 gate.l = 0; 223 gate.avl = 0; 224 fill_segment_descriptor(idt, idt, &gate); 225 } 226 } 227 228 static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem) 229 { 230 sregs->idt.base = guest_mem + ADDR_VAR_IDT; 231 sregs->idt.limit = 0x1ff; 232 uint64* idt = (uint64*)(host_mem + sregs->idt.base); 233 for (int i = 0; i < 32; i++) { 234 struct kvm_segment gate; 235 gate.selector = (i * 2) << 3; 236 gate.type = (i & 1) ? 14 : 15; // interrupt or trap gate 237 gate.base = SEL_CS64; 238 gate.limit = guest_mem + ADDR_VAR_USER_CODE2; // entry offset 239 gate.present = 1; 240 gate.dpl = 0; 241 gate.s = 0; 242 gate.g = 0; 243 gate.db = 0; 244 gate.l = 0; 245 gate.avl = 0; 246 fill_segment_descriptor_dword(idt, idt, &gate); 247 } 248 } 249 250 struct kvm_text { 251 uintptr_t typ; 252 const void* text; 253 uintptr_t size; 254 }; 255 256 struct kvm_opt { 257 uint64 typ; 258 uint64 val; 259 }; 260 261 #define KVM_SETUP_PAGING (1 << 0) 262 #define KVM_SETUP_PAE (1 << 1) 263 #define KVM_SETUP_PROTECTED (1 << 2) 264 #define KVM_SETUP_CPL3 (1 << 3) 265 #define KVM_SETUP_VIRT86 (1 << 4) 266 #define KVM_SETUP_SMM (1 << 5) 267 #define KVM_SETUP_VM (1 << 6) 268 269 // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts]) 270 static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4, volatile long a5, volatile long a6, volatile long a7) 271 { 272 const int vmfd = a0; 273 const int cpufd = a1; 274 char* const host_mem = (char*)a2; 275 const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3; 276 const uintptr_t text_count = a4; 277 const uintptr_t flags = a5; 278 const struct kvm_opt* const opt_array_ptr = (struct kvm_opt*)a6; 279 uintptr_t opt_count = a7; 280 281 const uintptr_t page_size = 4 << 10; 282 const uintptr_t ioapic_page = 10; 283 const uintptr_t guest_mem_size = 24 * page_size; 284 const uintptr_t guest_mem = 0; 285 286 (void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count 287 int text_type = text_array_ptr[0].typ; 288 const void* text = text_array_ptr[0].text; 289 uintptr_t text_size = text_array_ptr[0].size; 290 291 for (uintptr_t i = 0; i < guest_mem_size / page_size; i++) { 292 struct kvm_userspace_memory_region memreg; 293 memreg.slot = i; 294 memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY 295 memreg.guest_phys_addr = guest_mem + i * page_size; 296 if (i == ioapic_page) 297 memreg.guest_phys_addr = 0xfec00000; 298 memreg.memory_size = page_size; 299 memreg.userspace_addr = (uintptr_t)host_mem + i * page_size; 300 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); 301 } 302 // SMRAM 303 struct kvm_userspace_memory_region memreg; 304 memreg.slot = 1 + (1 << 16); 305 memreg.flags = 0; 306 memreg.guest_phys_addr = 0x30000; 307 memreg.memory_size = 64 << 10; 308 memreg.userspace_addr = (uintptr_t)host_mem; 309 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg); 310 311 struct kvm_sregs sregs; 312 if (ioctl(cpufd, KVM_GET_SREGS, &sregs)) 313 return -1; 314 315 struct kvm_regs regs; 316 memset(®s, 0, sizeof(regs)); 317 regs.rip = guest_mem + ADDR_TEXT; 318 regs.rsp = ADDR_STACK0; 319 320 sregs.gdt.base = guest_mem + ADDR_GDT; 321 sregs.gdt.limit = 256 * sizeof(uint64) - 1; 322 uint64* gdt = (uint64*)(host_mem + sregs.gdt.base); 323 324 struct kvm_segment seg_ldt; 325 memset(&seg_ldt, 0, sizeof(seg_ldt)); 326 seg_ldt.selector = SEL_LDT; 327 seg_ldt.type = 2; 328 seg_ldt.base = guest_mem + ADDR_LDT; 329 seg_ldt.limit = 256 * sizeof(uint64) - 1; 330 seg_ldt.present = 1; 331 seg_ldt.dpl = 0; 332 seg_ldt.s = 0; 333 seg_ldt.g = 0; 334 seg_ldt.db = 1; 335 seg_ldt.l = 0; 336 sregs.ldt = seg_ldt; 337 uint64* ldt = (uint64*)(host_mem + sregs.ldt.base); 338 339 struct kvm_segment seg_cs16; 340 memset(&seg_cs16, 0, sizeof(seg_cs16)); 341 seg_cs16.selector = SEL_CS16; 342 seg_cs16.type = 11; 343 seg_cs16.base = 0; 344 seg_cs16.limit = 0xfffff; 345 seg_cs16.present = 1; 346 seg_cs16.dpl = 0; 347 seg_cs16.s = 1; 348 seg_cs16.g = 0; 349 seg_cs16.db = 0; 350 seg_cs16.l = 0; 351 352 struct kvm_segment seg_ds16 = seg_cs16; 353 seg_ds16.selector = SEL_DS16; 354 seg_ds16.type = 3; 355 356 struct kvm_segment seg_cs16_cpl3 = seg_cs16; 357 seg_cs16_cpl3.selector = SEL_CS16_CPL3; 358 seg_cs16_cpl3.dpl = 3; 359 360 struct kvm_segment seg_ds16_cpl3 = seg_ds16; 361 seg_ds16_cpl3.selector = SEL_DS16_CPL3; 362 seg_ds16_cpl3.dpl = 3; 363 364 struct kvm_segment seg_cs32 = seg_cs16; 365 seg_cs32.selector = SEL_CS32; 366 seg_cs32.db = 1; 367 368 struct kvm_segment seg_ds32 = seg_ds16; 369 seg_ds32.selector = SEL_DS32; 370 seg_ds32.db = 1; 371 372 struct kvm_segment seg_cs32_cpl3 = seg_cs32; 373 seg_cs32_cpl3.selector = SEL_CS32_CPL3; 374 seg_cs32_cpl3.dpl = 3; 375 376 struct kvm_segment seg_ds32_cpl3 = seg_ds32; 377 seg_ds32_cpl3.selector = SEL_DS32_CPL3; 378 seg_ds32_cpl3.dpl = 3; 379 380 struct kvm_segment seg_cs64 = seg_cs16; 381 seg_cs64.selector = SEL_CS64; 382 seg_cs64.l = 1; 383 384 struct kvm_segment seg_ds64 = seg_ds32; 385 seg_ds64.selector = SEL_DS64; 386 387 struct kvm_segment seg_cs64_cpl3 = seg_cs64; 388 seg_cs64_cpl3.selector = SEL_CS64_CPL3; 389 seg_cs64_cpl3.dpl = 3; 390 391 struct kvm_segment seg_ds64_cpl3 = seg_ds64; 392 seg_ds64_cpl3.selector = SEL_DS64_CPL3; 393 seg_ds64_cpl3.dpl = 3; 394 395 struct kvm_segment seg_tss32; 396 memset(&seg_tss32, 0, sizeof(seg_tss32)); 397 seg_tss32.selector = SEL_TSS32; 398 seg_tss32.type = 9; 399 seg_tss32.base = ADDR_VAR_TSS32; 400 seg_tss32.limit = 0x1ff; 401 seg_tss32.present = 1; 402 seg_tss32.dpl = 0; 403 seg_tss32.s = 0; 404 seg_tss32.g = 0; 405 seg_tss32.db = 0; 406 seg_tss32.l = 0; 407 408 struct kvm_segment seg_tss32_2 = seg_tss32; 409 seg_tss32_2.selector = SEL_TSS32_2; 410 seg_tss32_2.base = ADDR_VAR_TSS32_2; 411 412 struct kvm_segment seg_tss32_cpl3 = seg_tss32; 413 seg_tss32_cpl3.selector = SEL_TSS32_CPL3; 414 seg_tss32_cpl3.base = ADDR_VAR_TSS32_CPL3; 415 416 struct kvm_segment seg_tss32_vm86 = seg_tss32; 417 seg_tss32_vm86.selector = SEL_TSS32_VM86; 418 seg_tss32_vm86.base = ADDR_VAR_TSS32_VM86; 419 420 struct kvm_segment seg_tss16 = seg_tss32; 421 seg_tss16.selector = SEL_TSS16; 422 seg_tss16.base = ADDR_VAR_TSS16; 423 seg_tss16.limit = 0xff; 424 seg_tss16.type = 1; 425 426 struct kvm_segment seg_tss16_2 = seg_tss16; 427 seg_tss16_2.selector = SEL_TSS16_2; 428 seg_tss16_2.base = ADDR_VAR_TSS16_2; 429 seg_tss16_2.dpl = 0; 430 431 struct kvm_segment seg_tss16_cpl3 = seg_tss16; 432 seg_tss16_cpl3.selector = SEL_TSS16_CPL3; 433 seg_tss16_cpl3.base = ADDR_VAR_TSS16_CPL3; 434 seg_tss16_cpl3.dpl = 3; 435 436 struct kvm_segment seg_tss64 = seg_tss32; 437 seg_tss64.selector = SEL_TSS64; 438 seg_tss64.base = ADDR_VAR_TSS64; 439 seg_tss64.limit = 0x1ff; 440 441 struct kvm_segment seg_tss64_cpl3 = seg_tss64; 442 seg_tss64_cpl3.selector = SEL_TSS64_CPL3; 443 seg_tss64_cpl3.base = ADDR_VAR_TSS64_CPL3; 444 seg_tss64_cpl3.dpl = 3; 445 446 struct kvm_segment seg_cgate16; 447 memset(&seg_cgate16, 0, sizeof(seg_cgate16)); 448 seg_cgate16.selector = SEL_CGATE16; 449 seg_cgate16.type = 4; 450 seg_cgate16.base = SEL_CS16 | (2 << 16); // selector + param count 451 seg_cgate16.limit = ADDR_VAR_USER_CODE2; // entry offset 452 seg_cgate16.present = 1; 453 seg_cgate16.dpl = 0; 454 seg_cgate16.s = 0; 455 seg_cgate16.g = 0; 456 seg_cgate16.db = 0; 457 seg_cgate16.l = 0; 458 seg_cgate16.avl = 0; 459 460 struct kvm_segment seg_tgate16 = seg_cgate16; 461 seg_tgate16.selector = SEL_TGATE16; 462 seg_tgate16.type = 3; 463 seg_cgate16.base = SEL_TSS16_2; 464 seg_tgate16.limit = 0; 465 466 struct kvm_segment seg_cgate32 = seg_cgate16; 467 seg_cgate32.selector = SEL_CGATE32; 468 seg_cgate32.type = 12; 469 seg_cgate32.base = SEL_CS32 | (2 << 16); // selector + param count 470 471 struct kvm_segment seg_tgate32 = seg_cgate32; 472 seg_tgate32.selector = SEL_TGATE32; 473 seg_tgate32.type = 11; 474 seg_tgate32.base = SEL_TSS32_2; 475 seg_tgate32.limit = 0; 476 477 struct kvm_segment seg_cgate64 = seg_cgate16; 478 seg_cgate64.selector = SEL_CGATE64; 479 seg_cgate64.type = 12; 480 seg_cgate64.base = SEL_CS64; 481 482 int kvmfd = open("/dev/kvm", O_RDWR); 483 char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)]; 484 memset(buf, 0, sizeof(buf)); 485 struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf; 486 cpuid->nent = 128; 487 ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid); 488 ioctl(cpufd, KVM_SET_CPUID2, cpuid); 489 close(kvmfd); 490 491 const char* text_prefix = 0; 492 int text_prefix_size = 0; 493 char* host_text = host_mem + ADDR_TEXT; 494 495 if (text_type == 8) { 496 if (flags & KVM_SETUP_SMM) { 497 if (flags & KVM_SETUP_PROTECTED) { 498 sregs.cs = seg_cs16; 499 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16; 500 sregs.cr0 |= CR0_PE; 501 } else { 502 sregs.cs.selector = 0; 503 sregs.cs.base = 0; 504 } 505 506 *(host_mem + ADDR_TEXT) = 0xf4; // hlt for rsm 507 host_text = host_mem + 0x8000; 508 509 ioctl(cpufd, KVM_SMI, 0); 510 } else if (flags & KVM_SETUP_VIRT86) { 511 sregs.cs = seg_cs32; 512 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 513 sregs.cr0 |= CR0_PE; 514 sregs.efer |= EFER_SCE; 515 516 setup_syscall_msrs(cpufd, SEL_CS32, SEL_CS32_CPL3); 517 setup_32bit_idt(&sregs, host_mem, guest_mem); 518 519 if (flags & KVM_SETUP_PAGING) { 520 uint64 pd_addr = guest_mem + ADDR_PD; 521 uint64* pd = (uint64*)(host_mem + ADDR_PD); 522 // A single 4MB page to cover the memory region 523 pd[0] = PDE32_PRESENT | PDE32_RW | PDE32_USER | PDE32_PS; 524 sregs.cr3 = pd_addr; 525 sregs.cr4 |= CR4_PSE; 526 527 text_prefix = kvm_asm32_paged_vm86; 528 text_prefix_size = sizeof(kvm_asm32_paged_vm86) - 1; 529 } else { 530 text_prefix = kvm_asm32_vm86; 531 text_prefix_size = sizeof(kvm_asm32_vm86) - 1; 532 } 533 } else { 534 sregs.cs.selector = 0; 535 sregs.cs.base = 0; 536 } 537 } else if (text_type == 16) { 538 if (flags & KVM_SETUP_CPL3) { 539 sregs.cs = seg_cs16; 540 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16; 541 542 text_prefix = kvm_asm16_cpl3; 543 text_prefix_size = sizeof(kvm_asm16_cpl3) - 1; 544 } else { 545 sregs.cr0 |= CR0_PE; 546 sregs.cs = seg_cs16; 547 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16; 548 } 549 } else if (text_type == 32) { 550 sregs.cr0 |= CR0_PE; 551 sregs.efer |= EFER_SCE; 552 553 setup_syscall_msrs(cpufd, SEL_CS32, SEL_CS32_CPL3); 554 setup_32bit_idt(&sregs, host_mem, guest_mem); 555 556 if (flags & KVM_SETUP_SMM) { 557 sregs.cs = seg_cs32; 558 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 559 560 *(host_mem + ADDR_TEXT) = 0xf4; // hlt for rsm 561 host_text = host_mem + 0x8000; 562 563 ioctl(cpufd, KVM_SMI, 0); 564 } else if (flags & KVM_SETUP_PAGING) { 565 sregs.cs = seg_cs32; 566 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 567 568 uint64 pd_addr = guest_mem + ADDR_PD; 569 uint64* pd = (uint64*)(host_mem + ADDR_PD); 570 // A single 4MB page to cover the memory region 571 pd[0] = PDE32_PRESENT | PDE32_RW | PDE32_USER | PDE32_PS; 572 sregs.cr3 = pd_addr; 573 sregs.cr4 |= CR4_PSE; 574 575 text_prefix = kvm_asm32_paged; 576 text_prefix_size = sizeof(kvm_asm32_paged) - 1; 577 } else if (flags & KVM_SETUP_CPL3) { 578 sregs.cs = seg_cs32_cpl3; 579 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32_cpl3; 580 } else { 581 sregs.cs = seg_cs32; 582 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 583 } 584 } else { 585 sregs.efer |= EFER_LME | EFER_SCE; 586 sregs.cr0 |= CR0_PE; 587 588 setup_syscall_msrs(cpufd, SEL_CS64, SEL_CS64_CPL3); 589 setup_64bit_idt(&sregs, host_mem, guest_mem); 590 591 sregs.cs = seg_cs32; 592 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32; 593 594 uint64 pml4_addr = guest_mem + ADDR_PML4; 595 uint64* pml4 = (uint64*)(host_mem + ADDR_PML4); 596 uint64 pdpt_addr = guest_mem + ADDR_PDP; 597 uint64* pdpt = (uint64*)(host_mem + ADDR_PDP); 598 uint64 pd_addr = guest_mem + ADDR_PD; 599 uint64* pd = (uint64*)(host_mem + ADDR_PD); 600 pml4[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | pdpt_addr; 601 pdpt[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | pd_addr; 602 pd[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | PDE64_PS; 603 sregs.cr3 = pml4_addr; 604 sregs.cr4 |= CR4_PAE; 605 606 if (flags & KVM_SETUP_VM) { 607 sregs.cr0 |= CR0_NE; 608 609 *((uint64*)(host_mem + ADDR_VAR_VMXON_PTR)) = ADDR_VAR_VMXON; 610 *((uint64*)(host_mem + ADDR_VAR_VMCS_PTR)) = ADDR_VAR_VMCS; 611 memcpy(host_mem + ADDR_VAR_VMEXIT_CODE, kvm_asm64_vm_exit, sizeof(kvm_asm64_vm_exit) - 1); 612 *((uint64*)(host_mem + ADDR_VAR_VMEXIT_PTR)) = ADDR_VAR_VMEXIT_CODE; 613 614 text_prefix = kvm_asm64_init_vm; 615 text_prefix_size = sizeof(kvm_asm64_init_vm) - 1; 616 } else if (flags & KVM_SETUP_CPL3) { 617 text_prefix = kvm_asm64_cpl3; 618 text_prefix_size = sizeof(kvm_asm64_cpl3) - 1; 619 } else { 620 text_prefix = kvm_asm64_enable_long; 621 text_prefix_size = sizeof(kvm_asm64_enable_long) - 1; 622 } 623 } 624 625 struct tss16 tss16; 626 memset(&tss16, 0, sizeof(tss16)); 627 tss16.ss0 = tss16.ss1 = tss16.ss2 = SEL_DS16; 628 tss16.sp0 = tss16.sp1 = tss16.sp2 = ADDR_STACK0; 629 tss16.ip = ADDR_VAR_USER_CODE2; 630 tss16.flags = (1 << 1); 631 tss16.cs = SEL_CS16; 632 tss16.es = tss16.ds = tss16.ss = SEL_DS16; 633 tss16.ldt = SEL_LDT; 634 struct tss16* tss16_addr = (struct tss16*)(host_mem + seg_tss16_2.base); 635 memcpy(tss16_addr, &tss16, sizeof(tss16)); 636 637 memset(&tss16, 0, sizeof(tss16)); 638 tss16.ss0 = tss16.ss1 = tss16.ss2 = SEL_DS16; 639 tss16.sp0 = tss16.sp1 = tss16.sp2 = ADDR_STACK0; 640 tss16.ip = ADDR_VAR_USER_CODE2; 641 tss16.flags = (1 << 1); 642 tss16.cs = SEL_CS16_CPL3; 643 tss16.es = tss16.ds = tss16.ss = SEL_DS16_CPL3; 644 tss16.ldt = SEL_LDT; 645 struct tss16* tss16_cpl3_addr = (struct tss16*)(host_mem + seg_tss16_cpl3.base); 646 memcpy(tss16_cpl3_addr, &tss16, sizeof(tss16)); 647 648 struct tss32 tss32; 649 memset(&tss32, 0, sizeof(tss32)); 650 tss32.ss0 = tss32.ss1 = tss32.ss2 = SEL_DS32; 651 tss32.sp0 = tss32.sp1 = tss32.sp2 = ADDR_STACK0; 652 tss32.ip = ADDR_VAR_USER_CODE; 653 tss32.flags = (1 << 1) | (1 << 17); 654 tss32.ldt = SEL_LDT; 655 tss32.cr3 = sregs.cr3; 656 tss32.io_bitmap = offsetof(struct tss32, io_bitmap); 657 struct tss32* tss32_addr = (struct tss32*)(host_mem + seg_tss32_vm86.base); 658 memcpy(tss32_addr, &tss32, sizeof(tss32)); 659 660 memset(&tss32, 0, sizeof(tss32)); 661 tss32.ss0 = tss32.ss1 = tss32.ss2 = SEL_DS32; 662 tss32.sp0 = tss32.sp1 = tss32.sp2 = ADDR_STACK0; 663 tss32.ip = ADDR_VAR_USER_CODE; 664 tss32.flags = (1 << 1); 665 tss32.cr3 = sregs.cr3; 666 tss32.es = tss32.ds = tss32.ss = tss32.gs = tss32.fs = SEL_DS32; 667 tss32.cs = SEL_CS32; 668 tss32.ldt = SEL_LDT; 669 tss32.cr3 = sregs.cr3; 670 tss32.io_bitmap = offsetof(struct tss32, io_bitmap); 671 struct tss32* tss32_cpl3_addr = (struct tss32*)(host_mem + seg_tss32_2.base); 672 memcpy(tss32_cpl3_addr, &tss32, sizeof(tss32)); 673 674 struct tss64 tss64; 675 memset(&tss64, 0, sizeof(tss64)); 676 tss64.rsp[0] = ADDR_STACK0; 677 tss64.rsp[1] = ADDR_STACK0; 678 tss64.rsp[2] = ADDR_STACK0; 679 tss64.io_bitmap = offsetof(struct tss64, io_bitmap); 680 struct tss64* tss64_addr = (struct tss64*)(host_mem + seg_tss64.base); 681 memcpy(tss64_addr, &tss64, sizeof(tss64)); 682 683 memset(&tss64, 0, sizeof(tss64)); 684 tss64.rsp[0] = ADDR_STACK0; 685 tss64.rsp[1] = ADDR_STACK0; 686 tss64.rsp[2] = ADDR_STACK0; 687 tss64.io_bitmap = offsetof(struct tss64, io_bitmap); 688 struct tss64* tss64_cpl3_addr = (struct tss64*)(host_mem + seg_tss64_cpl3.base); 689 memcpy(tss64_cpl3_addr, &tss64, sizeof(tss64)); 690 691 if (text_size > 1000) 692 text_size = 1000; 693 if (text_prefix) { 694 memcpy(host_text, text_prefix, text_prefix_size); 695 // Replace 0xbadc0de in LJMP with offset of a next instruction. 696 void* patch = memmem(host_text, text_prefix_size, "\xde\xc0\xad\x0b", 4); 697 if (patch) 698 *((uint32*)patch) = guest_mem + ADDR_TEXT + ((char*)patch - host_text) + 6; 699 uint16 magic = PREFIX_SIZE; 700 patch = memmem(host_text, text_prefix_size, &magic, sizeof(magic)); 701 if (patch) 702 *((uint16*)patch) = guest_mem + ADDR_TEXT + text_prefix_size; 703 } 704 memcpy((void*)(host_text + text_prefix_size), text, text_size); 705 *(host_text + text_prefix_size + text_size) = 0xf4; // hlt 706 707 memcpy(host_mem + ADDR_VAR_USER_CODE, text, text_size); 708 *(host_mem + ADDR_VAR_USER_CODE + text_size) = 0xf4; // hlt 709 710 *(host_mem + ADDR_VAR_HLT) = 0xf4; // hlt 711 memcpy(host_mem + ADDR_VAR_SYSRET, "\x0f\x07\xf4", 3); 712 memcpy(host_mem + ADDR_VAR_SYSEXIT, "\x0f\x35\xf4", 3); 713 714 *(uint64*)(host_mem + ADDR_VAR_VMWRITE_FLD) = 0; 715 *(uint64*)(host_mem + ADDR_VAR_VMWRITE_VAL) = 0; 716 717 if (opt_count > 2) 718 opt_count = 2; 719 for (uintptr_t i = 0; i < opt_count; i++) { 720 uint64 typ = opt_array_ptr[i].typ; 721 uint64 val = opt_array_ptr[i].val; 722 switch (typ % 9) { 723 case 0: 724 sregs.cr0 ^= val & (CR0_MP | CR0_EM | CR0_ET | CR0_NE | CR0_WP | CR0_AM | CR0_NW | CR0_CD); 725 break; 726 case 1: 727 sregs.cr4 ^= val & (CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | CR4_MCE | CR4_PGE | CR4_PCE | 728 CR4_OSFXSR | CR4_OSXMMEXCPT | CR4_UMIP | CR4_VMXE | CR4_SMXE | CR4_FSGSBASE | CR4_PCIDE | 729 CR4_OSXSAVE | CR4_SMEP | CR4_SMAP | CR4_PKE); 730 break; 731 case 2: 732 sregs.efer ^= val & (EFER_SCE | EFER_NXE | EFER_SVME | EFER_LMSLE | EFER_FFXSR | EFER_TCE); 733 break; 734 case 3: 735 val &= ((1 << 8) | (1 << 9) | (1 << 10) | (1 << 12) | (1 << 13) | (1 << 14) | 736 (1 << 15) | (1 << 18) | (1 << 19) | (1 << 20) | (1 << 21)); 737 regs.rflags ^= val; 738 tss16_addr->flags ^= val; 739 tss16_cpl3_addr->flags ^= val; 740 tss32_addr->flags ^= val; 741 tss32_cpl3_addr->flags ^= val; 742 break; 743 case 4: 744 seg_cs16.type = val & 0xf; 745 seg_cs32.type = val & 0xf; 746 seg_cs64.type = val & 0xf; 747 break; 748 case 5: 749 seg_cs16_cpl3.type = val & 0xf; 750 seg_cs32_cpl3.type = val & 0xf; 751 seg_cs64_cpl3.type = val & 0xf; 752 break; 753 case 6: 754 seg_ds16.type = val & 0xf; 755 seg_ds32.type = val & 0xf; 756 seg_ds64.type = val & 0xf; 757 break; 758 case 7: 759 seg_ds16_cpl3.type = val & 0xf; 760 seg_ds32_cpl3.type = val & 0xf; 761 seg_ds64_cpl3.type = val & 0xf; 762 break; 763 case 8: 764 *(uint64*)(host_mem + ADDR_VAR_VMWRITE_FLD) = (val & 0xffff); 765 *(uint64*)(host_mem + ADDR_VAR_VMWRITE_VAL) = (val >> 16); 766 break; 767 default: 768 fail("bad kvm setup opt"); 769 } 770 } 771 regs.rflags |= 2; // bit 1 is always set 772 773 fill_segment_descriptor(gdt, ldt, &seg_ldt); 774 fill_segment_descriptor(gdt, ldt, &seg_cs16); 775 fill_segment_descriptor(gdt, ldt, &seg_ds16); 776 fill_segment_descriptor(gdt, ldt, &seg_cs16_cpl3); 777 fill_segment_descriptor(gdt, ldt, &seg_ds16_cpl3); 778 fill_segment_descriptor(gdt, ldt, &seg_cs32); 779 fill_segment_descriptor(gdt, ldt, &seg_ds32); 780 fill_segment_descriptor(gdt, ldt, &seg_cs32_cpl3); 781 fill_segment_descriptor(gdt, ldt, &seg_ds32_cpl3); 782 fill_segment_descriptor(gdt, ldt, &seg_cs64); 783 fill_segment_descriptor(gdt, ldt, &seg_ds64); 784 fill_segment_descriptor(gdt, ldt, &seg_cs64_cpl3); 785 fill_segment_descriptor(gdt, ldt, &seg_ds64_cpl3); 786 fill_segment_descriptor(gdt, ldt, &seg_tss32); 787 fill_segment_descriptor(gdt, ldt, &seg_tss32_2); 788 fill_segment_descriptor(gdt, ldt, &seg_tss32_cpl3); 789 fill_segment_descriptor(gdt, ldt, &seg_tss32_vm86); 790 fill_segment_descriptor(gdt, ldt, &seg_tss16); 791 fill_segment_descriptor(gdt, ldt, &seg_tss16_2); 792 fill_segment_descriptor(gdt, ldt, &seg_tss16_cpl3); 793 fill_segment_descriptor_dword(gdt, ldt, &seg_tss64); 794 fill_segment_descriptor_dword(gdt, ldt, &seg_tss64_cpl3); 795 fill_segment_descriptor(gdt, ldt, &seg_cgate16); 796 fill_segment_descriptor(gdt, ldt, &seg_tgate16); 797 fill_segment_descriptor(gdt, ldt, &seg_cgate32); 798 fill_segment_descriptor(gdt, ldt, &seg_tgate32); 799 fill_segment_descriptor_dword(gdt, ldt, &seg_cgate64); 800 801 if (ioctl(cpufd, KVM_SET_SREGS, &sregs)) 802 return -1; 803 if (ioctl(cpufd, KVM_SET_REGS, ®s)) 804 return -1; 805 return 0; 806 }