github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/executor/common_kvm_amd64.h (about)

     1  // Copyright 2017 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  #ifndef EXECUTOR_COMMON_KVM_AMD64_H
     5  #define EXECUTOR_COMMON_KVM_AMD64_H
     6  
     7  // This file is shared between executor and csource package.
     8  
     9  // Implementation of syz_kvm_setup_cpu pseudo-syscall.
    10  // See Intel Software Developer’s Manual Volume 3: System Programming Guide
    11  // for details on what happens here.
    12  
    13  #include "common_kvm.h"
    14  #include "common_kvm_amd64_syzos.h"
    15  #include "kvm.h"
    16  #include "kvm_amd64.S.h"
    17  
    18  #ifndef KVM_SMI
    19  #define KVM_SMI _IO(KVMIO, 0xb7)
    20  #endif
    21  
    22  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
    23  struct tss16 {
    24  	uint16 prev;
    25  	uint16 sp0;
    26  	uint16 ss0;
    27  	uint16 sp1;
    28  	uint16 ss1;
    29  	uint16 sp2;
    30  	uint16 ss2;
    31  	uint16 ip;
    32  	uint16 flags;
    33  	uint16 ax;
    34  	uint16 cx;
    35  	uint16 dx;
    36  	uint16 bx;
    37  	uint16 sp;
    38  	uint16 bp;
    39  	uint16 si;
    40  	uint16 di;
    41  	uint16 es;
    42  	uint16 cs;
    43  	uint16 ss;
    44  	uint16 ds;
    45  	uint16 ldt;
    46  } __attribute__((packed));
    47  
    48  struct tss32 {
    49  	uint16 prev, prevh;
    50  	uint32 sp0;
    51  	uint16 ss0, ss0h;
    52  	uint32 sp1;
    53  	uint16 ss1, ss1h;
    54  	uint32 sp2;
    55  	uint16 ss2, ss2h;
    56  	uint32 cr3;
    57  	uint32 ip;
    58  	uint32 flags;
    59  	uint32 ax;
    60  	uint32 cx;
    61  	uint32 dx;
    62  	uint32 bx;
    63  	uint32 sp;
    64  	uint32 bp;
    65  	uint32 si;
    66  	uint32 di;
    67  	uint16 es, esh;
    68  	uint16 cs, csh;
    69  	uint16 ss, ssh;
    70  	uint16 ds, dsh;
    71  	uint16 fs, fsh;
    72  	uint16 gs, gsh;
    73  	uint16 ldt, ldth;
    74  	uint16 trace;
    75  	uint16 io_bitmap;
    76  } __attribute__((packed));
    77  #endif
    78  
    79  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
    80  struct tss64 {
    81  	uint32 reserved0;
    82  	uint64 rsp[3];
    83  	uint64 reserved1;
    84  	uint64 ist[7];
    85  	uint64 reserved2;
    86  	uint16 reserved3;
    87  	uint16 io_bitmap;
    88  } __attribute__((packed));
    89  
    90  static void fill_segment_descriptor(uint64* dt, uint64* lt, struct kvm_segment* seg)
    91  {
    92  	uint16 index = seg->selector >> 3;
    93  	uint64 limit = seg->g ? seg->limit >> 12 : seg->limit;
    94  	uint64 sd = (limit & 0xffff) | (seg->base & 0xffffff) << 16 | (uint64)seg->type << 40 | (uint64)seg->s << 44 | (uint64)seg->dpl << 45 | (uint64)seg->present << 47 | (limit & 0xf0000ULL) << 48 | (uint64)seg->avl << 52 | (uint64)seg->l << 53 | (uint64)seg->db << 54 | (uint64)seg->g << 55 | (seg->base & 0xff000000ULL) << 56;
    95  	dt[index] = sd;
    96  	lt[index] = sd;
    97  }
    98  
    99  static void fill_segment_descriptor_dword(uint64* dt, uint64* lt, struct kvm_segment* seg)
   100  {
   101  	fill_segment_descriptor(dt, lt, seg);
   102  	uint16 index = seg->selector >> 3;
   103  	dt[index + 1] = 0;
   104  	lt[index + 1] = 0;
   105  }
   106  #endif
   107  
   108  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
   109  static void setup_syscall_msrs(int cpufd, uint16 sel_cs, uint16 sel_cs_cpl3)
   110  {
   111  	char buf[sizeof(struct kvm_msrs) + 5 * sizeof(struct kvm_msr_entry)];
   112  	memset(buf, 0, sizeof(buf));
   113  	struct kvm_msrs* msrs = (struct kvm_msrs*)buf;
   114  	struct kvm_msr_entry* entries = msrs->entries;
   115  	msrs->nmsrs = 5;
   116  	entries[0].index = X86_MSR_IA32_SYSENTER_CS;
   117  	entries[0].data = sel_cs;
   118  	entries[1].index = X86_MSR_IA32_SYSENTER_ESP;
   119  	entries[1].data = X86_ADDR_STACK0;
   120  	entries[2].index = X86_MSR_IA32_SYSENTER_EIP;
   121  	entries[2].data = X86_ADDR_VAR_SYSEXIT;
   122  	entries[3].index = X86_MSR_IA32_STAR;
   123  	entries[3].data = ((uint64)sel_cs << 32) | ((uint64)sel_cs_cpl3 << 48);
   124  	entries[4].index = X86_MSR_IA32_LSTAR;
   125  	entries[4].data = X86_ADDR_VAR_SYSRET;
   126  	ioctl(cpufd, KVM_SET_MSRS, msrs);
   127  }
   128  #endif
   129  
   130  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
   131  static void setup_32bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem)
   132  {
   133  	sregs->idt.base = guest_mem + X86_ADDR_VAR_IDT;
   134  	sregs->idt.limit = 0x1ff;
   135  	uint64* idt = (uint64*)(host_mem + sregs->idt.base);
   136  	for (int i = 0; i < 32; i++) {
   137  		struct kvm_segment gate;
   138  		gate.selector = i << 3;
   139  		switch (i % 6) {
   140  		case 0:
   141  			// 16-bit interrupt gate
   142  			gate.type = 6;
   143  			gate.base = X86_SEL_CS16;
   144  			break;
   145  		case 1:
   146  			// 16-bit trap gate
   147  			gate.type = 7;
   148  			gate.base = X86_SEL_CS16;
   149  			break;
   150  		case 2:
   151  			// 16-bit task gate
   152  			gate.type = 3;
   153  			gate.base = X86_SEL_TGATE16;
   154  			break;
   155  		case 3:
   156  			// 32-bit interrupt gate
   157  			gate.type = 14;
   158  			gate.base = X86_SEL_CS32;
   159  			break;
   160  		case 4:
   161  			// 32-bit trap gate
   162  			gate.type = 15;
   163  			gate.base = X86_SEL_CS32;
   164  			break;
   165  		case 5:
   166  			// 32-bit task gate
   167  			gate.type = 11;
   168  			gate.base = X86_SEL_TGATE32;
   169  			break;
   170  		}
   171  		gate.limit = guest_mem + X86_ADDR_VAR_USER_CODE2; // entry offset
   172  		gate.present = 1;
   173  		gate.dpl = 0;
   174  		gate.s = 0;
   175  		gate.g = 0;
   176  		gate.db = 0;
   177  		gate.l = 0;
   178  		gate.avl = 0;
   179  		fill_segment_descriptor(idt, idt, &gate);
   180  	}
   181  }
   182  #endif
   183  
   184  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
   185  static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem)
   186  {
   187  	sregs->idt.base = guest_mem + X86_ADDR_VAR_IDT;
   188  	sregs->idt.limit = 0x1ff;
   189  	uint64* idt = (uint64*)(host_mem + sregs->idt.base);
   190  	for (int i = 0; i < 32; i++) {
   191  		struct kvm_segment gate;
   192  		gate.selector = (i * 2) << 3;
   193  		gate.type = (i & 1) ? 14 : 15; // interrupt or trap gate
   194  		gate.base = X86_SEL_CS64;
   195  		gate.limit = guest_mem + X86_ADDR_VAR_USER_CODE2; // entry offset
   196  		gate.present = 1;
   197  		gate.dpl = 0;
   198  		gate.s = 0;
   199  		gate.g = 0;
   200  		gate.db = 0;
   201  		gate.l = 0;
   202  		gate.avl = 0;
   203  		fill_segment_descriptor_dword(idt, idt, &gate);
   204  	}
   205  }
   206  #endif
   207  
   208  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm || __NR_syz_kvm_add_vcpu
   209  // Flags for mem_region
   210  #define MEM_REGION_FLAG_USER_CODE (1 << 0)
   211  #define MEM_REGION_FLAG_DIRTY_LOG (1 << 1)
   212  #define MEM_REGION_FLAG_READONLY (1 << 2)
   213  #define MEM_REGION_FLAG_EXECUTOR_CODE (1 << 3)
   214  #define MEM_REGION_FLAG_GPA0 (1 << 5)
   215  #define MEM_REGION_FLAG_NO_HOST_MEM (1 << 6)
   216  
   217  struct mem_region {
   218  	uint64 gpa;
   219  	int pages;
   220  	uint32 flags;
   221  };
   222  
   223  // SYZOS guest virtual memory layout (must be in sync with executor/kvm.h):
   224  static const struct mem_region syzos_mem_regions[] = {
   225      // AMD64 data structures (48 pages starting at GPA 0x0, see kvm.h).
   226      {X86_SYZOS_ADDR_ZERO, 48, MEM_REGION_FLAG_GPA0},
   227      // SMRAM memory.
   228      {X86_SYZOS_ADDR_SMRAM, 10, 0},
   229      // Unmapped region to trigger a page faults for uexits etc.
   230      {X86_SYZOS_ADDR_EXIT, 1, MEM_REGION_FLAG_NO_HOST_MEM},
   231      // Writable region with KVM_MEM_LOG_DIRTY_PAGES to fuzz dirty ring.
   232      {X86_SYZOS_ADDR_DIRTY_PAGES, 2, MEM_REGION_FLAG_DIRTY_LOG},
   233      // SYZOS user code (generated by the fuzzer).
   234      {X86_SYZOS_ADDR_USER_CODE, KVM_MAX_VCPU, MEM_REGION_FLAG_READONLY | MEM_REGION_FLAG_USER_CODE},
   235      // Executor guest code.
   236      {SYZOS_ADDR_EXECUTOR_CODE, 4, MEM_REGION_FLAG_READONLY | MEM_REGION_FLAG_EXECUTOR_CODE},
   237      // Scratch memory for code generated at runtime.
   238      {X86_SYZOS_ADDR_SCRATCH_CODE, 1, 0},
   239      // CPU stack.
   240      {X86_SYZOS_ADDR_STACK_BOTTOM, 1, 0},
   241      // Per-VCPU regions for L2 VMs.
   242      {X86_SYZOS_PER_VCPU_REGIONS_BASE, (KVM_MAX_VCPU * X86_SYZOS_L1_VCPU_REGION_SIZE) / KVM_PAGE_SIZE, 0},
   243      // IOAPIC memory.
   244      {X86_SYZOS_ADDR_IOAPIC, 1, 0},
   245  };
   246  #endif
   247  
   248  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm || __NR_syz_kvm_setup_cpu || __NR_syz_kvm_add_vcpu
   249  struct kvm_syz_vm {
   250  	int vmfd;
   251  	int next_cpu_id;
   252  	void* host_mem;
   253  	size_t total_pages;
   254  	void* user_text;
   255  	void* gpa0_mem;
   256  };
   257  #endif
   258  
   259  #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu
   260  
   261  #define X86_NUM_IDT_ENTRIES 256
   262  static void syzos_setup_idt(struct kvm_syz_vm* vm, struct kvm_sregs* sregs)
   263  {
   264  	sregs->idt.base = X86_SYZOS_ADDR_VAR_IDT;
   265  	sregs->idt.limit = (X86_NUM_IDT_ENTRIES * sizeof(struct idt_entry_64)) - 1;
   266  	volatile struct idt_entry_64* idt =
   267  	    (volatile struct idt_entry_64*)((uint64)vm->host_mem + sregs->idt.base);
   268  	uint64 handler_addr = executor_fn_guest_addr(dummy_null_handler);
   269  	for (int i = 0; i < X86_NUM_IDT_ENTRIES; i++) {
   270  		idt[i].offset_low = (uint16)(handler_addr & 0xFFFF);
   271  		idt[i].selector = X86_SYZOS_SEL_CODE;
   272  		idt[i].ist = 0;
   273  		// 0x8E is a 64-bit interrupt gate: P=1, DPL=0, type=0xE.
   274  		idt[i].type_attr = 0x8E;
   275  		idt[i].offset_mid = (uint16)((handler_addr >> 16) & 0xFFFF);
   276  		idt[i].offset_high = (uint32)((handler_addr >> 32) & 0xFFFFFFFF);
   277  		idt[i].reserved = 0;
   278  	}
   279  }
   280  #endif
   281  
   282  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu || __NR_syz_kvm_add_vcpu
   283  struct kvm_text {
   284  	uintptr_t typ;
   285  	const void* text;
   286  	uintptr_t size;
   287  };
   288  #endif
   289  
   290  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
   291  struct kvm_opt {
   292  	uint64 typ;
   293  	uint64 val;
   294  };
   295  #endif
   296  
   297  #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu
   298  #define PAGE_MASK GENMASK_ULL(51, 12)
   299  
   300  typedef struct {
   301  	uint64 next_page;
   302  	uint64 last_page;
   303  } page_alloc_t;
   304  
   305  static uint64 pg_alloc(page_alloc_t* alloc)
   306  {
   307  	if (alloc->next_page >= alloc->last_page)
   308  		fail("page table allocation failed");
   309  	uint64 page = alloc->next_page;
   310  	alloc->next_page += KVM_PAGE_SIZE;
   311  	return page;
   312  }
   313  
   314  static void map_4k_page(uint64 host_mem, page_alloc_t* alloc, uint64 gpa)
   315  {
   316  	uint64* pml4 = (uint64*)(host_mem + X86_SYZOS_ADDR_PML4);
   317  
   318  	// PML4 Entry (Level 4).
   319  	uint64 pml4_idx = (gpa >> 39) & 0x1FF;
   320  	if (pml4[pml4_idx] == 0)
   321  		pml4[pml4_idx] = X86_PDE64_PRESENT | X86_PDE64_RW | pg_alloc(alloc);
   322  	uint64* pdpt = (uint64*)(host_mem + (pml4[pml4_idx] & PAGE_MASK));
   323  
   324  	// PDPT Entry (Level 3).
   325  	uint64 pdpt_idx = (gpa >> 30) & 0x1FF;
   326  	if (pdpt[pdpt_idx] == 0)
   327  		pdpt[pdpt_idx] = X86_PDE64_PRESENT | X86_PDE64_RW | pg_alloc(alloc);
   328  	uint64* pd = (uint64*)(host_mem + (pdpt[pdpt_idx] & PAGE_MASK));
   329  
   330  	// PD Entry (Level 2).
   331  	uint64 pd_idx = (gpa >> 21) & 0x1FF;
   332  	if (pd[pd_idx] == 0)
   333  		pd[pd_idx] = X86_PDE64_PRESENT | X86_PDE64_RW | pg_alloc(alloc);
   334  	uint64* pt = (uint64*)(host_mem + (pd[pd_idx] & PAGE_MASK));
   335  
   336  	// PT Entry (Level 1).
   337  	uint64 pt_idx = (gpa >> 12) & 0x1FF;
   338  
   339  	// Set the final 4KB page table entry to map the GPA
   340  	// This is an identity map: GPA -> GPA
   341  	pt[pt_idx] = (gpa & PAGE_MASK) | X86_PDE64_PRESENT | X86_PDE64_RW;
   342  }
   343  
   344  static int map_4k_region(uint64 host_mem, page_alloc_t* alloc, uint64 gpa_start, int num_pages)
   345  {
   346  	for (int i = 0; i < num_pages; i++)
   347  		map_4k_page(host_mem, alloc, gpa_start + (i * KVM_PAGE_SIZE));
   348  	return num_pages;
   349  }
   350  
   351  // We assume a 4-level page table, in the future we could add support for
   352  // n-level if needed.
   353  static void setup_pg_table(struct kvm_syz_vm* vm)
   354  {
   355  	int total = vm->total_pages;
   356  	// Page tables are located in the first memory region starting at 0x0.
   357  	uint64 host_mem = (uint64)vm->gpa0_mem;
   358  
   359  	page_alloc_t alloc = {.next_page = X86_SYZOS_ADDR_PT_POOL,
   360  			      .last_page = X86_SYZOS_ADDR_PT_POOL + 32 * KVM_PAGE_SIZE};
   361  
   362  	// Zero-out all page table memory.
   363  	for (uint64 i = 0; i < (alloc.last_page - alloc.next_page); i += KVM_PAGE_SIZE)
   364  		memset((void*)(host_mem + alloc.next_page + i), 0, KVM_PAGE_SIZE);
   365  
   366  	// Map all the regions defined in setup_vm()
   367  	for (size_t i = 0; i < sizeof(syzos_mem_regions) / sizeof(syzos_mem_regions[0]); i++)
   368  		total -= map_4k_region(host_mem, &alloc, syzos_mem_regions[i].gpa, syzos_mem_regions[i].pages);
   369  	map_4k_region(host_mem, &alloc, X86_SYZOS_ADDR_UNUSED, total);
   370  }
   371  
   372  // A 64-bit GDT entry for a code or data segment.
   373  // System segments (like TSS) are different and use a 128-bit format.
   374  struct gdt_entry {
   375  	uint16 limit_low;
   376  	uint16 base_low;
   377  	uint8 base_mid;
   378  	uint8 access;
   379  	uint8 limit_high_and_flags;
   380  	uint8 base_high;
   381  } __attribute__((packed));
   382  
   383  static void setup_gdt_64(struct gdt_entry* gdt)
   384  {
   385  	// Entry 0: Null
   386  	gdt[0] = (struct gdt_entry){0};
   387  
   388  	// Entry 1 (selector 0x08): 64-bit Code Segment
   389  	// P=1, DPL=0, S=1, Type=Execute/Read, L=1, G=1
   390  	gdt[X86_SYZOS_SEL_CODE >> 3] = (struct gdt_entry){
   391  	    .limit_low = 0xFFFF,
   392  	    .base_low = 0,
   393  	    .base_mid = 0,
   394  	    .access = 0x9A, // Present, DPL=0, S=1, Type=Execute/Read, Accessed
   395  	    .limit_high_and_flags = 0xAF, // Granularity=1, L=1, Limit=0xF
   396  	    .base_high = 0};
   397  
   398  	// Entry 2 (selector 0x10): 64-bit Data Segment
   399  	// P=1, DPL=0, S=1, Type=Read/Write, DB=1, G=1
   400  	gdt[X86_SYZOS_SEL_DATA >> 3] = (struct gdt_entry){
   401  	    .limit_low = 0xFFFF,
   402  	    .base_low = (uint16)(X86_SYZOS_ADDR_VAR_TSS & 0xFFFF),
   403  	    .base_mid = (uint8)((X86_SYZOS_ADDR_VAR_TSS >> 16) & 0xFF),
   404  	    .access = 0x92, // Present, DPL=0, S=1, Type=Read/Write, Accessed
   405  	    .limit_high_and_flags = 0xCF, // Granularity=1, DB=1, Limit=0xF
   406  	    .base_high = (uint8)((X86_SYZOS_ADDR_VAR_TSS >> 24) & 0xFF)};
   407  	// Entry 3 (selector 0x18): 64-bit TSS Segment
   408  	gdt[X86_SYZOS_SEL_TSS64 >> 3] = (struct gdt_entry){
   409  	    .limit_low = 0x67, // Minimal TSS limit
   410  	    .base_low = 0,
   411  	    .base_mid = 0,
   412  	    .access = 0x89, // Present, DPL=0, 64-bit TSS (Available)
   413  	    .limit_high_and_flags = 0x00, // G=0, Limit High = 0
   414  	    .base_high = 0};
   415  	// NOTE: A 64-bit TSS descriptor actually needs a second GDT entry for the high 32 bits of the base.
   416  	// We'll keep the base 0 for simplicity, so the second entry (index 4) can remain 0.
   417  }
   418  
   419  // This only sets up a 64-bit VCPU.
   420  // TODO: Should add support for other modes.
   421  static void setup_gdt_ldt_pg(struct kvm_syz_vm* vm, int cpufd)
   422  {
   423  	struct kvm_sregs sregs;
   424  	ioctl(cpufd, KVM_GET_SREGS, &sregs);
   425  
   426  	sregs.gdt.base = X86_SYZOS_ADDR_GDT;
   427  	sregs.gdt.limit = 5 * sizeof(struct gdt_entry) - 1;
   428  	struct gdt_entry* gdt = (struct gdt_entry*)((uint64)vm->host_mem + sregs.gdt.base);
   429  
   430  	struct kvm_segment seg_cs64;
   431  	memset(&seg_cs64, 0, sizeof(seg_cs64));
   432  	seg_cs64.selector = X86_SYZOS_SEL_CODE;
   433  	seg_cs64.type = 11;
   434  	seg_cs64.base = 0;
   435  	seg_cs64.limit = 0xFFFFFFFFu;
   436  	seg_cs64.present = 1;
   437  	seg_cs64.s = 1;
   438  	seg_cs64.g = 1;
   439  	seg_cs64.l = 1;
   440  
   441  	sregs.cs = seg_cs64;
   442  
   443  	struct kvm_segment seg_ds64;
   444  	memset(&seg_ds64, 0, sizeof(struct kvm_segment));
   445  	seg_ds64.selector = X86_SYZOS_SEL_DATA;
   446  	seg_ds64.type = 3;
   447  	seg_ds64.limit = 0xFFFFFFFFu;
   448  	seg_ds64.present = 1;
   449  	seg_ds64.s = 1;
   450  	seg_ds64.g = 1;
   451  	seg_ds64.db = 1;
   452  
   453  	sregs.ds = seg_ds64;
   454  	sregs.es = seg_ds64;
   455  	sregs.fs = seg_ds64;
   456  	sregs.gs = seg_ds64;
   457  	sregs.ss = seg_ds64;
   458  
   459  	// The L1 guest (the host for L2) MUST have a valid TR
   460  	// pointing to the 64-bit TSS in the GDT.
   461  	struct kvm_segment seg_tr;
   462  	memset(&seg_tr, 0, sizeof(seg_tr));
   463  	seg_tr.selector = X86_SYZOS_SEL_TSS64; // 0x18
   464  	seg_tr.type = 11; // 64-bit TSS (Busy)
   465  	seg_tr.base = X86_SYZOS_ADDR_VAR_TSS;
   466  	seg_tr.limit = 0x67; // Limit of the TSS descriptor
   467  	seg_tr.present = 1;
   468  	seg_tr.s = 0; // System segment
   469  	sregs.tr = seg_tr;
   470  
   471  	// The L1 TSS memory is at (vm->host_mem + X86_SYZOS_ADDR_VAR_TSS)
   472  	volatile uint8* l1_tss =
   473  	    (volatile uint8*)((uint64)vm->host_mem + X86_SYZOS_ADDR_VAR_TSS);
   474  
   475  	// Zero out the TSS (104 bytes for 64-bit)
   476  	memset((void*)l1_tss, 0, 104);
   477  
   478  	// Set the critical RSP0 field to the L1 guest's main stack.
   479  	// RSP0 is at offset +4 bytes in a 64-bit TSS.
   480  	*(volatile uint64*)(l1_tss + 4) = X86_SYZOS_ADDR_STACK0;
   481  
   482  	setup_gdt_64(gdt);
   483  
   484  	syzos_setup_idt(vm, &sregs);
   485  	setup_pg_table(vm);
   486  
   487  	sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
   488  	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
   489  	sregs.efer |= (X86_EFER_LME | X86_EFER_LMA | X86_EFER_NXE);
   490  	sregs.cr3 = X86_ADDR_PML4;
   491  
   492  	ioctl(cpufd, KVM_SET_SREGS, &sregs);
   493  }
   494  #endif
   495  
   496  #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu
   497  static void setup_cpuid(int cpufd)
   498  {
   499  	int kvmfd = open("/dev/kvm", O_RDWR);
   500  	char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)];
   501  	memset(buf, 0, sizeof(buf));
   502  	struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf;
   503  	cpuid->nent = 128;
   504  	ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid);
   505  	ioctl(cpufd, KVM_SET_CPUID2, cpuid);
   506  	close(kvmfd);
   507  }
   508  #endif
   509  
   510  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
   511  #define KVM_SETUP_PAGING (1 << 0)
   512  #define KVM_SETUP_PAE (1 << 1)
   513  #define KVM_SETUP_PROTECTED (1 << 2)
   514  #define KVM_SETUP_CPL3 (1 << 3)
   515  #define KVM_SETUP_VIRT86 (1 << 4)
   516  #define KVM_SETUP_SMM (1 << 5)
   517  #define KVM_SETUP_VM (1 << 6)
   518  
   519  // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts])
   520  static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4, volatile long a5, volatile long a6, volatile long a7)
   521  {
   522  	const int vmfd = a0;
   523  	const int cpufd = a1;
   524  	char* const host_mem = (char*)a2;
   525  	const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3;
   526  	const uintptr_t text_count = a4;
   527  	const uintptr_t flags = a5;
   528  	const struct kvm_opt* const opt_array_ptr = (struct kvm_opt*)a6;
   529  	uintptr_t opt_count = a7;
   530  
   531  	const uintptr_t page_size = 4 << 10;
   532  	const uintptr_t ioapic_page = 10;
   533  	const uintptr_t guest_mem_size = 24 * page_size;
   534  	const uintptr_t guest_mem = 0;
   535  
   536  	(void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count
   537  	int text_type = text_array_ptr[0].typ;
   538  	const void* text = text_array_ptr[0].text;
   539  	uintptr_t text_size = text_array_ptr[0].size;
   540  
   541  	for (uintptr_t i = 0; i < guest_mem_size / page_size; i++) {
   542  		struct kvm_userspace_memory_region memreg;
   543  		memreg.slot = i;
   544  		memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY
   545  		memreg.guest_phys_addr = guest_mem + i * page_size;
   546  		if (i == ioapic_page)
   547  			memreg.guest_phys_addr = 0xfec00000;
   548  		memreg.memory_size = page_size;
   549  		memreg.userspace_addr = (uintptr_t)host_mem + i * page_size;
   550  		ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
   551  	}
   552  	// SMRAM
   553  	struct kvm_userspace_memory_region memreg;
   554  	memreg.slot = 1 + (1 << 16);
   555  	memreg.flags = 0;
   556  	memreg.guest_phys_addr = 0x30000;
   557  	memreg.memory_size = 64 << 10;
   558  	memreg.userspace_addr = (uintptr_t)host_mem;
   559  	ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
   560  
   561  	struct kvm_sregs sregs;
   562  	if (ioctl(cpufd, KVM_GET_SREGS, &sregs))
   563  		return -1;
   564  
   565  	struct kvm_regs regs;
   566  	memset(&regs, 0, sizeof(regs));
   567  	regs.rip = guest_mem + X86_ADDR_TEXT;
   568  	regs.rsp = X86_ADDR_STACK0;
   569  
   570  	sregs.gdt.base = guest_mem + X86_ADDR_GDT;
   571  	sregs.gdt.limit = 256 * sizeof(uint64) - 1;
   572  	uint64* gdt = (uint64*)(host_mem + sregs.gdt.base);
   573  
   574  	struct kvm_segment seg_ldt;
   575  	memset(&seg_ldt, 0, sizeof(seg_ldt));
   576  	seg_ldt.selector = X86_SEL_LDT;
   577  	seg_ldt.type = 2;
   578  	seg_ldt.base = guest_mem + X86_ADDR_LDT;
   579  	seg_ldt.limit = 256 * sizeof(uint64) - 1;
   580  	seg_ldt.present = 1;
   581  	seg_ldt.dpl = 0;
   582  	seg_ldt.s = 0;
   583  	seg_ldt.g = 0;
   584  	seg_ldt.db = 1;
   585  	seg_ldt.l = 0;
   586  	sregs.ldt = seg_ldt;
   587  	uint64* ldt = (uint64*)(host_mem + sregs.ldt.base);
   588  
   589  	struct kvm_segment seg_cs16;
   590  	memset(&seg_cs16, 0, sizeof(seg_cs16));
   591  	seg_cs16.selector = X86_SEL_CS16;
   592  	seg_cs16.type = 11;
   593  	seg_cs16.base = 0;
   594  	seg_cs16.limit = 0xfffff;
   595  	seg_cs16.present = 1;
   596  	seg_cs16.dpl = 0;
   597  	seg_cs16.s = 1;
   598  	seg_cs16.g = 0;
   599  	seg_cs16.db = 0;
   600  	seg_cs16.l = 0;
   601  
   602  	struct kvm_segment seg_ds16 = seg_cs16;
   603  	seg_ds16.selector = X86_SEL_DS16;
   604  	seg_ds16.type = 3;
   605  
   606  	struct kvm_segment seg_cs16_cpl3 = seg_cs16;
   607  	seg_cs16_cpl3.selector = X86_SEL_CS16_CPL3;
   608  	seg_cs16_cpl3.dpl = 3;
   609  
   610  	struct kvm_segment seg_ds16_cpl3 = seg_ds16;
   611  	seg_ds16_cpl3.selector = X86_SEL_DS16_CPL3;
   612  	seg_ds16_cpl3.dpl = 3;
   613  
   614  	struct kvm_segment seg_cs32 = seg_cs16;
   615  	seg_cs32.selector = X86_SEL_CS32;
   616  	seg_cs32.db = 1;
   617  
   618  	struct kvm_segment seg_ds32 = seg_ds16;
   619  	seg_ds32.selector = X86_SEL_DS32;
   620  	seg_ds32.db = 1;
   621  
   622  	struct kvm_segment seg_cs32_cpl3 = seg_cs32;
   623  	seg_cs32_cpl3.selector = X86_SEL_CS32_CPL3;
   624  	seg_cs32_cpl3.dpl = 3;
   625  
   626  	struct kvm_segment seg_ds32_cpl3 = seg_ds32;
   627  	seg_ds32_cpl3.selector = X86_SEL_DS32_CPL3;
   628  	seg_ds32_cpl3.dpl = 3;
   629  
   630  	struct kvm_segment seg_cs64 = seg_cs16;
   631  	seg_cs64.selector = X86_SEL_CS64;
   632  	seg_cs64.l = 1;
   633  
   634  	struct kvm_segment seg_ds64 = seg_ds32;
   635  	seg_ds64.selector = X86_SEL_DS64;
   636  
   637  	struct kvm_segment seg_cs64_cpl3 = seg_cs64;
   638  	seg_cs64_cpl3.selector = X86_SEL_CS64_CPL3;
   639  	seg_cs64_cpl3.dpl = 3;
   640  
   641  	struct kvm_segment seg_ds64_cpl3 = seg_ds64;
   642  	seg_ds64_cpl3.selector = X86_SEL_DS64_CPL3;
   643  	seg_ds64_cpl3.dpl = 3;
   644  
   645  	struct kvm_segment seg_tss32;
   646  	memset(&seg_tss32, 0, sizeof(seg_tss32));
   647  	seg_tss32.selector = X86_SEL_TSS32;
   648  	seg_tss32.type = 9;
   649  	seg_tss32.base = X86_ADDR_VAR_TSS32;
   650  	seg_tss32.limit = 0x1ff;
   651  	seg_tss32.present = 1;
   652  	seg_tss32.dpl = 0;
   653  	seg_tss32.s = 0;
   654  	seg_tss32.g = 0;
   655  	seg_tss32.db = 0;
   656  	seg_tss32.l = 0;
   657  
   658  	struct kvm_segment seg_tss32_2 = seg_tss32;
   659  	seg_tss32_2.selector = X86_SEL_TSS32_2;
   660  	seg_tss32_2.base = X86_ADDR_VAR_TSS32_2;
   661  
   662  	struct kvm_segment seg_tss32_cpl3 = seg_tss32;
   663  	seg_tss32_cpl3.selector = X86_SEL_TSS32_CPL3;
   664  	seg_tss32_cpl3.base = X86_ADDR_VAR_TSS32_CPL3;
   665  
   666  	struct kvm_segment seg_tss32_vm86 = seg_tss32;
   667  	seg_tss32_vm86.selector = X86_SEL_TSS32_VM86;
   668  	seg_tss32_vm86.base = X86_ADDR_VAR_TSS32_VM86;
   669  
   670  	struct kvm_segment seg_tss16 = seg_tss32;
   671  	seg_tss16.selector = X86_SEL_TSS16;
   672  	seg_tss16.base = X86_ADDR_VAR_TSS16;
   673  	seg_tss16.limit = 0xff;
   674  	seg_tss16.type = 1;
   675  
   676  	struct kvm_segment seg_tss16_2 = seg_tss16;
   677  	seg_tss16_2.selector = X86_SEL_TSS16_2;
   678  	seg_tss16_2.base = X86_ADDR_VAR_TSS16_2;
   679  	seg_tss16_2.dpl = 0;
   680  
   681  	struct kvm_segment seg_tss16_cpl3 = seg_tss16;
   682  	seg_tss16_cpl3.selector = X86_SEL_TSS16_CPL3;
   683  	seg_tss16_cpl3.base = X86_ADDR_VAR_TSS16_CPL3;
   684  	seg_tss16_cpl3.dpl = 3;
   685  
   686  	struct kvm_segment seg_tss64 = seg_tss32;
   687  	seg_tss64.selector = X86_SEL_TSS64;
   688  	seg_tss64.base = X86_ADDR_VAR_TSS64;
   689  	seg_tss64.limit = 0x1ff;
   690  
   691  	struct kvm_segment seg_tss64_cpl3 = seg_tss64;
   692  	seg_tss64_cpl3.selector = X86_SEL_TSS64_CPL3;
   693  	seg_tss64_cpl3.base = X86_ADDR_VAR_TSS64_CPL3;
   694  	seg_tss64_cpl3.dpl = 3;
   695  
   696  	struct kvm_segment seg_cgate16;
   697  	memset(&seg_cgate16, 0, sizeof(seg_cgate16));
   698  	seg_cgate16.selector = X86_SEL_CGATE16;
   699  	seg_cgate16.type = 4;
   700  	seg_cgate16.base = X86_SEL_CS16 | (2 << 16); // selector + param count
   701  	seg_cgate16.limit = X86_ADDR_VAR_USER_CODE2; // entry offset
   702  	seg_cgate16.present = 1;
   703  	seg_cgate16.dpl = 0;
   704  	seg_cgate16.s = 0;
   705  	seg_cgate16.g = 0;
   706  	seg_cgate16.db = 0;
   707  	seg_cgate16.l = 0;
   708  	seg_cgate16.avl = 0;
   709  
   710  	struct kvm_segment seg_tgate16 = seg_cgate16;
   711  	seg_tgate16.selector = X86_SEL_TGATE16;
   712  	seg_tgate16.type = 3;
   713  	seg_cgate16.base = X86_SEL_TSS16_2;
   714  	seg_tgate16.limit = 0;
   715  
   716  	struct kvm_segment seg_cgate32 = seg_cgate16;
   717  	seg_cgate32.selector = X86_SEL_CGATE32;
   718  	seg_cgate32.type = 12;
   719  	seg_cgate32.base = X86_SEL_CS32 | (2 << 16); // selector + param count
   720  
   721  	struct kvm_segment seg_tgate32 = seg_cgate32;
   722  	seg_tgate32.selector = X86_SEL_TGATE32;
   723  	seg_tgate32.type = 11;
   724  	seg_tgate32.base = X86_SEL_TSS32_2;
   725  	seg_tgate32.limit = 0;
   726  
   727  	struct kvm_segment seg_cgate64 = seg_cgate16;
   728  	seg_cgate64.selector = X86_SEL_CGATE64;
   729  	seg_cgate64.type = 12;
   730  	seg_cgate64.base = X86_SEL_CS64;
   731  
   732  	int kvmfd = open("/dev/kvm", O_RDWR);
   733  	char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)];
   734  	memset(buf, 0, sizeof(buf));
   735  	struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf;
   736  	cpuid->nent = 128;
   737  	ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid);
   738  	ioctl(cpufd, KVM_SET_CPUID2, cpuid);
   739  	close(kvmfd);
   740  
   741  	const char* text_prefix = 0;
   742  	int text_prefix_size = 0;
   743  	char* host_text = host_mem + X86_ADDR_TEXT;
   744  
   745  	if (text_type == 8) {
   746  		if (flags & KVM_SETUP_SMM) {
   747  			if (flags & KVM_SETUP_PROTECTED) {
   748  				sregs.cs = seg_cs16;
   749  				sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
   750  				sregs.cr0 |= X86_CR0_PE;
   751  			} else {
   752  				sregs.cs.selector = 0;
   753  				sregs.cs.base = 0;
   754  			}
   755  
   756  			*(host_mem + X86_ADDR_TEXT) = 0xf4; // hlt for rsm
   757  			host_text = host_mem + 0x8000;
   758  
   759  			ioctl(cpufd, KVM_SMI, 0);
   760  		} else if (flags & KVM_SETUP_VIRT86) {
   761  			sregs.cs = seg_cs32;
   762  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   763  			sregs.cr0 |= X86_CR0_PE;
   764  			sregs.efer |= X86_EFER_SCE;
   765  
   766  			setup_syscall_msrs(cpufd, X86_SEL_CS32, X86_SEL_CS32_CPL3);
   767  			setup_32bit_idt(&sregs, host_mem, guest_mem);
   768  
   769  			if (flags & KVM_SETUP_PAGING) {
   770  				uint64 pd_addr = guest_mem + X86_ADDR_PD;
   771  				uint64* pd = (uint64*)(host_mem + X86_ADDR_PD);
   772  				// A single 4MB page to cover the memory region
   773  				pd[0] = X86_PDE32_PRESENT | X86_PDE32_RW | X86_PDE32_USER | X86_PDE32_PS;
   774  				sregs.cr3 = pd_addr;
   775  				sregs.cr4 |= X86_CR4_PSE;
   776  
   777  				text_prefix = kvm_asm32_paged_vm86;
   778  				text_prefix_size = sizeof(kvm_asm32_paged_vm86) - 1;
   779  			} else {
   780  				text_prefix = kvm_asm32_vm86;
   781  				text_prefix_size = sizeof(kvm_asm32_vm86) - 1;
   782  			}
   783  		} else {
   784  			sregs.cs.selector = 0;
   785  			sregs.cs.base = 0;
   786  		}
   787  	} else if (text_type == 16) {
   788  		if (flags & KVM_SETUP_CPL3) {
   789  			sregs.cs = seg_cs16;
   790  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
   791  
   792  			text_prefix = kvm_asm16_cpl3;
   793  			text_prefix_size = sizeof(kvm_asm16_cpl3) - 1;
   794  		} else {
   795  			sregs.cr0 |= X86_CR0_PE;
   796  			sregs.cs = seg_cs16;
   797  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
   798  		}
   799  	} else if (text_type == 32) {
   800  		sregs.cr0 |= X86_CR0_PE;
   801  		sregs.efer |= X86_EFER_SCE;
   802  
   803  		setup_syscall_msrs(cpufd, X86_SEL_CS32, X86_SEL_CS32_CPL3);
   804  		setup_32bit_idt(&sregs, host_mem, guest_mem);
   805  
   806  		if (flags & KVM_SETUP_SMM) {
   807  			sregs.cs = seg_cs32;
   808  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   809  
   810  			*(host_mem + X86_ADDR_TEXT) = 0xf4; // hlt for rsm
   811  			host_text = host_mem + 0x8000;
   812  
   813  			ioctl(cpufd, KVM_SMI, 0);
   814  		} else if (flags & KVM_SETUP_PAGING) {
   815  			sregs.cs = seg_cs32;
   816  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   817  
   818  			uint64 pd_addr = guest_mem + X86_ADDR_PD;
   819  			uint64* pd = (uint64*)(host_mem + X86_ADDR_PD);
   820  			// A single 4MB page to cover the memory region
   821  			pd[0] = X86_PDE32_PRESENT | X86_PDE32_RW | X86_PDE32_USER | X86_PDE32_PS;
   822  			sregs.cr3 = pd_addr;
   823  			sregs.cr4 |= X86_CR4_PSE;
   824  
   825  			text_prefix = kvm_asm32_paged;
   826  			text_prefix_size = sizeof(kvm_asm32_paged) - 1;
   827  		} else if (flags & KVM_SETUP_CPL3) {
   828  			sregs.cs = seg_cs32_cpl3;
   829  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32_cpl3;
   830  		} else {
   831  			sregs.cs = seg_cs32;
   832  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   833  		}
   834  	} else {
   835  		sregs.efer |= X86_EFER_LME | X86_EFER_SCE;
   836  		sregs.cr0 |= X86_CR0_PE;
   837  
   838  		setup_syscall_msrs(cpufd, X86_SEL_CS64, X86_SEL_CS64_CPL3);
   839  		setup_64bit_idt(&sregs, host_mem, guest_mem);
   840  
   841  		sregs.cs = seg_cs32;
   842  		sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   843  
   844  		uint64 pml4_addr = guest_mem + X86_ADDR_PML4;
   845  		uint64* pml4 = (uint64*)(host_mem + X86_ADDR_PML4);
   846  		uint64 pdpt_addr = guest_mem + X86_ADDR_PDP;
   847  		uint64* pdpt = (uint64*)(host_mem + X86_ADDR_PDP);
   848  		uint64 pd_addr = guest_mem + X86_ADDR_PD;
   849  		uint64* pd = (uint64*)(host_mem + X86_ADDR_PD);
   850  		pml4[0] = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER | pdpt_addr;
   851  		pdpt[0] = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER | pd_addr;
   852  		pd[0] = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER | X86_PDE64_PS;
   853  		sregs.cr3 = pml4_addr;
   854  		sregs.cr4 |= X86_CR4_PAE;
   855  
   856  		if (flags & KVM_SETUP_VM) {
   857  			sregs.cr0 |= X86_CR0_NE;
   858  
   859  			*((uint64*)(host_mem + X86_ADDR_VAR_VMXON_PTR)) = X86_ADDR_VAR_VMXON;
   860  			*((uint64*)(host_mem + X86_ADDR_VAR_VMCS_PTR)) = X86_ADDR_VAR_VMCS;
   861  			memcpy(host_mem + X86_ADDR_VAR_VMEXIT_CODE, kvm_asm64_vm_exit, sizeof(kvm_asm64_vm_exit) - 1);
   862  			*((uint64*)(host_mem + X86_ADDR_VAR_VMEXIT_PTR)) = X86_ADDR_VAR_VMEXIT_CODE;
   863  
   864  			text_prefix = kvm_asm64_init_vm;
   865  			text_prefix_size = sizeof(kvm_asm64_init_vm) - 1;
   866  		} else if (flags & KVM_SETUP_CPL3) {
   867  			text_prefix = kvm_asm64_cpl3;
   868  			text_prefix_size = sizeof(kvm_asm64_cpl3) - 1;
   869  		} else {
   870  			text_prefix = kvm_asm64_enable_long;
   871  			text_prefix_size = sizeof(kvm_asm64_enable_long) - 1;
   872  		}
   873  	}
   874  
   875  	struct tss16 tss16;
   876  	memset(&tss16, 0, sizeof(tss16));
   877  	tss16.ss0 = tss16.ss1 = tss16.ss2 = X86_SEL_DS16;
   878  	tss16.sp0 = tss16.sp1 = tss16.sp2 = X86_ADDR_STACK0;
   879  	tss16.ip = X86_ADDR_VAR_USER_CODE2;
   880  	tss16.flags = (1 << 1);
   881  	tss16.cs = X86_SEL_CS16;
   882  	tss16.es = tss16.ds = tss16.ss = X86_SEL_DS16;
   883  	tss16.ldt = X86_SEL_LDT;
   884  	struct tss16* tss16_addr = (struct tss16*)(host_mem + seg_tss16_2.base);
   885  	memcpy(tss16_addr, &tss16, sizeof(tss16));
   886  
   887  	memset(&tss16, 0, sizeof(tss16));
   888  	tss16.ss0 = tss16.ss1 = tss16.ss2 = X86_SEL_DS16;
   889  	tss16.sp0 = tss16.sp1 = tss16.sp2 = X86_ADDR_STACK0;
   890  	tss16.ip = X86_ADDR_VAR_USER_CODE2;
   891  	tss16.flags = (1 << 1);
   892  	tss16.cs = X86_SEL_CS16_CPL3;
   893  	tss16.es = tss16.ds = tss16.ss = X86_SEL_DS16_CPL3;
   894  	tss16.ldt = X86_SEL_LDT;
   895  	struct tss16* tss16_cpl3_addr = (struct tss16*)(host_mem + seg_tss16_cpl3.base);
   896  	memcpy(tss16_cpl3_addr, &tss16, sizeof(tss16));
   897  
   898  	struct tss32 tss32;
   899  	memset(&tss32, 0, sizeof(tss32));
   900  	tss32.ss0 = tss32.ss1 = tss32.ss2 = X86_SEL_DS32;
   901  	tss32.sp0 = tss32.sp1 = tss32.sp2 = X86_ADDR_STACK0;
   902  	tss32.ip = X86_ADDR_VAR_USER_CODE;
   903  	tss32.flags = (1 << 1) | (1 << 17);
   904  	tss32.ldt = X86_SEL_LDT;
   905  	tss32.cr3 = sregs.cr3;
   906  	tss32.io_bitmap = offsetof(struct tss32, io_bitmap);
   907  	struct tss32* tss32_addr = (struct tss32*)(host_mem + seg_tss32_vm86.base);
   908  	memcpy(tss32_addr, &tss32, sizeof(tss32));
   909  
   910  	memset(&tss32, 0, sizeof(tss32));
   911  	tss32.ss0 = tss32.ss1 = tss32.ss2 = X86_SEL_DS32;
   912  	tss32.sp0 = tss32.sp1 = tss32.sp2 = X86_ADDR_STACK0;
   913  	tss32.ip = X86_ADDR_VAR_USER_CODE;
   914  	tss32.flags = (1 << 1);
   915  	tss32.cr3 = sregs.cr3;
   916  	tss32.es = tss32.ds = tss32.ss = tss32.gs = tss32.fs = X86_SEL_DS32;
   917  	tss32.cs = X86_SEL_CS32;
   918  	tss32.ldt = X86_SEL_LDT;
   919  	tss32.cr3 = sregs.cr3;
   920  	tss32.io_bitmap = offsetof(struct tss32, io_bitmap);
   921  	struct tss32* tss32_cpl3_addr = (struct tss32*)(host_mem + seg_tss32_2.base);
   922  	memcpy(tss32_cpl3_addr, &tss32, sizeof(tss32));
   923  
   924  	struct tss64 tss64;
   925  	memset(&tss64, 0, sizeof(tss64));
   926  	tss64.rsp[0] = X86_ADDR_STACK0;
   927  	tss64.rsp[1] = X86_ADDR_STACK0;
   928  	tss64.rsp[2] = X86_ADDR_STACK0;
   929  	tss64.io_bitmap = offsetof(struct tss64, io_bitmap);
   930  	struct tss64* tss64_addr = (struct tss64*)(host_mem + seg_tss64.base);
   931  	memcpy(tss64_addr, &tss64, sizeof(tss64));
   932  
   933  	memset(&tss64, 0, sizeof(tss64));
   934  	tss64.rsp[0] = X86_ADDR_STACK0;
   935  	tss64.rsp[1] = X86_ADDR_STACK0;
   936  	tss64.rsp[2] = X86_ADDR_STACK0;
   937  	tss64.io_bitmap = offsetof(struct tss64, io_bitmap);
   938  	struct tss64* tss64_cpl3_addr = (struct tss64*)(host_mem + seg_tss64_cpl3.base);
   939  	memcpy(tss64_cpl3_addr, &tss64, sizeof(tss64));
   940  
   941  	if (text_size > 1000)
   942  		text_size = 1000;
   943  	if (text_prefix) {
   944  		memcpy(host_text, text_prefix, text_prefix_size);
   945  		// Replace 0xbadc0de in LJMP with offset of a next instruction.
   946  		void* patch = memmem(host_text, text_prefix_size, "\xde\xc0\xad\x0b", 4);
   947  		if (patch)
   948  			*((uint32*)patch) = guest_mem + X86_ADDR_TEXT + ((char*)patch - host_text) + 6;
   949  		uint16 magic = X86_PREFIX_SIZE;
   950  		patch = memmem(host_text, text_prefix_size, &magic, sizeof(magic));
   951  		if (patch)
   952  			*((uint16*)patch) = guest_mem + X86_ADDR_TEXT + text_prefix_size;
   953  	}
   954  	memcpy((void*)(host_text + text_prefix_size), text, text_size);
   955  	*(host_text + text_prefix_size + text_size) = 0xf4; // hlt
   956  
   957  	memcpy(host_mem + X86_ADDR_VAR_USER_CODE, text, text_size);
   958  	*(host_mem + X86_ADDR_VAR_USER_CODE + text_size) = 0xf4; // hlt
   959  
   960  	*(host_mem + X86_ADDR_VAR_HLT) = 0xf4; // hlt
   961  	memcpy(host_mem + X86_ADDR_VAR_SYSRET, "\x0f\x07\xf4", 3);
   962  	memcpy(host_mem + X86_ADDR_VAR_SYSEXIT, "\x0f\x35\xf4", 3);
   963  
   964  	*(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_FLD) = 0;
   965  	*(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_VAL) = 0;
   966  
   967  	if (opt_count > 2)
   968  		opt_count = 2;
   969  	for (uintptr_t i = 0; i < opt_count; i++) {
   970  		uint64 typ = opt_array_ptr[i].typ;
   971  		uint64 val = opt_array_ptr[i].val;
   972  		switch (typ % 9) {
   973  		case 0:
   974  			sregs.cr0 ^= val & (X86_CR0_MP | X86_CR0_EM | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | X86_CR0_NW | X86_CR0_CD);
   975  			break;
   976  		case 1:
   977  			sregs.cr4 ^= val & (X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE | X86_CR4_MCE | X86_CR4_PGE | X86_CR4_PCE |
   978  					    X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT | X86_CR4_UMIP | X86_CR4_VMXE | X86_CR4_SMXE | X86_CR4_FSGSBASE | X86_CR4_PCIDE |
   979  					    X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
   980  			break;
   981  		case 2:
   982  			sregs.efer ^= val & (X86_EFER_SCE | X86_EFER_NXE | X86_EFER_SVME | X86_EFER_LMSLE | X86_EFER_FFXSR | X86_EFER_TCE);
   983  			break;
   984  		case 3:
   985  			val &= ((1 << 8) | (1 << 9) | (1 << 10) | (1 << 12) | (1 << 13) | (1 << 14) |
   986  				(1 << 15) | (1 << 18) | (1 << 19) | (1 << 20) | (1 << 21));
   987  			regs.rflags ^= val;
   988  			tss16_addr->flags ^= val;
   989  			tss16_cpl3_addr->flags ^= val;
   990  			tss32_addr->flags ^= val;
   991  			tss32_cpl3_addr->flags ^= val;
   992  			break;
   993  		case 4:
   994  			seg_cs16.type = val & 0xf;
   995  			seg_cs32.type = val & 0xf;
   996  			seg_cs64.type = val & 0xf;
   997  			break;
   998  		case 5:
   999  			seg_cs16_cpl3.type = val & 0xf;
  1000  			seg_cs32_cpl3.type = val & 0xf;
  1001  			seg_cs64_cpl3.type = val & 0xf;
  1002  			break;
  1003  		case 6:
  1004  			seg_ds16.type = val & 0xf;
  1005  			seg_ds32.type = val & 0xf;
  1006  			seg_ds64.type = val & 0xf;
  1007  			break;
  1008  		case 7:
  1009  			seg_ds16_cpl3.type = val & 0xf;
  1010  			seg_ds32_cpl3.type = val & 0xf;
  1011  			seg_ds64_cpl3.type = val & 0xf;
  1012  			break;
  1013  		case 8:
  1014  			*(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_FLD) = (val & 0xffff);
  1015  			*(uint64*)(host_mem + X86_ADDR_VAR_VMWRITE_VAL) = (val >> 16);
  1016  			break;
  1017  		default:
  1018  			fail("bad kvm setup opt");
  1019  		}
  1020  	}
  1021  	regs.rflags |= 2; // bit 1 is always set
  1022  
  1023  	fill_segment_descriptor(gdt, ldt, &seg_ldt);
  1024  	fill_segment_descriptor(gdt, ldt, &seg_cs16);
  1025  	fill_segment_descriptor(gdt, ldt, &seg_ds16);
  1026  	fill_segment_descriptor(gdt, ldt, &seg_cs16_cpl3);
  1027  	fill_segment_descriptor(gdt, ldt, &seg_ds16_cpl3);
  1028  	fill_segment_descriptor(gdt, ldt, &seg_cs32);
  1029  	fill_segment_descriptor(gdt, ldt, &seg_ds32);
  1030  	fill_segment_descriptor(gdt, ldt, &seg_cs32_cpl3);
  1031  	fill_segment_descriptor(gdt, ldt, &seg_ds32_cpl3);
  1032  	fill_segment_descriptor(gdt, ldt, &seg_cs64);
  1033  	fill_segment_descriptor(gdt, ldt, &seg_ds64);
  1034  	fill_segment_descriptor(gdt, ldt, &seg_cs64_cpl3);
  1035  	fill_segment_descriptor(gdt, ldt, &seg_ds64_cpl3);
  1036  	fill_segment_descriptor(gdt, ldt, &seg_tss32);
  1037  	fill_segment_descriptor(gdt, ldt, &seg_tss32_2);
  1038  	fill_segment_descriptor(gdt, ldt, &seg_tss32_cpl3);
  1039  	fill_segment_descriptor(gdt, ldt, &seg_tss32_vm86);
  1040  	fill_segment_descriptor(gdt, ldt, &seg_tss16);
  1041  	fill_segment_descriptor(gdt, ldt, &seg_tss16_2);
  1042  	fill_segment_descriptor(gdt, ldt, &seg_tss16_cpl3);
  1043  	fill_segment_descriptor_dword(gdt, ldt, &seg_tss64);
  1044  	fill_segment_descriptor_dword(gdt, ldt, &seg_tss64_cpl3);
  1045  	fill_segment_descriptor(gdt, ldt, &seg_cgate16);
  1046  	fill_segment_descriptor(gdt, ldt, &seg_tgate16);
  1047  	fill_segment_descriptor(gdt, ldt, &seg_cgate32);
  1048  	fill_segment_descriptor(gdt, ldt, &seg_tgate32);
  1049  	fill_segment_descriptor_dword(gdt, ldt, &seg_cgate64);
  1050  
  1051  	if (ioctl(cpufd, KVM_SET_SREGS, &sregs))
  1052  		return -1;
  1053  	if (ioctl(cpufd, KVM_SET_REGS, &regs))
  1054  		return -1;
  1055  	return 0;
  1056  }
  1057  #endif
  1058  
  1059  #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu
  1060  
  1061  #define RFLAGS_1_BIT (1ULL << 1)
  1062  #define RFLAGS_IF_BIT (1ULL << 9)
  1063  
  1064  static void reset_cpu_regs(int cpufd, int cpu_id, size_t text_size)
  1065  {
  1066  	struct kvm_regs regs;
  1067  	memset(&regs, 0, sizeof(regs));
  1068  
  1069  	// RFLAGS.1 must be 1, RFLAGS.IF enables interrupts.
  1070  	regs.rflags |= RFLAGS_1_BIT | RFLAGS_IF_BIT;
  1071  	// PC points to the relative offset of guest_main() within the guest code.
  1072  	regs.rip = executor_fn_guest_addr(guest_main);
  1073  	regs.rsp = X86_SYZOS_ADDR_STACK0;
  1074  	// Pass parameters to guest_main().
  1075  	regs.rdi = text_size;
  1076  	regs.rsi = cpu_id;
  1077  	ioctl(cpufd, KVM_SET_REGS, &regs);
  1078  }
  1079  
  1080  static void install_user_code(struct kvm_syz_vm* vm, int cpufd, int cpu_id, const void* text, size_t text_size)
  1081  {
  1082  	if ((cpu_id < 0) || (cpu_id >= KVM_MAX_VCPU))
  1083  		return;
  1084  	if (text_size > KVM_PAGE_SIZE)
  1085  		text_size = KVM_PAGE_SIZE;
  1086  	void* target = (void*)((uint64)vm->user_text + (KVM_PAGE_SIZE * cpu_id));
  1087  	memcpy(target, text, text_size);
  1088  	setup_gdt_ldt_pg(vm, cpufd);
  1089  	setup_cpuid(cpufd);
  1090  	reset_cpu_regs(cpufd, cpu_id, text_size);
  1091  }
  1092  #endif
  1093  
  1094  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm
  1095  struct addr_size {
  1096  	void* addr;
  1097  	size_t size;
  1098  };
  1099  
  1100  static struct addr_size alloc_guest_mem(struct addr_size* free, size_t size)
  1101  {
  1102  	struct addr_size ret = {.addr = NULL, .size = 0};
  1103  
  1104  	if (free->size < size)
  1105  		return ret;
  1106  	ret.addr = free->addr;
  1107  	ret.size = size;
  1108  	free->addr = (void*)((char*)free->addr + size);
  1109  	free->size -= size;
  1110  	return ret;
  1111  }
  1112  
  1113  // Call KVM_SET_USER_MEMORY_REGION for the given pages.
  1114  static void vm_set_user_memory_region(int vmfd, uint32 slot, uint32 flags, uint64 guest_phys_addr, uint64 memory_size, uint64 userspace_addr)
  1115  {
  1116  	struct kvm_userspace_memory_region memreg;
  1117  	memreg.slot = slot;
  1118  	memreg.flags = flags;
  1119  	memreg.guest_phys_addr = guest_phys_addr;
  1120  	memreg.memory_size = memory_size;
  1121  	memreg.userspace_addr = userspace_addr;
  1122  	ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
  1123  }
  1124  
  1125  static void install_syzos_code(void* host_mem, size_t mem_size)
  1126  {
  1127  	size_t size = (char*)&__stop_guest - (char*)&__start_guest;
  1128  	if (size > mem_size)
  1129  		fail("SyzOS size exceeds guest memory");
  1130  	memcpy(host_mem, &__start_guest, size);
  1131  }
  1132  
  1133  static void setup_vm(int vmfd, struct kvm_syz_vm* vm)
  1134  {
  1135  	struct addr_size allocator = {.addr = vm->host_mem, .size = vm->total_pages * KVM_PAGE_SIZE};
  1136  	int slot = 0; // Slot numbers do not matter, they just have to be different.
  1137  
  1138  	for (size_t i = 0; i < sizeof(syzos_mem_regions) / sizeof(syzos_mem_regions[0]); i++) {
  1139  		const struct mem_region* r = &syzos_mem_regions[i];
  1140  		if (r->flags & MEM_REGION_FLAG_NO_HOST_MEM)
  1141  			continue;
  1142  		struct addr_size next = alloc_guest_mem(&allocator, r->pages * KVM_PAGE_SIZE);
  1143  		uint32 flags = 0;
  1144  		if (r->flags & MEM_REGION_FLAG_DIRTY_LOG)
  1145  			flags |= KVM_MEM_LOG_DIRTY_PAGES;
  1146  		if (r->flags & MEM_REGION_FLAG_READONLY)
  1147  			flags |= KVM_MEM_READONLY;
  1148  		if (r->flags & MEM_REGION_FLAG_USER_CODE)
  1149  			vm->user_text = next.addr;
  1150  		if (r->flags & MEM_REGION_FLAG_GPA0)
  1151  			vm->gpa0_mem = next.addr;
  1152  		if (r->flags & MEM_REGION_FLAG_EXECUTOR_CODE)
  1153  			install_syzos_code(next.addr, next.size);
  1154  		vm_set_user_memory_region(vmfd, slot++, flags, r->gpa, next.size, (uintptr_t)next.addr);
  1155  	}
  1156  
  1157  	// Map the remaining pages at an unused address.
  1158  	struct addr_size next = alloc_guest_mem(&allocator, allocator.size);
  1159  	vm_set_user_memory_region(vmfd, slot++, 0, X86_SYZOS_ADDR_UNUSED, next.size, (uintptr_t)next.addr);
  1160  }
  1161  #endif
  1162  
  1163  #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm
  1164  static long syz_kvm_setup_syzos_vm(volatile long a0, volatile long a1)
  1165  {
  1166  	const int vmfd = a0;
  1167  	void* host_mem = (void*)a1;
  1168  	struct kvm_syz_vm* ret = (struct kvm_syz_vm*)host_mem;
  1169  	ret->host_mem = (void*)((uint64)host_mem + KVM_PAGE_SIZE);
  1170  	ret->total_pages = KVM_GUEST_PAGES - 1;
  1171  	setup_vm(vmfd, ret);
  1172  	ret->vmfd = vmfd;
  1173  	ret->next_cpu_id = 0;
  1174  	return (long)ret;
  1175  }
  1176  #endif
  1177  
  1178  #if SYZ_EXECUTOR || __NR_syz_kvm_add_vcpu
  1179  static long syz_kvm_add_vcpu(volatile long a0, volatile long a1)
  1180  {
  1181  	struct kvm_syz_vm* vm = (struct kvm_syz_vm*)a0;
  1182  	struct kvm_text* utext = (struct kvm_text*)a1;
  1183  	const void* text = utext->text;
  1184  	size_t text_size = utext->size;
  1185  
  1186  	if (!vm) {
  1187  		errno = EINVAL;
  1188  		return -1;
  1189  	}
  1190  	if (vm->next_cpu_id == KVM_MAX_VCPU) {
  1191  		errno = ENOMEM;
  1192  		return -1;
  1193  	}
  1194  	int cpu_id = vm->next_cpu_id;
  1195  	int cpufd = ioctl(vm->vmfd, KVM_CREATE_VCPU, cpu_id);
  1196  	if (cpufd == -1)
  1197  		return -1;
  1198  	// Only increment next_cpu_id if CPU creation succeeded.
  1199  	vm->next_cpu_id++;
  1200  	install_user_code(vm, cpufd, cpu_id, text, text_size);
  1201  	return cpufd;
  1202  }
  1203  #endif
  1204  
  1205  #if SYZ_EXECUTOR || __NR_syz_kvm_assert_syzos_uexit
  1206  static long syz_kvm_assert_syzos_uexit(volatile long a0, volatile long a1)
  1207  {
  1208  	struct kvm_run* run = (struct kvm_run*)a0;
  1209  	uint64 expect = a1;
  1210  
  1211  	if (!run || (run->exit_reason != KVM_EXIT_MMIO) || (run->mmio.phys_addr != X86_SYZOS_ADDR_UEXIT)) {
  1212  		errno = EINVAL;
  1213  		return -1;
  1214  	}
  1215  
  1216  	if ((((uint64*)(run->mmio.data))[0]) != expect) {
  1217  		errno = EDOM;
  1218  		return -1;
  1219  	}
  1220  	return 0;
  1221  }
  1222  #endif
  1223  
  1224  #endif // EXECUTOR_COMMON_KVM_AMD64_H