github.com/google/syzkaller@v0.0.0-20240517125934-c0f1611a36d6/executor/common_kvm_amd64.h (about)

     1  // Copyright 2017 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  // This file is shared between executor and csource package.
     5  
     6  // Implementation of syz_kvm_setup_cpu pseudo-syscall.
     7  // See Intel Software Developer’s Manual Volume 3: System Programming Guide
     8  // for details on what happens here.
     9  
    10  #include "kvm.h"
    11  #include "kvm_amd64.S.h"
    12  
    13  #ifndef KVM_SMI
    14  #define KVM_SMI _IO(KVMIO, 0xb7)
    15  #endif
    16  
    17  #define CR0_PE 1
    18  #define CR0_MP (1 << 1)
    19  #define CR0_EM (1 << 2)
    20  #define CR0_TS (1 << 3)
    21  #define CR0_ET (1 << 4)
    22  #define CR0_NE (1 << 5)
    23  #define CR0_WP (1 << 16)
    24  #define CR0_AM (1 << 18)
    25  #define CR0_NW (1 << 29)
    26  #define CR0_CD (1 << 30)
    27  #define CR0_PG (1 << 31)
    28  
    29  #define CR4_VME 1
    30  #define CR4_PVI (1 << 1)
    31  #define CR4_TSD (1 << 2)
    32  #define CR4_DE (1 << 3)
    33  #define CR4_PSE (1 << 4)
    34  #define CR4_PAE (1 << 5)
    35  #define CR4_MCE (1 << 6)
    36  #define CR4_PGE (1 << 7)
    37  #define CR4_PCE (1 << 8)
    38  #define CR4_OSFXSR (1 << 8)
    39  #define CR4_OSXMMEXCPT (1 << 10)
    40  #define CR4_UMIP (1 << 11)
    41  #define CR4_VMXE (1 << 13)
    42  #define CR4_SMXE (1 << 14)
    43  #define CR4_FSGSBASE (1 << 16)
    44  #define CR4_PCIDE (1 << 17)
    45  #define CR4_OSXSAVE (1 << 18)
    46  #define CR4_SMEP (1 << 20)
    47  #define CR4_SMAP (1 << 21)
    48  #define CR4_PKE (1 << 22)
    49  
    50  #define EFER_SCE 1
    51  #define EFER_LME (1 << 8)
    52  #define EFER_LMA (1 << 10)
    53  #define EFER_NXE (1 << 11)
    54  #define EFER_SVME (1 << 12)
    55  #define EFER_LMSLE (1 << 13)
    56  #define EFER_FFXSR (1 << 14)
    57  #define EFER_TCE (1 << 15)
    58  
    59  // 32-bit page directory entry bits
    60  #define PDE32_PRESENT 1
    61  #define PDE32_RW (1 << 1)
    62  #define PDE32_USER (1 << 2)
    63  #define PDE32_PS (1 << 7)
    64  
    65  // 64-bit page * entry bits
    66  #define PDE64_PRESENT 1
    67  #define PDE64_RW (1 << 1)
    68  #define PDE64_USER (1 << 2)
    69  #define PDE64_ACCESSED (1 << 5)
    70  #define PDE64_DIRTY (1 << 6)
    71  #define PDE64_PS (1 << 7)
    72  #define PDE64_G (1 << 8)
    73  
    74  struct tss16 {
    75  	uint16 prev;
    76  	uint16 sp0;
    77  	uint16 ss0;
    78  	uint16 sp1;
    79  	uint16 ss1;
    80  	uint16 sp2;
    81  	uint16 ss2;
    82  	uint16 ip;
    83  	uint16 flags;
    84  	uint16 ax;
    85  	uint16 cx;
    86  	uint16 dx;
    87  	uint16 bx;
    88  	uint16 sp;
    89  	uint16 bp;
    90  	uint16 si;
    91  	uint16 di;
    92  	uint16 es;
    93  	uint16 cs;
    94  	uint16 ss;
    95  	uint16 ds;
    96  	uint16 ldt;
    97  } __attribute__((packed));
    98  
    99  struct tss32 {
   100  	uint16 prev, prevh;
   101  	uint32 sp0;
   102  	uint16 ss0, ss0h;
   103  	uint32 sp1;
   104  	uint16 ss1, ss1h;
   105  	uint32 sp2;
   106  	uint16 ss2, ss2h;
   107  	uint32 cr3;
   108  	uint32 ip;
   109  	uint32 flags;
   110  	uint32 ax;
   111  	uint32 cx;
   112  	uint32 dx;
   113  	uint32 bx;
   114  	uint32 sp;
   115  	uint32 bp;
   116  	uint32 si;
   117  	uint32 di;
   118  	uint16 es, esh;
   119  	uint16 cs, csh;
   120  	uint16 ss, ssh;
   121  	uint16 ds, dsh;
   122  	uint16 fs, fsh;
   123  	uint16 gs, gsh;
   124  	uint16 ldt, ldth;
   125  	uint16 trace;
   126  	uint16 io_bitmap;
   127  } __attribute__((packed));
   128  
   129  struct tss64 {
   130  	uint32 reserved0;
   131  	uint64 rsp[3];
   132  	uint64 reserved1;
   133  	uint64 ist[7];
   134  	uint64 reserved2;
   135  	uint32 reserved3;
   136  	uint32 io_bitmap;
   137  } __attribute__((packed));
   138  
   139  static void fill_segment_descriptor(uint64* dt, uint64* lt, struct kvm_segment* seg)
   140  {
   141  	uint16 index = seg->selector >> 3;
   142  	uint64 limit = seg->g ? seg->limit >> 12 : seg->limit;
   143  	uint64 sd = (limit & 0xffff) | (seg->base & 0xffffff) << 16 | (uint64)seg->type << 40 | (uint64)seg->s << 44 | (uint64)seg->dpl << 45 | (uint64)seg->present << 47 | (limit & 0xf0000ULL) << 48 | (uint64)seg->avl << 52 | (uint64)seg->l << 53 | (uint64)seg->db << 54 | (uint64)seg->g << 55 | (seg->base & 0xff000000ULL) << 56;
   144  	dt[index] = sd;
   145  	lt[index] = sd;
   146  }
   147  
   148  static void fill_segment_descriptor_dword(uint64* dt, uint64* lt, struct kvm_segment* seg)
   149  {
   150  	fill_segment_descriptor(dt, lt, seg);
   151  	uint16 index = seg->selector >> 3;
   152  	dt[index + 1] = 0;
   153  	lt[index + 1] = 0;
   154  }
   155  
   156  static void setup_syscall_msrs(int cpufd, uint16 sel_cs, uint16 sel_cs_cpl3)
   157  {
   158  	char buf[sizeof(struct kvm_msrs) + 5 * sizeof(struct kvm_msr_entry)];
   159  	memset(buf, 0, sizeof(buf));
   160  	struct kvm_msrs* msrs = (struct kvm_msrs*)buf;
   161  	struct kvm_msr_entry* entries = msrs->entries;
   162  	msrs->nmsrs = 5;
   163  	entries[0].index = MSR_IA32_SYSENTER_CS;
   164  	entries[0].data = sel_cs;
   165  	entries[1].index = MSR_IA32_SYSENTER_ESP;
   166  	entries[1].data = ADDR_STACK0;
   167  	entries[2].index = MSR_IA32_SYSENTER_EIP;
   168  	entries[2].data = ADDR_VAR_SYSEXIT;
   169  	entries[3].index = MSR_IA32_STAR;
   170  	entries[3].data = ((uint64)sel_cs << 32) | ((uint64)sel_cs_cpl3 << 48);
   171  	entries[4].index = MSR_IA32_LSTAR;
   172  	entries[4].data = ADDR_VAR_SYSRET;
   173  	ioctl(cpufd, KVM_SET_MSRS, msrs);
   174  }
   175  
   176  static void setup_32bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem)
   177  {
   178  	sregs->idt.base = guest_mem + ADDR_VAR_IDT;
   179  	sregs->idt.limit = 0x1ff;
   180  	uint64* idt = (uint64*)(host_mem + sregs->idt.base);
   181  	for (int i = 0; i < 32; i++) {
   182  		struct kvm_segment gate;
   183  		gate.selector = i << 3;
   184  		switch (i % 6) {
   185  		case 0:
   186  			// 16-bit interrupt gate
   187  			gate.type = 6;
   188  			gate.base = SEL_CS16;
   189  			break;
   190  		case 1:
   191  			// 16-bit trap gate
   192  			gate.type = 7;
   193  			gate.base = SEL_CS16;
   194  			break;
   195  		case 2:
   196  			// 16-bit task gate
   197  			gate.type = 3;
   198  			gate.base = SEL_TGATE16;
   199  			break;
   200  		case 3:
   201  			// 32-bit interrupt gate
   202  			gate.type = 14;
   203  			gate.base = SEL_CS32;
   204  			break;
   205  		case 4:
   206  			// 32-bit trap gate
   207  			gate.type = 15;
   208  			gate.base = SEL_CS32;
   209  			break;
   210  		case 5:
   211  			// 32-bit task gate
   212  			gate.type = 11;
   213  			gate.base = SEL_TGATE32;
   214  			break;
   215  		}
   216  		gate.limit = guest_mem + ADDR_VAR_USER_CODE2; // entry offset
   217  		gate.present = 1;
   218  		gate.dpl = 0;
   219  		gate.s = 0;
   220  		gate.g = 0;
   221  		gate.db = 0;
   222  		gate.l = 0;
   223  		gate.avl = 0;
   224  		fill_segment_descriptor(idt, idt, &gate);
   225  	}
   226  }
   227  
   228  static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem)
   229  {
   230  	sregs->idt.base = guest_mem + ADDR_VAR_IDT;
   231  	sregs->idt.limit = 0x1ff;
   232  	uint64* idt = (uint64*)(host_mem + sregs->idt.base);
   233  	for (int i = 0; i < 32; i++) {
   234  		struct kvm_segment gate;
   235  		gate.selector = (i * 2) << 3;
   236  		gate.type = (i & 1) ? 14 : 15; // interrupt or trap gate
   237  		gate.base = SEL_CS64;
   238  		gate.limit = guest_mem + ADDR_VAR_USER_CODE2; // entry offset
   239  		gate.present = 1;
   240  		gate.dpl = 0;
   241  		gate.s = 0;
   242  		gate.g = 0;
   243  		gate.db = 0;
   244  		gate.l = 0;
   245  		gate.avl = 0;
   246  		fill_segment_descriptor_dword(idt, idt, &gate);
   247  	}
   248  }
   249  
   250  struct kvm_text {
   251  	uintptr_t typ;
   252  	const void* text;
   253  	uintptr_t size;
   254  };
   255  
   256  struct kvm_opt {
   257  	uint64 typ;
   258  	uint64 val;
   259  };
   260  
   261  #define KVM_SETUP_PAGING (1 << 0)
   262  #define KVM_SETUP_PAE (1 << 1)
   263  #define KVM_SETUP_PROTECTED (1 << 2)
   264  #define KVM_SETUP_CPL3 (1 << 3)
   265  #define KVM_SETUP_VIRT86 (1 << 4)
   266  #define KVM_SETUP_SMM (1 << 5)
   267  #define KVM_SETUP_VM (1 << 6)
   268  
   269  // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts])
   270  static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4, volatile long a5, volatile long a6, volatile long a7)
   271  {
   272  	const int vmfd = a0;
   273  	const int cpufd = a1;
   274  	char* const host_mem = (char*)a2;
   275  	const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3;
   276  	const uintptr_t text_count = a4;
   277  	const uintptr_t flags = a5;
   278  	const struct kvm_opt* const opt_array_ptr = (struct kvm_opt*)a6;
   279  	uintptr_t opt_count = a7;
   280  
   281  	const uintptr_t page_size = 4 << 10;
   282  	const uintptr_t ioapic_page = 10;
   283  	const uintptr_t guest_mem_size = 24 * page_size;
   284  	const uintptr_t guest_mem = 0;
   285  
   286  	(void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count
   287  	int text_type = text_array_ptr[0].typ;
   288  	const void* text = text_array_ptr[0].text;
   289  	uintptr_t text_size = text_array_ptr[0].size;
   290  
   291  	for (uintptr_t i = 0; i < guest_mem_size / page_size; i++) {
   292  		struct kvm_userspace_memory_region memreg;
   293  		memreg.slot = i;
   294  		memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY
   295  		memreg.guest_phys_addr = guest_mem + i * page_size;
   296  		if (i == ioapic_page)
   297  			memreg.guest_phys_addr = 0xfec00000;
   298  		memreg.memory_size = page_size;
   299  		memreg.userspace_addr = (uintptr_t)host_mem + i * page_size;
   300  		ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
   301  	}
   302  	// SMRAM
   303  	struct kvm_userspace_memory_region memreg;
   304  	memreg.slot = 1 + (1 << 16);
   305  	memreg.flags = 0;
   306  	memreg.guest_phys_addr = 0x30000;
   307  	memreg.memory_size = 64 << 10;
   308  	memreg.userspace_addr = (uintptr_t)host_mem;
   309  	ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
   310  
   311  	struct kvm_sregs sregs;
   312  	if (ioctl(cpufd, KVM_GET_SREGS, &sregs))
   313  		return -1;
   314  
   315  	struct kvm_regs regs;
   316  	memset(&regs, 0, sizeof(regs));
   317  	regs.rip = guest_mem + ADDR_TEXT;
   318  	regs.rsp = ADDR_STACK0;
   319  
   320  	sregs.gdt.base = guest_mem + ADDR_GDT;
   321  	sregs.gdt.limit = 256 * sizeof(uint64) - 1;
   322  	uint64* gdt = (uint64*)(host_mem + sregs.gdt.base);
   323  
   324  	struct kvm_segment seg_ldt;
   325  	memset(&seg_ldt, 0, sizeof(seg_ldt));
   326  	seg_ldt.selector = SEL_LDT;
   327  	seg_ldt.type = 2;
   328  	seg_ldt.base = guest_mem + ADDR_LDT;
   329  	seg_ldt.limit = 256 * sizeof(uint64) - 1;
   330  	seg_ldt.present = 1;
   331  	seg_ldt.dpl = 0;
   332  	seg_ldt.s = 0;
   333  	seg_ldt.g = 0;
   334  	seg_ldt.db = 1;
   335  	seg_ldt.l = 0;
   336  	sregs.ldt = seg_ldt;
   337  	uint64* ldt = (uint64*)(host_mem + sregs.ldt.base);
   338  
   339  	struct kvm_segment seg_cs16;
   340  	memset(&seg_cs16, 0, sizeof(seg_cs16));
   341  	seg_cs16.selector = SEL_CS16;
   342  	seg_cs16.type = 11;
   343  	seg_cs16.base = 0;
   344  	seg_cs16.limit = 0xfffff;
   345  	seg_cs16.present = 1;
   346  	seg_cs16.dpl = 0;
   347  	seg_cs16.s = 1;
   348  	seg_cs16.g = 0;
   349  	seg_cs16.db = 0;
   350  	seg_cs16.l = 0;
   351  
   352  	struct kvm_segment seg_ds16 = seg_cs16;
   353  	seg_ds16.selector = SEL_DS16;
   354  	seg_ds16.type = 3;
   355  
   356  	struct kvm_segment seg_cs16_cpl3 = seg_cs16;
   357  	seg_cs16_cpl3.selector = SEL_CS16_CPL3;
   358  	seg_cs16_cpl3.dpl = 3;
   359  
   360  	struct kvm_segment seg_ds16_cpl3 = seg_ds16;
   361  	seg_ds16_cpl3.selector = SEL_DS16_CPL3;
   362  	seg_ds16_cpl3.dpl = 3;
   363  
   364  	struct kvm_segment seg_cs32 = seg_cs16;
   365  	seg_cs32.selector = SEL_CS32;
   366  	seg_cs32.db = 1;
   367  
   368  	struct kvm_segment seg_ds32 = seg_ds16;
   369  	seg_ds32.selector = SEL_DS32;
   370  	seg_ds32.db = 1;
   371  
   372  	struct kvm_segment seg_cs32_cpl3 = seg_cs32;
   373  	seg_cs32_cpl3.selector = SEL_CS32_CPL3;
   374  	seg_cs32_cpl3.dpl = 3;
   375  
   376  	struct kvm_segment seg_ds32_cpl3 = seg_ds32;
   377  	seg_ds32_cpl3.selector = SEL_DS32_CPL3;
   378  	seg_ds32_cpl3.dpl = 3;
   379  
   380  	struct kvm_segment seg_cs64 = seg_cs16;
   381  	seg_cs64.selector = SEL_CS64;
   382  	seg_cs64.l = 1;
   383  
   384  	struct kvm_segment seg_ds64 = seg_ds32;
   385  	seg_ds64.selector = SEL_DS64;
   386  
   387  	struct kvm_segment seg_cs64_cpl3 = seg_cs64;
   388  	seg_cs64_cpl3.selector = SEL_CS64_CPL3;
   389  	seg_cs64_cpl3.dpl = 3;
   390  
   391  	struct kvm_segment seg_ds64_cpl3 = seg_ds64;
   392  	seg_ds64_cpl3.selector = SEL_DS64_CPL3;
   393  	seg_ds64_cpl3.dpl = 3;
   394  
   395  	struct kvm_segment seg_tss32;
   396  	memset(&seg_tss32, 0, sizeof(seg_tss32));
   397  	seg_tss32.selector = SEL_TSS32;
   398  	seg_tss32.type = 9;
   399  	seg_tss32.base = ADDR_VAR_TSS32;
   400  	seg_tss32.limit = 0x1ff;
   401  	seg_tss32.present = 1;
   402  	seg_tss32.dpl = 0;
   403  	seg_tss32.s = 0;
   404  	seg_tss32.g = 0;
   405  	seg_tss32.db = 0;
   406  	seg_tss32.l = 0;
   407  
   408  	struct kvm_segment seg_tss32_2 = seg_tss32;
   409  	seg_tss32_2.selector = SEL_TSS32_2;
   410  	seg_tss32_2.base = ADDR_VAR_TSS32_2;
   411  
   412  	struct kvm_segment seg_tss32_cpl3 = seg_tss32;
   413  	seg_tss32_cpl3.selector = SEL_TSS32_CPL3;
   414  	seg_tss32_cpl3.base = ADDR_VAR_TSS32_CPL3;
   415  
   416  	struct kvm_segment seg_tss32_vm86 = seg_tss32;
   417  	seg_tss32_vm86.selector = SEL_TSS32_VM86;
   418  	seg_tss32_vm86.base = ADDR_VAR_TSS32_VM86;
   419  
   420  	struct kvm_segment seg_tss16 = seg_tss32;
   421  	seg_tss16.selector = SEL_TSS16;
   422  	seg_tss16.base = ADDR_VAR_TSS16;
   423  	seg_tss16.limit = 0xff;
   424  	seg_tss16.type = 1;
   425  
   426  	struct kvm_segment seg_tss16_2 = seg_tss16;
   427  	seg_tss16_2.selector = SEL_TSS16_2;
   428  	seg_tss16_2.base = ADDR_VAR_TSS16_2;
   429  	seg_tss16_2.dpl = 0;
   430  
   431  	struct kvm_segment seg_tss16_cpl3 = seg_tss16;
   432  	seg_tss16_cpl3.selector = SEL_TSS16_CPL3;
   433  	seg_tss16_cpl3.base = ADDR_VAR_TSS16_CPL3;
   434  	seg_tss16_cpl3.dpl = 3;
   435  
   436  	struct kvm_segment seg_tss64 = seg_tss32;
   437  	seg_tss64.selector = SEL_TSS64;
   438  	seg_tss64.base = ADDR_VAR_TSS64;
   439  	seg_tss64.limit = 0x1ff;
   440  
   441  	struct kvm_segment seg_tss64_cpl3 = seg_tss64;
   442  	seg_tss64_cpl3.selector = SEL_TSS64_CPL3;
   443  	seg_tss64_cpl3.base = ADDR_VAR_TSS64_CPL3;
   444  	seg_tss64_cpl3.dpl = 3;
   445  
   446  	struct kvm_segment seg_cgate16;
   447  	memset(&seg_cgate16, 0, sizeof(seg_cgate16));
   448  	seg_cgate16.selector = SEL_CGATE16;
   449  	seg_cgate16.type = 4;
   450  	seg_cgate16.base = SEL_CS16 | (2 << 16); // selector + param count
   451  	seg_cgate16.limit = ADDR_VAR_USER_CODE2; // entry offset
   452  	seg_cgate16.present = 1;
   453  	seg_cgate16.dpl = 0;
   454  	seg_cgate16.s = 0;
   455  	seg_cgate16.g = 0;
   456  	seg_cgate16.db = 0;
   457  	seg_cgate16.l = 0;
   458  	seg_cgate16.avl = 0;
   459  
   460  	struct kvm_segment seg_tgate16 = seg_cgate16;
   461  	seg_tgate16.selector = SEL_TGATE16;
   462  	seg_tgate16.type = 3;
   463  	seg_cgate16.base = SEL_TSS16_2;
   464  	seg_tgate16.limit = 0;
   465  
   466  	struct kvm_segment seg_cgate32 = seg_cgate16;
   467  	seg_cgate32.selector = SEL_CGATE32;
   468  	seg_cgate32.type = 12;
   469  	seg_cgate32.base = SEL_CS32 | (2 << 16); // selector + param count
   470  
   471  	struct kvm_segment seg_tgate32 = seg_cgate32;
   472  	seg_tgate32.selector = SEL_TGATE32;
   473  	seg_tgate32.type = 11;
   474  	seg_tgate32.base = SEL_TSS32_2;
   475  	seg_tgate32.limit = 0;
   476  
   477  	struct kvm_segment seg_cgate64 = seg_cgate16;
   478  	seg_cgate64.selector = SEL_CGATE64;
   479  	seg_cgate64.type = 12;
   480  	seg_cgate64.base = SEL_CS64;
   481  
   482  	int kvmfd = open("/dev/kvm", O_RDWR);
   483  	char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)];
   484  	memset(buf, 0, sizeof(buf));
   485  	struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf;
   486  	cpuid->nent = 128;
   487  	ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid);
   488  	ioctl(cpufd, KVM_SET_CPUID2, cpuid);
   489  	close(kvmfd);
   490  
   491  	const char* text_prefix = 0;
   492  	int text_prefix_size = 0;
   493  	char* host_text = host_mem + ADDR_TEXT;
   494  
   495  	if (text_type == 8) {
   496  		if (flags & KVM_SETUP_SMM) {
   497  			if (flags & KVM_SETUP_PROTECTED) {
   498  				sregs.cs = seg_cs16;
   499  				sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
   500  				sregs.cr0 |= CR0_PE;
   501  			} else {
   502  				sregs.cs.selector = 0;
   503  				sregs.cs.base = 0;
   504  			}
   505  
   506  			*(host_mem + ADDR_TEXT) = 0xf4; // hlt for rsm
   507  			host_text = host_mem + 0x8000;
   508  
   509  			ioctl(cpufd, KVM_SMI, 0);
   510  		} else if (flags & KVM_SETUP_VIRT86) {
   511  			sregs.cs = seg_cs32;
   512  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   513  			sregs.cr0 |= CR0_PE;
   514  			sregs.efer |= EFER_SCE;
   515  
   516  			setup_syscall_msrs(cpufd, SEL_CS32, SEL_CS32_CPL3);
   517  			setup_32bit_idt(&sregs, host_mem, guest_mem);
   518  
   519  			if (flags & KVM_SETUP_PAGING) {
   520  				uint64 pd_addr = guest_mem + ADDR_PD;
   521  				uint64* pd = (uint64*)(host_mem + ADDR_PD);
   522  				// A single 4MB page to cover the memory region
   523  				pd[0] = PDE32_PRESENT | PDE32_RW | PDE32_USER | PDE32_PS;
   524  				sregs.cr3 = pd_addr;
   525  				sregs.cr4 |= CR4_PSE;
   526  
   527  				text_prefix = kvm_asm32_paged_vm86;
   528  				text_prefix_size = sizeof(kvm_asm32_paged_vm86) - 1;
   529  			} else {
   530  				text_prefix = kvm_asm32_vm86;
   531  				text_prefix_size = sizeof(kvm_asm32_vm86) - 1;
   532  			}
   533  		} else {
   534  			sregs.cs.selector = 0;
   535  			sregs.cs.base = 0;
   536  		}
   537  	} else if (text_type == 16) {
   538  		if (flags & KVM_SETUP_CPL3) {
   539  			sregs.cs = seg_cs16;
   540  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
   541  
   542  			text_prefix = kvm_asm16_cpl3;
   543  			text_prefix_size = sizeof(kvm_asm16_cpl3) - 1;
   544  		} else {
   545  			sregs.cr0 |= CR0_PE;
   546  			sregs.cs = seg_cs16;
   547  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
   548  		}
   549  	} else if (text_type == 32) {
   550  		sregs.cr0 |= CR0_PE;
   551  		sregs.efer |= EFER_SCE;
   552  
   553  		setup_syscall_msrs(cpufd, SEL_CS32, SEL_CS32_CPL3);
   554  		setup_32bit_idt(&sregs, host_mem, guest_mem);
   555  
   556  		if (flags & KVM_SETUP_SMM) {
   557  			sregs.cs = seg_cs32;
   558  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   559  
   560  			*(host_mem + ADDR_TEXT) = 0xf4; // hlt for rsm
   561  			host_text = host_mem + 0x8000;
   562  
   563  			ioctl(cpufd, KVM_SMI, 0);
   564  		} else if (flags & KVM_SETUP_PAGING) {
   565  			sregs.cs = seg_cs32;
   566  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   567  
   568  			uint64 pd_addr = guest_mem + ADDR_PD;
   569  			uint64* pd = (uint64*)(host_mem + ADDR_PD);
   570  			// A single 4MB page to cover the memory region
   571  			pd[0] = PDE32_PRESENT | PDE32_RW | PDE32_USER | PDE32_PS;
   572  			sregs.cr3 = pd_addr;
   573  			sregs.cr4 |= CR4_PSE;
   574  
   575  			text_prefix = kvm_asm32_paged;
   576  			text_prefix_size = sizeof(kvm_asm32_paged) - 1;
   577  		} else if (flags & KVM_SETUP_CPL3) {
   578  			sregs.cs = seg_cs32_cpl3;
   579  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32_cpl3;
   580  		} else {
   581  			sregs.cs = seg_cs32;
   582  			sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   583  		}
   584  	} else {
   585  		sregs.efer |= EFER_LME | EFER_SCE;
   586  		sregs.cr0 |= CR0_PE;
   587  
   588  		setup_syscall_msrs(cpufd, SEL_CS64, SEL_CS64_CPL3);
   589  		setup_64bit_idt(&sregs, host_mem, guest_mem);
   590  
   591  		sregs.cs = seg_cs32;
   592  		sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
   593  
   594  		uint64 pml4_addr = guest_mem + ADDR_PML4;
   595  		uint64* pml4 = (uint64*)(host_mem + ADDR_PML4);
   596  		uint64 pdpt_addr = guest_mem + ADDR_PDP;
   597  		uint64* pdpt = (uint64*)(host_mem + ADDR_PDP);
   598  		uint64 pd_addr = guest_mem + ADDR_PD;
   599  		uint64* pd = (uint64*)(host_mem + ADDR_PD);
   600  		pml4[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | pdpt_addr;
   601  		pdpt[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | pd_addr;
   602  		pd[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | PDE64_PS;
   603  		sregs.cr3 = pml4_addr;
   604  		sregs.cr4 |= CR4_PAE;
   605  
   606  		if (flags & KVM_SETUP_VM) {
   607  			sregs.cr0 |= CR0_NE;
   608  
   609  			*((uint64*)(host_mem + ADDR_VAR_VMXON_PTR)) = ADDR_VAR_VMXON;
   610  			*((uint64*)(host_mem + ADDR_VAR_VMCS_PTR)) = ADDR_VAR_VMCS;
   611  			memcpy(host_mem + ADDR_VAR_VMEXIT_CODE, kvm_asm64_vm_exit, sizeof(kvm_asm64_vm_exit) - 1);
   612  			*((uint64*)(host_mem + ADDR_VAR_VMEXIT_PTR)) = ADDR_VAR_VMEXIT_CODE;
   613  
   614  			text_prefix = kvm_asm64_init_vm;
   615  			text_prefix_size = sizeof(kvm_asm64_init_vm) - 1;
   616  		} else if (flags & KVM_SETUP_CPL3) {
   617  			text_prefix = kvm_asm64_cpl3;
   618  			text_prefix_size = sizeof(kvm_asm64_cpl3) - 1;
   619  		} else {
   620  			text_prefix = kvm_asm64_enable_long;
   621  			text_prefix_size = sizeof(kvm_asm64_enable_long) - 1;
   622  		}
   623  	}
   624  
   625  	struct tss16 tss16;
   626  	memset(&tss16, 0, sizeof(tss16));
   627  	tss16.ss0 = tss16.ss1 = tss16.ss2 = SEL_DS16;
   628  	tss16.sp0 = tss16.sp1 = tss16.sp2 = ADDR_STACK0;
   629  	tss16.ip = ADDR_VAR_USER_CODE2;
   630  	tss16.flags = (1 << 1);
   631  	tss16.cs = SEL_CS16;
   632  	tss16.es = tss16.ds = tss16.ss = SEL_DS16;
   633  	tss16.ldt = SEL_LDT;
   634  	struct tss16* tss16_addr = (struct tss16*)(host_mem + seg_tss16_2.base);
   635  	memcpy(tss16_addr, &tss16, sizeof(tss16));
   636  
   637  	memset(&tss16, 0, sizeof(tss16));
   638  	tss16.ss0 = tss16.ss1 = tss16.ss2 = SEL_DS16;
   639  	tss16.sp0 = tss16.sp1 = tss16.sp2 = ADDR_STACK0;
   640  	tss16.ip = ADDR_VAR_USER_CODE2;
   641  	tss16.flags = (1 << 1);
   642  	tss16.cs = SEL_CS16_CPL3;
   643  	tss16.es = tss16.ds = tss16.ss = SEL_DS16_CPL3;
   644  	tss16.ldt = SEL_LDT;
   645  	struct tss16* tss16_cpl3_addr = (struct tss16*)(host_mem + seg_tss16_cpl3.base);
   646  	memcpy(tss16_cpl3_addr, &tss16, sizeof(tss16));
   647  
   648  	struct tss32 tss32;
   649  	memset(&tss32, 0, sizeof(tss32));
   650  	tss32.ss0 = tss32.ss1 = tss32.ss2 = SEL_DS32;
   651  	tss32.sp0 = tss32.sp1 = tss32.sp2 = ADDR_STACK0;
   652  	tss32.ip = ADDR_VAR_USER_CODE;
   653  	tss32.flags = (1 << 1) | (1 << 17);
   654  	tss32.ldt = SEL_LDT;
   655  	tss32.cr3 = sregs.cr3;
   656  	tss32.io_bitmap = offsetof(struct tss32, io_bitmap);
   657  	struct tss32* tss32_addr = (struct tss32*)(host_mem + seg_tss32_vm86.base);
   658  	memcpy(tss32_addr, &tss32, sizeof(tss32));
   659  
   660  	memset(&tss32, 0, sizeof(tss32));
   661  	tss32.ss0 = tss32.ss1 = tss32.ss2 = SEL_DS32;
   662  	tss32.sp0 = tss32.sp1 = tss32.sp2 = ADDR_STACK0;
   663  	tss32.ip = ADDR_VAR_USER_CODE;
   664  	tss32.flags = (1 << 1);
   665  	tss32.cr3 = sregs.cr3;
   666  	tss32.es = tss32.ds = tss32.ss = tss32.gs = tss32.fs = SEL_DS32;
   667  	tss32.cs = SEL_CS32;
   668  	tss32.ldt = SEL_LDT;
   669  	tss32.cr3 = sregs.cr3;
   670  	tss32.io_bitmap = offsetof(struct tss32, io_bitmap);
   671  	struct tss32* tss32_cpl3_addr = (struct tss32*)(host_mem + seg_tss32_2.base);
   672  	memcpy(tss32_cpl3_addr, &tss32, sizeof(tss32));
   673  
   674  	struct tss64 tss64;
   675  	memset(&tss64, 0, sizeof(tss64));
   676  	tss64.rsp[0] = ADDR_STACK0;
   677  	tss64.rsp[1] = ADDR_STACK0;
   678  	tss64.rsp[2] = ADDR_STACK0;
   679  	tss64.io_bitmap = offsetof(struct tss64, io_bitmap);
   680  	struct tss64* tss64_addr = (struct tss64*)(host_mem + seg_tss64.base);
   681  	memcpy(tss64_addr, &tss64, sizeof(tss64));
   682  
   683  	memset(&tss64, 0, sizeof(tss64));
   684  	tss64.rsp[0] = ADDR_STACK0;
   685  	tss64.rsp[1] = ADDR_STACK0;
   686  	tss64.rsp[2] = ADDR_STACK0;
   687  	tss64.io_bitmap = offsetof(struct tss64, io_bitmap);
   688  	struct tss64* tss64_cpl3_addr = (struct tss64*)(host_mem + seg_tss64_cpl3.base);
   689  	memcpy(tss64_cpl3_addr, &tss64, sizeof(tss64));
   690  
   691  	if (text_size > 1000)
   692  		text_size = 1000;
   693  	if (text_prefix) {
   694  		memcpy(host_text, text_prefix, text_prefix_size);
   695  		// Replace 0xbadc0de in LJMP with offset of a next instruction.
   696  		void* patch = memmem(host_text, text_prefix_size, "\xde\xc0\xad\x0b", 4);
   697  		if (patch)
   698  			*((uint32*)patch) = guest_mem + ADDR_TEXT + ((char*)patch - host_text) + 6;
   699  		uint16 magic = PREFIX_SIZE;
   700  		patch = memmem(host_text, text_prefix_size, &magic, sizeof(magic));
   701  		if (patch)
   702  			*((uint16*)patch) = guest_mem + ADDR_TEXT + text_prefix_size;
   703  	}
   704  	memcpy((void*)(host_text + text_prefix_size), text, text_size);
   705  	*(host_text + text_prefix_size + text_size) = 0xf4; // hlt
   706  
   707  	memcpy(host_mem + ADDR_VAR_USER_CODE, text, text_size);
   708  	*(host_mem + ADDR_VAR_USER_CODE + text_size) = 0xf4; // hlt
   709  
   710  	*(host_mem + ADDR_VAR_HLT) = 0xf4; // hlt
   711  	memcpy(host_mem + ADDR_VAR_SYSRET, "\x0f\x07\xf4", 3);
   712  	memcpy(host_mem + ADDR_VAR_SYSEXIT, "\x0f\x35\xf4", 3);
   713  
   714  	*(uint64*)(host_mem + ADDR_VAR_VMWRITE_FLD) = 0;
   715  	*(uint64*)(host_mem + ADDR_VAR_VMWRITE_VAL) = 0;
   716  
   717  	if (opt_count > 2)
   718  		opt_count = 2;
   719  	for (uintptr_t i = 0; i < opt_count; i++) {
   720  		uint64 typ = opt_array_ptr[i].typ;
   721  		uint64 val = opt_array_ptr[i].val;
   722  		switch (typ % 9) {
   723  		case 0:
   724  			sregs.cr0 ^= val & (CR0_MP | CR0_EM | CR0_ET | CR0_NE | CR0_WP | CR0_AM | CR0_NW | CR0_CD);
   725  			break;
   726  		case 1:
   727  			sregs.cr4 ^= val & (CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | CR4_MCE | CR4_PGE | CR4_PCE |
   728  					    CR4_OSFXSR | CR4_OSXMMEXCPT | CR4_UMIP | CR4_VMXE | CR4_SMXE | CR4_FSGSBASE | CR4_PCIDE |
   729  					    CR4_OSXSAVE | CR4_SMEP | CR4_SMAP | CR4_PKE);
   730  			break;
   731  		case 2:
   732  			sregs.efer ^= val & (EFER_SCE | EFER_NXE | EFER_SVME | EFER_LMSLE | EFER_FFXSR | EFER_TCE);
   733  			break;
   734  		case 3:
   735  			val &= ((1 << 8) | (1 << 9) | (1 << 10) | (1 << 12) | (1 << 13) | (1 << 14) |
   736  				(1 << 15) | (1 << 18) | (1 << 19) | (1 << 20) | (1 << 21));
   737  			regs.rflags ^= val;
   738  			tss16_addr->flags ^= val;
   739  			tss16_cpl3_addr->flags ^= val;
   740  			tss32_addr->flags ^= val;
   741  			tss32_cpl3_addr->flags ^= val;
   742  			break;
   743  		case 4:
   744  			seg_cs16.type = val & 0xf;
   745  			seg_cs32.type = val & 0xf;
   746  			seg_cs64.type = val & 0xf;
   747  			break;
   748  		case 5:
   749  			seg_cs16_cpl3.type = val & 0xf;
   750  			seg_cs32_cpl3.type = val & 0xf;
   751  			seg_cs64_cpl3.type = val & 0xf;
   752  			break;
   753  		case 6:
   754  			seg_ds16.type = val & 0xf;
   755  			seg_ds32.type = val & 0xf;
   756  			seg_ds64.type = val & 0xf;
   757  			break;
   758  		case 7:
   759  			seg_ds16_cpl3.type = val & 0xf;
   760  			seg_ds32_cpl3.type = val & 0xf;
   761  			seg_ds64_cpl3.type = val & 0xf;
   762  			break;
   763  		case 8:
   764  			*(uint64*)(host_mem + ADDR_VAR_VMWRITE_FLD) = (val & 0xffff);
   765  			*(uint64*)(host_mem + ADDR_VAR_VMWRITE_VAL) = (val >> 16);
   766  			break;
   767  		default:
   768  			fail("bad kvm setup opt");
   769  		}
   770  	}
   771  	regs.rflags |= 2; // bit 1 is always set
   772  
   773  	fill_segment_descriptor(gdt, ldt, &seg_ldt);
   774  	fill_segment_descriptor(gdt, ldt, &seg_cs16);
   775  	fill_segment_descriptor(gdt, ldt, &seg_ds16);
   776  	fill_segment_descriptor(gdt, ldt, &seg_cs16_cpl3);
   777  	fill_segment_descriptor(gdt, ldt, &seg_ds16_cpl3);
   778  	fill_segment_descriptor(gdt, ldt, &seg_cs32);
   779  	fill_segment_descriptor(gdt, ldt, &seg_ds32);
   780  	fill_segment_descriptor(gdt, ldt, &seg_cs32_cpl3);
   781  	fill_segment_descriptor(gdt, ldt, &seg_ds32_cpl3);
   782  	fill_segment_descriptor(gdt, ldt, &seg_cs64);
   783  	fill_segment_descriptor(gdt, ldt, &seg_ds64);
   784  	fill_segment_descriptor(gdt, ldt, &seg_cs64_cpl3);
   785  	fill_segment_descriptor(gdt, ldt, &seg_ds64_cpl3);
   786  	fill_segment_descriptor(gdt, ldt, &seg_tss32);
   787  	fill_segment_descriptor(gdt, ldt, &seg_tss32_2);
   788  	fill_segment_descriptor(gdt, ldt, &seg_tss32_cpl3);
   789  	fill_segment_descriptor(gdt, ldt, &seg_tss32_vm86);
   790  	fill_segment_descriptor(gdt, ldt, &seg_tss16);
   791  	fill_segment_descriptor(gdt, ldt, &seg_tss16_2);
   792  	fill_segment_descriptor(gdt, ldt, &seg_tss16_cpl3);
   793  	fill_segment_descriptor_dword(gdt, ldt, &seg_tss64);
   794  	fill_segment_descriptor_dword(gdt, ldt, &seg_tss64_cpl3);
   795  	fill_segment_descriptor(gdt, ldt, &seg_cgate16);
   796  	fill_segment_descriptor(gdt, ldt, &seg_tgate16);
   797  	fill_segment_descriptor(gdt, ldt, &seg_cgate32);
   798  	fill_segment_descriptor(gdt, ldt, &seg_tgate32);
   799  	fill_segment_descriptor_dword(gdt, ldt, &seg_cgate64);
   800  
   801  	if (ioctl(cpufd, KVM_SET_SREGS, &sregs))
   802  		return -1;
   803  	if (ioctl(cpufd, KVM_SET_REGS, &regs))
   804  		return -1;
   805  	return 0;
   806  }