github.com/google/syzkaller@v0.0.0-20240517125934-c0f1611a36d6/executor/common_kvm_ppc64.h (about) 1 // Copyright 2020 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 // This file is shared between executor and csource package. 5 6 // Implementation of syz_kvm_setup_cpu pseudo-syscall. 7 8 #include "kvm_ppc64le.S.h" 9 10 #define BOOK3S_INTERRUPT_SYSTEM_RESET 0x100 11 #define BOOK3S_INTERRUPT_MACHINE_CHECK 0x200 12 #define BOOK3S_INTERRUPT_DATA_STORAGE 0x300 13 #define BOOK3S_INTERRUPT_DATA_SEGMENT 0x380 14 #define BOOK3S_INTERRUPT_INST_STORAGE 0x400 15 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 16 #define BOOK3S_INTERRUPT_EXTERNAL 0x500 17 #define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 18 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600 19 #define BOOK3S_INTERRUPT_PROGRAM 0x700 20 #define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 21 #define BOOK3S_INTERRUPT_DECREMENTER 0x900 22 #define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980 23 #define BOOK3S_INTERRUPT_DOORBELL 0xa00 24 #define BOOK3S_INTERRUPT_SYSCALL 0xc00 25 #define BOOK3S_INTERRUPT_TRACE 0xd00 26 #define BOOK3S_INTERRUPT_H_DATA_STORAGE 0xe00 27 #define BOOK3S_INTERRUPT_H_INST_STORAGE 0xe20 28 #define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40 29 #define BOOK3S_INTERRUPT_HMI 0xe60 30 #define BOOK3S_INTERRUPT_H_DOORBELL 0xe80 31 #define BOOK3S_INTERRUPT_H_VIRT 0xea0 32 #define BOOK3S_INTERRUPT_PERFMON 0xf00 33 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 34 #define BOOK3S_INTERRUPT_VSX 0xf40 35 #define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60 36 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80 37 38 #define BITS_PER_LONG 64 39 #define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) 40 #define PPC_BIT(bit) (1ULL << PPC_BITLSHIFT(bit)) 41 #define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) 42 43 #define RADIX_PTE_INDEX_SIZE 5 // size: 8B << 5 = 256B, maps 2^5 x 64K = 2MB 44 #define RADIX_PMD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 2MB = 1GB 45 #define RADIX_PUD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 1GB = 512GB 46 #define RADIX_PGD_INDEX_SIZE 13 // size: 8B << 13 = 64KB, maps 2^13 x 512GB = 4PB 47 48 #define cpu_to_be32(x) __builtin_bswap32(x) 49 #define cpu_to_be64(x) __builtin_bswap64(x) 50 #define be64_to_cpu(x) __builtin_bswap64(x) 51 52 #define LPCR_ILE PPC_BIT(38) 53 #define LPCR_UPRT PPC_BIT(41) // Use Process Table 54 #define LPCR_EVIRT PPC_BIT(42) // Enhanced Virtualisation 55 #define LPCR_HR PPC_BIT(43) // Host Radix 56 #ifndef KVM_REG_PPC_LPCR_64 57 #define KVM_REG_PPC_LPCR_64 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb5) 58 #endif 59 60 #define PRTB_SIZE_SHIFT 12 // log2((64 << 10) / 16) 61 #define PATB_GR (1UL << 63) // guest uses radix; must match HR 62 #define PATB_HR (1UL << 63) 63 #define PRTB_MASK 0x0ffffffffffff000UL 64 65 #define ALIGNUP(p, q) ((void*)(((unsigned long)(p) + (q)-1) & ~((q)-1))) 66 #define MAX(a, b) (((a) > (b)) ? (a) : (b)) 67 68 #ifndef KVM_REG_PPC_DEC_EXPIRY 69 #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) 70 #endif 71 72 #ifndef KVM_PPC_CONFIGURE_V3_MMU 73 // Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 74 #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) 75 76 // For KVM_PPC_CONFIGURE_V3_MMU 77 struct kvm_ppc_mmuv3_cfg { 78 __u64 flags; 79 __u64 process_table; // second doubleword of partition table entry 80 }; 81 82 // Flag values for KVM_PPC_CONFIGURE_V3_MMU 83 #define KVM_PPC_MMUV3_RADIX 1 // 1 = radix mode, 0 = HPT 84 #define KVM_PPC_MMUV3_GTSE 2 // global translation shootdown enb 85 #endif 86 87 #ifndef KVM_CAP_PPC_NESTED_HV 88 #define KVM_CAP_PPC_NESTED_HV 160 89 #endif 90 91 struct kvm_text { 92 uintptr_t typ; 93 const void* text; 94 uintptr_t size; 95 }; 96 97 static int kvmppc_define_rtas_kernel_token(int vmfd, unsigned token, const char* func) 98 { 99 struct kvm_rtas_token_args args; 100 101 args.token = token; 102 strncpy(args.name, func, sizeof(args.name) - 1); 103 104 return ioctl(vmfd, KVM_PPC_RTAS_DEFINE_TOKEN, &args); 105 } 106 107 static int kvmppc_get_one_reg(int cpufd, uint64 id, void* target) 108 { 109 struct kvm_one_reg reg = {.id = id, .addr = (uintptr_t)target}; 110 111 return ioctl(cpufd, KVM_GET_ONE_REG, ®); 112 } 113 114 static int kvmppc_set_one_reg(int cpufd, uint64 id, void* target) 115 { 116 struct kvm_one_reg reg = {.id = id, .addr = (uintptr_t)target}; 117 118 return ioctl(cpufd, KVM_SET_ONE_REG, ®); 119 } 120 121 static int kvm_vcpu_enable_cap(int cpufd, uint32 capability) 122 { 123 struct kvm_enable_cap cap = { 124 .cap = capability, 125 }; 126 return ioctl(cpufd, KVM_ENABLE_CAP, &cap); 127 } 128 129 static int kvm_vm_enable_cap(int vmfd, uint32 capability, uint64 p1, uint64 p2) 130 { 131 struct kvm_enable_cap cap = { 132 .cap = capability, 133 .flags = 0, 134 .args = {p1, p2}, 135 }; 136 return ioctl(vmfd, KVM_ENABLE_CAP, &cap); 137 } 138 139 static void dump_text(const char* mem, unsigned start, unsigned cw, uint32 debug_inst_opcode) 140 { 141 #ifdef DEBUG 142 printf("Text @%x: ", start); 143 144 for (unsigned i = 0; i < cw; ++i) { 145 uint32 w = ((uint32*)(mem + start))[i]; 146 147 printf(" %08x", w); 148 if (debug_inst_opcode && debug_inst_opcode == w) 149 break; 150 } 151 152 printf("\n"); 153 #endif 154 } 155 156 // Flags 157 #define KVM_SETUP_PPC64_LE (1 << 0) // Little endian 158 #define KVM_SETUP_PPC64_IR (1 << 1) // Paging for instructions 159 #define KVM_SETUP_PPC64_DR (1 << 2) // Paging for data 160 #define KVM_SETUP_PPC64_PR (1 << 3) // Run with MSR_PR (==usermode) 161 #define KVM_SETUP_PPC64_PID1 (1 << 4) // Set PID=1 i.e. not kernel's PID 162 163 // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags_ppc64], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts]) 164 static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4, volatile long a5, volatile long a6, volatile long a7) 165 { 166 const int vmfd = a0; 167 const int cpufd = a1; 168 char* const host_mem = (char*)a2; 169 const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3; 170 const uintptr_t text_count = a4; 171 uintptr_t flags = a5; 172 const uintptr_t page_size = 0x10000; // SYZ_PAGE_SIZE 173 const uintptr_t guest_mem_size = 24 * page_size; // vma[24] from dev_kvm.txt 174 unsigned long gpa_off = 0; 175 uint32 debug_inst_opcode = 0; 176 177 (void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count 178 const void* text = 0; 179 uintptr_t text_size = 0; 180 uint64 pid = 0; 181 uint64 lpcr = 0; 182 NONFAILING(text = text_array_ptr[0].text); 183 NONFAILING(text_size = text_array_ptr[0].size); 184 185 if (kvm_vcpu_enable_cap(cpufd, KVM_CAP_PPC_PAPR)) 186 return -1; 187 188 if (kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_NESTED_HV, true, 0)) 189 return -1; 190 191 for (uintptr_t i = 0; i < guest_mem_size / page_size; i++) { 192 struct kvm_userspace_memory_region memreg; 193 memreg.slot = i; 194 memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES but not KVM_MEM_READONLY 195 memreg.guest_phys_addr = i * page_size; 196 memreg.memory_size = page_size; 197 memreg.userspace_addr = (uintptr_t)host_mem + i * page_size; 198 if (ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg)) 199 return -1; 200 } 201 202 struct kvm_regs regs; 203 struct kvm_sregs sregs; 204 if (ioctl(cpufd, KVM_GET_SREGS, &sregs)) 205 return -1; 206 if (ioctl(cpufd, KVM_GET_REGS, ®s)) 207 return -1; 208 209 regs.msr = PPC_BIT(0); // MSR_SF == Sixty Four == 64bit 210 if (flags & KVM_SETUP_PPC64_LE) 211 regs.msr |= PPC_BIT(63); // Little endian 212 213 // PR == "problem state" == non priveledged == userspace 214 if (flags & KVM_SETUP_PPC64_PR) { 215 regs.msr |= PPC_BIT(49); 216 // When PR=1, the hardware enforces IR and DR as well. 217 flags |= KVM_SETUP_PPC64_IR | KVM_SETUP_PPC64_DR | KVM_SETUP_PPC64_PID1; 218 } 219 220 if (flags & KVM_SETUP_PPC64_IR) 221 regs.msr |= PPC_BIT(58); // IR - MMU=on for instructions 222 if (flags & KVM_SETUP_PPC64_DR) 223 regs.msr |= PPC_BIT(59); // DR - MMU=on for data 224 if (flags & KVM_SETUP_PPC64_PID1) 225 pid = 1; 226 227 // KVM HV on POWER is hard to force to exit, it will bounce between 228 // the fault handlers in KVM and the VM. Forcing all exception 229 // vectors to do software debug breakpoint ensures the exit from KVM. 230 if (kvmppc_get_one_reg(cpufd, KVM_REG_PPC_DEBUG_INST, &debug_inst_opcode)) 231 return -1; 232 233 #define VEC(x) (*((uint32*)(host_mem + (x)))) 234 VEC(BOOK3S_INTERRUPT_SYSTEM_RESET) = debug_inst_opcode; 235 VEC(BOOK3S_INTERRUPT_MACHINE_CHECK) = debug_inst_opcode; 236 VEC(BOOK3S_INTERRUPT_DATA_STORAGE) = debug_inst_opcode; 237 VEC(BOOK3S_INTERRUPT_DATA_SEGMENT) = debug_inst_opcode; 238 VEC(BOOK3S_INTERRUPT_INST_STORAGE) = debug_inst_opcode; 239 VEC(BOOK3S_INTERRUPT_INST_SEGMENT) = debug_inst_opcode; 240 VEC(BOOK3S_INTERRUPT_EXTERNAL) = debug_inst_opcode; 241 VEC(BOOK3S_INTERRUPT_EXTERNAL_HV) = debug_inst_opcode; 242 VEC(BOOK3S_INTERRUPT_ALIGNMENT) = debug_inst_opcode; 243 VEC(BOOK3S_INTERRUPT_PROGRAM) = debug_inst_opcode; 244 VEC(BOOK3S_INTERRUPT_FP_UNAVAIL) = debug_inst_opcode; 245 memcpy(host_mem + BOOK3S_INTERRUPT_DECREMENTER, kvm_ppc64_recharge_dec, sizeof(kvm_ppc64_recharge_dec) - 1); 246 VEC(BOOK3S_INTERRUPT_DECREMENTER + sizeof(kvm_ppc64_recharge_dec) - 1) = debug_inst_opcode; 247 VEC(BOOK3S_INTERRUPT_HV_DECREMENTER) = debug_inst_opcode; 248 VEC(BOOK3S_INTERRUPT_DOORBELL) = debug_inst_opcode; 249 VEC(BOOK3S_INTERRUPT_SYSCALL) = debug_inst_opcode; 250 VEC(BOOK3S_INTERRUPT_TRACE) = debug_inst_opcode; 251 VEC(BOOK3S_INTERRUPT_H_DATA_STORAGE) = debug_inst_opcode; 252 VEC(BOOK3S_INTERRUPT_H_INST_STORAGE) = debug_inst_opcode; 253 VEC(BOOK3S_INTERRUPT_H_EMUL_ASSIST) = debug_inst_opcode; 254 VEC(BOOK3S_INTERRUPT_HMI) = debug_inst_opcode; 255 VEC(BOOK3S_INTERRUPT_H_DOORBELL) = debug_inst_opcode; 256 VEC(BOOK3S_INTERRUPT_H_VIRT) = debug_inst_opcode; 257 VEC(BOOK3S_INTERRUPT_PERFMON) = debug_inst_opcode; 258 VEC(BOOK3S_INTERRUPT_ALTIVEC) = debug_inst_opcode; 259 VEC(BOOK3S_INTERRUPT_VSX) = debug_inst_opcode; 260 VEC(BOOK3S_INTERRUPT_FAC_UNAVAIL) = debug_inst_opcode; 261 VEC(BOOK3S_INTERRUPT_H_FAC_UNAVAIL) = debug_inst_opcode; 262 263 struct kvm_guest_debug dbg = {0}; 264 dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; 265 266 if (ioctl(cpufd, KVM_SET_GUEST_DEBUG, &dbg)) 267 return -1; 268 269 // Exception vector occupy 128K, including "System Call Vectored" 270 gpa_off = 128 << 10; 271 272 // Set up a radix page table, the hash mode is not supported 273 if (flags & (KVM_SETUP_PPC64_IR | KVM_SETUP_PPC64_DR)) { 274 uintptr_t process_tb_off = gpa_off; 275 unsigned long process_tb_size = 1UL << (PRTB_SIZE_SHIFT + 4); 276 struct prtb_entry { 277 __be64 prtb0; 278 __be64 prtb1; 279 }* process_tb = (struct prtb_entry*)(host_mem + gpa_off); 280 281 memset(process_tb, 0xcc, process_tb_size); 282 283 // PRTB_SIZE_SHIFT is defined to use 64K for the process table 284 gpa_off += process_tb_size; 285 286 unsigned long *pgd, *pud, *pmd, *pte, i; 287 288 // Create 4 level page table, just like Linux does for PAGE_SIZE==64K, 289 // put each level to a separate page including the last level which won't 290 // need more than as we only allocate 24 pages for the entire VM. 291 uintptr_t pgd_off = gpa_off; 292 pgd = (unsigned long*)(host_mem + pgd_off); 293 gpa_off += page_size; 294 uintptr_t pud_off = gpa_off; 295 pud = (unsigned long*)(host_mem + pud_off); 296 gpa_off += page_size; 297 uintptr_t pmd_off = gpa_off; 298 pmd = (unsigned long*)(host_mem + pmd_off); 299 gpa_off += page_size; 300 uintptr_t pte_off = gpa_off; 301 pte = (unsigned long*)(host_mem + pte_off); 302 gpa_off += page_size; 303 304 memset(pgd, 0, page_size); 305 memset(pud, 0, page_size); 306 memset(pmd, 0, page_size); 307 memset(pte, 0, page_size); 308 pgd[0] = cpu_to_be64(PPC_BIT(0) | // Valid 309 (pud_off & PPC_BITMASK(4, 55)) | 310 RADIX_PUD_INDEX_SIZE); 311 pud[0] = cpu_to_be64(PPC_BIT(0) | // Valid 312 (pmd_off & PPC_BITMASK(4, 55)) | 313 RADIX_PMD_INDEX_SIZE); 314 pmd[0] = cpu_to_be64(PPC_BIT(0) | // Valid 315 (pte_off & PPC_BITMASK(4, 55)) | 316 RADIX_PTE_INDEX_SIZE); 317 318 // Map all 24 pages and allow write+execute for better coverage. 319 for (i = 0; i < 24; ++i) 320 pte[i] = cpu_to_be64(PPC_BIT(0) | // Valid 321 PPC_BIT(1) | // Leaf 322 ((i * page_size) & PPC_BITMASK(7, 51)) | 323 PPC_BIT(55) | // Reference 324 PPC_BIT(56) | // Change 325 PPC_BIT(61) | // Read permitted 326 PPC_BIT(62) | // Write permitted 327 PPC_BIT(63)); // Execute permitted 328 329 const long max_shift = 52; 330 const unsigned long rts = (max_shift - 31) & 0x1f; 331 const unsigned long rts1 = (rts >> 3) << PPC_BITLSHIFT(2); 332 const unsigned long rts2 = (rts & 7) << PPC_BITLSHIFT(58); 333 334 process_tb[0].prtb0 = cpu_to_be64(PATB_HR | rts1 | pgd_off | rts2 | RADIX_PGD_INDEX_SIZE); 335 if (pid) 336 process_tb[pid].prtb0 = cpu_to_be64(PATB_HR | rts1 | pgd_off | rts2 | RADIX_PGD_INDEX_SIZE); 337 338 // PATB_GR is not in the spec but KVM HV wants it for some reason 339 struct kvm_ppc_mmuv3_cfg cfg = { 340 .flags = KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE, 341 .process_table = (process_tb_off & PRTB_MASK) | (PRTB_SIZE_SHIFT - 12) | PATB_GR, 342 }; 343 if (ioctl(vmfd, KVM_PPC_CONFIGURE_V3_MMU, &cfg)) 344 return -1; 345 346 lpcr |= LPCR_UPRT | LPCR_HR; 347 #ifdef DEBUG 348 printf("MMUv3: flags=%lx %016lx\n", cfg.flags, cfg.process_table); 349 printf("PTRB0=%016lx PGD0=%016lx PUD0=%016lx PMD0=%016lx\n", 350 be64_to_cpu((unsigned long)process_tb[0].prtb0), be64_to_cpu((unsigned long)pgd[0]), 351 be64_to_cpu((unsigned long)pud[0]), be64_to_cpu((unsigned long)pmd[0])); 352 printf("PTEs @%lx:\n %016lx %016lx %016lx %016lx\n %016lx %016lx %016lx %016lx\n", 353 pte_off, 354 be64_to_cpu((unsigned long)pte[0]), be64_to_cpu((unsigned long)pte[1]), 355 be64_to_cpu((unsigned long)pte[2]), be64_to_cpu((unsigned long)pte[3]), 356 be64_to_cpu((unsigned long)pte[4]), be64_to_cpu((unsigned long)pte[5]), 357 be64_to_cpu((unsigned long)pte[6]), be64_to_cpu((unsigned long)pte[7])); 358 #endif 359 } 360 361 memcpy(host_mem + gpa_off, text, text_size); 362 regs.pc = gpa_off; 363 364 uintptr_t end_of_text = gpa_off + ((text_size + 3) & ~3); 365 memcpy(host_mem + end_of_text, &debug_inst_opcode, sizeof(debug_inst_opcode)); 366 367 // The code generator produces little endian instructions so swap bytes here 368 if (!(flags & KVM_SETUP_PPC64_LE)) { 369 uint32* p = (uint32*)(host_mem + gpa_off); 370 for (unsigned long i = 0; i < text_size / sizeof(*p); ++i) 371 p[i] = cpu_to_be32(p[i]); 372 373 p = (uint32*)(host_mem + BOOK3S_INTERRUPT_DECREMENTER); 374 for (unsigned long i = 0; i < sizeof(kvm_ppc64_recharge_dec) / sizeof(*p); ++i) 375 p[i] = cpu_to_be32(p[i]); 376 } else { 377 // PPC by default calls exception handlers in big endian unless ILE 378 lpcr |= LPCR_ILE; 379 } 380 381 if (ioctl(cpufd, KVM_SET_SREGS, &sregs)) 382 return -1; 383 if (ioctl(cpufd, KVM_SET_REGS, ®s)) 384 return -1; 385 if (kvmppc_set_one_reg(cpufd, KVM_REG_PPC_LPCR_64, &lpcr)) 386 return -1; 387 if (kvmppc_set_one_reg(cpufd, KVM_REG_PPC_PID, &pid)) 388 return -1; 389 390 // Hypercalls need to be enable so we enable them all here to 391 // allow fuzzing 392 #define MAX_HCALL 0x450 393 for (unsigned hcall = 4; hcall < MAX_HCALL; hcall += 4) 394 kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_ENABLE_HCALL, hcall, 1); 395 396 for (unsigned hcall = 0xf000; hcall < 0xf810; hcall += 4) 397 kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_ENABLE_HCALL, hcall, 1); 398 399 for (unsigned hcall = 0xef00; hcall < 0xef20; hcall += 4) 400 kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_ENABLE_HCALL, hcall, 1); 401 402 // Only a few of many RTAS calls are actually in the KVM and the rest 403 // are handled in QEMU, enable the KVM handling for those 4 here. 404 kvmppc_define_rtas_kernel_token(vmfd, 1, "ibm,set-xive"); 405 kvmppc_define_rtas_kernel_token(vmfd, 2, "ibm,get-xive"); 406 kvmppc_define_rtas_kernel_token(vmfd, 3, "ibm,int-on"); 407 kvmppc_define_rtas_kernel_token(vmfd, 4, "ibm,int-off"); 408 409 dump_text(host_mem, regs.pc, 8, debug_inst_opcode); 410 dump_text(host_mem, BOOK3S_INTERRUPT_DECREMENTER, 16, debug_inst_opcode); 411 412 uint64 decr = 0x7fffffff; 413 if (kvmppc_set_one_reg(cpufd, KVM_REG_PPC_DEC_EXPIRY, &decr)) 414 return -1; 415 416 return 0; 417 }