gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/sysmsg/sighandler_amd64.c (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #define _GNU_SOURCE 16 #include <asm/prctl.h> 17 #include <asm/unistd_64.h> 18 #include <errno.h> 19 #include <linux/audit.h> 20 #include <linux/futex.h> 21 #include <linux/unistd.h> 22 #include <signal.h> 23 #include <stdbool.h> 24 #include <stddef.h> 25 #include <stdint.h> 26 #include <stdlib.h> 27 #include <sys/prctl.h> 28 #include <sys/ucontext.h> 29 30 #include "atomic.h" 31 #include "sysmsg.h" 32 #include "sysmsg_offsets.h" 33 #include "sysmsg_offsets_amd64.h" 34 35 // TODO(b/271631387): These globals are shared between AMD64 and ARM64; move to 36 // sysmsg_lib.c. 37 struct arch_state __export_arch_state; 38 uint64_t __export_stub_start; 39 40 long __syscall(long n, long a1, long a2, long a3, long a4, long a5, long a6) { 41 unsigned long ret; 42 register long r10 __asm__("r10") = a4; 43 register long r8 __asm__("r8") = a5; 44 register long r9 __asm__("r9") = a6; 45 __asm__ __volatile__("syscall" 46 : "=a"(ret) 47 : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), 48 "r"(r9) 49 : "rcx", "r11", "memory"); 50 return ret; 51 } 52 53 long sys_futex(uint32_t *addr, int op, int val, struct __kernel_timespec *tv, 54 uint32_t *addr2, int val3) { 55 return __syscall(__NR_futex, (long)addr, (long)op, (long)val, (long)tv, 56 (long)addr2, (long)val3); 57 } 58 59 union csgsfs { 60 uint64_t csgsfs; // REG_CSGSFS 61 struct { 62 uint16_t cs; 63 uint16_t gs; 64 uint16_t fs; 65 uint16_t ss; 66 }; 67 }; 68 69 static void gregs_to_ptregs(ucontext_t *ucontext, 70 struct user_regs_struct *ptregs) { 71 union csgsfs csgsfs = {.csgsfs = ucontext->uc_mcontext.gregs[REG_CSGSFS]}; 72 73 // Set all registers except: 74 // * fs_base and gs_base, because they can be only changed by arch_prctl. 75 // * DS and ES are not used on x86_64. 76 ptregs->r15 = ucontext->uc_mcontext.gregs[REG_R15]; 77 ptregs->r14 = ucontext->uc_mcontext.gregs[REG_R14]; 78 ptregs->r13 = ucontext->uc_mcontext.gregs[REG_R13]; 79 ptregs->r12 = ucontext->uc_mcontext.gregs[REG_R12]; 80 ptregs->rbp = ucontext->uc_mcontext.gregs[REG_RBP]; 81 ptregs->rbx = ucontext->uc_mcontext.gregs[REG_RBX]; 82 ptregs->r11 = ucontext->uc_mcontext.gregs[REG_R11]; 83 ptregs->r10 = ucontext->uc_mcontext.gregs[REG_R10]; 84 ptregs->r9 = ucontext->uc_mcontext.gregs[REG_R9]; 85 ptregs->r8 = ucontext->uc_mcontext.gregs[REG_R8]; 86 ptregs->rax = ucontext->uc_mcontext.gregs[REG_RAX]; 87 ptregs->rcx = ucontext->uc_mcontext.gregs[REG_RCX]; 88 ptregs->rdx = ucontext->uc_mcontext.gregs[REG_RDX]; 89 ptregs->rsi = ucontext->uc_mcontext.gregs[REG_RSI]; 90 ptregs->rdi = ucontext->uc_mcontext.gregs[REG_RDI]; 91 ptregs->rip = ucontext->uc_mcontext.gregs[REG_RIP]; 92 ptregs->eflags = ucontext->uc_mcontext.gregs[REG_EFL]; 93 ptregs->rsp = ucontext->uc_mcontext.gregs[REG_RSP]; 94 95 ptregs->cs = csgsfs.cs; 96 ptregs->ss = csgsfs.ss; 97 ptregs->fs = csgsfs.fs; 98 ptregs->gs = csgsfs.gs; 99 } 100 101 static void ptregs_to_gregs(ucontext_t *ucontext, 102 struct user_regs_struct *ptregs) { 103 union csgsfs csgsfs = {.csgsfs = ucontext->uc_mcontext.gregs[REG_CSGSFS]}; 104 105 ucontext->uc_mcontext.gregs[REG_R15] = ptregs->r15; 106 ucontext->uc_mcontext.gregs[REG_R14] = ptregs->r14; 107 ucontext->uc_mcontext.gregs[REG_R13] = ptregs->r13; 108 ucontext->uc_mcontext.gregs[REG_R12] = ptregs->r12; 109 ucontext->uc_mcontext.gregs[REG_RBP] = ptregs->rbp; 110 ucontext->uc_mcontext.gregs[REG_RBX] = ptregs->rbx; 111 ucontext->uc_mcontext.gregs[REG_R11] = ptregs->r11; 112 ucontext->uc_mcontext.gregs[REG_R10] = ptregs->r10; 113 ucontext->uc_mcontext.gregs[REG_R9] = ptregs->r9; 114 ucontext->uc_mcontext.gregs[REG_R8] = ptregs->r8; 115 ucontext->uc_mcontext.gregs[REG_RAX] = ptregs->rax; 116 ucontext->uc_mcontext.gregs[REG_RCX] = ptregs->rcx; 117 ucontext->uc_mcontext.gregs[REG_RDX] = ptregs->rdx; 118 ucontext->uc_mcontext.gregs[REG_RSI] = ptregs->rsi; 119 ucontext->uc_mcontext.gregs[REG_RDI] = ptregs->rdi; 120 ucontext->uc_mcontext.gregs[REG_RIP] = ptregs->rip; 121 ucontext->uc_mcontext.gregs[REG_EFL] = ptregs->eflags; 122 ucontext->uc_mcontext.gregs[REG_RSP] = ptregs->rsp; 123 124 csgsfs.cs = ptregs->cs; 125 csgsfs.ss = ptregs->ss; 126 csgsfs.fs = ptregs->fs; 127 csgsfs.gs = ptregs->gs; 128 129 ucontext->uc_mcontext.gregs[REG_CSGSFS] = csgsfs.csgsfs; 130 } 131 132 // get_fsbase writes the current thread's fsbase value to ptregs. 133 static uint64_t get_fsbase(void) { 134 uint64_t fsbase; 135 if (__export_arch_state.fsgsbase) { 136 asm volatile("rdfsbase %0" : "=r"(fsbase)); 137 } else { 138 int ret = 139 __syscall(__NR_arch_prctl, ARCH_GET_FS, (long)&fsbase, 0, 0, 0, 0); 140 if (ret) { 141 panic(STUB_ERROR_ARCH_PRCTL, ret); 142 } 143 } 144 return fsbase; 145 } 146 147 // set_fsbase sets the current thread's fsbase to the fsbase value in ptregs. 148 static void set_fsbase(uint64_t fsbase) { 149 if (__export_arch_state.fsgsbase) { 150 asm volatile("wrfsbase %0" : : "r"(fsbase) : "memory"); 151 } else { 152 int ret = __syscall(__NR_arch_prctl, ARCH_SET_FS, fsbase, 0, 0, 0, 0); 153 if (ret) { 154 panic(STUB_ERROR_ARCH_PRCTL, ret); 155 } 156 } 157 } 158 159 // switch_context_amd64 is a wrapper of switch_context() which does checks 160 // specific to amd64. 161 struct thread_context *switch_context_amd64( 162 struct sysmsg *sysmsg, struct thread_context *ctx, 163 enum context_state new_context_state) { 164 struct thread_context *old_ctx = sysmsg->context; 165 166 for (;;) { 167 ctx = switch_context(sysmsg, ctx, new_context_state); 168 169 // After setting THREAD_STATE_NONE, syshandled can be interrupted by 170 // SIGCHLD. In this case, we consider that the current context contains 171 // the actual state and sighandler can take control on it. 172 atomic_store(&sysmsg->state, THREAD_STATE_NONE); 173 if (atomic_load(&ctx->interrupt) != 0) { 174 atomic_store(&sysmsg->state, THREAD_STATE_PREP); 175 // This context got interrupted while it was waiting in the queue. 176 // Setup all the necessary bits to let the sentry know this context has 177 // switched back because of it. 178 atomic_store(&ctx->interrupt, 0); 179 new_context_state = CONTEXT_STATE_FAULT; 180 ctx->signo = SIGCHLD; 181 ctx->siginfo.si_signo = SIGCHLD; 182 ctx->ptregs.orig_rax = -1; 183 } else { 184 break; 185 } 186 } 187 if (old_ctx != ctx || ctx->last_thread_id != sysmsg->thread_id) { 188 ctx->fpstate_changed = 1; 189 } 190 return ctx; 191 } 192 193 static void prep_fpstate_for_sigframe(void *buf, uint32_t user_size, 194 bool use_xsave); 195 196 void __export_sighandler(int signo, siginfo_t *siginfo, void *_ucontext) { 197 ucontext_t *ucontext = _ucontext; 198 void *sp = sysmsg_sp(); 199 struct sysmsg *sysmsg = sysmsg_addr(sp); 200 201 if (sysmsg != sysmsg->self) panic(STUB_ERROR_BAD_SYSMSG, 0); 202 int32_t thread_state = atomic_load(&sysmsg->state); 203 if (thread_state == THREAD_STATE_INITIALIZING) { 204 // This thread was interrupted before it even had a context. 205 return; 206 } 207 208 struct thread_context *ctx = sysmsg->context; 209 210 // If the current thread is in syshandler, an interrupt has to be postponed, 211 // because sysmsg can't be changed. 212 if (signo == SIGCHLD && thread_state != THREAD_STATE_NONE) { 213 return; 214 } 215 216 // Handle faults in syshandler. 217 if ((signo == SIGSEGV || signo == SIGBUS) && sysmsg->fault_jump) { 218 ucontext->uc_mcontext.gregs[REG_RIP] += sysmsg->fault_jump; 219 sysmsg->fault_jump = 0; 220 return; 221 } 222 223 long fs_base = get_fsbase(); 224 225 ctx->signo = signo; 226 ctx->siginfo = *siginfo; 227 // syshandler sets THREAD_STATE_NONE right before it starts resuming a 228 // context. It means the context contains the actual state, and the state of 229 // the stub thread is incomplete. 230 if (signo != SIGCHLD || 231 ucontext->uc_mcontext.gregs[REG_RIP] < __export_stub_start) { 232 ctx->ptregs.fs_base = fs_base; 233 gregs_to_ptregs(ucontext, &ctx->ptregs); 234 memcpy(ctx->fpstate, (uint8_t *)ucontext->uc_mcontext.fpregs, 235 __export_arch_state.fp_len); 236 237 atomic_store(&ctx->fpstate_changed, 0); 238 } 239 240 enum context_state ctx_state = CONTEXT_STATE_INVALID; 241 242 switch (signo) { 243 case SIGSYS: { 244 ctx_state = CONTEXT_STATE_SYSCALL; 245 246 // Check whether this syscall can be replaced on a function call or not. 247 // If a syscall instruction set is "mov sysno, %eax, syscall", it can be 248 // replaced on a function call which works much faster. 249 // Look at pkg/sentry/usertrap for more details. 250 if (siginfo->si_arch == AUDIT_ARCH_X86_64) { 251 uint8_t *rip = (uint8_t *)ctx->ptregs.rip; 252 // FIXME(b/144063246): Even if all five bytes before the syscall 253 // instruction match the "mov sysno, %eax" instruction, they can be a 254 // part of a longer instruction. Here is not easy way to decode x86 255 // instructions in reverse. 256 uint64_t syscall_code_int[2]; 257 uint8_t *syscall_code = (uint8_t *)&syscall_code_int[0]; 258 259 // We need to receive 5 bytes before the syscall instruction, but they 260 // are not aligned, so we can't read them atomically. Let's read them 261 // twice. If the second copy will not contain the FAULT_OPCODE, this 262 // will mean that the first copy is in the consistent state. 263 for (int i = 0; i < 2; i++) { 264 // fault_jump is set to the size of "mov (%rbx)" which is 3 bytes. 265 atomic_store(&sysmsg->fault_jump, 3); 266 asm volatile("movq (%1), %0\n" 267 : "=a"(syscall_code_int[i]) 268 : "b"(rip - 8) 269 : "cc", "memory"); 270 atomic_store(&sysmsg->fault_jump, 0); 271 } 272 // The mov instruction is 5 bytes: b8 <sysno, 4 bytes>. 273 // The syscall instruction is 2 bytes: 0f 05. 274 uint32_t sysno = *(uint32_t *)(syscall_code + 2); 275 int need_trap = *(syscall_code + 6) == 0x0f && // syscall 276 *(syscall_code + 7) == 0x05 && 277 *(syscall_code + 1) == 0xb8 && // mov sysno, %eax 278 sysno == siginfo->si_syscall && 279 sysno == ctx->ptregs.rax; 280 281 // Restart syscall if it has been patched by another thread. When a 282 // syscall instruction set is replaced on a function call, all threads 283 // have to call it via the function call. Otherwise the syscall will not 284 // be restarted properly if it will be interrupted by signal. 285 syscall_code = (uint8_t *)&syscall_code_int[1]; 286 uint8_t syscall_opcode = *(syscall_code + 6); 287 288 // A binary patch is built so that the first byte of the syscall 289 // instruction is changed on the invalid instruction. If we meet this 290 // case, this means that another thread has been patched this syscall 291 // and we need to restart it. 292 if (syscall_opcode == FAULT_OPCODE) { 293 ucontext->uc_mcontext.gregs[REG_RIP] -= 7; 294 return; 295 } 296 297 if (need_trap) { 298 // This syscall can be replaced on the function call. 299 ctx_state = CONTEXT_STATE_SYSCALL_NEED_TRAP; 300 } 301 } 302 ctx->ptregs.orig_rax = ctx->ptregs.rax; 303 ctx->ptregs.rax = (unsigned long)-ENOSYS; 304 if (siginfo->si_arch != AUDIT_ARCH_X86_64) 305 // gVisor doesn't support x32 system calls, so let's change the syscall 306 // number so that it returns ENOSYS. 307 ctx->ptregs.orig_rax += 0x86000000; 308 break; 309 } 310 case SIGCHLD: 311 case SIGSEGV: 312 case SIGBUS: 313 case SIGFPE: 314 case SIGTRAP: 315 case SIGILL: 316 ctx->ptregs.orig_rax = -1; 317 ctx_state = CONTEXT_STATE_FAULT; 318 break; 319 default: 320 return; 321 } 322 323 ctx = switch_context_amd64(sysmsg, ctx, ctx_state); 324 if (fs_base != ctx->ptregs.fs_base) { 325 set_fsbase(ctx->ptregs.fs_base); 326 } 327 328 if (atomic_load(&ctx->fpstate_changed)) { 329 prep_fpstate_for_sigframe( 330 ctx->fpstate, __export_arch_state.fp_len, 331 __export_arch_state.xsave_mode != XSAVE_MODE_FXSAVE); 332 ucontext->uc_mcontext.fpregs = (void *)ctx->fpstate; 333 } 334 ptregs_to_gregs(ucontext, &ctx->ptregs); 335 } 336 337 void __syshandler() { 338 struct sysmsg *sysmsg; 339 asm volatile("movq %%gs:0, %0\n" : "=r"(sysmsg) : :); 340 // SYSMSG_STATE_PREP is set to postpone interrupts. Look at 341 // __export_sighandler for more details. 342 int state = atomic_load(&sysmsg->state); 343 if (state != THREAD_STATE_PREP) panic(STUB_ERROR_BAD_THREAD_STATE, 0); 344 345 struct thread_context *ctx = sysmsg->context; 346 347 enum context_state ctx_state = CONTEXT_STATE_SYSCALL_TRAP; 348 ctx->signo = SIGSYS; 349 ctx->siginfo.si_addr = 0; 350 ctx->siginfo.si_syscall = ctx->ptregs.rax; 351 ctx->ptregs.rax = (unsigned long)-ENOSYS; 352 353 long fs_base = get_fsbase(); 354 ctx->ptregs.fs_base = fs_base; 355 356 ctx = switch_context_amd64(sysmsg, ctx, ctx_state); 357 // switch_context_amd64 changed sysmsg->state to THREAD_STATE_NONE, so we can 358 // only resume the current process, all other actions are 359 // prohibited after this point. 360 361 if (fs_base != ctx->ptregs.fs_base) { 362 set_fsbase(ctx->ptregs.fs_base); 363 } 364 } 365 366 void __export_start(struct sysmsg *sysmsg, void *_ucontext) { 367 init_new_thread(); 368 369 asm volatile("movq %%gs:0, %0\n" : "=r"(sysmsg) : :); 370 if (sysmsg->self != sysmsg) { 371 panic(STUB_ERROR_BAD_SYSMSG, 0); 372 } 373 374 struct thread_context *ctx = 375 switch_context_amd64(sysmsg, NULL, CONTEXT_STATE_INVALID); 376 377 restore_state(sysmsg, ctx, _ucontext); 378 } 379 380 // asm_restore_state is implemented in syshandler_amd64.S 381 void asm_restore_state(); 382 383 // On x86 restore_state jumps straight to user code and does not return. 384 void restore_state(struct sysmsg *sysmsg, struct thread_context *ctx, 385 void *unused) { 386 set_fsbase(ctx->ptregs.fs_base); 387 asm_restore_state(); 388 } 389 390 void verify_offsets_amd64() { 391 #define PTREGS_OFFSET offsetof(struct thread_context, ptregs) 392 BUILD_BUG_ON(offsetof_thread_context_ptregs != PTREGS_OFFSET); 393 BUILD_BUG_ON(offsetof_thread_context_ptregs_r15 != 394 (offsetof(struct user_regs_struct, r15) + PTREGS_OFFSET)); 395 BUILD_BUG_ON(offsetof_thread_context_ptregs_r14 != 396 (offsetof(struct user_regs_struct, r14) + PTREGS_OFFSET)); 397 BUILD_BUG_ON(offsetof_thread_context_ptregs_r13 != 398 (offsetof(struct user_regs_struct, r13) + PTREGS_OFFSET)); 399 BUILD_BUG_ON(offsetof_thread_context_ptregs_r12 != 400 (offsetof(struct user_regs_struct, r12) + PTREGS_OFFSET)); 401 BUILD_BUG_ON(offsetof_thread_context_ptregs_rbp != 402 (offsetof(struct user_regs_struct, rbp) + PTREGS_OFFSET)); 403 BUILD_BUG_ON(offsetof_thread_context_ptregs_rbx != 404 (offsetof(struct user_regs_struct, rbx) + PTREGS_OFFSET)); 405 BUILD_BUG_ON(offsetof_thread_context_ptregs_r11 != 406 (offsetof(struct user_regs_struct, r11) + PTREGS_OFFSET)); 407 BUILD_BUG_ON(offsetof_thread_context_ptregs_r10 != 408 (offsetof(struct user_regs_struct, r10) + PTREGS_OFFSET)); 409 BUILD_BUG_ON(offsetof_thread_context_ptregs_r9 != 410 (offsetof(struct user_regs_struct, r9) + PTREGS_OFFSET)); 411 BUILD_BUG_ON(offsetof_thread_context_ptregs_r8 != 412 (offsetof(struct user_regs_struct, r8) + PTREGS_OFFSET)); 413 BUILD_BUG_ON(offsetof_thread_context_ptregs_rax != 414 (offsetof(struct user_regs_struct, rax) + PTREGS_OFFSET)); 415 BUILD_BUG_ON(offsetof_thread_context_ptregs_rcx != 416 (offsetof(struct user_regs_struct, rcx) + PTREGS_OFFSET)); 417 BUILD_BUG_ON(offsetof_thread_context_ptregs_rdx != 418 (offsetof(struct user_regs_struct, rdx) + PTREGS_OFFSET)); 419 BUILD_BUG_ON(offsetof_thread_context_ptregs_rsi != 420 (offsetof(struct user_regs_struct, rsi) + PTREGS_OFFSET)); 421 BUILD_BUG_ON(offsetof_thread_context_ptregs_rdi != 422 (offsetof(struct user_regs_struct, rdi) + PTREGS_OFFSET)); 423 BUILD_BUG_ON(offsetof_thread_context_ptregs_orig_rax != 424 (offsetof(struct user_regs_struct, orig_rax) + PTREGS_OFFSET)); 425 BUILD_BUG_ON(offsetof_thread_context_ptregs_rip != 426 (offsetof(struct user_regs_struct, rip) + PTREGS_OFFSET)); 427 BUILD_BUG_ON(offsetof_thread_context_ptregs_cs != 428 (offsetof(struct user_regs_struct, cs) + PTREGS_OFFSET)); 429 BUILD_BUG_ON(offsetof_thread_context_ptregs_eflags != 430 (offsetof(struct user_regs_struct, eflags) + PTREGS_OFFSET)); 431 BUILD_BUG_ON(offsetof_thread_context_ptregs_rsp != 432 (offsetof(struct user_regs_struct, rsp) + PTREGS_OFFSET)); 433 BUILD_BUG_ON(offsetof_thread_context_ptregs_ss != 434 (offsetof(struct user_regs_struct, ss) + PTREGS_OFFSET)); 435 BUILD_BUG_ON(offsetof_thread_context_ptregs_fs_base != 436 (offsetof(struct user_regs_struct, fs_base) + PTREGS_OFFSET)); 437 BUILD_BUG_ON(offsetof_thread_context_ptregs_gs_base != 438 (offsetof(struct user_regs_struct, gs_base) + PTREGS_OFFSET)); 439 BUILD_BUG_ON(offsetof_thread_context_ptregs_ds != 440 (offsetof(struct user_regs_struct, ds) + PTREGS_OFFSET)); 441 BUILD_BUG_ON(offsetof_thread_context_ptregs_es != 442 (offsetof(struct user_regs_struct, es) + PTREGS_OFFSET)); 443 BUILD_BUG_ON(offsetof_thread_context_ptregs_fs != 444 (offsetof(struct user_regs_struct, fs) + PTREGS_OFFSET)); 445 BUILD_BUG_ON(offsetof_thread_context_ptregs_gs != 446 (offsetof(struct user_regs_struct, gs) + PTREGS_OFFSET)); 447 #undef PTREGS_OFFSET 448 } 449 450 // asm/sigcontext.h conflicts with signal.h. 451 struct __fpx_sw_bytes { 452 uint32_t magic1; 453 uint32_t extended_size; 454 uint64_t xfeatures; 455 uint32_t xstate_size; 456 uint32_t padding[7]; 457 }; 458 459 struct __fpstate { 460 uint16_t cwd; 461 uint16_t swd; 462 uint16_t twd; 463 uint16_t fop; 464 uint64_t rip; 465 uint64_t rdp; 466 uint32_t mxcsr; 467 uint32_t mxcsr_mask; 468 uint32_t st_space[32]; 469 uint32_t xmm_space[64]; 470 uint32_t reserved2[12]; 471 struct __fpx_sw_bytes sw_reserved; 472 }; 473 474 // The kernel expects to see some additional info in an FPU state. More details 475 // can be found in arch/x86/kernel/fpu/signal.c:check_xstate_in_sigframe. 476 static void prep_fpstate_for_sigframe(void *buf, uint32_t user_size, 477 bool use_xsave) { 478 struct __fpstate *fpstate = buf; 479 struct __fpx_sw_bytes *sw_bytes = &fpstate->sw_reserved; 480 481 sw_bytes->magic1 = FP_XSTATE_MAGIC1; 482 sw_bytes->extended_size = user_size + FP_XSTATE_MAGIC2_SIZE; 483 sw_bytes->xfeatures = ~(0ULL) ^ (XCR0_DISABLED_MASK); 484 sw_bytes->xstate_size = user_size; 485 *(uint32_t *)(buf + user_size) = use_xsave ? FP_XSTATE_MAGIC2 : 0; 486 }