github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/arch/arch_x86.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 || 386 16 // +build amd64 386 17 18 package arch 19 20 import ( 21 "fmt" 22 "io" 23 24 "golang.org/x/sys/unix" 25 "github.com/metacubex/gvisor/pkg/abi/linux" 26 "github.com/metacubex/gvisor/pkg/cpuid" 27 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 28 "github.com/metacubex/gvisor/pkg/sentry/arch/fpu" 29 rpb "github.com/metacubex/gvisor/pkg/sentry/arch/registers_go_proto" 30 ) 31 32 // Registers represents the CPU registers for this architecture. 33 // 34 // +stateify savable 35 type Registers struct { 36 linux.PtraceRegs 37 } 38 39 // System-related constants for x86. 40 const ( 41 // SyscallWidth is the width of syscall, sysenter, and int 80 insturctions. 42 SyscallWidth = 2 43 ) 44 45 // EFLAGS register bits. 46 const ( 47 // eflagsCF is the mask for the carry flag. 48 eflagsCF = uint64(1) << 0 49 // eflagsPF is the mask for the parity flag. 50 eflagsPF = uint64(1) << 2 51 // eflagsAF is the mask for the auxiliary carry flag. 52 eflagsAF = uint64(1) << 4 53 // eflagsZF is the mask for the zero flag. 54 eflagsZF = uint64(1) << 6 55 // eflagsSF is the mask for the sign flag. 56 eflagsSF = uint64(1) << 7 57 // eflagsTF is the mask for the trap flag. 58 eflagsTF = uint64(1) << 8 59 // eflagsIF is the mask for the interrupt flag. 60 eflagsIF = uint64(1) << 9 61 // eflagsDF is the mask for the direction flag. 62 eflagsDF = uint64(1) << 10 63 // eflagsOF is the mask for the overflow flag. 64 eflagsOF = uint64(1) << 11 65 // eflagsIOPL is the mask for the I/O privilege level. 66 eflagsIOPL = uint64(3) << 12 67 // eflagsNT is the mask for the nested task bit. 68 eflagsNT = uint64(1) << 14 69 // eflagsRF is the mask for the resume flag. 70 eflagsRF = uint64(1) << 16 71 // eflagsVM is the mask for the virtual mode bit. 72 eflagsVM = uint64(1) << 17 73 // eflagsAC is the mask for the alignment check / access control bit. 74 eflagsAC = uint64(1) << 18 75 // eflagsVIF is the mask for the virtual interrupt flag. 76 eflagsVIF = uint64(1) << 19 77 // eflagsVIP is the mask for the virtual interrupt pending bit. 78 eflagsVIP = uint64(1) << 20 79 // eflagsID is the mask for the CPUID detection bit. 80 eflagsID = uint64(1) << 21 81 82 // eflagsPtraceMutable is the mask for the set of EFLAGS that may be 83 // changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to 84 // Linux's FLAG_MASK. 85 eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT 86 87 // eflagsRestorable is the mask for the set of EFLAGS that may be changed by 88 // SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS. 89 eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF 90 ) 91 92 // Segment selectors. See arch/x86/include/asm/segment.h. 93 const ( 94 userCS = 0x33 // guest ring 3 code selector 95 user32CS = 0x23 // guest ring 3 32 bit code selector 96 userDS = 0x2b // guest ring 3 data selector 97 98 _FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector 99 _GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector 100 ) 101 102 var ( 103 // TrapInstruction is the x86 trap instruction. 104 TrapInstruction = [1]byte{0xcc} 105 106 // CPUIDInstruction is the x86 CPUID instruction. 107 CPUIDInstruction = [2]byte{0xf, 0xa2} 108 109 // X86TrapFlag is an exported const for use by other packages. 110 X86TrapFlag uint64 = (1 << 8) 111 ) 112 113 // Proto returns a protobuf representation of the system registers in State. 114 func (s State) Proto() *rpb.Registers { 115 regs := &rpb.AMD64Registers{ 116 Rax: s.Regs.Rax, 117 Rbx: s.Regs.Rbx, 118 Rcx: s.Regs.Rcx, 119 Rdx: s.Regs.Rdx, 120 Rsi: s.Regs.Rsi, 121 Rdi: s.Regs.Rdi, 122 Rsp: s.Regs.Rsp, 123 Rbp: s.Regs.Rbp, 124 R8: s.Regs.R8, 125 R9: s.Regs.R9, 126 R10: s.Regs.R10, 127 R11: s.Regs.R11, 128 R12: s.Regs.R12, 129 R13: s.Regs.R13, 130 R14: s.Regs.R14, 131 R15: s.Regs.R15, 132 Rip: s.Regs.Rip, 133 Rflags: s.Regs.Eflags, 134 OrigRax: s.Regs.Orig_rax, 135 Cs: s.Regs.Cs, 136 Ds: s.Regs.Ds, 137 Es: s.Regs.Es, 138 Fs: s.Regs.Fs, 139 Gs: s.Regs.Gs, 140 Ss: s.Regs.Ss, 141 FsBase: s.Regs.Fs_base, 142 GsBase: s.Regs.Gs_base, 143 } 144 return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}} 145 } 146 147 // Fork creates and returns an identical copy of the state. 148 func (s *State) Fork() State { 149 return State{ 150 Regs: s.Regs, 151 fpState: s.fpState.Fork(), 152 } 153 } 154 155 // StateData implements Context.StateData. 156 func (s *State) StateData() *State { 157 return s 158 } 159 160 // SingleStep implements Context.SingleStep. 161 func (s *State) SingleStep() bool { 162 return s.Regs.Eflags&X86TrapFlag != 0 163 } 164 165 // SetSingleStep enables single stepping. 166 func (s *State) SetSingleStep() { 167 // Set the trap flag. 168 s.Regs.Eflags |= X86TrapFlag 169 } 170 171 // ClearSingleStep enables single stepping. 172 func (s *State) ClearSingleStep() { 173 // Clear the trap flag. 174 s.Regs.Eflags &= ^X86TrapFlag 175 } 176 177 // RegisterMap returns a map of all registers. 178 func (s *State) RegisterMap() (map[string]uintptr, error) { 179 return map[string]uintptr{ 180 "R15": uintptr(s.Regs.R15), 181 "R14": uintptr(s.Regs.R14), 182 "R13": uintptr(s.Regs.R13), 183 "R12": uintptr(s.Regs.R12), 184 "Rbp": uintptr(s.Regs.Rbp), 185 "Rbx": uintptr(s.Regs.Rbx), 186 "R11": uintptr(s.Regs.R11), 187 "R10": uintptr(s.Regs.R10), 188 "R9": uintptr(s.Regs.R9), 189 "R8": uintptr(s.Regs.R8), 190 "Rax": uintptr(s.Regs.Rax), 191 "Rcx": uintptr(s.Regs.Rcx), 192 "Rdx": uintptr(s.Regs.Rdx), 193 "Rsi": uintptr(s.Regs.Rsi), 194 "Rdi": uintptr(s.Regs.Rdi), 195 "Orig_rax": uintptr(s.Regs.Orig_rax), 196 "Rip": uintptr(s.Regs.Rip), 197 "Cs": uintptr(s.Regs.Cs), 198 "Eflags": uintptr(s.Regs.Eflags), 199 "Rsp": uintptr(s.Regs.Rsp), 200 "Ss": uintptr(s.Regs.Ss), 201 "Fs_base": uintptr(s.Regs.Fs_base), 202 "Gs_base": uintptr(s.Regs.Gs_base), 203 "Ds": uintptr(s.Regs.Ds), 204 "Es": uintptr(s.Regs.Es), 205 "Fs": uintptr(s.Regs.Fs), 206 "Gs": uintptr(s.Regs.Gs), 207 }, nil 208 } 209 210 // PtraceGetRegs implements Context.PtraceGetRegs. 211 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { 212 regs := s.ptraceGetRegs() 213 n, err := regs.WriteTo(dst) 214 return int(n), err 215 } 216 217 func (s *State) ptraceGetRegs() Registers { 218 regs := s.Regs 219 // As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base 220 // addresses using reserved descriptors in the GDT instead of the MSRs, 221 // with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These 222 // values are actually visible in struct user_regs_struct::fs/gs; 223 // arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct 224 // thread_struct::fsindex/gsindex. 225 // 226 // We always use fs == gs == 0 when fs_base/gs_base is in use, for 227 // simplicity. 228 // 229 // Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via 230 // arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a 231 // 32-bit value and fsindex/gsindex indicates that this optimization is 232 // in use, as well as the reverse case of setting fs/gs to 233 // FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the 234 // same in PtraceSetRegs.) 235 // 236 // TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux 237 // doesn't have this behavior anymore. 238 if regs.Fs == 0 && regs.Fs_base <= 0xffffffff { 239 regs.Fs = _FS_TLS_SEL 240 } 241 if regs.Gs == 0 && regs.Gs_base <= 0xffffffff { 242 regs.Gs = _GS_TLS_SEL 243 } 244 return regs 245 } 246 247 var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() 248 249 // PtraceSetRegs implements Context.PtraceSetRegs. 250 func (s *State) PtraceSetRegs(src io.Reader) (int, error) { 251 var regs Registers 252 buf := make([]byte, ptraceRegistersSize) 253 if _, err := io.ReadFull(src, buf); err != nil { 254 return 0, err 255 } 256 regs.UnmarshalUnsafe(buf) 257 // Truncate segment registers to 16 bits. 258 regs.Cs = uint64(uint16(regs.Cs)) 259 regs.Ds = uint64(uint16(regs.Ds)) 260 regs.Es = uint64(uint16(regs.Es)) 261 regs.Fs = uint64(uint16(regs.Fs)) 262 regs.Gs = uint64(uint16(regs.Gs)) 263 regs.Ss = uint64(uint16(regs.Ss)) 264 // In Linux this validation is via arch/x86/kernel/ptrace.c:putreg(). 265 if !isUserSegmentSelector(regs.Cs) { 266 return 0, unix.EIO 267 } 268 if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) { 269 return 0, unix.EIO 270 } 271 if regs.Es != 0 && !isUserSegmentSelector(regs.Es) { 272 return 0, unix.EIO 273 } 274 if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) { 275 return 0, unix.EIO 276 } 277 if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) { 278 return 0, unix.EIO 279 } 280 if !isUserSegmentSelector(regs.Ss) { 281 return 0, unix.EIO 282 } 283 if !isValidSegmentBase(regs.Fs_base) { 284 return 0, unix.EIO 285 } 286 if !isValidSegmentBase(regs.Gs_base) { 287 return 0, unix.EIO 288 } 289 // CS and SS are validated, but changes to them are otherwise silently 290 // ignored on amd64. 291 regs.Cs = s.Regs.Cs 292 regs.Ss = s.Regs.Ss 293 // fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux. 294 if regs.Fs_base != s.Regs.Fs_base { 295 regs.Fs = 0 296 } 297 if regs.Gs_base != s.Regs.Gs_base { 298 regs.Gs = 0 299 } 300 // Ignore "stale" TLS segment selectors for FS and GS. See comment in 301 // ptraceGetRegs. 302 if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 { 303 regs.Fs = 0 304 } 305 if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 { 306 regs.Gs = 0 307 } 308 regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) 309 s.Regs = regs 310 return ptraceRegistersSize, nil 311 } 312 313 // isUserSegmentSelector returns true if the given segment selector specifies a 314 // privilege level of 3 (USER_RPL). 315 func isUserSegmentSelector(reg uint64) bool { 316 return reg&3 == 3 317 } 318 319 // isValidSegmentBase returns true if the given segment base specifies a 320 // canonical user address. 321 func isValidSegmentBase(reg uint64) bool { 322 return reg < uint64(maxAddr64) 323 } 324 325 // Register sets defined in include/uapi/linux/elf.h. 326 const ( 327 _NT_PRSTATUS = 1 328 _NT_PRFPREG = 2 329 _NT_X86_XSTATE = 0x202 330 ) 331 332 // PtraceGetRegSet implements Context.PtraceGetRegSet. 333 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int, fs cpuid.FeatureSet) (int, error) { 334 switch regset { 335 case _NT_PRSTATUS: 336 if maxlen < ptraceRegistersSize { 337 return 0, linuxerr.EFAULT 338 } 339 return s.PtraceGetRegs(dst) 340 case _NT_PRFPREG: 341 return s.fpState.PtraceGetFPRegs(dst, maxlen) 342 case _NT_X86_XSTATE: 343 return s.fpState.PtraceGetXstateRegs(dst, maxlen, fs) 344 default: 345 return 0, linuxerr.EINVAL 346 } 347 } 348 349 // PtraceSetRegSet implements Context.PtraceSetRegSet. 350 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int, fs cpuid.FeatureSet) (int, error) { 351 switch regset { 352 case _NT_PRSTATUS: 353 if maxlen < ptraceRegistersSize { 354 return 0, linuxerr.EFAULT 355 } 356 return s.PtraceSetRegs(src) 357 case _NT_PRFPREG: 358 return s.fpState.PtraceSetFPRegs(src, maxlen) 359 case _NT_X86_XSTATE: 360 return s.fpState.PtraceSetXstateRegs(src, maxlen, fs) 361 default: 362 return 0, linuxerr.EINVAL 363 } 364 } 365 366 // FullRestore indicates whether a full restore is required. 367 func (s *State) FullRestore() bool { 368 // A fast system call return is possible only if 369 // 370 // * RCX matches the instruction pointer. 371 // * R11 matches our flags value. 372 // * Usermode does not expect to set either the resume flag or the 373 // virtual mode flags (unlikely.) 374 // * CS and SS are set to the standard selectors. 375 // 376 // That is, SYSRET results in the correct final state. 377 fastRestore := s.Regs.Rcx == s.Regs.Rip && 378 s.Regs.Eflags == s.Regs.R11 && 379 (s.Regs.Eflags&eflagsRF == 0) && 380 (s.Regs.Eflags&eflagsVM == 0) && 381 s.Regs.Cs == userCS && 382 s.Regs.Ss == userDS 383 return !fastRestore 384 } 385 386 // New returns a new architecture context. 387 func New(arch Arch) *Context64 { 388 switch arch { 389 case AMD64: 390 return &Context64{ 391 State{ 392 fpState: fpu.NewState(), 393 // Set initial registers for compatibility with Linux 394 // (as done in arch/x86/kernel/process_64.c:start_thread()). 395 Regs: Registers{ 396 PtraceRegs: linux.PtraceRegs{ 397 Eflags: eflagsIF, 398 Cs: userCS, 399 Ss: userDS, 400 }, 401 }, 402 }, 403 } 404 } 405 panic(fmt.Sprintf("unknown architecture %v", arch)) 406 }