github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/arch/arch_x86.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build amd64 386 16 17 package arch 18 19 import ( 20 "fmt" 21 "io" 22 23 "golang.org/x/sys/unix" 24 "github.com/SagerNet/gvisor/pkg/abi/linux" 25 "github.com/SagerNet/gvisor/pkg/cpuid" 26 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 27 "github.com/SagerNet/gvisor/pkg/log" 28 "github.com/SagerNet/gvisor/pkg/sentry/arch/fpu" 29 rpb "github.com/SagerNet/gvisor/pkg/sentry/arch/registers_go_proto" 30 "github.com/SagerNet/gvisor/pkg/syserror" 31 ) 32 33 // Registers represents the CPU registers for this architecture. 34 // 35 // +stateify savable 36 type Registers struct { 37 linux.PtraceRegs 38 } 39 40 // System-related constants for x86. 41 const ( 42 // SyscallWidth is the width of syscall, sysenter, and int 80 insturctions. 43 SyscallWidth = 2 44 ) 45 46 // EFLAGS register bits. 47 const ( 48 // eflagsCF is the mask for the carry flag. 49 eflagsCF = uint64(1) << 0 50 // eflagsPF is the mask for the parity flag. 51 eflagsPF = uint64(1) << 2 52 // eflagsAF is the mask for the auxiliary carry flag. 53 eflagsAF = uint64(1) << 4 54 // eflagsZF is the mask for the zero flag. 55 eflagsZF = uint64(1) << 6 56 // eflagsSF is the mask for the sign flag. 57 eflagsSF = uint64(1) << 7 58 // eflagsTF is the mask for the trap flag. 59 eflagsTF = uint64(1) << 8 60 // eflagsIF is the mask for the interrupt flag. 61 eflagsIF = uint64(1) << 9 62 // eflagsDF is the mask for the direction flag. 63 eflagsDF = uint64(1) << 10 64 // eflagsOF is the mask for the overflow flag. 65 eflagsOF = uint64(1) << 11 66 // eflagsIOPL is the mask for the I/O privilege level. 67 eflagsIOPL = uint64(3) << 12 68 // eflagsNT is the mask for the nested task bit. 69 eflagsNT = uint64(1) << 14 70 // eflagsRF is the mask for the resume flag. 71 eflagsRF = uint64(1) << 16 72 // eflagsVM is the mask for the virtual mode bit. 73 eflagsVM = uint64(1) << 17 74 // eflagsAC is the mask for the alignment check / access control bit. 75 eflagsAC = uint64(1) << 18 76 // eflagsVIF is the mask for the virtual interrupt flag. 77 eflagsVIF = uint64(1) << 19 78 // eflagsVIP is the mask for the virtual interrupt pending bit. 79 eflagsVIP = uint64(1) << 20 80 // eflagsID is the mask for the CPUID detection bit. 81 eflagsID = uint64(1) << 21 82 83 // eflagsPtraceMutable is the mask for the set of EFLAGS that may be 84 // changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to 85 // Linux's FLAG_MASK. 86 eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT 87 88 // eflagsRestorable is the mask for the set of EFLAGS that may be changed by 89 // SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS. 90 eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF 91 ) 92 93 // Segment selectors. See arch/x86/include/asm/segment.h. 94 const ( 95 userCS = 0x33 // guest ring 3 code selector 96 user32CS = 0x23 // guest ring 3 32 bit code selector 97 userDS = 0x2b // guest ring 3 data selector 98 99 _FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector 100 _GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector 101 ) 102 103 var ( 104 // TrapInstruction is the x86 trap instruction. 105 TrapInstruction = [1]byte{0xcc} 106 107 // CPUIDInstruction is the x86 CPUID instruction. 108 CPUIDInstruction = [2]byte{0xf, 0xa2} 109 110 // X86TrapFlag is an exported const for use by other packages. 111 X86TrapFlag uint64 = (1 << 8) 112 ) 113 114 // Proto returns a protobuf representation of the system registers in State. 115 func (s State) Proto() *rpb.Registers { 116 regs := &rpb.AMD64Registers{ 117 Rax: s.Regs.Rax, 118 Rbx: s.Regs.Rbx, 119 Rcx: s.Regs.Rcx, 120 Rdx: s.Regs.Rdx, 121 Rsi: s.Regs.Rsi, 122 Rdi: s.Regs.Rdi, 123 Rsp: s.Regs.Rsp, 124 Rbp: s.Regs.Rbp, 125 R8: s.Regs.R8, 126 R9: s.Regs.R9, 127 R10: s.Regs.R10, 128 R11: s.Regs.R11, 129 R12: s.Regs.R12, 130 R13: s.Regs.R13, 131 R14: s.Regs.R14, 132 R15: s.Regs.R15, 133 Rip: s.Regs.Rip, 134 Rflags: s.Regs.Eflags, 135 OrigRax: s.Regs.Orig_rax, 136 Cs: s.Regs.Cs, 137 Ds: s.Regs.Ds, 138 Es: s.Regs.Es, 139 Fs: s.Regs.Fs, 140 Gs: s.Regs.Gs, 141 Ss: s.Regs.Ss, 142 FsBase: s.Regs.Fs_base, 143 GsBase: s.Regs.Gs_base, 144 } 145 return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}} 146 } 147 148 // Fork creates and returns an identical copy of the state. 149 func (s *State) Fork() State { 150 return State{ 151 Regs: s.Regs, 152 fpState: s.fpState.Fork(), 153 FeatureSet: s.FeatureSet, 154 } 155 } 156 157 // StateData implements Context.StateData. 158 func (s *State) StateData() *State { 159 return s 160 } 161 162 // CPUIDEmulate emulates a cpuid instruction. 163 func (s *State) CPUIDEmulate(l log.Logger) { 164 argax := uint32(s.Regs.Rax) 165 argcx := uint32(s.Regs.Rcx) 166 ax, bx, cx, dx := s.FeatureSet.EmulateID(argax, argcx) 167 s.Regs.Rax = uint64(ax) 168 s.Regs.Rbx = uint64(bx) 169 s.Regs.Rcx = uint64(cx) 170 s.Regs.Rdx = uint64(dx) 171 l.Debugf("CPUID(%x,%x): %x %x %x %x", argax, argcx, ax, bx, cx, dx) 172 } 173 174 // SingleStep implements Context.SingleStep. 175 func (s *State) SingleStep() bool { 176 return s.Regs.Eflags&X86TrapFlag != 0 177 } 178 179 // SetSingleStep enables single stepping. 180 func (s *State) SetSingleStep() { 181 // Set the trap flag. 182 s.Regs.Eflags |= X86TrapFlag 183 } 184 185 // ClearSingleStep enables single stepping. 186 func (s *State) ClearSingleStep() { 187 // Clear the trap flag. 188 s.Regs.Eflags &= ^X86TrapFlag 189 } 190 191 // RegisterMap returns a map of all registers. 192 func (s *State) RegisterMap() (map[string]uintptr, error) { 193 return map[string]uintptr{ 194 "R15": uintptr(s.Regs.R15), 195 "R14": uintptr(s.Regs.R14), 196 "R13": uintptr(s.Regs.R13), 197 "R12": uintptr(s.Regs.R12), 198 "Rbp": uintptr(s.Regs.Rbp), 199 "Rbx": uintptr(s.Regs.Rbx), 200 "R11": uintptr(s.Regs.R11), 201 "R10": uintptr(s.Regs.R10), 202 "R9": uintptr(s.Regs.R9), 203 "R8": uintptr(s.Regs.R8), 204 "Rax": uintptr(s.Regs.Rax), 205 "Rcx": uintptr(s.Regs.Rcx), 206 "Rdx": uintptr(s.Regs.Rdx), 207 "Rsi": uintptr(s.Regs.Rsi), 208 "Rdi": uintptr(s.Regs.Rdi), 209 "Orig_rax": uintptr(s.Regs.Orig_rax), 210 "Rip": uintptr(s.Regs.Rip), 211 "Cs": uintptr(s.Regs.Cs), 212 "Eflags": uintptr(s.Regs.Eflags), 213 "Rsp": uintptr(s.Regs.Rsp), 214 "Ss": uintptr(s.Regs.Ss), 215 "Fs_base": uintptr(s.Regs.Fs_base), 216 "Gs_base": uintptr(s.Regs.Gs_base), 217 "Ds": uintptr(s.Regs.Ds), 218 "Es": uintptr(s.Regs.Es), 219 "Fs": uintptr(s.Regs.Fs), 220 "Gs": uintptr(s.Regs.Gs), 221 }, nil 222 } 223 224 // PtraceGetRegs implements Context.PtraceGetRegs. 225 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { 226 regs := s.ptraceGetRegs() 227 n, err := regs.WriteTo(dst) 228 return int(n), err 229 } 230 231 func (s *State) ptraceGetRegs() Registers { 232 regs := s.Regs 233 // These may not be initialized. 234 if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 { 235 regs.Eflags = eflagsIF 236 regs.Cs = userCS 237 regs.Ss = userDS 238 } 239 // As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base 240 // addresses using reserved descriptors in the GDT instead of the MSRs, 241 // with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These 242 // values are actually visible in struct user_regs_struct::fs/gs; 243 // arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct 244 // thread_struct::fsindex/gsindex. 245 // 246 // We always use fs == gs == 0 when fs_base/gs_base is in use, for 247 // simplicity. 248 // 249 // Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via 250 // arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a 251 // 32-bit value and fsindex/gsindex indicates that this optimization is 252 // in use, as well as the reverse case of setting fs/gs to 253 // FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the 254 // same in PtraceSetRegs.) 255 // 256 // TODO(github.com/SagerNet/issue/168): Remove this fixup since newer Linux 257 // doesn't have this behavior anymore. 258 if regs.Fs == 0 && regs.Fs_base <= 0xffffffff { 259 regs.Fs = _FS_TLS_SEL 260 } 261 if regs.Gs == 0 && regs.Gs_base <= 0xffffffff { 262 regs.Gs = _GS_TLS_SEL 263 } 264 return regs 265 } 266 267 var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() 268 269 // PtraceSetRegs implements Context.PtraceSetRegs. 270 func (s *State) PtraceSetRegs(src io.Reader) (int, error) { 271 var regs Registers 272 buf := make([]byte, ptraceRegistersSize) 273 if _, err := io.ReadFull(src, buf); err != nil { 274 return 0, err 275 } 276 regs.UnmarshalUnsafe(buf) 277 // Truncate segment registers to 16 bits. 278 regs.Cs = uint64(uint16(regs.Cs)) 279 regs.Ds = uint64(uint16(regs.Ds)) 280 regs.Es = uint64(uint16(regs.Es)) 281 regs.Fs = uint64(uint16(regs.Fs)) 282 regs.Gs = uint64(uint16(regs.Gs)) 283 regs.Ss = uint64(uint16(regs.Ss)) 284 // In Linux this validation is via arch/x86/kernel/ptrace.c:putreg(). 285 if !isUserSegmentSelector(regs.Cs) { 286 return 0, unix.EIO 287 } 288 if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) { 289 return 0, unix.EIO 290 } 291 if regs.Es != 0 && !isUserSegmentSelector(regs.Es) { 292 return 0, unix.EIO 293 } 294 if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) { 295 return 0, unix.EIO 296 } 297 if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) { 298 return 0, unix.EIO 299 } 300 if !isUserSegmentSelector(regs.Ss) { 301 return 0, unix.EIO 302 } 303 if !isValidSegmentBase(regs.Fs_base) { 304 return 0, unix.EIO 305 } 306 if !isValidSegmentBase(regs.Gs_base) { 307 return 0, unix.EIO 308 } 309 // CS and SS are validated, but changes to them are otherwise silently 310 // ignored on amd64. 311 regs.Cs = s.Regs.Cs 312 regs.Ss = s.Regs.Ss 313 // fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux. 314 if regs.Fs_base != s.Regs.Fs_base { 315 regs.Fs = 0 316 } 317 if regs.Gs_base != s.Regs.Gs_base { 318 regs.Gs = 0 319 } 320 // Ignore "stale" TLS segment selectors for FS and GS. See comment in 321 // ptraceGetRegs. 322 if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 { 323 regs.Fs = 0 324 } 325 if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 { 326 regs.Gs = 0 327 } 328 regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) 329 s.Regs = regs 330 return ptraceRegistersSize, nil 331 } 332 333 // isUserSegmentSelector returns true if the given segment selector specifies a 334 // privilege level of 3 (USER_RPL). 335 func isUserSegmentSelector(reg uint64) bool { 336 return reg&3 == 3 337 } 338 339 // isValidSegmentBase returns true if the given segment base specifies a 340 // canonical user address. 341 func isValidSegmentBase(reg uint64) bool { 342 return reg < uint64(maxAddr64) 343 } 344 345 // Register sets defined in include/uapi/linux/elf.h. 346 const ( 347 _NT_PRSTATUS = 1 348 _NT_PRFPREG = 2 349 _NT_X86_XSTATE = 0x202 350 ) 351 352 // PtraceGetRegSet implements Context.PtraceGetRegSet. 353 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { 354 switch regset { 355 case _NT_PRSTATUS: 356 if maxlen < ptraceRegistersSize { 357 return 0, syserror.EFAULT 358 } 359 return s.PtraceGetRegs(dst) 360 case _NT_PRFPREG: 361 return s.fpState.PtraceGetFPRegs(dst, maxlen) 362 case _NT_X86_XSTATE: 363 return s.fpState.PtraceGetXstateRegs(dst, maxlen, s.FeatureSet) 364 default: 365 return 0, linuxerr.EINVAL 366 } 367 } 368 369 // PtraceSetRegSet implements Context.PtraceSetRegSet. 370 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { 371 switch regset { 372 case _NT_PRSTATUS: 373 if maxlen < ptraceRegistersSize { 374 return 0, syserror.EFAULT 375 } 376 return s.PtraceSetRegs(src) 377 case _NT_PRFPREG: 378 return s.fpState.PtraceSetFPRegs(src, maxlen) 379 case _NT_X86_XSTATE: 380 return s.fpState.PtraceSetXstateRegs(src, maxlen, s.FeatureSet) 381 default: 382 return 0, linuxerr.EINVAL 383 } 384 } 385 386 // FullRestore indicates whether a full restore is required. 387 func (s *State) FullRestore() bool { 388 // A fast system call return is possible only if 389 // 390 // * RCX matches the instruction pointer. 391 // * R11 matches our flags value. 392 // * Usermode does not expect to set either the resume flag or the 393 // virtual mode flags (unlikely.) 394 // * CS and SS are set to the standard selectors. 395 // 396 // That is, SYSRET results in the correct final state. 397 fastRestore := s.Regs.Rcx == s.Regs.Rip && 398 s.Regs.Eflags == s.Regs.R11 && 399 (s.Regs.Eflags&eflagsRF == 0) && 400 (s.Regs.Eflags&eflagsVM == 0) && 401 s.Regs.Cs == userCS && 402 s.Regs.Ss == userDS 403 return !fastRestore 404 } 405 406 // New returns a new architecture context. 407 func New(arch Arch, fs *cpuid.FeatureSet) Context { 408 switch arch { 409 case AMD64: 410 return &context64{ 411 State{ 412 fpState: fpu.NewState(), 413 FeatureSet: fs, 414 }, 415 []fpu.State(nil), 416 } 417 } 418 panic(fmt.Sprintf("unknown architecture %v", arch)) 419 }