github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/arch/arch_x86.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 || 386 16 // +build amd64 386 17 18 package arch 19 20 import ( 21 "fmt" 22 "io" 23 24 "golang.org/x/sys/unix" 25 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 26 "github.com/nicocha30/gvisor-ligolo/pkg/cpuid" 27 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch/fpu" 29 rpb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch/registers_go_proto" 30 ) 31 32 // Registers represents the CPU registers for this architecture. 33 // 34 // +stateify savable 35 type Registers struct { 36 linux.PtraceRegs 37 } 38 39 // System-related constants for x86. 40 const ( 41 // SyscallWidth is the width of syscall, sysenter, and int 80 insturctions. 42 SyscallWidth = 2 43 ) 44 45 // EFLAGS register bits. 46 const ( 47 // eflagsCF is the mask for the carry flag. 48 eflagsCF = uint64(1) << 0 49 // eflagsPF is the mask for the parity flag. 50 eflagsPF = uint64(1) << 2 51 // eflagsAF is the mask for the auxiliary carry flag. 52 eflagsAF = uint64(1) << 4 53 // eflagsZF is the mask for the zero flag. 54 eflagsZF = uint64(1) << 6 55 // eflagsSF is the mask for the sign flag. 56 eflagsSF = uint64(1) << 7 57 // eflagsTF is the mask for the trap flag. 58 eflagsTF = uint64(1) << 8 59 // eflagsIF is the mask for the interrupt flag. 60 eflagsIF = uint64(1) << 9 61 // eflagsDF is the mask for the direction flag. 62 eflagsDF = uint64(1) << 10 63 // eflagsOF is the mask for the overflow flag. 64 eflagsOF = uint64(1) << 11 65 // eflagsIOPL is the mask for the I/O privilege level. 66 eflagsIOPL = uint64(3) << 12 67 // eflagsNT is the mask for the nested task bit. 68 eflagsNT = uint64(1) << 14 69 // eflagsRF is the mask for the resume flag. 70 eflagsRF = uint64(1) << 16 71 // eflagsVM is the mask for the virtual mode bit. 72 eflagsVM = uint64(1) << 17 73 // eflagsAC is the mask for the alignment check / access control bit. 74 eflagsAC = uint64(1) << 18 75 // eflagsVIF is the mask for the virtual interrupt flag. 76 eflagsVIF = uint64(1) << 19 77 // eflagsVIP is the mask for the virtual interrupt pending bit. 78 eflagsVIP = uint64(1) << 20 79 // eflagsID is the mask for the CPUID detection bit. 80 eflagsID = uint64(1) << 21 81 82 // eflagsPtraceMutable is the mask for the set of EFLAGS that may be 83 // changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to 84 // Linux's FLAG_MASK. 85 eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT 86 87 // eflagsRestorable is the mask for the set of EFLAGS that may be changed by 88 // SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS. 89 eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF 90 ) 91 92 // Segment selectors. See arch/x86/include/asm/segment.h. 93 const ( 94 userCS = 0x33 // guest ring 3 code selector 95 user32CS = 0x23 // guest ring 3 32 bit code selector 96 userDS = 0x2b // guest ring 3 data selector 97 98 _FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector 99 _GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector 100 ) 101 102 var ( 103 // TrapInstruction is the x86 trap instruction. 104 TrapInstruction = [1]byte{0xcc} 105 106 // CPUIDInstruction is the x86 CPUID instruction. 107 CPUIDInstruction = [2]byte{0xf, 0xa2} 108 109 // X86TrapFlag is an exported const for use by other packages. 110 X86TrapFlag uint64 = (1 << 8) 111 ) 112 113 // Proto returns a protobuf representation of the system registers in State. 114 func (s State) Proto() *rpb.Registers { 115 regs := &rpb.AMD64Registers{ 116 Rax: s.Regs.Rax, 117 Rbx: s.Regs.Rbx, 118 Rcx: s.Regs.Rcx, 119 Rdx: s.Regs.Rdx, 120 Rsi: s.Regs.Rsi, 121 Rdi: s.Regs.Rdi, 122 Rsp: s.Regs.Rsp, 123 Rbp: s.Regs.Rbp, 124 R8: s.Regs.R8, 125 R9: s.Regs.R9, 126 R10: s.Regs.R10, 127 R11: s.Regs.R11, 128 R12: s.Regs.R12, 129 R13: s.Regs.R13, 130 R14: s.Regs.R14, 131 R15: s.Regs.R15, 132 Rip: s.Regs.Rip, 133 Rflags: s.Regs.Eflags, 134 OrigRax: s.Regs.Orig_rax, 135 Cs: s.Regs.Cs, 136 Ds: s.Regs.Ds, 137 Es: s.Regs.Es, 138 Fs: s.Regs.Fs, 139 Gs: s.Regs.Gs, 140 Ss: s.Regs.Ss, 141 FsBase: s.Regs.Fs_base, 142 GsBase: s.Regs.Gs_base, 143 } 144 return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}} 145 } 146 147 // Fork creates and returns an identical copy of the state. 148 func (s *State) Fork() State { 149 return State{ 150 Regs: s.Regs, 151 fpState: s.fpState.Fork(), 152 } 153 } 154 155 // StateData implements Context.StateData. 156 func (s *State) StateData() *State { 157 return s 158 } 159 160 // SingleStep implements Context.SingleStep. 161 func (s *State) SingleStep() bool { 162 return s.Regs.Eflags&X86TrapFlag != 0 163 } 164 165 // SetSingleStep enables single stepping. 166 func (s *State) SetSingleStep() { 167 // Set the trap flag. 168 s.Regs.Eflags |= X86TrapFlag 169 } 170 171 // ClearSingleStep enables single stepping. 172 func (s *State) ClearSingleStep() { 173 // Clear the trap flag. 174 s.Regs.Eflags &= ^X86TrapFlag 175 } 176 177 // RegisterMap returns a map of all registers. 178 func (s *State) RegisterMap() (map[string]uintptr, error) { 179 return map[string]uintptr{ 180 "R15": uintptr(s.Regs.R15), 181 "R14": uintptr(s.Regs.R14), 182 "R13": uintptr(s.Regs.R13), 183 "R12": uintptr(s.Regs.R12), 184 "Rbp": uintptr(s.Regs.Rbp), 185 "Rbx": uintptr(s.Regs.Rbx), 186 "R11": uintptr(s.Regs.R11), 187 "R10": uintptr(s.Regs.R10), 188 "R9": uintptr(s.Regs.R9), 189 "R8": uintptr(s.Regs.R8), 190 "Rax": uintptr(s.Regs.Rax), 191 "Rcx": uintptr(s.Regs.Rcx), 192 "Rdx": uintptr(s.Regs.Rdx), 193 "Rsi": uintptr(s.Regs.Rsi), 194 "Rdi": uintptr(s.Regs.Rdi), 195 "Orig_rax": uintptr(s.Regs.Orig_rax), 196 "Rip": uintptr(s.Regs.Rip), 197 "Cs": uintptr(s.Regs.Cs), 198 "Eflags": uintptr(s.Regs.Eflags), 199 "Rsp": uintptr(s.Regs.Rsp), 200 "Ss": uintptr(s.Regs.Ss), 201 "Fs_base": uintptr(s.Regs.Fs_base), 202 "Gs_base": uintptr(s.Regs.Gs_base), 203 "Ds": uintptr(s.Regs.Ds), 204 "Es": uintptr(s.Regs.Es), 205 "Fs": uintptr(s.Regs.Fs), 206 "Gs": uintptr(s.Regs.Gs), 207 }, nil 208 } 209 210 // PtraceGetRegs implements Context.PtraceGetRegs. 211 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { 212 regs := s.ptraceGetRegs() 213 n, err := regs.WriteTo(dst) 214 return int(n), err 215 } 216 217 func (s *State) ptraceGetRegs() Registers { 218 regs := s.Regs 219 // These may not be initialized. 220 if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 { 221 regs.Eflags = eflagsIF 222 regs.Cs = userCS 223 regs.Ss = userDS 224 } 225 // As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base 226 // addresses using reserved descriptors in the GDT instead of the MSRs, 227 // with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These 228 // values are actually visible in struct user_regs_struct::fs/gs; 229 // arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct 230 // thread_struct::fsindex/gsindex. 231 // 232 // We always use fs == gs == 0 when fs_base/gs_base is in use, for 233 // simplicity. 234 // 235 // Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via 236 // arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a 237 // 32-bit value and fsindex/gsindex indicates that this optimization is 238 // in use, as well as the reverse case of setting fs/gs to 239 // FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the 240 // same in PtraceSetRegs.) 241 // 242 // TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux 243 // doesn't have this behavior anymore. 244 if regs.Fs == 0 && regs.Fs_base <= 0xffffffff { 245 regs.Fs = _FS_TLS_SEL 246 } 247 if regs.Gs == 0 && regs.Gs_base <= 0xffffffff { 248 regs.Gs = _GS_TLS_SEL 249 } 250 return regs 251 } 252 253 var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() 254 255 // PtraceSetRegs implements Context.PtraceSetRegs. 256 func (s *State) PtraceSetRegs(src io.Reader) (int, error) { 257 var regs Registers 258 buf := make([]byte, ptraceRegistersSize) 259 if _, err := io.ReadFull(src, buf); err != nil { 260 return 0, err 261 } 262 regs.UnmarshalUnsafe(buf) 263 // Truncate segment registers to 16 bits. 264 regs.Cs = uint64(uint16(regs.Cs)) 265 regs.Ds = uint64(uint16(regs.Ds)) 266 regs.Es = uint64(uint16(regs.Es)) 267 regs.Fs = uint64(uint16(regs.Fs)) 268 regs.Gs = uint64(uint16(regs.Gs)) 269 regs.Ss = uint64(uint16(regs.Ss)) 270 // In Linux this validation is via arch/x86/kernel/ptrace.c:putreg(). 271 if !isUserSegmentSelector(regs.Cs) { 272 return 0, unix.EIO 273 } 274 if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) { 275 return 0, unix.EIO 276 } 277 if regs.Es != 0 && !isUserSegmentSelector(regs.Es) { 278 return 0, unix.EIO 279 } 280 if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) { 281 return 0, unix.EIO 282 } 283 if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) { 284 return 0, unix.EIO 285 } 286 if !isUserSegmentSelector(regs.Ss) { 287 return 0, unix.EIO 288 } 289 if !isValidSegmentBase(regs.Fs_base) { 290 return 0, unix.EIO 291 } 292 if !isValidSegmentBase(regs.Gs_base) { 293 return 0, unix.EIO 294 } 295 // CS and SS are validated, but changes to them are otherwise silently 296 // ignored on amd64. 297 regs.Cs = s.Regs.Cs 298 regs.Ss = s.Regs.Ss 299 // fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux. 300 if regs.Fs_base != s.Regs.Fs_base { 301 regs.Fs = 0 302 } 303 if regs.Gs_base != s.Regs.Gs_base { 304 regs.Gs = 0 305 } 306 // Ignore "stale" TLS segment selectors for FS and GS. See comment in 307 // ptraceGetRegs. 308 if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 { 309 regs.Fs = 0 310 } 311 if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 { 312 regs.Gs = 0 313 } 314 regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) 315 s.Regs = regs 316 return ptraceRegistersSize, nil 317 } 318 319 // isUserSegmentSelector returns true if the given segment selector specifies a 320 // privilege level of 3 (USER_RPL). 321 func isUserSegmentSelector(reg uint64) bool { 322 return reg&3 == 3 323 } 324 325 // isValidSegmentBase returns true if the given segment base specifies a 326 // canonical user address. 327 func isValidSegmentBase(reg uint64) bool { 328 return reg < uint64(maxAddr64) 329 } 330 331 // Register sets defined in include/uapi/linux/elf.h. 332 const ( 333 _NT_PRSTATUS = 1 334 _NT_PRFPREG = 2 335 _NT_X86_XSTATE = 0x202 336 ) 337 338 // PtraceGetRegSet implements Context.PtraceGetRegSet. 339 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int, fs cpuid.FeatureSet) (int, error) { 340 switch regset { 341 case _NT_PRSTATUS: 342 if maxlen < ptraceRegistersSize { 343 return 0, linuxerr.EFAULT 344 } 345 return s.PtraceGetRegs(dst) 346 case _NT_PRFPREG: 347 return s.fpState.PtraceGetFPRegs(dst, maxlen) 348 case _NT_X86_XSTATE: 349 return s.fpState.PtraceGetXstateRegs(dst, maxlen, fs) 350 default: 351 return 0, linuxerr.EINVAL 352 } 353 } 354 355 // PtraceSetRegSet implements Context.PtraceSetRegSet. 356 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int, fs cpuid.FeatureSet) (int, error) { 357 switch regset { 358 case _NT_PRSTATUS: 359 if maxlen < ptraceRegistersSize { 360 return 0, linuxerr.EFAULT 361 } 362 return s.PtraceSetRegs(src) 363 case _NT_PRFPREG: 364 return s.fpState.PtraceSetFPRegs(src, maxlen) 365 case _NT_X86_XSTATE: 366 return s.fpState.PtraceSetXstateRegs(src, maxlen, fs) 367 default: 368 return 0, linuxerr.EINVAL 369 } 370 } 371 372 // FullRestore indicates whether a full restore is required. 373 func (s *State) FullRestore() bool { 374 // A fast system call return is possible only if 375 // 376 // * RCX matches the instruction pointer. 377 // * R11 matches our flags value. 378 // * Usermode does not expect to set either the resume flag or the 379 // virtual mode flags (unlikely.) 380 // * CS and SS are set to the standard selectors. 381 // 382 // That is, SYSRET results in the correct final state. 383 fastRestore := s.Regs.Rcx == s.Regs.Rip && 384 s.Regs.Eflags == s.Regs.R11 && 385 (s.Regs.Eflags&eflagsRF == 0) && 386 (s.Regs.Eflags&eflagsVM == 0) && 387 s.Regs.Cs == userCS && 388 s.Regs.Ss == userDS 389 return !fastRestore 390 } 391 392 // New returns a new architecture context. 393 func New(arch Arch) *Context64 { 394 switch arch { 395 case AMD64: 396 return &Context64{ 397 State{ 398 fpState: fpu.NewState(), 399 }, 400 } 401 } 402 panic(fmt.Sprintf("unknown architecture %v", arch)) 403 }