github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/arch/arch_x86.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64 || 386
    16  // +build amd64 386
    17  
    18  package arch
    19  
    20  import (
    21  	"fmt"
    22  	"io"
    23  
    24  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    25  	"github.com/ttpreport/gvisor-ligolo/pkg/cpuid"
    26  	"github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr"
    27  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/arch/fpu"
    28  	rpb "github.com/ttpreport/gvisor-ligolo/pkg/sentry/arch/registers_go_proto"
    29  	"golang.org/x/sys/unix"
    30  )
    31  
    32  // Registers represents the CPU registers for this architecture.
    33  //
    34  // +stateify savable
    35  type Registers struct {
    36  	linux.PtraceRegs
    37  }
    38  
    39  // System-related constants for x86.
    40  const (
    41  	// SyscallWidth is the width of syscall, sysenter, and int 80 insturctions.
    42  	SyscallWidth = 2
    43  )
    44  
    45  // EFLAGS register bits.
    46  const (
    47  	// eflagsCF is the mask for the carry flag.
    48  	eflagsCF = uint64(1) << 0
    49  	// eflagsPF is the mask for the parity flag.
    50  	eflagsPF = uint64(1) << 2
    51  	// eflagsAF is the mask for the auxiliary carry flag.
    52  	eflagsAF = uint64(1) << 4
    53  	// eflagsZF is the mask for the zero flag.
    54  	eflagsZF = uint64(1) << 6
    55  	// eflagsSF is the mask for the sign flag.
    56  	eflagsSF = uint64(1) << 7
    57  	// eflagsTF is the mask for the trap flag.
    58  	eflagsTF = uint64(1) << 8
    59  	// eflagsIF is the mask for the interrupt flag.
    60  	eflagsIF = uint64(1) << 9
    61  	// eflagsDF is the mask for the direction flag.
    62  	eflagsDF = uint64(1) << 10
    63  	// eflagsOF is the mask for the overflow flag.
    64  	eflagsOF = uint64(1) << 11
    65  	// eflagsIOPL is the mask for the I/O privilege level.
    66  	eflagsIOPL = uint64(3) << 12
    67  	// eflagsNT is the mask for the nested task bit.
    68  	eflagsNT = uint64(1) << 14
    69  	// eflagsRF is the mask for the resume flag.
    70  	eflagsRF = uint64(1) << 16
    71  	// eflagsVM is the mask for the virtual mode bit.
    72  	eflagsVM = uint64(1) << 17
    73  	// eflagsAC is the mask for the alignment check / access control bit.
    74  	eflagsAC = uint64(1) << 18
    75  	// eflagsVIF is the mask for the virtual interrupt flag.
    76  	eflagsVIF = uint64(1) << 19
    77  	// eflagsVIP is the mask for the virtual interrupt pending bit.
    78  	eflagsVIP = uint64(1) << 20
    79  	// eflagsID is the mask for the CPUID detection bit.
    80  	eflagsID = uint64(1) << 21
    81  
    82  	// eflagsPtraceMutable is the mask for the set of EFLAGS that may be
    83  	// changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to
    84  	// Linux's FLAG_MASK.
    85  	eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT
    86  
    87  	// eflagsRestorable is the mask for the set of EFLAGS that may be changed by
    88  	// SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS.
    89  	eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF
    90  )
    91  
    92  // Segment selectors. See arch/x86/include/asm/segment.h.
    93  const (
    94  	userCS   = 0x33 // guest ring 3 code selector
    95  	user32CS = 0x23 // guest ring 3 32 bit code selector
    96  	userDS   = 0x2b // guest ring 3 data selector
    97  
    98  	_FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector
    99  	_GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector
   100  )
   101  
   102  var (
   103  	// TrapInstruction is the x86 trap instruction.
   104  	TrapInstruction = [1]byte{0xcc}
   105  
   106  	// CPUIDInstruction is the x86 CPUID instruction.
   107  	CPUIDInstruction = [2]byte{0xf, 0xa2}
   108  
   109  	// X86TrapFlag is an exported const for use by other packages.
   110  	X86TrapFlag uint64 = (1 << 8)
   111  )
   112  
   113  // Proto returns a protobuf representation of the system registers in State.
   114  func (s State) Proto() *rpb.Registers {
   115  	regs := &rpb.AMD64Registers{
   116  		Rax:     s.Regs.Rax,
   117  		Rbx:     s.Regs.Rbx,
   118  		Rcx:     s.Regs.Rcx,
   119  		Rdx:     s.Regs.Rdx,
   120  		Rsi:     s.Regs.Rsi,
   121  		Rdi:     s.Regs.Rdi,
   122  		Rsp:     s.Regs.Rsp,
   123  		Rbp:     s.Regs.Rbp,
   124  		R8:      s.Regs.R8,
   125  		R9:      s.Regs.R9,
   126  		R10:     s.Regs.R10,
   127  		R11:     s.Regs.R11,
   128  		R12:     s.Regs.R12,
   129  		R13:     s.Regs.R13,
   130  		R14:     s.Regs.R14,
   131  		R15:     s.Regs.R15,
   132  		Rip:     s.Regs.Rip,
   133  		Rflags:  s.Regs.Eflags,
   134  		OrigRax: s.Regs.Orig_rax,
   135  		Cs:      s.Regs.Cs,
   136  		Ds:      s.Regs.Ds,
   137  		Es:      s.Regs.Es,
   138  		Fs:      s.Regs.Fs,
   139  		Gs:      s.Regs.Gs,
   140  		Ss:      s.Regs.Ss,
   141  		FsBase:  s.Regs.Fs_base,
   142  		GsBase:  s.Regs.Gs_base,
   143  	}
   144  	return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}}
   145  }
   146  
   147  // Fork creates and returns an identical copy of the state.
   148  func (s *State) Fork() State {
   149  	return State{
   150  		Regs:    s.Regs,
   151  		fpState: s.fpState.Fork(),
   152  	}
   153  }
   154  
   155  // StateData implements Context.StateData.
   156  func (s *State) StateData() *State {
   157  	return s
   158  }
   159  
   160  // SingleStep implements Context.SingleStep.
   161  func (s *State) SingleStep() bool {
   162  	return s.Regs.Eflags&X86TrapFlag != 0
   163  }
   164  
   165  // SetSingleStep enables single stepping.
   166  func (s *State) SetSingleStep() {
   167  	// Set the trap flag.
   168  	s.Regs.Eflags |= X86TrapFlag
   169  }
   170  
   171  // ClearSingleStep enables single stepping.
   172  func (s *State) ClearSingleStep() {
   173  	// Clear the trap flag.
   174  	s.Regs.Eflags &= ^X86TrapFlag
   175  }
   176  
   177  // RegisterMap returns a map of all registers.
   178  func (s *State) RegisterMap() (map[string]uintptr, error) {
   179  	return map[string]uintptr{
   180  		"R15":      uintptr(s.Regs.R15),
   181  		"R14":      uintptr(s.Regs.R14),
   182  		"R13":      uintptr(s.Regs.R13),
   183  		"R12":      uintptr(s.Regs.R12),
   184  		"Rbp":      uintptr(s.Regs.Rbp),
   185  		"Rbx":      uintptr(s.Regs.Rbx),
   186  		"R11":      uintptr(s.Regs.R11),
   187  		"R10":      uintptr(s.Regs.R10),
   188  		"R9":       uintptr(s.Regs.R9),
   189  		"R8":       uintptr(s.Regs.R8),
   190  		"Rax":      uintptr(s.Regs.Rax),
   191  		"Rcx":      uintptr(s.Regs.Rcx),
   192  		"Rdx":      uintptr(s.Regs.Rdx),
   193  		"Rsi":      uintptr(s.Regs.Rsi),
   194  		"Rdi":      uintptr(s.Regs.Rdi),
   195  		"Orig_rax": uintptr(s.Regs.Orig_rax),
   196  		"Rip":      uintptr(s.Regs.Rip),
   197  		"Cs":       uintptr(s.Regs.Cs),
   198  		"Eflags":   uintptr(s.Regs.Eflags),
   199  		"Rsp":      uintptr(s.Regs.Rsp),
   200  		"Ss":       uintptr(s.Regs.Ss),
   201  		"Fs_base":  uintptr(s.Regs.Fs_base),
   202  		"Gs_base":  uintptr(s.Regs.Gs_base),
   203  		"Ds":       uintptr(s.Regs.Ds),
   204  		"Es":       uintptr(s.Regs.Es),
   205  		"Fs":       uintptr(s.Regs.Fs),
   206  		"Gs":       uintptr(s.Regs.Gs),
   207  	}, nil
   208  }
   209  
   210  // PtraceGetRegs implements Context.PtraceGetRegs.
   211  func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
   212  	regs := s.ptraceGetRegs()
   213  	n, err := regs.WriteTo(dst)
   214  	return int(n), err
   215  }
   216  
   217  func (s *State) ptraceGetRegs() Registers {
   218  	regs := s.Regs
   219  	// These may not be initialized.
   220  	if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 {
   221  		regs.Eflags = eflagsIF
   222  		regs.Cs = userCS
   223  		regs.Ss = userDS
   224  	}
   225  	// As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base
   226  	// addresses using reserved descriptors in the GDT instead of the MSRs,
   227  	// with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These
   228  	// values are actually visible in struct user_regs_struct::fs/gs;
   229  	// arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct
   230  	// thread_struct::fsindex/gsindex.
   231  	//
   232  	// We always use fs == gs == 0 when fs_base/gs_base is in use, for
   233  	// simplicity.
   234  	//
   235  	// Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via
   236  	// arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a
   237  	// 32-bit value and fsindex/gsindex indicates that this optimization is
   238  	// in use, as well as the reverse case of setting fs/gs to
   239  	// FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
   240  	// same in PtraceSetRegs.)
   241  	//
   242  	// TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux
   243  	// doesn't have this behavior anymore.
   244  	if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
   245  		regs.Fs = _FS_TLS_SEL
   246  	}
   247  	if regs.Gs == 0 && regs.Gs_base <= 0xffffffff {
   248  		regs.Gs = _GS_TLS_SEL
   249  	}
   250  	return regs
   251  }
   252  
   253  var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
   254  
   255  // PtraceSetRegs implements Context.PtraceSetRegs.
   256  func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
   257  	var regs Registers
   258  	buf := make([]byte, ptraceRegistersSize)
   259  	if _, err := io.ReadFull(src, buf); err != nil {
   260  		return 0, err
   261  	}
   262  	regs.UnmarshalUnsafe(buf)
   263  	// Truncate segment registers to 16 bits.
   264  	regs.Cs = uint64(uint16(regs.Cs))
   265  	regs.Ds = uint64(uint16(regs.Ds))
   266  	regs.Es = uint64(uint16(regs.Es))
   267  	regs.Fs = uint64(uint16(regs.Fs))
   268  	regs.Gs = uint64(uint16(regs.Gs))
   269  	regs.Ss = uint64(uint16(regs.Ss))
   270  	// In Linux this validation is via arch/x86/kernel/ptrace.c:putreg().
   271  	if !isUserSegmentSelector(regs.Cs) {
   272  		return 0, unix.EIO
   273  	}
   274  	if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) {
   275  		return 0, unix.EIO
   276  	}
   277  	if regs.Es != 0 && !isUserSegmentSelector(regs.Es) {
   278  		return 0, unix.EIO
   279  	}
   280  	if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) {
   281  		return 0, unix.EIO
   282  	}
   283  	if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) {
   284  		return 0, unix.EIO
   285  	}
   286  	if !isUserSegmentSelector(regs.Ss) {
   287  		return 0, unix.EIO
   288  	}
   289  	if !isValidSegmentBase(regs.Fs_base) {
   290  		return 0, unix.EIO
   291  	}
   292  	if !isValidSegmentBase(regs.Gs_base) {
   293  		return 0, unix.EIO
   294  	}
   295  	// CS and SS are validated, but changes to them are otherwise silently
   296  	// ignored on amd64.
   297  	regs.Cs = s.Regs.Cs
   298  	regs.Ss = s.Regs.Ss
   299  	// fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux.
   300  	if regs.Fs_base != s.Regs.Fs_base {
   301  		regs.Fs = 0
   302  	}
   303  	if regs.Gs_base != s.Regs.Gs_base {
   304  		regs.Gs = 0
   305  	}
   306  	// Ignore "stale" TLS segment selectors for FS and GS. See comment in
   307  	// ptraceGetRegs.
   308  	if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 {
   309  		regs.Fs = 0
   310  	}
   311  	if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 {
   312  		regs.Gs = 0
   313  	}
   314  	regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
   315  	s.Regs = regs
   316  	return ptraceRegistersSize, nil
   317  }
   318  
   319  // isUserSegmentSelector returns true if the given segment selector specifies a
   320  // privilege level of 3 (USER_RPL).
   321  func isUserSegmentSelector(reg uint64) bool {
   322  	return reg&3 == 3
   323  }
   324  
   325  // isValidSegmentBase returns true if the given segment base specifies a
   326  // canonical user address.
   327  func isValidSegmentBase(reg uint64) bool {
   328  	return reg < uint64(maxAddr64)
   329  }
   330  
   331  // Register sets defined in include/uapi/linux/elf.h.
   332  const (
   333  	_NT_PRSTATUS   = 1
   334  	_NT_PRFPREG    = 2
   335  	_NT_X86_XSTATE = 0x202
   336  )
   337  
   338  // PtraceGetRegSet implements Context.PtraceGetRegSet.
   339  func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int, fs cpuid.FeatureSet) (int, error) {
   340  	switch regset {
   341  	case _NT_PRSTATUS:
   342  		if maxlen < ptraceRegistersSize {
   343  			return 0, linuxerr.EFAULT
   344  		}
   345  		return s.PtraceGetRegs(dst)
   346  	case _NT_PRFPREG:
   347  		return s.fpState.PtraceGetFPRegs(dst, maxlen)
   348  	case _NT_X86_XSTATE:
   349  		return s.fpState.PtraceGetXstateRegs(dst, maxlen, fs)
   350  	default:
   351  		return 0, linuxerr.EINVAL
   352  	}
   353  }
   354  
   355  // PtraceSetRegSet implements Context.PtraceSetRegSet.
   356  func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int, fs cpuid.FeatureSet) (int, error) {
   357  	switch regset {
   358  	case _NT_PRSTATUS:
   359  		if maxlen < ptraceRegistersSize {
   360  			return 0, linuxerr.EFAULT
   361  		}
   362  		return s.PtraceSetRegs(src)
   363  	case _NT_PRFPREG:
   364  		return s.fpState.PtraceSetFPRegs(src, maxlen)
   365  	case _NT_X86_XSTATE:
   366  		return s.fpState.PtraceSetXstateRegs(src, maxlen, fs)
   367  	default:
   368  		return 0, linuxerr.EINVAL
   369  	}
   370  }
   371  
   372  // FullRestore indicates whether a full restore is required.
   373  func (s *State) FullRestore() bool {
   374  	// A fast system call return is possible only if
   375  	//
   376  	//	* RCX matches the instruction pointer.
   377  	//	* R11 matches our flags value.
   378  	//	* Usermode does not expect to set either the resume flag or the
   379  	//   virtual mode flags (unlikely.)
   380  	//	* CS and SS are set to the standard selectors.
   381  	//
   382  	// That is, SYSRET results in the correct final state.
   383  	fastRestore := s.Regs.Rcx == s.Regs.Rip &&
   384  		s.Regs.Eflags == s.Regs.R11 &&
   385  		(s.Regs.Eflags&eflagsRF == 0) &&
   386  		(s.Regs.Eflags&eflagsVM == 0) &&
   387  		s.Regs.Cs == userCS &&
   388  		s.Regs.Ss == userDS
   389  	return !fastRestore
   390  }
   391  
   392  // New returns a new architecture context.
   393  func New(arch Arch) *Context64 {
   394  	switch arch {
   395  	case AMD64:
   396  		return &Context64{
   397  			State{
   398  				fpState: fpu.NewState(),
   399  			},
   400  		}
   401  	}
   402  	panic(fmt.Sprintf("unknown architecture %v", arch))
   403  }