github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/arch/arch_x86.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build amd64 386
    16  
    17  package arch
    18  
    19  import (
    20  	"fmt"
    21  	"io"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    25  	"github.com/SagerNet/gvisor/pkg/cpuid"
    26  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    27  	"github.com/SagerNet/gvisor/pkg/log"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/arch/fpu"
    29  	rpb "github.com/SagerNet/gvisor/pkg/sentry/arch/registers_go_proto"
    30  	"github.com/SagerNet/gvisor/pkg/syserror"
    31  )
    32  
    33  // Registers represents the CPU registers for this architecture.
    34  //
    35  // +stateify savable
    36  type Registers struct {
    37  	linux.PtraceRegs
    38  }
    39  
    40  // System-related constants for x86.
    41  const (
    42  	// SyscallWidth is the width of syscall, sysenter, and int 80 insturctions.
    43  	SyscallWidth = 2
    44  )
    45  
    46  // EFLAGS register bits.
    47  const (
    48  	// eflagsCF is the mask for the carry flag.
    49  	eflagsCF = uint64(1) << 0
    50  	// eflagsPF is the mask for the parity flag.
    51  	eflagsPF = uint64(1) << 2
    52  	// eflagsAF is the mask for the auxiliary carry flag.
    53  	eflagsAF = uint64(1) << 4
    54  	// eflagsZF is the mask for the zero flag.
    55  	eflagsZF = uint64(1) << 6
    56  	// eflagsSF is the mask for the sign flag.
    57  	eflagsSF = uint64(1) << 7
    58  	// eflagsTF is the mask for the trap flag.
    59  	eflagsTF = uint64(1) << 8
    60  	// eflagsIF is the mask for the interrupt flag.
    61  	eflagsIF = uint64(1) << 9
    62  	// eflagsDF is the mask for the direction flag.
    63  	eflagsDF = uint64(1) << 10
    64  	// eflagsOF is the mask for the overflow flag.
    65  	eflagsOF = uint64(1) << 11
    66  	// eflagsIOPL is the mask for the I/O privilege level.
    67  	eflagsIOPL = uint64(3) << 12
    68  	// eflagsNT is the mask for the nested task bit.
    69  	eflagsNT = uint64(1) << 14
    70  	// eflagsRF is the mask for the resume flag.
    71  	eflagsRF = uint64(1) << 16
    72  	// eflagsVM is the mask for the virtual mode bit.
    73  	eflagsVM = uint64(1) << 17
    74  	// eflagsAC is the mask for the alignment check / access control bit.
    75  	eflagsAC = uint64(1) << 18
    76  	// eflagsVIF is the mask for the virtual interrupt flag.
    77  	eflagsVIF = uint64(1) << 19
    78  	// eflagsVIP is the mask for the virtual interrupt pending bit.
    79  	eflagsVIP = uint64(1) << 20
    80  	// eflagsID is the mask for the CPUID detection bit.
    81  	eflagsID = uint64(1) << 21
    82  
    83  	// eflagsPtraceMutable is the mask for the set of EFLAGS that may be
    84  	// changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to
    85  	// Linux's FLAG_MASK.
    86  	eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT
    87  
    88  	// eflagsRestorable is the mask for the set of EFLAGS that may be changed by
    89  	// SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS.
    90  	eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF
    91  )
    92  
    93  // Segment selectors. See arch/x86/include/asm/segment.h.
    94  const (
    95  	userCS   = 0x33 // guest ring 3 code selector
    96  	user32CS = 0x23 // guest ring 3 32 bit code selector
    97  	userDS   = 0x2b // guest ring 3 data selector
    98  
    99  	_FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector
   100  	_GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector
   101  )
   102  
   103  var (
   104  	// TrapInstruction is the x86 trap instruction.
   105  	TrapInstruction = [1]byte{0xcc}
   106  
   107  	// CPUIDInstruction is the x86 CPUID instruction.
   108  	CPUIDInstruction = [2]byte{0xf, 0xa2}
   109  
   110  	// X86TrapFlag is an exported const for use by other packages.
   111  	X86TrapFlag uint64 = (1 << 8)
   112  )
   113  
   114  // Proto returns a protobuf representation of the system registers in State.
   115  func (s State) Proto() *rpb.Registers {
   116  	regs := &rpb.AMD64Registers{
   117  		Rax:     s.Regs.Rax,
   118  		Rbx:     s.Regs.Rbx,
   119  		Rcx:     s.Regs.Rcx,
   120  		Rdx:     s.Regs.Rdx,
   121  		Rsi:     s.Regs.Rsi,
   122  		Rdi:     s.Regs.Rdi,
   123  		Rsp:     s.Regs.Rsp,
   124  		Rbp:     s.Regs.Rbp,
   125  		R8:      s.Regs.R8,
   126  		R9:      s.Regs.R9,
   127  		R10:     s.Regs.R10,
   128  		R11:     s.Regs.R11,
   129  		R12:     s.Regs.R12,
   130  		R13:     s.Regs.R13,
   131  		R14:     s.Regs.R14,
   132  		R15:     s.Regs.R15,
   133  		Rip:     s.Regs.Rip,
   134  		Rflags:  s.Regs.Eflags,
   135  		OrigRax: s.Regs.Orig_rax,
   136  		Cs:      s.Regs.Cs,
   137  		Ds:      s.Regs.Ds,
   138  		Es:      s.Regs.Es,
   139  		Fs:      s.Regs.Fs,
   140  		Gs:      s.Regs.Gs,
   141  		Ss:      s.Regs.Ss,
   142  		FsBase:  s.Regs.Fs_base,
   143  		GsBase:  s.Regs.Gs_base,
   144  	}
   145  	return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}}
   146  }
   147  
   148  // Fork creates and returns an identical copy of the state.
   149  func (s *State) Fork() State {
   150  	return State{
   151  		Regs:       s.Regs,
   152  		fpState:    s.fpState.Fork(),
   153  		FeatureSet: s.FeatureSet,
   154  	}
   155  }
   156  
   157  // StateData implements Context.StateData.
   158  func (s *State) StateData() *State {
   159  	return s
   160  }
   161  
   162  // CPUIDEmulate emulates a cpuid instruction.
   163  func (s *State) CPUIDEmulate(l log.Logger) {
   164  	argax := uint32(s.Regs.Rax)
   165  	argcx := uint32(s.Regs.Rcx)
   166  	ax, bx, cx, dx := s.FeatureSet.EmulateID(argax, argcx)
   167  	s.Regs.Rax = uint64(ax)
   168  	s.Regs.Rbx = uint64(bx)
   169  	s.Regs.Rcx = uint64(cx)
   170  	s.Regs.Rdx = uint64(dx)
   171  	l.Debugf("CPUID(%x,%x): %x %x %x %x", argax, argcx, ax, bx, cx, dx)
   172  }
   173  
   174  // SingleStep implements Context.SingleStep.
   175  func (s *State) SingleStep() bool {
   176  	return s.Regs.Eflags&X86TrapFlag != 0
   177  }
   178  
   179  // SetSingleStep enables single stepping.
   180  func (s *State) SetSingleStep() {
   181  	// Set the trap flag.
   182  	s.Regs.Eflags |= X86TrapFlag
   183  }
   184  
   185  // ClearSingleStep enables single stepping.
   186  func (s *State) ClearSingleStep() {
   187  	// Clear the trap flag.
   188  	s.Regs.Eflags &= ^X86TrapFlag
   189  }
   190  
   191  // RegisterMap returns a map of all registers.
   192  func (s *State) RegisterMap() (map[string]uintptr, error) {
   193  	return map[string]uintptr{
   194  		"R15":      uintptr(s.Regs.R15),
   195  		"R14":      uintptr(s.Regs.R14),
   196  		"R13":      uintptr(s.Regs.R13),
   197  		"R12":      uintptr(s.Regs.R12),
   198  		"Rbp":      uintptr(s.Regs.Rbp),
   199  		"Rbx":      uintptr(s.Regs.Rbx),
   200  		"R11":      uintptr(s.Regs.R11),
   201  		"R10":      uintptr(s.Regs.R10),
   202  		"R9":       uintptr(s.Regs.R9),
   203  		"R8":       uintptr(s.Regs.R8),
   204  		"Rax":      uintptr(s.Regs.Rax),
   205  		"Rcx":      uintptr(s.Regs.Rcx),
   206  		"Rdx":      uintptr(s.Regs.Rdx),
   207  		"Rsi":      uintptr(s.Regs.Rsi),
   208  		"Rdi":      uintptr(s.Regs.Rdi),
   209  		"Orig_rax": uintptr(s.Regs.Orig_rax),
   210  		"Rip":      uintptr(s.Regs.Rip),
   211  		"Cs":       uintptr(s.Regs.Cs),
   212  		"Eflags":   uintptr(s.Regs.Eflags),
   213  		"Rsp":      uintptr(s.Regs.Rsp),
   214  		"Ss":       uintptr(s.Regs.Ss),
   215  		"Fs_base":  uintptr(s.Regs.Fs_base),
   216  		"Gs_base":  uintptr(s.Regs.Gs_base),
   217  		"Ds":       uintptr(s.Regs.Ds),
   218  		"Es":       uintptr(s.Regs.Es),
   219  		"Fs":       uintptr(s.Regs.Fs),
   220  		"Gs":       uintptr(s.Regs.Gs),
   221  	}, nil
   222  }
   223  
   224  // PtraceGetRegs implements Context.PtraceGetRegs.
   225  func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
   226  	regs := s.ptraceGetRegs()
   227  	n, err := regs.WriteTo(dst)
   228  	return int(n), err
   229  }
   230  
   231  func (s *State) ptraceGetRegs() Registers {
   232  	regs := s.Regs
   233  	// These may not be initialized.
   234  	if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 {
   235  		regs.Eflags = eflagsIF
   236  		regs.Cs = userCS
   237  		regs.Ss = userDS
   238  	}
   239  	// As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base
   240  	// addresses using reserved descriptors in the GDT instead of the MSRs,
   241  	// with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These
   242  	// values are actually visible in struct user_regs_struct::fs/gs;
   243  	// arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct
   244  	// thread_struct::fsindex/gsindex.
   245  	//
   246  	// We always use fs == gs == 0 when fs_base/gs_base is in use, for
   247  	// simplicity.
   248  	//
   249  	// Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via
   250  	// arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a
   251  	// 32-bit value and fsindex/gsindex indicates that this optimization is
   252  	// in use, as well as the reverse case of setting fs/gs to
   253  	// FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
   254  	// same in PtraceSetRegs.)
   255  	//
   256  	// TODO(github.com/SagerNet/issue/168): Remove this fixup since newer Linux
   257  	// doesn't have this behavior anymore.
   258  	if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
   259  		regs.Fs = _FS_TLS_SEL
   260  	}
   261  	if regs.Gs == 0 && regs.Gs_base <= 0xffffffff {
   262  		regs.Gs = _GS_TLS_SEL
   263  	}
   264  	return regs
   265  }
   266  
   267  var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()
   268  
   269  // PtraceSetRegs implements Context.PtraceSetRegs.
   270  func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
   271  	var regs Registers
   272  	buf := make([]byte, ptraceRegistersSize)
   273  	if _, err := io.ReadFull(src, buf); err != nil {
   274  		return 0, err
   275  	}
   276  	regs.UnmarshalUnsafe(buf)
   277  	// Truncate segment registers to 16 bits.
   278  	regs.Cs = uint64(uint16(regs.Cs))
   279  	regs.Ds = uint64(uint16(regs.Ds))
   280  	regs.Es = uint64(uint16(regs.Es))
   281  	regs.Fs = uint64(uint16(regs.Fs))
   282  	regs.Gs = uint64(uint16(regs.Gs))
   283  	regs.Ss = uint64(uint16(regs.Ss))
   284  	// In Linux this validation is via arch/x86/kernel/ptrace.c:putreg().
   285  	if !isUserSegmentSelector(regs.Cs) {
   286  		return 0, unix.EIO
   287  	}
   288  	if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) {
   289  		return 0, unix.EIO
   290  	}
   291  	if regs.Es != 0 && !isUserSegmentSelector(regs.Es) {
   292  		return 0, unix.EIO
   293  	}
   294  	if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) {
   295  		return 0, unix.EIO
   296  	}
   297  	if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) {
   298  		return 0, unix.EIO
   299  	}
   300  	if !isUserSegmentSelector(regs.Ss) {
   301  		return 0, unix.EIO
   302  	}
   303  	if !isValidSegmentBase(regs.Fs_base) {
   304  		return 0, unix.EIO
   305  	}
   306  	if !isValidSegmentBase(regs.Gs_base) {
   307  		return 0, unix.EIO
   308  	}
   309  	// CS and SS are validated, but changes to them are otherwise silently
   310  	// ignored on amd64.
   311  	regs.Cs = s.Regs.Cs
   312  	regs.Ss = s.Regs.Ss
   313  	// fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux.
   314  	if regs.Fs_base != s.Regs.Fs_base {
   315  		regs.Fs = 0
   316  	}
   317  	if regs.Gs_base != s.Regs.Gs_base {
   318  		regs.Gs = 0
   319  	}
   320  	// Ignore "stale" TLS segment selectors for FS and GS. See comment in
   321  	// ptraceGetRegs.
   322  	if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 {
   323  		regs.Fs = 0
   324  	}
   325  	if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 {
   326  		regs.Gs = 0
   327  	}
   328  	regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
   329  	s.Regs = regs
   330  	return ptraceRegistersSize, nil
   331  }
   332  
   333  // isUserSegmentSelector returns true if the given segment selector specifies a
   334  // privilege level of 3 (USER_RPL).
   335  func isUserSegmentSelector(reg uint64) bool {
   336  	return reg&3 == 3
   337  }
   338  
   339  // isValidSegmentBase returns true if the given segment base specifies a
   340  // canonical user address.
   341  func isValidSegmentBase(reg uint64) bool {
   342  	return reg < uint64(maxAddr64)
   343  }
   344  
   345  // Register sets defined in include/uapi/linux/elf.h.
   346  const (
   347  	_NT_PRSTATUS   = 1
   348  	_NT_PRFPREG    = 2
   349  	_NT_X86_XSTATE = 0x202
   350  )
   351  
   352  // PtraceGetRegSet implements Context.PtraceGetRegSet.
   353  func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
   354  	switch regset {
   355  	case _NT_PRSTATUS:
   356  		if maxlen < ptraceRegistersSize {
   357  			return 0, syserror.EFAULT
   358  		}
   359  		return s.PtraceGetRegs(dst)
   360  	case _NT_PRFPREG:
   361  		return s.fpState.PtraceGetFPRegs(dst, maxlen)
   362  	case _NT_X86_XSTATE:
   363  		return s.fpState.PtraceGetXstateRegs(dst, maxlen, s.FeatureSet)
   364  	default:
   365  		return 0, linuxerr.EINVAL
   366  	}
   367  }
   368  
   369  // PtraceSetRegSet implements Context.PtraceSetRegSet.
   370  func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
   371  	switch regset {
   372  	case _NT_PRSTATUS:
   373  		if maxlen < ptraceRegistersSize {
   374  			return 0, syserror.EFAULT
   375  		}
   376  		return s.PtraceSetRegs(src)
   377  	case _NT_PRFPREG:
   378  		return s.fpState.PtraceSetFPRegs(src, maxlen)
   379  	case _NT_X86_XSTATE:
   380  		return s.fpState.PtraceSetXstateRegs(src, maxlen, s.FeatureSet)
   381  	default:
   382  		return 0, linuxerr.EINVAL
   383  	}
   384  }
   385  
   386  // FullRestore indicates whether a full restore is required.
   387  func (s *State) FullRestore() bool {
   388  	// A fast system call return is possible only if
   389  	//
   390  	// * RCX matches the instruction pointer.
   391  	// * R11 matches our flags value.
   392  	// * Usermode does not expect to set either the resume flag or the
   393  	//   virtual mode flags (unlikely.)
   394  	// * CS and SS are set to the standard selectors.
   395  	//
   396  	// That is, SYSRET results in the correct final state.
   397  	fastRestore := s.Regs.Rcx == s.Regs.Rip &&
   398  		s.Regs.Eflags == s.Regs.R11 &&
   399  		(s.Regs.Eflags&eflagsRF == 0) &&
   400  		(s.Regs.Eflags&eflagsVM == 0) &&
   401  		s.Regs.Cs == userCS &&
   402  		s.Regs.Ss == userDS
   403  	return !fastRestore
   404  }
   405  
   406  // New returns a new architecture context.
   407  func New(arch Arch, fs *cpuid.FeatureSet) Context {
   408  	switch arch {
   409  	case AMD64:
   410  		return &context64{
   411  			State{
   412  				fpState:    fpu.NewState(),
   413  				FeatureSet: fs,
   414  			},
   415  			[]fpu.State(nil),
   416  		}
   417  	}
   418  	panic(fmt.Sprintf("unknown architecture %v", arch))
   419  }