github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/arch/arch_amd64.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build amd64
    16  
    17  package arch
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"math/rand"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/SagerNet/gvisor/pkg/cpuid"
    26  	"github.com/SagerNet/gvisor/pkg/hostarch"
    27  	"github.com/SagerNet/gvisor/pkg/marshal"
    28  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/arch/fpu"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    31  )
    32  
    33  // Host specifies the host architecture.
    34  const Host = AMD64
    35  
    36  // These constants come directly from Linux.
    37  const (
    38  	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
    39  	// for a 64-bit process.
    40  	maxAddr64 hostarch.Addr = (1 << 47) - hostarch.PageSize
    41  
    42  	// maxStackRand64 is the maximum randomization to apply to the stack.
    43  	// It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux.
    44  	maxStackRand64 = 16 << 30 // 16 GB
    45  
    46  	// maxMmapRand64 is the maximum randomization to apply to the mmap
    47  	// layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux.
    48  	maxMmapRand64 = (1 << 28) * hostarch.PageSize
    49  
    50  	// minGap64 is the minimum gap to leave at the top of the address space
    51  	// for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux.
    52  	minGap64 = (128 << 20) + maxStackRand64
    53  
    54  	// preferredPIELoadAddr is the standard Linux position-independent
    55  	// executable base load address. It is ELF_ET_DYN_BASE in Linux.
    56  	//
    57  	// The Platform {Min,Max}UserAddress() may preclude loading at this
    58  	// address. See other preferredFoo comments below.
    59  	preferredPIELoadAddr hostarch.Addr = maxAddr64 / 3 * 2
    60  )
    61  
    62  // These constants are selected as heuristics to help make the Platform's
    63  // potentially limited address space conform as closely to Linux as possible.
    64  const (
    65  	// Select a preferred minimum TopDownBase address.
    66  	//
    67  	// Some applications (TSAN and other *SANs) are very particular about
    68  	// the way the Linux mmap allocator layouts out the address space.
    69  	//
    70  	// TSAN in particular expects top down allocations to be made in the
    71  	// range [0x7e8000000000, 0x800000000000).
    72  	//
    73  	// The minimum TopDownBase on Linux would be:
    74  	// 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000.
    75  	//
    76  	// (minGap64 because TSAN uses a small RLIMIT_STACK.)
    77  	//
    78  	// 0x7e8000000000 is selected arbitrarily by TSAN to leave room for
    79  	// allocations below TopDownBase.
    80  	//
    81  	// N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all
    82  	// the way down to 0x10007fff8000, and MSAN down to 0x700000000000.
    83  	//
    84  	// Of course, there is no hard minimum to allocation; an allocator can
    85  	// search all the way from TopDownBase to Min. However, TSAN declared
    86  	// their range "good enough".
    87  	//
    88  	// We would like to pick a TopDownBase such that it is unlikely that an
    89  	// allocator will select an address below TSAN's minimum. We achieve
    90  	// this by trying to leave a sizable gap below TopDownBase.
    91  	//
    92  	// This is all "preferred" because the layout min/max address may not
    93  	// allow us to select such a TopDownBase, in which case we have to fall
    94  	// back to a layout that TSAN may not be happy with.
    95  	preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000
    96  	preferredAllocationGap                 = 128 << 30 // 128 GB
    97  	preferredTopDownBaseMin                = preferredTopDownAllocMin + preferredAllocationGap
    98  
    99  	// minMmapRand64 is the smallest we are willing to make the
   100  	// randomization to stay above preferredTopDownBaseMin.
   101  	minMmapRand64 = (1 << 26) * hostarch.PageSize
   102  )
   103  
   104  // context64 represents an AMD64 context.
   105  //
   106  // +stateify savable
   107  type context64 struct {
   108  	State
   109  	sigFPState []fpu.State // fpstate to be restored on sigreturn.
   110  }
   111  
   112  // Arch implements Context.Arch.
   113  func (c *context64) Arch() Arch {
   114  	return AMD64
   115  }
   116  
   117  func (c *context64) copySigFPState() []fpu.State {
   118  	var sigfps []fpu.State
   119  	for _, s := range c.sigFPState {
   120  		sigfps = append(sigfps, s.Fork())
   121  	}
   122  	return sigfps
   123  }
   124  
   125  func (c *context64) FloatingPointData() *fpu.State {
   126  	return &c.State.fpState
   127  }
   128  
   129  // Fork returns an exact copy of this context.
   130  func (c *context64) Fork() Context {
   131  	return &context64{
   132  		State:      c.State.Fork(),
   133  		sigFPState: c.copySigFPState(),
   134  	}
   135  }
   136  
   137  // Return returns the current syscall return value.
   138  func (c *context64) Return() uintptr {
   139  	return uintptr(c.Regs.Rax)
   140  }
   141  
   142  // SetReturn sets the syscall return value.
   143  func (c *context64) SetReturn(value uintptr) {
   144  	c.Regs.Rax = uint64(value)
   145  }
   146  
   147  // IP returns the current instruction pointer.
   148  func (c *context64) IP() uintptr {
   149  	return uintptr(c.Regs.Rip)
   150  }
   151  
   152  // SetIP sets the current instruction pointer.
   153  func (c *context64) SetIP(value uintptr) {
   154  	c.Regs.Rip = uint64(value)
   155  }
   156  
   157  // Stack returns the current stack pointer.
   158  func (c *context64) Stack() uintptr {
   159  	return uintptr(c.Regs.Rsp)
   160  }
   161  
   162  // SetStack sets the current stack pointer.
   163  func (c *context64) SetStack(value uintptr) {
   164  	c.Regs.Rsp = uint64(value)
   165  }
   166  
   167  // TLS returns the current TLS pointer.
   168  func (c *context64) TLS() uintptr {
   169  	return uintptr(c.Regs.Fs_base)
   170  }
   171  
   172  // SetTLS sets the current TLS pointer. Returns false if value is invalid.
   173  func (c *context64) SetTLS(value uintptr) bool {
   174  	if !isValidSegmentBase(uint64(value)) {
   175  		return false
   176  	}
   177  
   178  	c.Regs.Fs = 0
   179  	c.Regs.Fs_base = uint64(value)
   180  	return true
   181  }
   182  
   183  // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
   184  func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
   185  	c.Regs.R10 = uint64(value)
   186  }
   187  
   188  // Native returns the native type for the given val.
   189  func (c *context64) Native(val uintptr) marshal.Marshallable {
   190  	v := primitive.Uint64(val)
   191  	return &v
   192  }
   193  
   194  // Value returns the generic val for the given native type.
   195  func (c *context64) Value(val marshal.Marshallable) uintptr {
   196  	return uintptr(*val.(*primitive.Uint64))
   197  }
   198  
   199  // Width returns the byte width of this architecture.
   200  func (c *context64) Width() uint {
   201  	return 8
   202  }
   203  
   204  // FeatureSet returns the FeatureSet in use.
   205  func (c *context64) FeatureSet() *cpuid.FeatureSet {
   206  	return c.State.FeatureSet
   207  }
   208  
   209  // mmapRand returns a random adjustment for randomizing an mmap layout.
   210  func mmapRand(max uint64) hostarch.Addr {
   211  	return hostarch.Addr(rand.Int63n(int64(max))).RoundDown()
   212  }
   213  
   214  // NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
   215  func (c *context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) {
   216  	min, ok := min.RoundUp()
   217  	if !ok {
   218  		return MmapLayout{}, unix.EINVAL
   219  	}
   220  	if max > maxAddr64 {
   221  		max = maxAddr64
   222  	}
   223  	max = max.RoundDown()
   224  
   225  	if min > max {
   226  		return MmapLayout{}, unix.EINVAL
   227  	}
   228  
   229  	stackSize := r.Get(limits.Stack)
   230  
   231  	// MAX_GAP in Linux.
   232  	maxGap := (max / 6) * 5
   233  	gap := hostarch.Addr(stackSize.Cur)
   234  	if gap < minGap64 {
   235  		gap = minGap64
   236  	}
   237  	if gap > maxGap {
   238  		gap = maxGap
   239  	}
   240  	defaultDir := MmapTopDown
   241  	if stackSize.Cur == limits.Infinity {
   242  		defaultDir = MmapBottomUp
   243  	}
   244  
   245  	topDownMin := max - gap - maxMmapRand64
   246  	maxRand := hostarch.Addr(maxMmapRand64)
   247  	if topDownMin < preferredTopDownBaseMin {
   248  		// Try to keep TopDownBase above preferredTopDownBaseMin by
   249  		// shrinking maxRand.
   250  		maxAdjust := maxRand - minMmapRand64
   251  		needAdjust := preferredTopDownBaseMin - topDownMin
   252  		if needAdjust <= maxAdjust {
   253  			maxRand -= needAdjust
   254  		}
   255  	}
   256  
   257  	rnd := mmapRand(uint64(maxRand))
   258  	l := MmapLayout{
   259  		MinAddr: min,
   260  		MaxAddr: max,
   261  		// TASK_UNMAPPED_BASE in Linux.
   262  		BottomUpBase:     (max/3 + rnd).RoundDown(),
   263  		TopDownBase:      (max - gap - rnd).RoundDown(),
   264  		DefaultDirection: defaultDir,
   265  		// We may have reduced the maximum randomization to keep
   266  		// TopDownBase above preferredTopDownBaseMin while maintaining
   267  		// our stack gap. Stack allocations must use that max
   268  		// randomization to avoiding eating into the gap.
   269  		MaxStackRand: uint64(maxRand),
   270  	}
   271  
   272  	// Final sanity check on the layout.
   273  	if !l.Valid() {
   274  		panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
   275  	}
   276  
   277  	return l, nil
   278  }
   279  
   280  // PIELoadAddress implements Context.PIELoadAddress.
   281  func (c *context64) PIELoadAddress(l MmapLayout) hostarch.Addr {
   282  	base := preferredPIELoadAddr
   283  	max, ok := base.AddLength(maxMmapRand64)
   284  	if !ok {
   285  		panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
   286  	}
   287  
   288  	if max > l.MaxAddr {
   289  		// preferredPIELoadAddr won't fit; fall back to the standard
   290  		// Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
   291  		//
   292  		// Don't bother trying to shrink the randomization for now.
   293  		base = l.TopDownBase / 3 * 2
   294  	}
   295  
   296  	return base + mmapRand(maxMmapRand64)
   297  }
   298  
   299  // userStructSize is the size in bytes of Linux's struct user on amd64.
   300  const userStructSize = 928
   301  
   302  // PtracePeekUser implements Context.PtracePeekUser.
   303  func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
   304  	if addr&7 != 0 || addr >= userStructSize {
   305  		return nil, unix.EIO
   306  	}
   307  	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
   308  	// u_debugreg, returning 0 or silently no-oping for other fields
   309  	// respectively.
   310  	if addr < uintptr(ptraceRegistersSize) {
   311  		regs := c.ptraceGetRegs()
   312  		buf := make([]byte, regs.SizeBytes())
   313  		regs.MarshalUnsafe(buf)
   314  		return c.Native(uintptr(hostarch.ByteOrder.Uint64(buf[addr:]))), nil
   315  	}
   316  	// Note: x86 debug registers are missing.
   317  	return c.Native(0), nil
   318  }
   319  
   320  // PtracePokeUser implements Context.PtracePokeUser.
   321  func (c *context64) PtracePokeUser(addr, data uintptr) error {
   322  	if addr&7 != 0 || addr >= userStructSize {
   323  		return unix.EIO
   324  	}
   325  	if addr < uintptr(ptraceRegistersSize) {
   326  		regs := c.ptraceGetRegs()
   327  		buf := make([]byte, regs.SizeBytes())
   328  		regs.MarshalUnsafe(buf)
   329  		hostarch.ByteOrder.PutUint64(buf[addr:], uint64(data))
   330  		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
   331  		return err
   332  	}
   333  	// Note: x86 debug registers are missing.
   334  	return nil
   335  }