github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/arch/arch_amd64.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64
    16  // +build amd64
    17  
    18  package arch
    19  
    20  import (
    21  	"bytes"
    22  	"fmt"
    23  	"math/rand"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    27  	"github.com/MerlinKodo/gvisor/pkg/marshal"
    28  	"github.com/MerlinKodo/gvisor/pkg/marshal/primitive"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch/fpu"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/limits"
    31  )
    32  
    33  // Host specifies the host architecture.
    34  const Host = AMD64
    35  
    36  // These constants come directly from Linux.
    37  const (
    38  	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
    39  	// for a 64-bit process.
    40  	maxAddr64 hostarch.Addr = (1 << 47) - hostarch.PageSize
    41  
    42  	// maxStackRand64 is the maximum randomization to apply to the stack.
    43  	// It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux.
    44  	maxStackRand64 = 16 << 30 // 16 GB
    45  
    46  	// maxMmapRand64 is the maximum randomization to apply to the mmap
    47  	// layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux.
    48  	maxMmapRand64 = (1 << 28) * hostarch.PageSize
    49  
    50  	// minGap64 is the minimum gap to leave at the top of the address space
    51  	// for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux.
    52  	minGap64 = (128 << 20) + maxStackRand64
    53  
    54  	// preferredPIELoadAddr is the standard Linux position-independent
    55  	// executable base load address. It is ELF_ET_DYN_BASE in Linux.
    56  	//
    57  	// The Platform {Min,Max}UserAddress() may preclude loading at this
    58  	// address. See other preferredFoo comments below.
    59  	preferredPIELoadAddr hostarch.Addr = maxAddr64 / 3 * 2
    60  )
    61  
    62  // These constants are selected as heuristics to help make the Platform's
    63  // potentially limited address space conform as closely to Linux as possible.
    64  const (
    65  	// Select a preferred minimum TopDownBase address.
    66  	//
    67  	// Some applications (TSAN and other *SANs) are very particular about
    68  	// the way the Linux mmap allocator layouts out the address space.
    69  	//
    70  	// TSAN in particular expects top down allocations to be made in the
    71  	// range [0x7e8000000000, 0x800000000000).
    72  	//
    73  	// The minimum TopDownBase on Linux would be:
    74  	// 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000.
    75  	//
    76  	// (minGap64 because TSAN uses a small RLIMIT_STACK.)
    77  	//
    78  	// 0x7e8000000000 is selected arbitrarily by TSAN to leave room for
    79  	// allocations below TopDownBase.
    80  	//
    81  	// N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all
    82  	// the way down to 0x10007fff8000, and MSAN down to 0x700000000000.
    83  	//
    84  	// Of course, there is no hard minimum to allocation; an allocator can
    85  	// search all the way from TopDownBase to Min. However, TSAN declared
    86  	// their range "good enough".
    87  	//
    88  	// We would like to pick a TopDownBase such that it is unlikely that an
    89  	// allocator will select an address below TSAN's minimum. We achieve
    90  	// this by trying to leave a sizable gap below TopDownBase.
    91  	//
    92  	// This is all "preferred" because the layout min/max address may not
    93  	// allow us to select such a TopDownBase, in which case we have to fall
    94  	// back to a layout that TSAN may not be happy with.
    95  	preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000
    96  	preferredAllocationGap                 = 128 << 30 // 128 GB
    97  	preferredTopDownBaseMin                = preferredTopDownAllocMin + preferredAllocationGap
    98  
    99  	// minMmapRand64 is the smallest we are willing to make the
   100  	// randomization to stay above preferredTopDownBaseMin.
   101  	minMmapRand64 = (1 << 26) * hostarch.PageSize
   102  )
   103  
   104  // Context64 represents an AMD64 context.
   105  //
   106  // +stateify savable
   107  type Context64 struct {
   108  	State
   109  }
   110  
   111  // Arch implements Context.Arch.
   112  func (c *Context64) Arch() Arch {
   113  	return AMD64
   114  }
   115  
   116  // FloatingPointData returns the state of the floating-point unit.
   117  func (c *Context64) FloatingPointData() *fpu.State {
   118  	return &c.State.fpState
   119  }
   120  
   121  // Fork returns an exact copy of this context.
   122  func (c *Context64) Fork() *Context64 {
   123  	return &Context64{
   124  		State: c.State.Fork(),
   125  	}
   126  }
   127  
   128  // Return returns the current syscall return value.
   129  func (c *Context64) Return() uintptr {
   130  	return uintptr(c.Regs.Rax)
   131  }
   132  
   133  // SetReturn sets the syscall return value.
   134  func (c *Context64) SetReturn(value uintptr) {
   135  	c.Regs.Rax = uint64(value)
   136  }
   137  
   138  // IP returns the current instruction pointer.
   139  func (c *Context64) IP() uintptr {
   140  	return uintptr(c.Regs.Rip)
   141  }
   142  
   143  // SetIP sets the current instruction pointer.
   144  func (c *Context64) SetIP(value uintptr) {
   145  	c.Regs.Rip = uint64(value)
   146  }
   147  
   148  // Stack returns the current stack pointer.
   149  func (c *Context64) Stack() uintptr {
   150  	return uintptr(c.Regs.Rsp)
   151  }
   152  
   153  // SetStack sets the current stack pointer.
   154  func (c *Context64) SetStack(value uintptr) {
   155  	c.Regs.Rsp = uint64(value)
   156  }
   157  
   158  // TLS returns the current TLS pointer.
   159  func (c *Context64) TLS() uintptr {
   160  	return uintptr(c.Regs.Fs_base)
   161  }
   162  
   163  // SetTLS sets the current TLS pointer. Returns false if value is invalid.
   164  func (c *Context64) SetTLS(value uintptr) bool {
   165  	if !isValidSegmentBase(uint64(value)) {
   166  		return false
   167  	}
   168  
   169  	c.Regs.Fs = 0
   170  	c.Regs.Fs_base = uint64(value)
   171  	return true
   172  }
   173  
   174  // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
   175  func (c *Context64) SetOldRSeqInterruptedIP(value uintptr) {
   176  	c.Regs.R10 = uint64(value)
   177  }
   178  
   179  // Native returns the native type for the given val.
   180  func (c *Context64) Native(val uintptr) marshal.Marshallable {
   181  	v := primitive.Uint64(val)
   182  	return &v
   183  }
   184  
   185  // Value returns the generic val for the given native type.
   186  func (c *Context64) Value(val marshal.Marshallable) uintptr {
   187  	return uintptr(*val.(*primitive.Uint64))
   188  }
   189  
   190  // Width returns the byte width of this architecture.
   191  func (c *Context64) Width() uint {
   192  	return 8
   193  }
   194  
   195  // mmapRand returns a random adjustment for randomizing an mmap layout.
   196  func mmapRand(max uint64) hostarch.Addr {
   197  	return hostarch.Addr(rand.Int63n(int64(max))).RoundDown()
   198  }
   199  
   200  // NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
   201  func (c *Context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) {
   202  	min, ok := min.RoundUp()
   203  	if !ok {
   204  		return MmapLayout{}, unix.EINVAL
   205  	}
   206  	if max > maxAddr64 {
   207  		max = maxAddr64
   208  	}
   209  	max = max.RoundDown()
   210  
   211  	if min > max {
   212  		return MmapLayout{}, unix.EINVAL
   213  	}
   214  
   215  	stackSize := r.Get(limits.Stack)
   216  
   217  	// MAX_GAP in Linux.
   218  	maxGap := (max / 6) * 5
   219  	gap := hostarch.Addr(stackSize.Cur)
   220  	if gap < minGap64 {
   221  		gap = minGap64
   222  	}
   223  	if gap > maxGap {
   224  		gap = maxGap
   225  	}
   226  	defaultDir := MmapTopDown
   227  	if stackSize.Cur == limits.Infinity {
   228  		defaultDir = MmapBottomUp
   229  	}
   230  
   231  	topDownMin := max - gap - maxMmapRand64
   232  	maxRand := hostarch.Addr(maxMmapRand64)
   233  	if topDownMin < preferredTopDownBaseMin {
   234  		// Try to keep TopDownBase above preferredTopDownBaseMin by
   235  		// shrinking maxRand.
   236  		maxAdjust := maxRand - minMmapRand64
   237  		needAdjust := preferredTopDownBaseMin - topDownMin
   238  		if needAdjust <= maxAdjust {
   239  			maxRand -= needAdjust
   240  		}
   241  	}
   242  
   243  	rnd := mmapRand(uint64(maxRand))
   244  	l := MmapLayout{
   245  		MinAddr: min,
   246  		MaxAddr: max,
   247  		// TASK_UNMAPPED_BASE in Linux.
   248  		BottomUpBase:     (max/3 + rnd).RoundDown(),
   249  		TopDownBase:      (max - gap - rnd).RoundDown(),
   250  		DefaultDirection: defaultDir,
   251  		// We may have reduced the maximum randomization to keep
   252  		// TopDownBase above preferredTopDownBaseMin while maintaining
   253  		// our stack gap. Stack allocations must use that max
   254  		// randomization to avoiding eating into the gap.
   255  		MaxStackRand: uint64(maxRand),
   256  	}
   257  
   258  	// Final sanity check on the layout.
   259  	if !l.Valid() {
   260  		panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
   261  	}
   262  
   263  	return l, nil
   264  }
   265  
   266  // PIELoadAddress implements Context.PIELoadAddress.
   267  func (c *Context64) PIELoadAddress(l MmapLayout) hostarch.Addr {
   268  	base := preferredPIELoadAddr
   269  	max, ok := base.AddLength(maxMmapRand64)
   270  	if !ok {
   271  		panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
   272  	}
   273  
   274  	if max > l.MaxAddr {
   275  		// preferredPIELoadAddr won't fit; fall back to the standard
   276  		// Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
   277  		//
   278  		// Don't bother trying to shrink the randomization for now.
   279  		base = l.TopDownBase / 3 * 2
   280  	}
   281  
   282  	return base + mmapRand(maxMmapRand64)
   283  }
   284  
   285  // userStructSize is the size in bytes of Linux's struct user on amd64.
   286  const userStructSize = 928
   287  
   288  // PtracePeekUser implements Context.PtracePeekUser.
   289  func (c *Context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
   290  	if addr&7 != 0 || addr >= userStructSize {
   291  		return nil, unix.EIO
   292  	}
   293  	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
   294  	// u_debugreg, returning 0 or silently no-oping for other fields
   295  	// respectively.
   296  	if addr < uintptr(ptraceRegistersSize) {
   297  		regs := c.ptraceGetRegs()
   298  		buf := make([]byte, regs.SizeBytes())
   299  		regs.MarshalUnsafe(buf)
   300  		return c.Native(uintptr(hostarch.ByteOrder.Uint64(buf[addr:]))), nil
   301  	}
   302  	// Note: x86 debug registers are missing.
   303  	return c.Native(0), nil
   304  }
   305  
   306  // PtracePokeUser implements Context.PtracePokeUser.
   307  func (c *Context64) PtracePokeUser(addr, data uintptr) error {
   308  	if addr&7 != 0 || addr >= userStructSize {
   309  		return unix.EIO
   310  	}
   311  	if addr < uintptr(ptraceRegistersSize) {
   312  		regs := c.ptraceGetRegs()
   313  		buf := make([]byte, regs.SizeBytes())
   314  		regs.MarshalUnsafe(buf)
   315  		hostarch.ByteOrder.PutUint64(buf[addr:], uint64(data))
   316  		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
   317  		return err
   318  	}
   319  	// Note: x86 debug registers are missing.
   320  	return nil
   321  }