github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/arch/arch_amd64.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build amd64 16 // +build amd64 17 18 package arch 19 20 import ( 21 "bytes" 22 "fmt" 23 "math/rand" 24 25 "golang.org/x/sys/unix" 26 "github.com/MerlinKodo/gvisor/pkg/hostarch" 27 "github.com/MerlinKodo/gvisor/pkg/marshal" 28 "github.com/MerlinKodo/gvisor/pkg/marshal/primitive" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/arch/fpu" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/limits" 31 ) 32 33 // Host specifies the host architecture. 34 const Host = AMD64 35 36 // These constants come directly from Linux. 37 const ( 38 // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux 39 // for a 64-bit process. 40 maxAddr64 hostarch.Addr = (1 << 47) - hostarch.PageSize 41 42 // maxStackRand64 is the maximum randomization to apply to the stack. 43 // It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux. 44 maxStackRand64 = 16 << 30 // 16 GB 45 46 // maxMmapRand64 is the maximum randomization to apply to the mmap 47 // layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux. 48 maxMmapRand64 = (1 << 28) * hostarch.PageSize 49 50 // minGap64 is the minimum gap to leave at the top of the address space 51 // for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux. 52 minGap64 = (128 << 20) + maxStackRand64 53 54 // preferredPIELoadAddr is the standard Linux position-independent 55 // executable base load address. It is ELF_ET_DYN_BASE in Linux. 56 // 57 // The Platform {Min,Max}UserAddress() may preclude loading at this 58 // address. See other preferredFoo comments below. 59 preferredPIELoadAddr hostarch.Addr = maxAddr64 / 3 * 2 60 ) 61 62 // These constants are selected as heuristics to help make the Platform's 63 // potentially limited address space conform as closely to Linux as possible. 64 const ( 65 // Select a preferred minimum TopDownBase address. 66 // 67 // Some applications (TSAN and other *SANs) are very particular about 68 // the way the Linux mmap allocator layouts out the address space. 69 // 70 // TSAN in particular expects top down allocations to be made in the 71 // range [0x7e8000000000, 0x800000000000). 72 // 73 // The minimum TopDownBase on Linux would be: 74 // 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000. 75 // 76 // (minGap64 because TSAN uses a small RLIMIT_STACK.) 77 // 78 // 0x7e8000000000 is selected arbitrarily by TSAN to leave room for 79 // allocations below TopDownBase. 80 // 81 // N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all 82 // the way down to 0x10007fff8000, and MSAN down to 0x700000000000. 83 // 84 // Of course, there is no hard minimum to allocation; an allocator can 85 // search all the way from TopDownBase to Min. However, TSAN declared 86 // their range "good enough". 87 // 88 // We would like to pick a TopDownBase such that it is unlikely that an 89 // allocator will select an address below TSAN's minimum. We achieve 90 // this by trying to leave a sizable gap below TopDownBase. 91 // 92 // This is all "preferred" because the layout min/max address may not 93 // allow us to select such a TopDownBase, in which case we have to fall 94 // back to a layout that TSAN may not be happy with. 95 preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000 96 preferredAllocationGap = 128 << 30 // 128 GB 97 preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap 98 99 // minMmapRand64 is the smallest we are willing to make the 100 // randomization to stay above preferredTopDownBaseMin. 101 minMmapRand64 = (1 << 26) * hostarch.PageSize 102 ) 103 104 // Context64 represents an AMD64 context. 105 // 106 // +stateify savable 107 type Context64 struct { 108 State 109 } 110 111 // Arch implements Context.Arch. 112 func (c *Context64) Arch() Arch { 113 return AMD64 114 } 115 116 // FloatingPointData returns the state of the floating-point unit. 117 func (c *Context64) FloatingPointData() *fpu.State { 118 return &c.State.fpState 119 } 120 121 // Fork returns an exact copy of this context. 122 func (c *Context64) Fork() *Context64 { 123 return &Context64{ 124 State: c.State.Fork(), 125 } 126 } 127 128 // Return returns the current syscall return value. 129 func (c *Context64) Return() uintptr { 130 return uintptr(c.Regs.Rax) 131 } 132 133 // SetReturn sets the syscall return value. 134 func (c *Context64) SetReturn(value uintptr) { 135 c.Regs.Rax = uint64(value) 136 } 137 138 // IP returns the current instruction pointer. 139 func (c *Context64) IP() uintptr { 140 return uintptr(c.Regs.Rip) 141 } 142 143 // SetIP sets the current instruction pointer. 144 func (c *Context64) SetIP(value uintptr) { 145 c.Regs.Rip = uint64(value) 146 } 147 148 // Stack returns the current stack pointer. 149 func (c *Context64) Stack() uintptr { 150 return uintptr(c.Regs.Rsp) 151 } 152 153 // SetStack sets the current stack pointer. 154 func (c *Context64) SetStack(value uintptr) { 155 c.Regs.Rsp = uint64(value) 156 } 157 158 // TLS returns the current TLS pointer. 159 func (c *Context64) TLS() uintptr { 160 return uintptr(c.Regs.Fs_base) 161 } 162 163 // SetTLS sets the current TLS pointer. Returns false if value is invalid. 164 func (c *Context64) SetTLS(value uintptr) bool { 165 if !isValidSegmentBase(uint64(value)) { 166 return false 167 } 168 169 c.Regs.Fs = 0 170 c.Regs.Fs_base = uint64(value) 171 return true 172 } 173 174 // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. 175 func (c *Context64) SetOldRSeqInterruptedIP(value uintptr) { 176 c.Regs.R10 = uint64(value) 177 } 178 179 // Native returns the native type for the given val. 180 func (c *Context64) Native(val uintptr) marshal.Marshallable { 181 v := primitive.Uint64(val) 182 return &v 183 } 184 185 // Value returns the generic val for the given native type. 186 func (c *Context64) Value(val marshal.Marshallable) uintptr { 187 return uintptr(*val.(*primitive.Uint64)) 188 } 189 190 // Width returns the byte width of this architecture. 191 func (c *Context64) Width() uint { 192 return 8 193 } 194 195 // mmapRand returns a random adjustment for randomizing an mmap layout. 196 func mmapRand(max uint64) hostarch.Addr { 197 return hostarch.Addr(rand.Int63n(int64(max))).RoundDown() 198 } 199 200 // NewMmapLayout implements Context.NewMmapLayout consistently with Linux. 201 func (c *Context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) { 202 min, ok := min.RoundUp() 203 if !ok { 204 return MmapLayout{}, unix.EINVAL 205 } 206 if max > maxAddr64 { 207 max = maxAddr64 208 } 209 max = max.RoundDown() 210 211 if min > max { 212 return MmapLayout{}, unix.EINVAL 213 } 214 215 stackSize := r.Get(limits.Stack) 216 217 // MAX_GAP in Linux. 218 maxGap := (max / 6) * 5 219 gap := hostarch.Addr(stackSize.Cur) 220 if gap < minGap64 { 221 gap = minGap64 222 } 223 if gap > maxGap { 224 gap = maxGap 225 } 226 defaultDir := MmapTopDown 227 if stackSize.Cur == limits.Infinity { 228 defaultDir = MmapBottomUp 229 } 230 231 topDownMin := max - gap - maxMmapRand64 232 maxRand := hostarch.Addr(maxMmapRand64) 233 if topDownMin < preferredTopDownBaseMin { 234 // Try to keep TopDownBase above preferredTopDownBaseMin by 235 // shrinking maxRand. 236 maxAdjust := maxRand - minMmapRand64 237 needAdjust := preferredTopDownBaseMin - topDownMin 238 if needAdjust <= maxAdjust { 239 maxRand -= needAdjust 240 } 241 } 242 243 rnd := mmapRand(uint64(maxRand)) 244 l := MmapLayout{ 245 MinAddr: min, 246 MaxAddr: max, 247 // TASK_UNMAPPED_BASE in Linux. 248 BottomUpBase: (max/3 + rnd).RoundDown(), 249 TopDownBase: (max - gap - rnd).RoundDown(), 250 DefaultDirection: defaultDir, 251 // We may have reduced the maximum randomization to keep 252 // TopDownBase above preferredTopDownBaseMin while maintaining 253 // our stack gap. Stack allocations must use that max 254 // randomization to avoiding eating into the gap. 255 MaxStackRand: uint64(maxRand), 256 } 257 258 // Final sanity check on the layout. 259 if !l.Valid() { 260 panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) 261 } 262 263 return l, nil 264 } 265 266 // PIELoadAddress implements Context.PIELoadAddress. 267 func (c *Context64) PIELoadAddress(l MmapLayout) hostarch.Addr { 268 base := preferredPIELoadAddr 269 max, ok := base.AddLength(maxMmapRand64) 270 if !ok { 271 panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) 272 } 273 274 if max > l.MaxAddr { 275 // preferredPIELoadAddr won't fit; fall back to the standard 276 // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. 277 // 278 // Don't bother trying to shrink the randomization for now. 279 base = l.TopDownBase / 3 * 2 280 } 281 282 return base + mmapRand(maxMmapRand64) 283 } 284 285 // userStructSize is the size in bytes of Linux's struct user on amd64. 286 const userStructSize = 928 287 288 // PtracePeekUser implements Context.PtracePeekUser. 289 func (c *Context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) { 290 if addr&7 != 0 || addr >= userStructSize { 291 return nil, unix.EIO 292 } 293 // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and 294 // u_debugreg, returning 0 or silently no-oping for other fields 295 // respectively. 296 if addr < uintptr(ptraceRegistersSize) { 297 regs := c.ptraceGetRegs() 298 buf := make([]byte, regs.SizeBytes()) 299 regs.MarshalUnsafe(buf) 300 return c.Native(uintptr(hostarch.ByteOrder.Uint64(buf[addr:]))), nil 301 } 302 // Note: x86 debug registers are missing. 303 return c.Native(0), nil 304 } 305 306 // PtracePokeUser implements Context.PtracePokeUser. 307 func (c *Context64) PtracePokeUser(addr, data uintptr) error { 308 if addr&7 != 0 || addr >= userStructSize { 309 return unix.EIO 310 } 311 if addr < uintptr(ptraceRegistersSize) { 312 regs := c.ptraceGetRegs() 313 buf := make([]byte, regs.SizeBytes()) 314 regs.MarshalUnsafe(buf) 315 hostarch.ByteOrder.PutUint64(buf[addr:], uint64(data)) 316 _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) 317 return err 318 } 319 // Note: x86 debug registers are missing. 320 return nil 321 }