github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/arch/arch_amd64.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build amd64 16 17 package arch 18 19 import ( 20 "bytes" 21 "fmt" 22 "math/rand" 23 24 "golang.org/x/sys/unix" 25 "github.com/SagerNet/gvisor/pkg/cpuid" 26 "github.com/SagerNet/gvisor/pkg/hostarch" 27 "github.com/SagerNet/gvisor/pkg/marshal" 28 "github.com/SagerNet/gvisor/pkg/marshal/primitive" 29 "github.com/SagerNet/gvisor/pkg/sentry/arch/fpu" 30 "github.com/SagerNet/gvisor/pkg/sentry/limits" 31 ) 32 33 // Host specifies the host architecture. 34 const Host = AMD64 35 36 // These constants come directly from Linux. 37 const ( 38 // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux 39 // for a 64-bit process. 40 maxAddr64 hostarch.Addr = (1 << 47) - hostarch.PageSize 41 42 // maxStackRand64 is the maximum randomization to apply to the stack. 43 // It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux. 44 maxStackRand64 = 16 << 30 // 16 GB 45 46 // maxMmapRand64 is the maximum randomization to apply to the mmap 47 // layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux. 48 maxMmapRand64 = (1 << 28) * hostarch.PageSize 49 50 // minGap64 is the minimum gap to leave at the top of the address space 51 // for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux. 52 minGap64 = (128 << 20) + maxStackRand64 53 54 // preferredPIELoadAddr is the standard Linux position-independent 55 // executable base load address. It is ELF_ET_DYN_BASE in Linux. 56 // 57 // The Platform {Min,Max}UserAddress() may preclude loading at this 58 // address. See other preferredFoo comments below. 59 preferredPIELoadAddr hostarch.Addr = maxAddr64 / 3 * 2 60 ) 61 62 // These constants are selected as heuristics to help make the Platform's 63 // potentially limited address space conform as closely to Linux as possible. 64 const ( 65 // Select a preferred minimum TopDownBase address. 66 // 67 // Some applications (TSAN and other *SANs) are very particular about 68 // the way the Linux mmap allocator layouts out the address space. 69 // 70 // TSAN in particular expects top down allocations to be made in the 71 // range [0x7e8000000000, 0x800000000000). 72 // 73 // The minimum TopDownBase on Linux would be: 74 // 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000. 75 // 76 // (minGap64 because TSAN uses a small RLIMIT_STACK.) 77 // 78 // 0x7e8000000000 is selected arbitrarily by TSAN to leave room for 79 // allocations below TopDownBase. 80 // 81 // N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all 82 // the way down to 0x10007fff8000, and MSAN down to 0x700000000000. 83 // 84 // Of course, there is no hard minimum to allocation; an allocator can 85 // search all the way from TopDownBase to Min. However, TSAN declared 86 // their range "good enough". 87 // 88 // We would like to pick a TopDownBase such that it is unlikely that an 89 // allocator will select an address below TSAN's minimum. We achieve 90 // this by trying to leave a sizable gap below TopDownBase. 91 // 92 // This is all "preferred" because the layout min/max address may not 93 // allow us to select such a TopDownBase, in which case we have to fall 94 // back to a layout that TSAN may not be happy with. 95 preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000 96 preferredAllocationGap = 128 << 30 // 128 GB 97 preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap 98 99 // minMmapRand64 is the smallest we are willing to make the 100 // randomization to stay above preferredTopDownBaseMin. 101 minMmapRand64 = (1 << 26) * hostarch.PageSize 102 ) 103 104 // context64 represents an AMD64 context. 105 // 106 // +stateify savable 107 type context64 struct { 108 State 109 sigFPState []fpu.State // fpstate to be restored on sigreturn. 110 } 111 112 // Arch implements Context.Arch. 113 func (c *context64) Arch() Arch { 114 return AMD64 115 } 116 117 func (c *context64) copySigFPState() []fpu.State { 118 var sigfps []fpu.State 119 for _, s := range c.sigFPState { 120 sigfps = append(sigfps, s.Fork()) 121 } 122 return sigfps 123 } 124 125 func (c *context64) FloatingPointData() *fpu.State { 126 return &c.State.fpState 127 } 128 129 // Fork returns an exact copy of this context. 130 func (c *context64) Fork() Context { 131 return &context64{ 132 State: c.State.Fork(), 133 sigFPState: c.copySigFPState(), 134 } 135 } 136 137 // Return returns the current syscall return value. 138 func (c *context64) Return() uintptr { 139 return uintptr(c.Regs.Rax) 140 } 141 142 // SetReturn sets the syscall return value. 143 func (c *context64) SetReturn(value uintptr) { 144 c.Regs.Rax = uint64(value) 145 } 146 147 // IP returns the current instruction pointer. 148 func (c *context64) IP() uintptr { 149 return uintptr(c.Regs.Rip) 150 } 151 152 // SetIP sets the current instruction pointer. 153 func (c *context64) SetIP(value uintptr) { 154 c.Regs.Rip = uint64(value) 155 } 156 157 // Stack returns the current stack pointer. 158 func (c *context64) Stack() uintptr { 159 return uintptr(c.Regs.Rsp) 160 } 161 162 // SetStack sets the current stack pointer. 163 func (c *context64) SetStack(value uintptr) { 164 c.Regs.Rsp = uint64(value) 165 } 166 167 // TLS returns the current TLS pointer. 168 func (c *context64) TLS() uintptr { 169 return uintptr(c.Regs.Fs_base) 170 } 171 172 // SetTLS sets the current TLS pointer. Returns false if value is invalid. 173 func (c *context64) SetTLS(value uintptr) bool { 174 if !isValidSegmentBase(uint64(value)) { 175 return false 176 } 177 178 c.Regs.Fs = 0 179 c.Regs.Fs_base = uint64(value) 180 return true 181 } 182 183 // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. 184 func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { 185 c.Regs.R10 = uint64(value) 186 } 187 188 // Native returns the native type for the given val. 189 func (c *context64) Native(val uintptr) marshal.Marshallable { 190 v := primitive.Uint64(val) 191 return &v 192 } 193 194 // Value returns the generic val for the given native type. 195 func (c *context64) Value(val marshal.Marshallable) uintptr { 196 return uintptr(*val.(*primitive.Uint64)) 197 } 198 199 // Width returns the byte width of this architecture. 200 func (c *context64) Width() uint { 201 return 8 202 } 203 204 // FeatureSet returns the FeatureSet in use. 205 func (c *context64) FeatureSet() *cpuid.FeatureSet { 206 return c.State.FeatureSet 207 } 208 209 // mmapRand returns a random adjustment for randomizing an mmap layout. 210 func mmapRand(max uint64) hostarch.Addr { 211 return hostarch.Addr(rand.Int63n(int64(max))).RoundDown() 212 } 213 214 // NewMmapLayout implements Context.NewMmapLayout consistently with Linux. 215 func (c *context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) { 216 min, ok := min.RoundUp() 217 if !ok { 218 return MmapLayout{}, unix.EINVAL 219 } 220 if max > maxAddr64 { 221 max = maxAddr64 222 } 223 max = max.RoundDown() 224 225 if min > max { 226 return MmapLayout{}, unix.EINVAL 227 } 228 229 stackSize := r.Get(limits.Stack) 230 231 // MAX_GAP in Linux. 232 maxGap := (max / 6) * 5 233 gap := hostarch.Addr(stackSize.Cur) 234 if gap < minGap64 { 235 gap = minGap64 236 } 237 if gap > maxGap { 238 gap = maxGap 239 } 240 defaultDir := MmapTopDown 241 if stackSize.Cur == limits.Infinity { 242 defaultDir = MmapBottomUp 243 } 244 245 topDownMin := max - gap - maxMmapRand64 246 maxRand := hostarch.Addr(maxMmapRand64) 247 if topDownMin < preferredTopDownBaseMin { 248 // Try to keep TopDownBase above preferredTopDownBaseMin by 249 // shrinking maxRand. 250 maxAdjust := maxRand - minMmapRand64 251 needAdjust := preferredTopDownBaseMin - topDownMin 252 if needAdjust <= maxAdjust { 253 maxRand -= needAdjust 254 } 255 } 256 257 rnd := mmapRand(uint64(maxRand)) 258 l := MmapLayout{ 259 MinAddr: min, 260 MaxAddr: max, 261 // TASK_UNMAPPED_BASE in Linux. 262 BottomUpBase: (max/3 + rnd).RoundDown(), 263 TopDownBase: (max - gap - rnd).RoundDown(), 264 DefaultDirection: defaultDir, 265 // We may have reduced the maximum randomization to keep 266 // TopDownBase above preferredTopDownBaseMin while maintaining 267 // our stack gap. Stack allocations must use that max 268 // randomization to avoiding eating into the gap. 269 MaxStackRand: uint64(maxRand), 270 } 271 272 // Final sanity check on the layout. 273 if !l.Valid() { 274 panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) 275 } 276 277 return l, nil 278 } 279 280 // PIELoadAddress implements Context.PIELoadAddress. 281 func (c *context64) PIELoadAddress(l MmapLayout) hostarch.Addr { 282 base := preferredPIELoadAddr 283 max, ok := base.AddLength(maxMmapRand64) 284 if !ok { 285 panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) 286 } 287 288 if max > l.MaxAddr { 289 // preferredPIELoadAddr won't fit; fall back to the standard 290 // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. 291 // 292 // Don't bother trying to shrink the randomization for now. 293 base = l.TopDownBase / 3 * 2 294 } 295 296 return base + mmapRand(maxMmapRand64) 297 } 298 299 // userStructSize is the size in bytes of Linux's struct user on amd64. 300 const userStructSize = 928 301 302 // PtracePeekUser implements Context.PtracePeekUser. 303 func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) { 304 if addr&7 != 0 || addr >= userStructSize { 305 return nil, unix.EIO 306 } 307 // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and 308 // u_debugreg, returning 0 or silently no-oping for other fields 309 // respectively. 310 if addr < uintptr(ptraceRegistersSize) { 311 regs := c.ptraceGetRegs() 312 buf := make([]byte, regs.SizeBytes()) 313 regs.MarshalUnsafe(buf) 314 return c.Native(uintptr(hostarch.ByteOrder.Uint64(buf[addr:]))), nil 315 } 316 // Note: x86 debug registers are missing. 317 return c.Native(0), nil 318 } 319 320 // PtracePokeUser implements Context.PtracePokeUser. 321 func (c *context64) PtracePokeUser(addr, data uintptr) error { 322 if addr&7 != 0 || addr >= userStructSize { 323 return unix.EIO 324 } 325 if addr < uintptr(ptraceRegistersSize) { 326 regs := c.ptraceGetRegs() 327 buf := make([]byte, regs.SizeBytes()) 328 regs.MarshalUnsafe(buf) 329 hostarch.ByteOrder.PutUint64(buf[addr:], uint64(data)) 330 _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) 331 return err 332 } 333 // Note: x86 debug registers are missing. 334 return nil 335 }