github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/sys_mmap.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "bytes" 19 20 "github.com/SagerNet/gvisor/pkg/abi/linux" 21 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 22 "github.com/SagerNet/gvisor/pkg/hostarch" 23 "github.com/SagerNet/gvisor/pkg/sentry/arch" 24 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 25 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 26 "github.com/SagerNet/gvisor/pkg/sentry/mm" 27 "github.com/SagerNet/gvisor/pkg/syserror" 28 ) 29 30 // Brk implements linux syscall brk(2). 31 func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 32 addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) 33 // "However, the actual Linux system call returns the new program break on 34 // success. On failure, the system call returns the current break." - 35 // brk(2) 36 return uintptr(addr), nil, nil 37 } 38 39 // LINT.IfChange 40 41 // Mmap implements linux syscall mmap(2). 42 func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 43 prot := args[2].Int() 44 flags := args[3].Int() 45 fd := args[4].Int() 46 fixed := flags&linux.MAP_FIXED != 0 47 private := flags&linux.MAP_PRIVATE != 0 48 shared := flags&linux.MAP_SHARED != 0 49 anon := flags&linux.MAP_ANONYMOUS != 0 50 map32bit := flags&linux.MAP_32BIT != 0 51 52 // Require exactly one of MAP_PRIVATE and MAP_SHARED. 53 if private == shared { 54 return 0, nil, linuxerr.EINVAL 55 } 56 57 opts := memmap.MMapOpts{ 58 Length: args[1].Uint64(), 59 Offset: args[5].Uint64(), 60 Addr: args[0].Pointer(), 61 Fixed: fixed, 62 Unmap: fixed, 63 Map32Bit: map32bit, 64 Private: private, 65 Perms: hostarch.AccessType{ 66 Read: linux.PROT_READ&prot != 0, 67 Write: linux.PROT_WRITE&prot != 0, 68 Execute: linux.PROT_EXEC&prot != 0, 69 }, 70 MaxPerms: hostarch.AnyAccess, 71 GrowsDown: linux.MAP_GROWSDOWN&flags != 0, 72 Precommit: linux.MAP_POPULATE&flags != 0, 73 } 74 if linux.MAP_LOCKED&flags != 0 { 75 opts.MLockMode = memmap.MLockEager 76 } 77 defer func() { 78 if opts.MappingIdentity != nil { 79 opts.MappingIdentity.DecRef(t) 80 } 81 }() 82 83 if !anon { 84 // Convert the passed FD to a file reference. 85 file := t.GetFile(fd) 86 if file == nil { 87 return 0, nil, linuxerr.EBADF 88 } 89 defer file.DecRef(t) 90 91 flags := file.Flags() 92 // mmap unconditionally requires that the FD is readable. 93 if !flags.Read { 94 return 0, nil, linuxerr.EACCES 95 } 96 // MAP_SHARED requires that the FD be writable for PROT_WRITE. 97 if shared && !flags.Write { 98 opts.MaxPerms.Write = false 99 } 100 101 if err := file.ConfigureMMap(t, &opts); err != nil { 102 return 0, nil, err 103 } 104 } else if shared { 105 // Back shared anonymous mappings with a special mappable. 106 opts.Offset = 0 107 m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel()) 108 if err != nil { 109 return 0, nil, err 110 } 111 opts.MappingIdentity = m // transfers ownership of m to opts 112 opts.Mappable = m 113 } 114 115 rv, err := t.MemoryManager().MMap(t, opts) 116 return uintptr(rv), nil, err 117 } 118 119 // LINT.ThenChange(vfs2/mmap.go) 120 121 // Munmap implements linux syscall munmap(2). 122 func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 123 return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) 124 } 125 126 // Mremap implements linux syscall mremap(2). 127 func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 128 oldAddr := args[0].Pointer() 129 oldSize := args[1].Uint64() 130 newSize := args[2].Uint64() 131 flags := args[3].Uint64() 132 newAddr := args[4].Pointer() 133 134 if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { 135 return 0, nil, linuxerr.EINVAL 136 } 137 mayMove := flags&linux.MREMAP_MAYMOVE != 0 138 fixed := flags&linux.MREMAP_FIXED != 0 139 var moveMode mm.MRemapMoveMode 140 switch { 141 case !mayMove && !fixed: 142 moveMode = mm.MRemapNoMove 143 case mayMove && !fixed: 144 moveMode = mm.MRemapMayMove 145 case mayMove && fixed: 146 moveMode = mm.MRemapMustMove 147 case !mayMove && fixed: 148 // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be 149 // specified." - mremap(2) 150 return 0, nil, linuxerr.EINVAL 151 } 152 153 rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ 154 Move: moveMode, 155 NewAddr: newAddr, 156 }) 157 return uintptr(rv), nil, err 158 } 159 160 // Mprotect implements linux syscall mprotect(2). 161 func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 162 length := args[1].Uint64() 163 prot := args[2].Int() 164 err := t.MemoryManager().MProtect(args[0].Pointer(), length, hostarch.AccessType{ 165 Read: linux.PROT_READ&prot != 0, 166 Write: linux.PROT_WRITE&prot != 0, 167 Execute: linux.PROT_EXEC&prot != 0, 168 }, linux.PROT_GROWSDOWN&prot != 0) 169 return 0, nil, err 170 } 171 172 // Madvise implements linux syscall madvise(2). 173 func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 174 addr := args[0].Pointer() 175 length := uint64(args[1].SizeT()) 176 adv := args[2].Int() 177 178 // "The Linux implementation requires that the address addr be 179 // page-aligned, and allows length to be zero." - madvise(2) 180 if addr.RoundDown() != addr { 181 return 0, nil, linuxerr.EINVAL 182 } 183 if length == 0 { 184 return 0, nil, nil 185 } 186 // Not explicitly stated: length need not be page-aligned. 187 lenAddr, ok := hostarch.Addr(length).RoundUp() 188 if !ok { 189 return 0, nil, linuxerr.EINVAL 190 } 191 length = uint64(lenAddr) 192 193 switch adv { 194 case linux.MADV_DONTNEED: 195 return 0, nil, t.MemoryManager().Decommit(addr, length) 196 case linux.MADV_DOFORK: 197 return 0, nil, t.MemoryManager().SetDontFork(addr, length, false) 198 case linux.MADV_DONTFORK: 199 return 0, nil, t.MemoryManager().SetDontFork(addr, length, true) 200 case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: 201 fallthrough 202 case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: 203 fallthrough 204 case linux.MADV_DONTDUMP, linux.MADV_DODUMP: 205 // TODO(b/72045799): Core dumping isn't implemented, so these are 206 // no-ops. 207 fallthrough 208 case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: 209 // Do nothing, we totally ignore the suggestions above. 210 return 0, nil, nil 211 case linux.MADV_REMOVE: 212 // These "suggestions" have application-visible side effects, so we 213 // have to indicate that we don't support them. 214 return 0, nil, syserror.ENOSYS 215 case linux.MADV_HWPOISON: 216 // Only privileged processes are allowed to poison pages. 217 return 0, nil, linuxerr.EPERM 218 default: 219 // If adv is not a valid value tell the caller. 220 return 0, nil, linuxerr.EINVAL 221 } 222 } 223 224 // Mincore implements the syscall mincore(2). 225 func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 226 addr := args[0].Pointer() 227 length := args[1].SizeT() 228 vec := args[2].Pointer() 229 230 if addr != addr.RoundDown() { 231 return 0, nil, linuxerr.EINVAL 232 } 233 // "The length argument need not be a multiple of the page size, but since 234 // residency information is returned for whole pages, length is effectively 235 // rounded up to the next multiple of the page size." - mincore(2) 236 la, ok := hostarch.Addr(length).RoundUp() 237 if !ok { 238 return 0, nil, syserror.ENOMEM 239 } 240 ar, ok := addr.ToRange(uint64(la)) 241 if !ok { 242 return 0, nil, syserror.ENOMEM 243 } 244 245 // Pretend that all mapped pages are "resident in core". 246 mapped := t.MemoryManager().VirtualMemorySizeRange(ar) 247 // "ENOMEM: addr to addr + length contained unmapped memory." 248 if mapped != uint64(la) { 249 return 0, nil, syserror.ENOMEM 250 } 251 resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize)) 252 _, err := t.CopyOutBytes(vec, resident) 253 return 0, nil, err 254 } 255 256 // Msync implements Linux syscall msync(2). 257 func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 258 addr := args[0].Pointer() 259 length := args[1].SizeT() 260 flags := args[2].Int() 261 262 // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, 263 // and may additionally include the MS_INVALIDATE bit. ... However, Linux 264 // permits a call to msync() that specifies neither of these flags, with 265 // semantics that are (currently) equivalent to specifying MS_ASYNC." - 266 // msync(2) 267 if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { 268 return 0, nil, linuxerr.EINVAL 269 } 270 sync := flags&linux.MS_SYNC != 0 271 if sync && flags&linux.MS_ASYNC != 0 { 272 return 0, nil, linuxerr.EINVAL 273 } 274 err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ 275 Sync: sync, 276 Invalidate: flags&linux.MS_INVALIDATE != 0, 277 }) 278 // MSync calls fsync, the same interrupt conversion rules apply, see 279 // mm/msync.c, fsync POSIX.1-2008. 280 return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) 281 } 282 283 // Mlock implements linux syscall mlock(2). 284 func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 285 addr := args[0].Pointer() 286 length := args[1].SizeT() 287 288 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) 289 } 290 291 // Mlock2 implements linux syscall mlock2(2). 292 func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 293 addr := args[0].Pointer() 294 length := args[1].SizeT() 295 flags := args[2].Int() 296 297 if flags&^(linux.MLOCK_ONFAULT) != 0 { 298 return 0, nil, linuxerr.EINVAL 299 } 300 301 mode := memmap.MLockEager 302 if flags&linux.MLOCK_ONFAULT != 0 { 303 mode = memmap.MLockLazy 304 } 305 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) 306 } 307 308 // Munlock implements linux syscall munlock(2). 309 func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 310 addr := args[0].Pointer() 311 length := args[1].SizeT() 312 313 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) 314 } 315 316 // Mlockall implements linux syscall mlockall(2). 317 func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 318 flags := args[0].Int() 319 320 if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { 321 return 0, nil, linuxerr.EINVAL 322 } 323 324 mode := memmap.MLockEager 325 if flags&linux.MCL_ONFAULT != 0 { 326 mode = memmap.MLockLazy 327 } 328 return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ 329 Current: flags&linux.MCL_CURRENT != 0, 330 Future: flags&linux.MCL_FUTURE != 0, 331 Mode: mode, 332 }) 333 } 334 335 // Munlockall implements linux syscall munlockall(2). 336 func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 337 return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ 338 Current: true, 339 Future: true, 340 Mode: memmap.MLockNone, 341 }) 342 }