gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/syscalls/linux/sys_mmap.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "bytes" 19 20 "gvisor.dev/gvisor/pkg/abi/linux" 21 "gvisor.dev/gvisor/pkg/errors/linuxerr" 22 "gvisor.dev/gvisor/pkg/hostarch" 23 "gvisor.dev/gvisor/pkg/sentry/arch" 24 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" 25 "gvisor.dev/gvisor/pkg/sentry/kernel" 26 "gvisor.dev/gvisor/pkg/sentry/memmap" 27 "gvisor.dev/gvisor/pkg/sentry/mm" 28 ) 29 30 // Brk implements linux syscall brk(2). 31 func Brk(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 32 addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) 33 // "However, the actual Linux system call returns the new program break on 34 // success. On failure, the system call returns the current break." - 35 // brk(2) 36 return uintptr(addr), nil, nil 37 } 38 39 // Mmap implements Linux syscall mmap(2). 40 func Mmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 41 prot := args[2].Int() 42 flags := args[3].Int() 43 fd := args[4].Int() 44 fixed := flags&linux.MAP_FIXED != 0 45 private := flags&linux.MAP_PRIVATE != 0 46 shared := flags&linux.MAP_SHARED != 0 47 anon := flags&linux.MAP_ANONYMOUS != 0 48 map32bit := flags&linux.MAP_32BIT != 0 49 50 // Require exactly one of MAP_PRIVATE and MAP_SHARED. 51 if private == shared { 52 return 0, nil, linuxerr.EINVAL 53 } 54 55 opts := memmap.MMapOpts{ 56 Length: args[1].Uint64(), 57 Offset: args[5].Uint64(), 58 Addr: args[0].Pointer(), 59 Fixed: fixed, 60 Unmap: fixed, 61 Map32Bit: map32bit, 62 Private: private, 63 Perms: hostarch.AccessType{ 64 Read: linux.PROT_READ&prot != 0, 65 Write: linux.PROT_WRITE&prot != 0, 66 Execute: linux.PROT_EXEC&prot != 0, 67 }, 68 MaxPerms: hostarch.AnyAccess, 69 GrowsDown: linux.MAP_GROWSDOWN&flags != 0, 70 } 71 if linux.MAP_POPULATE&flags != 0 { 72 opts.PlatformEffect = memmap.PlatformEffectCommit 73 } 74 if linux.MAP_LOCKED&flags != 0 { 75 opts.MLockMode = memmap.MLockEager 76 } 77 defer func() { 78 if opts.MappingIdentity != nil { 79 opts.MappingIdentity.DecRef(t) 80 } 81 }() 82 83 if !anon { 84 // Convert the passed FD to a file reference. 85 file := t.GetFile(fd) 86 if file == nil { 87 return 0, nil, linuxerr.EBADF 88 } 89 defer file.DecRef(t) 90 91 // mmap unconditionally requires that the FD is readable. 92 if !file.IsReadable() { 93 return 0, nil, linuxerr.EACCES 94 } 95 // MAP_SHARED requires that the FD be writable for PROT_WRITE. 96 if shared && !file.IsWritable() { 97 opts.MaxPerms.Write = false 98 } 99 100 if err := file.ConfigureMMap(t, &opts); err != nil { 101 return 0, nil, err 102 } 103 } else if shared { 104 // Back shared anonymous mappings with an anonymous tmpfs file. 105 opts.Offset = 0 106 file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length) 107 if err != nil { 108 return 0, nil, err 109 } 110 defer file.DecRef(t) 111 if err := file.ConfigureMMap(t, &opts); err != nil { 112 return 0, nil, err 113 } 114 } 115 116 rv, err := t.MemoryManager().MMap(t, opts) 117 return uintptr(rv), nil, err 118 } 119 120 // Munmap implements linux syscall munmap(2). 121 func Munmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 122 return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) 123 } 124 125 // Mremap implements linux syscall mremap(2). 126 func Mremap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 127 oldAddr := args[0].Pointer() 128 oldSize := args[1].Uint64() 129 newSize := args[2].Uint64() 130 flags := args[3].Uint64() 131 newAddr := args[4].Pointer() 132 133 if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { 134 return 0, nil, linuxerr.EINVAL 135 } 136 mayMove := flags&linux.MREMAP_MAYMOVE != 0 137 fixed := flags&linux.MREMAP_FIXED != 0 138 var moveMode mm.MRemapMoveMode 139 switch { 140 case !mayMove && !fixed: 141 moveMode = mm.MRemapNoMove 142 case mayMove && !fixed: 143 moveMode = mm.MRemapMayMove 144 case mayMove && fixed: 145 moveMode = mm.MRemapMustMove 146 case !mayMove && fixed: 147 // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be 148 // specified." - mremap(2) 149 return 0, nil, linuxerr.EINVAL 150 } 151 152 rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ 153 Move: moveMode, 154 NewAddr: newAddr, 155 }) 156 return uintptr(rv), nil, err 157 } 158 159 // Mprotect implements linux syscall mprotect(2). 160 func Mprotect(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 161 length := args[1].Uint64() 162 prot := args[2].Int() 163 err := t.MemoryManager().MProtect(args[0].Pointer(), length, hostarch.AccessType{ 164 Read: linux.PROT_READ&prot != 0, 165 Write: linux.PROT_WRITE&prot != 0, 166 Execute: linux.PROT_EXEC&prot != 0, 167 }, linux.PROT_GROWSDOWN&prot != 0) 168 return 0, nil, err 169 } 170 171 // Madvise implements linux syscall madvise(2). 172 func Madvise(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 173 addr := args[0].Pointer() 174 length := uint64(args[1].SizeT()) 175 adv := args[2].Int() 176 177 // "The Linux implementation requires that the address addr be 178 // page-aligned, and allows length to be zero." - madvise(2) 179 if addr.RoundDown() != addr { 180 return 0, nil, linuxerr.EINVAL 181 } 182 if length == 0 { 183 return 0, nil, nil 184 } 185 // Not explicitly stated: length need not be page-aligned. 186 lenAddr, ok := hostarch.Addr(length).RoundUp() 187 if !ok { 188 return 0, nil, linuxerr.EINVAL 189 } 190 length = uint64(lenAddr) 191 192 switch adv { 193 case linux.MADV_DONTNEED: 194 return 0, nil, t.MemoryManager().Decommit(addr, length) 195 case linux.MADV_DOFORK: 196 return 0, nil, t.MemoryManager().SetDontFork(addr, length, false) 197 case linux.MADV_DONTFORK: 198 return 0, nil, t.MemoryManager().SetDontFork(addr, length, true) 199 case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: 200 fallthrough 201 case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: 202 fallthrough 203 case linux.MADV_DONTDUMP, linux.MADV_DODUMP: 204 // TODO(b/72045799): Core dumping isn't implemented, so these are 205 // no-ops. 206 fallthrough 207 case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: 208 // Do nothing, we totally ignore the suggestions above. 209 return 0, nil, nil 210 case linux.MADV_REMOVE: 211 // These "suggestions" have application-visible side effects, so we 212 // have to indicate that we don't support them. 213 return 0, nil, linuxerr.ENOSYS 214 case linux.MADV_HWPOISON: 215 // Only privileged processes are allowed to poison pages. 216 return 0, nil, linuxerr.EPERM 217 default: 218 // If adv is not a valid value tell the caller. 219 return 0, nil, linuxerr.EINVAL 220 } 221 } 222 223 // Mincore implements the syscall mincore(2). 224 func Mincore(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 225 addr := args[0].Pointer() 226 length := args[1].SizeT() 227 vec := args[2].Pointer() 228 229 if addr != addr.RoundDown() { 230 return 0, nil, linuxerr.EINVAL 231 } 232 // "The length argument need not be a multiple of the page size, but since 233 // residency information is returned for whole pages, length is effectively 234 // rounded up to the next multiple of the page size." - mincore(2) 235 la, ok := hostarch.Addr(length).RoundUp() 236 if !ok { 237 return 0, nil, linuxerr.ENOMEM 238 } 239 ar, ok := addr.ToRange(uint64(la)) 240 if !ok { 241 return 0, nil, linuxerr.ENOMEM 242 } 243 244 // Pretend that all mapped pages are "resident in core". 245 mapped := t.MemoryManager().VirtualMemorySizeRange(ar) 246 // "ENOMEM: addr to addr + length contained unmapped memory." 247 if mapped != uint64(la) { 248 return 0, nil, linuxerr.ENOMEM 249 } 250 resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize)) 251 _, err := t.CopyOutBytes(vec, resident) 252 return 0, nil, err 253 } 254 255 // Msync implements Linux syscall msync(2). 256 func Msync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 257 addr := args[0].Pointer() 258 length := args[1].SizeT() 259 flags := args[2].Int() 260 261 // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, 262 // and may additionally include the MS_INVALIDATE bit. ... However, Linux 263 // permits a call to msync() that specifies neither of these flags, with 264 // semantics that are (currently) equivalent to specifying MS_ASYNC." - 265 // msync(2) 266 if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { 267 return 0, nil, linuxerr.EINVAL 268 } 269 sync := flags&linux.MS_SYNC != 0 270 if sync && flags&linux.MS_ASYNC != 0 { 271 return 0, nil, linuxerr.EINVAL 272 } 273 err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ 274 Sync: sync, 275 Invalidate: flags&linux.MS_INVALIDATE != 0, 276 }) 277 // MSync calls fsync, the same interrupt conversion rules apply, see 278 // mm/msync.c, fsync POSIX.1-2008. 279 return 0, nil, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) 280 } 281 282 // Mlock implements linux syscall mlock(2). 283 func Mlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 284 addr := args[0].Pointer() 285 length := args[1].SizeT() 286 287 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) 288 } 289 290 // Mlock2 implements linux syscall mlock2(2). 291 func Mlock2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 292 addr := args[0].Pointer() 293 length := args[1].SizeT() 294 flags := args[2].Int() 295 296 if flags&^(linux.MLOCK_ONFAULT) != 0 { 297 return 0, nil, linuxerr.EINVAL 298 } 299 300 mode := memmap.MLockEager 301 if flags&linux.MLOCK_ONFAULT != 0 { 302 mode = memmap.MLockLazy 303 } 304 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) 305 } 306 307 // Munlock implements linux syscall munlock(2). 308 func Munlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 309 addr := args[0].Pointer() 310 length := args[1].SizeT() 311 312 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) 313 } 314 315 // Mlockall implements linux syscall mlockall(2). 316 func Mlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 317 flags := args[0].Int() 318 319 if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { 320 return 0, nil, linuxerr.EINVAL 321 } 322 323 mode := memmap.MLockEager 324 if flags&linux.MCL_ONFAULT != 0 { 325 mode = memmap.MLockLazy 326 } 327 return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ 328 Current: flags&linux.MCL_CURRENT != 0, 329 Future: flags&linux.MCL_FUTURE != 0, 330 Mode: mode, 331 }) 332 } 333 334 // Munlockall implements linux syscall munlockall(2). 335 func Munlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 336 return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ 337 Current: true, 338 Future: true, 339 Mode: memmap.MLockNone, 340 }) 341 }