github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/syscalls/linux/sys_mmap.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "bytes" 19 20 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 21 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 22 "github.com/MerlinKodo/gvisor/pkg/hostarch" 23 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 24 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs" 25 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 26 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/mm" 28 ) 29 30 // Brk implements linux syscall brk(2). 31 func Brk(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 32 addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) 33 // "However, the actual Linux system call returns the new program break on 34 // success. On failure, the system call returns the current break." - 35 // brk(2) 36 return uintptr(addr), nil, nil 37 } 38 39 // Mmap implements Linux syscall mmap(2). 40 func Mmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 41 prot := args[2].Int() 42 flags := args[3].Int() 43 fd := args[4].Int() 44 fixed := flags&linux.MAP_FIXED != 0 45 private := flags&linux.MAP_PRIVATE != 0 46 shared := flags&linux.MAP_SHARED != 0 47 anon := flags&linux.MAP_ANONYMOUS != 0 48 map32bit := flags&linux.MAP_32BIT != 0 49 50 // Require exactly one of MAP_PRIVATE and MAP_SHARED. 51 if private == shared { 52 return 0, nil, linuxerr.EINVAL 53 } 54 55 opts := memmap.MMapOpts{ 56 Length: args[1].Uint64(), 57 Offset: args[5].Uint64(), 58 Addr: args[0].Pointer(), 59 Fixed: fixed, 60 Unmap: fixed, 61 Map32Bit: map32bit, 62 Private: private, 63 Perms: hostarch.AccessType{ 64 Read: linux.PROT_READ&prot != 0, 65 Write: linux.PROT_WRITE&prot != 0, 66 Execute: linux.PROT_EXEC&prot != 0, 67 }, 68 MaxPerms: hostarch.AnyAccess, 69 GrowsDown: linux.MAP_GROWSDOWN&flags != 0, 70 Precommit: linux.MAP_POPULATE&flags != 0, 71 } 72 if linux.MAP_LOCKED&flags != 0 { 73 opts.MLockMode = memmap.MLockEager 74 } 75 defer func() { 76 if opts.MappingIdentity != nil { 77 opts.MappingIdentity.DecRef(t) 78 } 79 }() 80 81 if !anon { 82 // Convert the passed FD to a file reference. 83 file := t.GetFile(fd) 84 if file == nil { 85 return 0, nil, linuxerr.EBADF 86 } 87 defer file.DecRef(t) 88 89 // mmap unconditionally requires that the FD is readable. 90 if !file.IsReadable() { 91 return 0, nil, linuxerr.EACCES 92 } 93 // MAP_SHARED requires that the FD be writable for PROT_WRITE. 94 if shared && !file.IsWritable() { 95 opts.MaxPerms.Write = false 96 } 97 98 if err := file.ConfigureMMap(t, &opts); err != nil { 99 return 0, nil, err 100 } 101 } else if shared { 102 // Back shared anonymous mappings with an anonymous tmpfs file. 103 opts.Offset = 0 104 file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length) 105 if err != nil { 106 return 0, nil, err 107 } 108 defer file.DecRef(t) 109 if err := file.ConfigureMMap(t, &opts); err != nil { 110 return 0, nil, err 111 } 112 } 113 114 rv, err := t.MemoryManager().MMap(t, opts) 115 return uintptr(rv), nil, err 116 } 117 118 // Munmap implements linux syscall munmap(2). 119 func Munmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 120 return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) 121 } 122 123 // Mremap implements linux syscall mremap(2). 124 func Mremap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 125 oldAddr := args[0].Pointer() 126 oldSize := args[1].Uint64() 127 newSize := args[2].Uint64() 128 flags := args[3].Uint64() 129 newAddr := args[4].Pointer() 130 131 if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { 132 return 0, nil, linuxerr.EINVAL 133 } 134 mayMove := flags&linux.MREMAP_MAYMOVE != 0 135 fixed := flags&linux.MREMAP_FIXED != 0 136 var moveMode mm.MRemapMoveMode 137 switch { 138 case !mayMove && !fixed: 139 moveMode = mm.MRemapNoMove 140 case mayMove && !fixed: 141 moveMode = mm.MRemapMayMove 142 case mayMove && fixed: 143 moveMode = mm.MRemapMustMove 144 case !mayMove && fixed: 145 // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be 146 // specified." - mremap(2) 147 return 0, nil, linuxerr.EINVAL 148 } 149 150 rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ 151 Move: moveMode, 152 NewAddr: newAddr, 153 }) 154 return uintptr(rv), nil, err 155 } 156 157 // Mprotect implements linux syscall mprotect(2). 158 func Mprotect(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 159 length := args[1].Uint64() 160 prot := args[2].Int() 161 err := t.MemoryManager().MProtect(args[0].Pointer(), length, hostarch.AccessType{ 162 Read: linux.PROT_READ&prot != 0, 163 Write: linux.PROT_WRITE&prot != 0, 164 Execute: linux.PROT_EXEC&prot != 0, 165 }, linux.PROT_GROWSDOWN&prot != 0) 166 return 0, nil, err 167 } 168 169 // Madvise implements linux syscall madvise(2). 170 func Madvise(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 171 addr := args[0].Pointer() 172 length := uint64(args[1].SizeT()) 173 adv := args[2].Int() 174 175 // "The Linux implementation requires that the address addr be 176 // page-aligned, and allows length to be zero." - madvise(2) 177 if addr.RoundDown() != addr { 178 return 0, nil, linuxerr.EINVAL 179 } 180 if length == 0 { 181 return 0, nil, nil 182 } 183 // Not explicitly stated: length need not be page-aligned. 184 lenAddr, ok := hostarch.Addr(length).RoundUp() 185 if !ok { 186 return 0, nil, linuxerr.EINVAL 187 } 188 length = uint64(lenAddr) 189 190 switch adv { 191 case linux.MADV_DONTNEED: 192 return 0, nil, t.MemoryManager().Decommit(addr, length) 193 case linux.MADV_DOFORK: 194 return 0, nil, t.MemoryManager().SetDontFork(addr, length, false) 195 case linux.MADV_DONTFORK: 196 return 0, nil, t.MemoryManager().SetDontFork(addr, length, true) 197 case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: 198 fallthrough 199 case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: 200 fallthrough 201 case linux.MADV_DONTDUMP, linux.MADV_DODUMP: 202 // TODO(b/72045799): Core dumping isn't implemented, so these are 203 // no-ops. 204 fallthrough 205 case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: 206 // Do nothing, we totally ignore the suggestions above. 207 return 0, nil, nil 208 case linux.MADV_REMOVE: 209 // These "suggestions" have application-visible side effects, so we 210 // have to indicate that we don't support them. 211 return 0, nil, linuxerr.ENOSYS 212 case linux.MADV_HWPOISON: 213 // Only privileged processes are allowed to poison pages. 214 return 0, nil, linuxerr.EPERM 215 default: 216 // If adv is not a valid value tell the caller. 217 return 0, nil, linuxerr.EINVAL 218 } 219 } 220 221 // Mincore implements the syscall mincore(2). 222 func Mincore(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 223 addr := args[0].Pointer() 224 length := args[1].SizeT() 225 vec := args[2].Pointer() 226 227 if addr != addr.RoundDown() { 228 return 0, nil, linuxerr.EINVAL 229 } 230 // "The length argument need not be a multiple of the page size, but since 231 // residency information is returned for whole pages, length is effectively 232 // rounded up to the next multiple of the page size." - mincore(2) 233 la, ok := hostarch.Addr(length).RoundUp() 234 if !ok { 235 return 0, nil, linuxerr.ENOMEM 236 } 237 ar, ok := addr.ToRange(uint64(la)) 238 if !ok { 239 return 0, nil, linuxerr.ENOMEM 240 } 241 242 // Pretend that all mapped pages are "resident in core". 243 mapped := t.MemoryManager().VirtualMemorySizeRange(ar) 244 // "ENOMEM: addr to addr + length contained unmapped memory." 245 if mapped != uint64(la) { 246 return 0, nil, linuxerr.ENOMEM 247 } 248 resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize)) 249 _, err := t.CopyOutBytes(vec, resident) 250 return 0, nil, err 251 } 252 253 // Msync implements Linux syscall msync(2). 254 func Msync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 255 addr := args[0].Pointer() 256 length := args[1].SizeT() 257 flags := args[2].Int() 258 259 // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, 260 // and may additionally include the MS_INVALIDATE bit. ... However, Linux 261 // permits a call to msync() that specifies neither of these flags, with 262 // semantics that are (currently) equivalent to specifying MS_ASYNC." - 263 // msync(2) 264 if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { 265 return 0, nil, linuxerr.EINVAL 266 } 267 sync := flags&linux.MS_SYNC != 0 268 if sync && flags&linux.MS_ASYNC != 0 { 269 return 0, nil, linuxerr.EINVAL 270 } 271 err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ 272 Sync: sync, 273 Invalidate: flags&linux.MS_INVALIDATE != 0, 274 }) 275 // MSync calls fsync, the same interrupt conversion rules apply, see 276 // mm/msync.c, fsync POSIX.1-2008. 277 return 0, nil, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) 278 } 279 280 // Mlock implements linux syscall mlock(2). 281 func Mlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 282 addr := args[0].Pointer() 283 length := args[1].SizeT() 284 285 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) 286 } 287 288 // Mlock2 implements linux syscall mlock2(2). 289 func Mlock2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 290 addr := args[0].Pointer() 291 length := args[1].SizeT() 292 flags := args[2].Int() 293 294 if flags&^(linux.MLOCK_ONFAULT) != 0 { 295 return 0, nil, linuxerr.EINVAL 296 } 297 298 mode := memmap.MLockEager 299 if flags&linux.MLOCK_ONFAULT != 0 { 300 mode = memmap.MLockLazy 301 } 302 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) 303 } 304 305 // Munlock implements linux syscall munlock(2). 306 func Munlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 307 addr := args[0].Pointer() 308 length := args[1].SizeT() 309 310 return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) 311 } 312 313 // Mlockall implements linux syscall mlockall(2). 314 func Mlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 315 flags := args[0].Int() 316 317 if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { 318 return 0, nil, linuxerr.EINVAL 319 } 320 321 mode := memmap.MLockEager 322 if flags&linux.MCL_ONFAULT != 0 { 323 mode = memmap.MLockLazy 324 } 325 return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ 326 Current: flags&linux.MCL_CURRENT != 0, 327 Future: flags&linux.MCL_FUTURE != 0, 328 Mode: mode, 329 }) 330 } 331 332 // Munlockall implements linux syscall munlockall(2). 333 func Munlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 334 return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ 335 Current: true, 336 Future: true, 337 Mode: memmap.MLockNone, 338 }) 339 }