github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_mmap.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"bytes"
    19  
    20  	"github.com/metacubex/gvisor/pkg/abi/linux"
    21  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    22  	"github.com/metacubex/gvisor/pkg/hostarch"
    23  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    24  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs"
    25  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    26  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    27  	"github.com/metacubex/gvisor/pkg/sentry/mm"
    28  )
    29  
    30  // Brk implements linux syscall brk(2).
    31  func Brk(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    32  	addr, _ := t.MemoryManager().Brk(t, args[0].Pointer())
    33  	// "However, the actual Linux system call returns the new program break on
    34  	// success. On failure, the system call returns the current break." -
    35  	// brk(2)
    36  	return uintptr(addr), nil, nil
    37  }
    38  
    39  // Mmap implements Linux syscall mmap(2).
    40  func Mmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    41  	prot := args[2].Int()
    42  	flags := args[3].Int()
    43  	fd := args[4].Int()
    44  	fixed := flags&linux.MAP_FIXED != 0
    45  	private := flags&linux.MAP_PRIVATE != 0
    46  	shared := flags&linux.MAP_SHARED != 0
    47  	anon := flags&linux.MAP_ANONYMOUS != 0
    48  	map32bit := flags&linux.MAP_32BIT != 0
    49  
    50  	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
    51  	if private == shared {
    52  		return 0, nil, linuxerr.EINVAL
    53  	}
    54  
    55  	opts := memmap.MMapOpts{
    56  		Length:   args[1].Uint64(),
    57  		Offset:   args[5].Uint64(),
    58  		Addr:     args[0].Pointer(),
    59  		Fixed:    fixed,
    60  		Unmap:    fixed,
    61  		Map32Bit: map32bit,
    62  		Private:  private,
    63  		Perms: hostarch.AccessType{
    64  			Read:    linux.PROT_READ&prot != 0,
    65  			Write:   linux.PROT_WRITE&prot != 0,
    66  			Execute: linux.PROT_EXEC&prot != 0,
    67  		},
    68  		MaxPerms:  hostarch.AnyAccess,
    69  		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
    70  	}
    71  	if linux.MAP_POPULATE&flags != 0 {
    72  		opts.PlatformEffect = memmap.PlatformEffectCommit
    73  	}
    74  	if linux.MAP_LOCKED&flags != 0 {
    75  		opts.MLockMode = memmap.MLockEager
    76  	}
    77  	defer func() {
    78  		if opts.MappingIdentity != nil {
    79  			opts.MappingIdentity.DecRef(t)
    80  		}
    81  	}()
    82  
    83  	if !anon {
    84  		// Convert the passed FD to a file reference.
    85  		file := t.GetFile(fd)
    86  		if file == nil {
    87  			return 0, nil, linuxerr.EBADF
    88  		}
    89  		defer file.DecRef(t)
    90  
    91  		// mmap unconditionally requires that the FD is readable.
    92  		if !file.IsReadable() {
    93  			return 0, nil, linuxerr.EACCES
    94  		}
    95  		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
    96  		if shared && !file.IsWritable() {
    97  			opts.MaxPerms.Write = false
    98  		}
    99  
   100  		if err := file.ConfigureMMap(t, &opts); err != nil {
   101  			return 0, nil, err
   102  		}
   103  	} else if shared {
   104  		// Back shared anonymous mappings with an anonymous tmpfs file.
   105  		opts.Offset = 0
   106  		file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
   107  		if err != nil {
   108  			return 0, nil, err
   109  		}
   110  		defer file.DecRef(t)
   111  		if err := file.ConfigureMMap(t, &opts); err != nil {
   112  			return 0, nil, err
   113  		}
   114  	}
   115  
   116  	rv, err := t.MemoryManager().MMap(t, opts)
   117  	return uintptr(rv), nil, err
   118  }
   119  
   120  // Munmap implements linux syscall munmap(2).
   121  func Munmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   122  	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
   123  }
   124  
   125  // Mremap implements linux syscall mremap(2).
   126  func Mremap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   127  	oldAddr := args[0].Pointer()
   128  	oldSize := args[1].Uint64()
   129  	newSize := args[2].Uint64()
   130  	flags := args[3].Uint64()
   131  	newAddr := args[4].Pointer()
   132  
   133  	if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 {
   134  		return 0, nil, linuxerr.EINVAL
   135  	}
   136  	mayMove := flags&linux.MREMAP_MAYMOVE != 0
   137  	fixed := flags&linux.MREMAP_FIXED != 0
   138  	var moveMode mm.MRemapMoveMode
   139  	switch {
   140  	case !mayMove && !fixed:
   141  		moveMode = mm.MRemapNoMove
   142  	case mayMove && !fixed:
   143  		moveMode = mm.MRemapMayMove
   144  	case mayMove && fixed:
   145  		moveMode = mm.MRemapMustMove
   146  	case !mayMove && fixed:
   147  		// "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be
   148  		// specified." - mremap(2)
   149  		return 0, nil, linuxerr.EINVAL
   150  	}
   151  
   152  	rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{
   153  		Move:    moveMode,
   154  		NewAddr: newAddr,
   155  	})
   156  	return uintptr(rv), nil, err
   157  }
   158  
   159  // Mprotect implements linux syscall mprotect(2).
   160  func Mprotect(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   161  	length := args[1].Uint64()
   162  	prot := args[2].Int()
   163  	err := t.MemoryManager().MProtect(args[0].Pointer(), length, hostarch.AccessType{
   164  		Read:    linux.PROT_READ&prot != 0,
   165  		Write:   linux.PROT_WRITE&prot != 0,
   166  		Execute: linux.PROT_EXEC&prot != 0,
   167  	}, linux.PROT_GROWSDOWN&prot != 0)
   168  	return 0, nil, err
   169  }
   170  
   171  // Madvise implements linux syscall madvise(2).
   172  func Madvise(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   173  	addr := args[0].Pointer()
   174  	length := uint64(args[1].SizeT())
   175  	adv := args[2].Int()
   176  
   177  	// "The Linux implementation requires that the address addr be
   178  	// page-aligned, and allows length to be zero." - madvise(2)
   179  	if addr.RoundDown() != addr {
   180  		return 0, nil, linuxerr.EINVAL
   181  	}
   182  	if length == 0 {
   183  		return 0, nil, nil
   184  	}
   185  	// Not explicitly stated: length need not be page-aligned.
   186  	lenAddr, ok := hostarch.Addr(length).RoundUp()
   187  	if !ok {
   188  		return 0, nil, linuxerr.EINVAL
   189  	}
   190  	length = uint64(lenAddr)
   191  
   192  	switch adv {
   193  	case linux.MADV_DONTNEED:
   194  		return 0, nil, t.MemoryManager().Decommit(addr, length)
   195  	case linux.MADV_DOFORK:
   196  		return 0, nil, t.MemoryManager().SetDontFork(addr, length, false)
   197  	case linux.MADV_DONTFORK:
   198  		return 0, nil, t.MemoryManager().SetDontFork(addr, length, true)
   199  	case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
   200  		fallthrough
   201  	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
   202  		fallthrough
   203  	case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
   204  		// TODO(b/72045799): Core dumping isn't implemented, so these are
   205  		// no-ops.
   206  		fallthrough
   207  	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
   208  		// Do nothing, we totally ignore the suggestions above.
   209  		return 0, nil, nil
   210  	case linux.MADV_REMOVE:
   211  		// These "suggestions" have application-visible side effects, so we
   212  		// have to indicate that we don't support them.
   213  		return 0, nil, linuxerr.ENOSYS
   214  	case linux.MADV_HWPOISON:
   215  		// Only privileged processes are allowed to poison pages.
   216  		return 0, nil, linuxerr.EPERM
   217  	default:
   218  		// If adv is not a valid value tell the caller.
   219  		return 0, nil, linuxerr.EINVAL
   220  	}
   221  }
   222  
   223  // Mincore implements the syscall mincore(2).
   224  func Mincore(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   225  	addr := args[0].Pointer()
   226  	length := args[1].SizeT()
   227  	vec := args[2].Pointer()
   228  
   229  	if addr != addr.RoundDown() {
   230  		return 0, nil, linuxerr.EINVAL
   231  	}
   232  	// "The length argument need not be a multiple of the page size, but since
   233  	// residency information is returned for whole pages, length is effectively
   234  	// rounded up to the next multiple of the page size." - mincore(2)
   235  	la, ok := hostarch.Addr(length).RoundUp()
   236  	if !ok {
   237  		return 0, nil, linuxerr.ENOMEM
   238  	}
   239  	ar, ok := addr.ToRange(uint64(la))
   240  	if !ok {
   241  		return 0, nil, linuxerr.ENOMEM
   242  	}
   243  
   244  	// Pretend that all mapped pages are "resident in core".
   245  	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
   246  	// "ENOMEM: addr to addr + length contained unmapped memory."
   247  	if mapped != uint64(la) {
   248  		return 0, nil, linuxerr.ENOMEM
   249  	}
   250  	resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize))
   251  	_, err := t.CopyOutBytes(vec, resident)
   252  	return 0, nil, err
   253  }
   254  
   255  // Msync implements Linux syscall msync(2).
   256  func Msync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   257  	addr := args[0].Pointer()
   258  	length := args[1].SizeT()
   259  	flags := args[2].Int()
   260  
   261  	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
   262  	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
   263  	// permits a call to msync() that specifies neither of these flags, with
   264  	// semantics that are (currently) equivalent to specifying MS_ASYNC." -
   265  	// msync(2)
   266  	if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 {
   267  		return 0, nil, linuxerr.EINVAL
   268  	}
   269  	sync := flags&linux.MS_SYNC != 0
   270  	if sync && flags&linux.MS_ASYNC != 0 {
   271  		return 0, nil, linuxerr.EINVAL
   272  	}
   273  	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
   274  		Sync:       sync,
   275  		Invalidate: flags&linux.MS_INVALIDATE != 0,
   276  	})
   277  	// MSync calls fsync, the same interrupt conversion rules apply, see
   278  	// mm/msync.c, fsync POSIX.1-2008.
   279  	return 0, nil, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS)
   280  }
   281  
   282  // Mlock implements linux syscall mlock(2).
   283  func Mlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   284  	addr := args[0].Pointer()
   285  	length := args[1].SizeT()
   286  
   287  	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
   288  }
   289  
   290  // Mlock2 implements linux syscall mlock2(2).
   291  func Mlock2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   292  	addr := args[0].Pointer()
   293  	length := args[1].SizeT()
   294  	flags := args[2].Int()
   295  
   296  	if flags&^(linux.MLOCK_ONFAULT) != 0 {
   297  		return 0, nil, linuxerr.EINVAL
   298  	}
   299  
   300  	mode := memmap.MLockEager
   301  	if flags&linux.MLOCK_ONFAULT != 0 {
   302  		mode = memmap.MLockLazy
   303  	}
   304  	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
   305  }
   306  
   307  // Munlock implements linux syscall munlock(2).
   308  func Munlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   309  	addr := args[0].Pointer()
   310  	length := args[1].SizeT()
   311  
   312  	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
   313  }
   314  
   315  // Mlockall implements linux syscall mlockall(2).
   316  func Mlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   317  	flags := args[0].Int()
   318  
   319  	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
   320  		return 0, nil, linuxerr.EINVAL
   321  	}
   322  
   323  	mode := memmap.MLockEager
   324  	if flags&linux.MCL_ONFAULT != 0 {
   325  		mode = memmap.MLockLazy
   326  	}
   327  	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
   328  		Current: flags&linux.MCL_CURRENT != 0,
   329  		Future:  flags&linux.MCL_FUTURE != 0,
   330  		Mode:    mode,
   331  	})
   332  }
   333  
   334  // Munlockall implements linux syscall munlockall(2).
   335  func Munlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   336  	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
   337  		Current: true,
   338  		Future:  true,
   339  		Mode:    memmap.MLockNone,
   340  	})
   341  }