github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/syscalls.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  	mrand "math/rand"
    20  	"sync/atomic"
    21  
    22  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    23  	"github.com/SagerNet/gvisor/pkg/context"
    24  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    25  	"github.com/SagerNet/gvisor/pkg/hostarch"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/futex"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    30  	"github.com/SagerNet/gvisor/pkg/syserror"
    31  )
    32  
    33  // HandleUserFault handles an application page fault. sp is the faulting
    34  // application thread's stack pointer.
    35  //
    36  // Preconditions: mm.as != nil.
    37  func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr, at hostarch.AccessType, sp hostarch.Addr) error {
    38  	ar, ok := addr.RoundDown().ToRange(hostarch.PageSize)
    39  	if !ok {
    40  		return syserror.EFAULT
    41  	}
    42  
    43  	// Don't bother trying existingPMAsLocked; in most cases, if we did have
    44  	// existing pmas, we wouldn't have faulted.
    45  
    46  	// Ensure that we have a usable vma. Here and below, since we are only
    47  	// asking for a single page, there is no possibility of partial success,
    48  	// and any error is immediately fatal.
    49  	mm.mappingMu.RLock()
    50  	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
    51  	if err != nil {
    52  		mm.mappingMu.RUnlock()
    53  		return err
    54  	}
    55  
    56  	// Ensure that we have a usable pma.
    57  	mm.activeMu.Lock()
    58  	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
    59  	mm.mappingMu.RUnlock()
    60  	if err != nil {
    61  		mm.activeMu.Unlock()
    62  		return err
    63  	}
    64  
    65  	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
    66  	// anymore.
    67  	mm.activeMu.DowngradeLock()
    68  
    69  	// Map the faulted page into the active AddressSpace.
    70  	err = mm.mapASLocked(pseg, ar, false)
    71  	mm.activeMu.RUnlock()
    72  	return err
    73  }
    74  
    75  // MMap establishes a memory mapping.
    76  func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) {
    77  	if opts.Length == 0 {
    78  		return 0, linuxerr.EINVAL
    79  	}
    80  	length, ok := hostarch.Addr(opts.Length).RoundUp()
    81  	if !ok {
    82  		return 0, syserror.ENOMEM
    83  	}
    84  	opts.Length = uint64(length)
    85  
    86  	if opts.Mappable != nil {
    87  		// Offset must be aligned.
    88  		if hostarch.Addr(opts.Offset).RoundDown() != hostarch.Addr(opts.Offset) {
    89  			return 0, linuxerr.EINVAL
    90  		}
    91  		// Offset + length must not overflow.
    92  		if end := opts.Offset + opts.Length; end < opts.Offset {
    93  			return 0, syserror.ENOMEM
    94  		}
    95  	} else {
    96  		opts.Offset = 0
    97  	}
    98  
    99  	if opts.Addr.RoundDown() != opts.Addr {
   100  		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
   101  		// don't.
   102  		if opts.Fixed {
   103  			return 0, linuxerr.EINVAL
   104  		}
   105  		opts.Addr = opts.Addr.RoundDown()
   106  	}
   107  
   108  	if !opts.MaxPerms.SupersetOf(opts.Perms) {
   109  		return 0, linuxerr.EACCES
   110  	}
   111  	if opts.Unmap && !opts.Fixed {
   112  		return 0, linuxerr.EINVAL
   113  	}
   114  	if opts.GrowsDown && opts.Mappable != nil {
   115  		return 0, linuxerr.EINVAL
   116  	}
   117  
   118  	// Get the new vma.
   119  	mm.mappingMu.Lock()
   120  	if opts.MLockMode < mm.defMLockMode {
   121  		opts.MLockMode = mm.defMLockMode
   122  	}
   123  	vseg, ar, err := mm.createVMALocked(ctx, opts)
   124  	if err != nil {
   125  		mm.mappingMu.Unlock()
   126  		return 0, err
   127  	}
   128  
   129  	// TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
   130  	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
   131  	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
   132  	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
   133  	// populate_vma_page_range(). Confirm this behavior.
   134  	switch {
   135  	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
   136  		// Get pmas and map with precommit as requested.
   137  		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
   138  
   139  	case opts.Mappable == nil && length <= privateAllocUnit:
   140  		// NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
   141  		// that doing so will save on future page faults. We only do this for
   142  		// anonymous mappings, since otherwise the cost of
   143  		// memmap.Mappable.Translate is unknown; and only for small mappings,
   144  		// to avoid needing to allocate large amounts of memory that we may
   145  		// subsequently need to checkpoint.
   146  		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
   147  
   148  	default:
   149  		mm.mappingMu.Unlock()
   150  	}
   151  
   152  	return ar.Start, nil
   153  }
   154  
   155  // populateVMA obtains pmas for addresses in ar in the given vma, and maps them
   156  // into mm.as if it is active.
   157  //
   158  // Preconditions:
   159  // * mm.mappingMu must be locked.
   160  // * vseg.Range().IsSupersetOf(ar).
   161  func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
   162  	if !vseg.ValuePtr().effectivePerms.Any() {
   163  		// Linux doesn't populate inaccessible pages. See
   164  		// mm/gup.c:populate_vma_page_range.
   165  		return
   166  	}
   167  
   168  	mm.activeMu.Lock()
   169  	// Can't defer mm.activeMu.Unlock(); see below.
   170  
   171  	// Even if we get new pmas, we can't actually map them if we don't have an
   172  	// AddressSpace.
   173  	if mm.as == nil {
   174  		mm.activeMu.Unlock()
   175  		return
   176  	}
   177  
   178  	// Ensure that we have usable pmas.
   179  	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess)
   180  	if err != nil {
   181  		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
   182  		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
   183  		// userspace actually tries to use the failing page.
   184  		mm.activeMu.Unlock()
   185  		return
   186  	}
   187  
   188  	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
   189  	// anymore.
   190  	mm.activeMu.DowngradeLock()
   191  
   192  	// As above, errors are silently ignored.
   193  	mm.mapASLocked(pseg, ar, precommit)
   194  	mm.activeMu.RUnlock()
   195  }
   196  
   197  // populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
   198  // unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
   199  // preferable to populateVMA since it unlocks mm.mappingMu before performing
   200  // expensive operations that don't require it to be locked.
   201  //
   202  // Preconditions:
   203  // * mm.mappingMu must be locked for writing.
   204  // * vseg.Range().IsSupersetOf(ar).
   205  //
   206  // Postconditions: mm.mappingMu will be unlocked.
   207  // +checklocksrelease:mm.mappingMu
   208  func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
   209  	// See populateVMA above for commentary.
   210  	if !vseg.ValuePtr().effectivePerms.Any() {
   211  		mm.mappingMu.Unlock()
   212  		return
   213  	}
   214  
   215  	mm.activeMu.Lock()
   216  
   217  	if mm.as == nil {
   218  		mm.activeMu.Unlock()
   219  		mm.mappingMu.Unlock()
   220  		return
   221  	}
   222  
   223  	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
   224  	// isn't needed at all for mapASLocked.
   225  	mm.mappingMu.DowngradeLock()
   226  	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess)
   227  	mm.mappingMu.RUnlock()
   228  	if err != nil {
   229  		mm.activeMu.Unlock()
   230  		return
   231  	}
   232  
   233  	mm.activeMu.DowngradeLock()
   234  	mm.mapASLocked(pseg, ar, precommit)
   235  	mm.activeMu.RUnlock()
   236  }
   237  
   238  // MapStack allocates the initial process stack.
   239  func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, error) {
   240  	// maxStackSize is the maximum supported process stack size in bytes.
   241  	//
   242  	// This limit exists because stack growing isn't implemented, so the entire
   243  	// process stack must be mapped up-front.
   244  	const maxStackSize = 128 << 20
   245  
   246  	stackSize := limits.FromContext(ctx).Get(limits.Stack)
   247  	r, ok := hostarch.Addr(stackSize.Cur).RoundUp()
   248  	sz := uint64(r)
   249  	if !ok {
   250  		// RLIM_INFINITY rounds up to 0.
   251  		sz = linux.DefaultStackSoftLimit
   252  	} else if sz > maxStackSize {
   253  		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
   254  		sz = maxStackSize
   255  	} else if sz == 0 {
   256  		return hostarch.AddrRange{}, syserror.ENOMEM
   257  	}
   258  	szaddr := hostarch.Addr(sz)
   259  	ctx.Debugf("Allocating stack with size of %v bytes", sz)
   260  
   261  	// Determine the stack's desired location. Unlike Linux, address
   262  	// randomization can't be disabled.
   263  	stackEnd := mm.layout.MaxAddr - hostarch.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
   264  	if stackEnd < szaddr {
   265  		return hostarch.AddrRange{}, syserror.ENOMEM
   266  	}
   267  	stackStart := stackEnd - szaddr
   268  	mm.mappingMu.Lock()
   269  	defer mm.mappingMu.Unlock()
   270  	_, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
   271  		Length:    sz,
   272  		Addr:      stackStart,
   273  		Perms:     hostarch.ReadWrite,
   274  		MaxPerms:  hostarch.AnyAccess,
   275  		Private:   true,
   276  		GrowsDown: true,
   277  		MLockMode: mm.defMLockMode,
   278  		Hint:      "[stack]",
   279  	})
   280  	return ar, err
   281  }
   282  
   283  // MUnmap implements the semantics of Linux's munmap(2).
   284  func (mm *MemoryManager) MUnmap(ctx context.Context, addr hostarch.Addr, length uint64) error {
   285  	if addr != addr.RoundDown() {
   286  		return linuxerr.EINVAL
   287  	}
   288  	if length == 0 {
   289  		return linuxerr.EINVAL
   290  	}
   291  	la, ok := hostarch.Addr(length).RoundUp()
   292  	if !ok {
   293  		return linuxerr.EINVAL
   294  	}
   295  	ar, ok := addr.ToRange(uint64(la))
   296  	if !ok {
   297  		return linuxerr.EINVAL
   298  	}
   299  
   300  	mm.mappingMu.Lock()
   301  	defer mm.mappingMu.Unlock()
   302  	mm.unmapLocked(ctx, ar)
   303  	return nil
   304  }
   305  
   306  // MRemapOpts specifies options to MRemap.
   307  type MRemapOpts struct {
   308  	// Move controls whether MRemap moves the remapped mapping to a new address.
   309  	Move MRemapMoveMode
   310  
   311  	// NewAddr is the new address for the remapping. NewAddr is ignored unless
   312  	// Move is MMRemapMustMove.
   313  	NewAddr hostarch.Addr
   314  }
   315  
   316  // MRemapMoveMode controls MRemap's moving behavior.
   317  type MRemapMoveMode int
   318  
   319  const (
   320  	// MRemapNoMove prevents MRemap from moving the remapped mapping.
   321  	MRemapNoMove MRemapMoveMode = iota
   322  
   323  	// MRemapMayMove allows MRemap to move the remapped mapping.
   324  	MRemapMayMove
   325  
   326  	// MRemapMustMove requires MRemap to move the remapped mapping to
   327  	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
   328  	// range.
   329  	MRemapMustMove
   330  )
   331  
   332  // MRemap implements the semantics of Linux's mremap(2).
   333  func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (hostarch.Addr, error) {
   334  	// "Note that old_address has to be page aligned." - mremap(2)
   335  	if oldAddr.RoundDown() != oldAddr {
   336  		return 0, linuxerr.EINVAL
   337  	}
   338  
   339  	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
   340  	// valid size. However, new_size can't be 0 after rounding.
   341  	oldSizeAddr, _ := hostarch.Addr(oldSize).RoundUp()
   342  	oldSize = uint64(oldSizeAddr)
   343  	newSizeAddr, ok := hostarch.Addr(newSize).RoundUp()
   344  	if !ok || newSizeAddr == 0 {
   345  		return 0, linuxerr.EINVAL
   346  	}
   347  	newSize = uint64(newSizeAddr)
   348  
   349  	oldEnd, ok := oldAddr.AddLength(oldSize)
   350  	if !ok {
   351  		return 0, linuxerr.EINVAL
   352  	}
   353  
   354  	mm.mappingMu.Lock()
   355  	defer mm.mappingMu.Unlock()
   356  
   357  	// All cases require that a vma exists at oldAddr.
   358  	vseg := mm.vmas.FindSegment(oldAddr)
   359  	if !vseg.Ok() {
   360  		return 0, syserror.EFAULT
   361  	}
   362  
   363  	// Behavior matrix:
   364  	//
   365  	// Move     | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
   366  	// ---------+-------------+-------------------+-------------------+------------------
   367  	//   NoMove | ENOMEM [1]  | Grow in-place     | No-op             | Shrink in-place
   368  	//  MayMove | Copy [1]    | Grow in-place or  | No-op             | Shrink in-place
   369  	//          |             |   move            |                   |
   370  	// MustMove | Copy        | Move and grow     | Move              | Shrink and move
   371  	//
   372  	// [1] In-place growth is impossible because the vma at oldAddr already
   373  	// occupies at least part of the destination. Thus the NoMove case always
   374  	// fails and the MayMove case always falls back to copying.
   375  
   376  	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
   377  		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
   378  		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
   379  		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
   380  		// !CAP_IPC_LOCK.
   381  		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
   382  		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
   383  			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
   384  				return 0, linuxerr.EAGAIN
   385  			}
   386  		}
   387  	}
   388  
   389  	if opts.Move != MRemapMustMove {
   390  		// Handle no-ops and in-place shrinking. These cases don't care if
   391  		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
   392  		// (aside from oldAddr).
   393  		if newSize <= oldSize {
   394  			if newSize < oldSize {
   395  				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
   396  				// either.
   397  				newEnd := oldAddr + hostarch.Addr(newSize)
   398  				mm.unmapLocked(ctx, hostarch.AddrRange{newEnd, oldEnd})
   399  			}
   400  			return oldAddr, nil
   401  		}
   402  
   403  		// Handle in-place growing.
   404  
   405  		// Check that oldEnd maps to the same vma as oldAddr.
   406  		if vseg.End() < oldEnd {
   407  			return 0, syserror.EFAULT
   408  		}
   409  		// "Grow" the existing vma by creating a new mergeable one.
   410  		vma := vseg.ValuePtr()
   411  		var newOffset uint64
   412  		if vma.mappable != nil {
   413  			newOffset = vseg.mappableRange().End
   414  		}
   415  		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
   416  			Length:          newSize - oldSize,
   417  			MappingIdentity: vma.id,
   418  			Mappable:        vma.mappable,
   419  			Offset:          newOffset,
   420  			Addr:            oldEnd,
   421  			Fixed:           true,
   422  			Perms:           vma.realPerms,
   423  			MaxPerms:        vma.maxPerms,
   424  			Private:         vma.private,
   425  			GrowsDown:       vma.growsDown,
   426  			MLockMode:       vma.mlockMode,
   427  			Hint:            vma.hint,
   428  		})
   429  		if err == nil {
   430  			if vma.mlockMode == memmap.MLockEager {
   431  				mm.populateVMA(ctx, vseg, ar, true)
   432  			}
   433  			return oldAddr, nil
   434  		}
   435  		// In-place growth failed. In the MRemapMayMove case, fall through to
   436  		// copying/moving below.
   437  		if opts.Move == MRemapNoMove {
   438  			return 0, err
   439  		}
   440  	}
   441  
   442  	// Find a location for the new mapping.
   443  	var newAR hostarch.AddrRange
   444  	switch opts.Move {
   445  	case MRemapMayMove:
   446  		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
   447  		if err != nil {
   448  			return 0, err
   449  		}
   450  		newAR, _ = newAddr.ToRange(newSize)
   451  
   452  	case MRemapMustMove:
   453  		newAddr := opts.NewAddr
   454  		if newAddr.RoundDown() != newAddr {
   455  			return 0, linuxerr.EINVAL
   456  		}
   457  		var ok bool
   458  		newAR, ok = newAddr.ToRange(newSize)
   459  		if !ok {
   460  			return 0, linuxerr.EINVAL
   461  		}
   462  		if (hostarch.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
   463  			return 0, linuxerr.EINVAL
   464  		}
   465  
   466  		// Check that the new region is valid.
   467  		_, err := mm.findAvailableLocked(newSize, findAvailableOpts{
   468  			Addr:  newAddr,
   469  			Fixed: true,
   470  			Unmap: true,
   471  		})
   472  		if err != nil {
   473  			return 0, err
   474  		}
   475  
   476  		// Unmap any mappings at the destination.
   477  		mm.unmapLocked(ctx, newAR)
   478  
   479  		// If the sizes specify shrinking, unmap everything between the new and
   480  		// old sizes at the source. Unmapping before the following checks is
   481  		// correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
   482  		// vma_to_resize().
   483  		if newSize < oldSize {
   484  			oldNewEnd := oldAddr + hostarch.Addr(newSize)
   485  			mm.unmapLocked(ctx, hostarch.AddrRange{oldNewEnd, oldEnd})
   486  			oldEnd = oldNewEnd
   487  		}
   488  
   489  		// unmapLocked may have invalidated vseg; look it up again.
   490  		vseg = mm.vmas.FindSegment(oldAddr)
   491  	}
   492  
   493  	oldAR := hostarch.AddrRange{oldAddr, oldEnd}
   494  
   495  	// Check that oldEnd maps to the same vma as oldAddr.
   496  	if vseg.End() < oldEnd {
   497  		return 0, syserror.EFAULT
   498  	}
   499  
   500  	// Check against RLIMIT_AS.
   501  	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   502  	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
   503  		return 0, syserror.ENOMEM
   504  	}
   505  
   506  	if vma := vseg.ValuePtr(); vma.mappable != nil {
   507  		// Check that offset+length does not overflow.
   508  		if vma.off+uint64(newAR.Length()) < vma.off {
   509  			return 0, linuxerr.EINVAL
   510  		}
   511  		// Inform the Mappable, if any, of the new mapping.
   512  		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
   513  			return 0, err
   514  		}
   515  	}
   516  
   517  	if oldSize == 0 {
   518  		// Handle copying.
   519  		//
   520  		// We can't use createVMALocked because it calls Mappable.AddMapping,
   521  		// whereas we've already called Mappable.CopyMapping (which is
   522  		// consistent with Linux). Call vseg.Value() (rather than
   523  		// vseg.ValuePtr()) to make a copy of the vma.
   524  		vma := vseg.Value()
   525  		if vma.mappable != nil {
   526  			vma.off = vseg.mappableOffsetAt(oldAR.Start)
   527  		}
   528  		if vma.id != nil {
   529  			vma.id.IncRef()
   530  		}
   531  		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
   532  		mm.usageAS += uint64(newAR.Length())
   533  		if vma.isPrivateDataLocked() {
   534  			mm.dataAS += uint64(newAR.Length())
   535  		}
   536  		if vma.mlockMode != memmap.MLockNone {
   537  			mm.lockedAS += uint64(newAR.Length())
   538  			if vma.mlockMode == memmap.MLockEager {
   539  				mm.populateVMA(ctx, vseg, newAR, true)
   540  			}
   541  		}
   542  		return newAR.Start, nil
   543  	}
   544  
   545  	// Handle moving.
   546  	//
   547  	// Remove the existing vma before inserting the new one to minimize
   548  	// iterator invalidation. We do this directly (instead of calling
   549  	// removeVMAsLocked) because:
   550  	//
   551  	// 1. We can't drop the reference on vma.id, which will be transferred to
   552  	// the new vma.
   553  	//
   554  	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
   555  	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
   556  	// overlapping oldAR.
   557  	//
   558  	// Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the
   559  	// vma.
   560  	vseg = mm.vmas.Isolate(vseg, oldAR)
   561  	vma := vseg.Value()
   562  	mm.vmas.Remove(vseg)
   563  	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
   564  	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   565  	if vma.isPrivateDataLocked() {
   566  		mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   567  	}
   568  	if vma.mlockMode != memmap.MLockNone {
   569  		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   570  	}
   571  
   572  	// Move pmas. This is technically optional for non-private pmas, which
   573  	// could just go through memmap.Mappable.Translate again, but it's required
   574  	// for private pmas.
   575  	mm.activeMu.Lock()
   576  	mm.movePMAsLocked(oldAR, newAR)
   577  	mm.activeMu.Unlock()
   578  
   579  	// Now that pmas have been moved to newAR, we can notify vma.mappable that
   580  	// oldAR is no longer mapped.
   581  	if vma.mappable != nil {
   582  		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
   583  	}
   584  
   585  	if vma.mlockMode == memmap.MLockEager {
   586  		mm.populateVMA(ctx, vseg, newAR, true)
   587  	}
   588  
   589  	return newAR.Start, nil
   590  }
   591  
   592  // MProtect implements the semantics of Linux's mprotect(2).
   593  func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms hostarch.AccessType, growsDown bool) error {
   594  	if addr.RoundDown() != addr {
   595  		return linuxerr.EINVAL
   596  	}
   597  	if length == 0 {
   598  		return nil
   599  	}
   600  	rlength, ok := hostarch.Addr(length).RoundUp()
   601  	if !ok {
   602  		return syserror.ENOMEM
   603  	}
   604  	ar, ok := addr.ToRange(uint64(rlength))
   605  	if !ok {
   606  		return syserror.ENOMEM
   607  	}
   608  	effectivePerms := realPerms.Effective()
   609  
   610  	mm.mappingMu.Lock()
   611  	defer mm.mappingMu.Unlock()
   612  	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
   613  	// the first non-empty gap. growsDown mprotect requires that the first vma
   614  	// be growsDown, but does not require it to extend all the way to ar.Start;
   615  	// vmas after the first must be contiguous but need not be growsDown, like
   616  	// the non-growsDown case.
   617  	vseg := mm.vmas.LowerBoundSegment(ar.Start)
   618  	if !vseg.Ok() {
   619  		return syserror.ENOMEM
   620  	}
   621  	if growsDown {
   622  		if !vseg.ValuePtr().growsDown {
   623  			return linuxerr.EINVAL
   624  		}
   625  		if ar.End <= vseg.Start() {
   626  			return syserror.ENOMEM
   627  		}
   628  		ar.Start = vseg.Start()
   629  	} else {
   630  		if ar.Start < vseg.Start() {
   631  			return syserror.ENOMEM
   632  		}
   633  	}
   634  
   635  	mm.activeMu.Lock()
   636  	defer mm.activeMu.Unlock()
   637  	defer func() {
   638  		mm.vmas.MergeRange(ar)
   639  		mm.vmas.MergeAdjacent(ar)
   640  		mm.pmas.MergeRange(ar)
   641  		mm.pmas.MergeAdjacent(ar)
   642  	}()
   643  	pseg := mm.pmas.LowerBoundSegment(ar.Start)
   644  	var didUnmapAS bool
   645  	for {
   646  		// Check for permission validity before splitting vmas, for consistency
   647  		// with Linux.
   648  		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
   649  			return linuxerr.EACCES
   650  		}
   651  		vseg = mm.vmas.Isolate(vseg, ar)
   652  
   653  		// Update vma permissions.
   654  		vma := vseg.ValuePtr()
   655  		vmaLength := vseg.Range().Length()
   656  		if vma.isPrivateDataLocked() {
   657  			mm.dataAS -= uint64(vmaLength)
   658  		}
   659  
   660  		vma.realPerms = realPerms
   661  		vma.effectivePerms = effectivePerms
   662  		if vma.isPrivateDataLocked() {
   663  			mm.dataAS += uint64(vmaLength)
   664  		}
   665  
   666  		// Propagate vma permission changes to pmas.
   667  		for pseg.Ok() && pseg.Start() < vseg.End() {
   668  			if pseg.Range().Overlaps(vseg.Range()) {
   669  				pseg = mm.pmas.Isolate(pseg, vseg.Range())
   670  				pma := pseg.ValuePtr()
   671  				if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
   672  					// Unmap all of ar, not just vseg.Range(), to minimize host
   673  					// syscalls.
   674  					mm.unmapASLocked(ar)
   675  					didUnmapAS = true
   676  				}
   677  				pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
   678  				if pma.needCOW {
   679  					pma.effectivePerms.Write = false
   680  				}
   681  			}
   682  			pseg = pseg.NextSegment()
   683  		}
   684  
   685  		// Continue to the next vma.
   686  		if ar.End <= vseg.End() {
   687  			return nil
   688  		}
   689  		vseg, _ = vseg.NextNonEmpty()
   690  		if !vseg.Ok() {
   691  			return syserror.ENOMEM
   692  		}
   693  	}
   694  }
   695  
   696  // BrkSetup sets mm's brk address to addr and its brk size to 0.
   697  func (mm *MemoryManager) BrkSetup(ctx context.Context, addr hostarch.Addr) {
   698  	mm.mappingMu.Lock()
   699  	defer mm.mappingMu.Unlock()
   700  	// Unmap the existing brk.
   701  	if mm.brk.Length() != 0 {
   702  		mm.unmapLocked(ctx, mm.brk)
   703  	}
   704  	mm.brk = hostarch.AddrRange{addr, addr}
   705  }
   706  
   707  // Brk implements the semantics of Linux's brk(2), except that it returns an
   708  // error on failure.
   709  func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch.Addr, error) {
   710  	mm.mappingMu.Lock()
   711  	// Can't defer mm.mappingMu.Unlock(); see below.
   712  
   713  	if addr < mm.brk.Start {
   714  		addr = mm.brk.End
   715  		mm.mappingMu.Unlock()
   716  		return addr, linuxerr.EINVAL
   717  	}
   718  
   719  	// TODO(github.com/SagerNet/issue/156): This enforces RLIMIT_DATA, but is
   720  	// slightly more permissive than the usual data limit. In particular,
   721  	// this only limits the size of the heap; a true RLIMIT_DATA limits the
   722  	// size of heap + data + bss. The segment sizes need to be plumbed from
   723  	// the loader package to fully enforce RLIMIT_DATA.
   724  	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
   725  		addr = mm.brk.End
   726  		mm.mappingMu.Unlock()
   727  		return addr, syserror.ENOMEM
   728  	}
   729  
   730  	oldbrkpg, _ := mm.brk.End.RoundUp()
   731  	newbrkpg, ok := addr.RoundUp()
   732  	if !ok {
   733  		addr = mm.brk.End
   734  		mm.mappingMu.Unlock()
   735  		return addr, syserror.EFAULT
   736  	}
   737  
   738  	switch {
   739  	case oldbrkpg < newbrkpg:
   740  		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
   741  			Length: uint64(newbrkpg - oldbrkpg),
   742  			Addr:   oldbrkpg,
   743  			Fixed:  true,
   744  			// Compare Linux's
   745  			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
   746  			Perms:    hostarch.ReadWrite,
   747  			MaxPerms: hostarch.AnyAccess,
   748  			Private:  true,
   749  			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
   750  			// mm->def_flags.
   751  			MLockMode: mm.defMLockMode,
   752  			Hint:      "[heap]",
   753  		})
   754  		if err != nil {
   755  			addr = mm.brk.End
   756  			mm.mappingMu.Unlock()
   757  			return addr, err
   758  		}
   759  		mm.brk.End = addr
   760  		if mm.defMLockMode == memmap.MLockEager {
   761  			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
   762  		} else {
   763  			mm.mappingMu.Unlock()
   764  		}
   765  
   766  	case newbrkpg < oldbrkpg:
   767  		mm.unmapLocked(ctx, hostarch.AddrRange{newbrkpg, oldbrkpg})
   768  		fallthrough
   769  
   770  	default:
   771  		mm.brk.End = addr
   772  		mm.mappingMu.Unlock()
   773  	}
   774  
   775  	return addr, nil
   776  }
   777  
   778  // MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
   779  // depending on mode.
   780  func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length uint64, mode memmap.MLockMode) error {
   781  	// Linux allows this to overflow.
   782  	la, _ := hostarch.Addr(length + addr.PageOffset()).RoundUp()
   783  	ar, ok := addr.RoundDown().ToRange(uint64(la))
   784  	if !ok {
   785  		return linuxerr.EINVAL
   786  	}
   787  
   788  	mm.mappingMu.Lock()
   789  	// Can't defer mm.mappingMu.Unlock(); see below.
   790  
   791  	if mode != memmap.MLockNone {
   792  		// Check against RLIMIT_MEMLOCK.
   793  		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
   794  			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
   795  			if mlockLimit == 0 {
   796  				mm.mappingMu.Unlock()
   797  				return linuxerr.EPERM
   798  			}
   799  			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
   800  				mm.mappingMu.Unlock()
   801  				return syserror.ENOMEM
   802  			}
   803  		}
   804  	}
   805  
   806  	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
   807  	if ar.Length() == 0 {
   808  		mm.mappingMu.Unlock()
   809  		return nil
   810  	}
   811  
   812  	// Apply the new mlock mode to vmas.
   813  	var unmapped bool
   814  	vseg := mm.vmas.FindSegment(ar.Start)
   815  	for {
   816  		if !vseg.Ok() {
   817  			unmapped = true
   818  			break
   819  		}
   820  		vseg = mm.vmas.Isolate(vseg, ar)
   821  		vma := vseg.ValuePtr()
   822  		prevMode := vma.mlockMode
   823  		vma.mlockMode = mode
   824  		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
   825  			mm.lockedAS += uint64(vseg.Range().Length())
   826  		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
   827  			mm.lockedAS -= uint64(vseg.Range().Length())
   828  		}
   829  		if ar.End <= vseg.End() {
   830  			break
   831  		}
   832  		vseg, _ = vseg.NextNonEmpty()
   833  	}
   834  	mm.vmas.MergeRange(ar)
   835  	mm.vmas.MergeAdjacent(ar)
   836  	if unmapped {
   837  		mm.mappingMu.Unlock()
   838  		return syserror.ENOMEM
   839  	}
   840  
   841  	if mode == memmap.MLockEager {
   842  		// Ensure that we have usable pmas. Since we didn't return ENOMEM
   843  		// above, ar must be fully covered by vmas, so we can just use
   844  		// NextSegment below.
   845  		mm.activeMu.Lock()
   846  		mm.mappingMu.DowngradeLock()
   847  		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
   848  			if !vseg.ValuePtr().effectivePerms.Any() {
   849  				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
   850  				// case, which is converted to ENOMEM by mlock.
   851  				mm.activeMu.Unlock()
   852  				mm.mappingMu.RUnlock()
   853  				return syserror.ENOMEM
   854  			}
   855  			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), hostarch.NoAccess)
   856  			if err != nil {
   857  				mm.activeMu.Unlock()
   858  				mm.mappingMu.RUnlock()
   859  				// Linux: mm/mlock.c:__mlock_posix_error_return()
   860  				if linuxerr.Equals(linuxerr.EFAULT, err) {
   861  					return syserror.ENOMEM
   862  				}
   863  				if linuxerr.Equals(linuxerr.ENOMEM, err) {
   864  					return linuxerr.EAGAIN
   865  				}
   866  				return err
   867  			}
   868  		}
   869  
   870  		// Map pmas into the active AddressSpace, if we have one.
   871  		mm.mappingMu.RUnlock()
   872  		if mm.as != nil {
   873  			mm.activeMu.DowngradeLock()
   874  			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
   875  			mm.activeMu.RUnlock()
   876  			if err != nil {
   877  				return err
   878  			}
   879  		} else {
   880  			mm.activeMu.Unlock()
   881  		}
   882  	} else {
   883  		mm.mappingMu.Unlock()
   884  	}
   885  
   886  	return nil
   887  }
   888  
   889  // MLockAllOpts holds options to MLockAll.
   890  type MLockAllOpts struct {
   891  	// If Current is true, change the memory-locking behavior of all mappings
   892  	// to Mode. If Future is true, upgrade the memory-locking behavior of all
   893  	// future mappings to Mode. At least one of Current or Future must be true.
   894  	Current bool
   895  	Future  bool
   896  	Mode    memmap.MLockMode
   897  }
   898  
   899  // MLockAll implements the semantics of Linux's mlockall()/munlockall(),
   900  // depending on opts.
   901  func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
   902  	if !opts.Current && !opts.Future {
   903  		return linuxerr.EINVAL
   904  	}
   905  
   906  	mm.mappingMu.Lock()
   907  	// Can't defer mm.mappingMu.Unlock(); see below.
   908  
   909  	if opts.Current {
   910  		if opts.Mode != memmap.MLockNone {
   911  			// Check against RLIMIT_MEMLOCK.
   912  			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
   913  				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
   914  				if mlockLimit == 0 {
   915  					mm.mappingMu.Unlock()
   916  					return linuxerr.EPERM
   917  				}
   918  				if uint64(mm.vmas.Span()) > mlockLimit {
   919  					mm.mappingMu.Unlock()
   920  					return syserror.ENOMEM
   921  				}
   922  			}
   923  		}
   924  		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
   925  			vma := vseg.ValuePtr()
   926  			prevMode := vma.mlockMode
   927  			vma.mlockMode = opts.Mode
   928  			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
   929  				mm.lockedAS += uint64(vseg.Range().Length())
   930  			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
   931  				mm.lockedAS -= uint64(vseg.Range().Length())
   932  			}
   933  		}
   934  	}
   935  
   936  	if opts.Future {
   937  		mm.defMLockMode = opts.Mode
   938  	}
   939  
   940  	if opts.Current && opts.Mode == memmap.MLockEager {
   941  		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
   942  		// ignores the return value of __mm_populate(), so all errors below are
   943  		// ignored.
   944  		//
   945  		// Try to get usable pmas.
   946  		mm.activeMu.Lock()
   947  		mm.mappingMu.DowngradeLock()
   948  		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
   949  			if vseg.ValuePtr().effectivePerms.Any() {
   950  				mm.getPMAsLocked(ctx, vseg, vseg.Range(), hostarch.NoAccess)
   951  			}
   952  		}
   953  
   954  		// Map all pmas into the active AddressSpace, if we have one.
   955  		mm.mappingMu.RUnlock()
   956  		if mm.as != nil {
   957  			mm.activeMu.DowngradeLock()
   958  			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
   959  			mm.activeMu.RUnlock()
   960  		} else {
   961  			mm.activeMu.Unlock()
   962  		}
   963  	} else {
   964  		mm.mappingMu.Unlock()
   965  	}
   966  	return nil
   967  }
   968  
   969  // NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
   970  func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint64, error) {
   971  	mm.mappingMu.RLock()
   972  	defer mm.mappingMu.RUnlock()
   973  	vseg := mm.vmas.FindSegment(addr)
   974  	if !vseg.Ok() {
   975  		return 0, 0, syserror.EFAULT
   976  	}
   977  	vma := vseg.ValuePtr()
   978  	return vma.numaPolicy, vma.numaNodemask, nil
   979  }
   980  
   981  // SetNumaPolicy implements the semantics of Linux's mbind().
   982  func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
   983  	if !addr.IsPageAligned() {
   984  		return linuxerr.EINVAL
   985  	}
   986  	// Linux allows this to overflow.
   987  	la, _ := hostarch.Addr(length).RoundUp()
   988  	ar, ok := addr.ToRange(uint64(la))
   989  	if !ok {
   990  		return linuxerr.EINVAL
   991  	}
   992  	if ar.Length() == 0 {
   993  		return nil
   994  	}
   995  
   996  	mm.mappingMu.Lock()
   997  	defer mm.mappingMu.Unlock()
   998  	defer func() {
   999  		mm.vmas.MergeRange(ar)
  1000  		mm.vmas.MergeAdjacent(ar)
  1001  	}()
  1002  	vseg := mm.vmas.LowerBoundSegment(ar.Start)
  1003  	lastEnd := ar.Start
  1004  	for {
  1005  		if !vseg.Ok() || lastEnd < vseg.Start() {
  1006  			// "EFAULT: ... there was an unmapped hole in the specified memory
  1007  			// range specified [sic] by addr and len." - mbind(2)
  1008  			return syserror.EFAULT
  1009  		}
  1010  		vseg = mm.vmas.Isolate(vseg, ar)
  1011  		vma := vseg.ValuePtr()
  1012  		vma.numaPolicy = policy
  1013  		vma.numaNodemask = nodemask
  1014  		lastEnd = vseg.End()
  1015  		if ar.End <= lastEnd {
  1016  			return nil
  1017  		}
  1018  		vseg, _ = vseg.NextNonEmpty()
  1019  	}
  1020  }
  1021  
  1022  // SetDontFork implements the semantics of madvise MADV_DONTFORK.
  1023  func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork bool) error {
  1024  	ar, ok := addr.ToRange(length)
  1025  	if !ok {
  1026  		return linuxerr.EINVAL
  1027  	}
  1028  
  1029  	mm.mappingMu.Lock()
  1030  	defer mm.mappingMu.Unlock()
  1031  	defer func() {
  1032  		mm.vmas.MergeRange(ar)
  1033  		mm.vmas.MergeAdjacent(ar)
  1034  	}()
  1035  
  1036  	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
  1037  		vseg = mm.vmas.Isolate(vseg, ar)
  1038  		vma := vseg.ValuePtr()
  1039  		vma.dontfork = dontfork
  1040  	}
  1041  
  1042  	if mm.vmas.SpanRange(ar) != ar.Length() {
  1043  		return syserror.ENOMEM
  1044  	}
  1045  	return nil
  1046  }
  1047  
  1048  // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
  1049  func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error {
  1050  	ar, ok := addr.ToRange(length)
  1051  	if !ok {
  1052  		return linuxerr.EINVAL
  1053  	}
  1054  
  1055  	mm.mappingMu.RLock()
  1056  	defer mm.mappingMu.RUnlock()
  1057  	mm.activeMu.Lock()
  1058  	defer mm.activeMu.Unlock()
  1059  
  1060  	// This is invalidateLocked(invalidatePrivate=true, invalidateShared=true),
  1061  	// with the additional wrinkle that we must refuse to invalidate pmas under
  1062  	// mlocked vmas.
  1063  	var didUnmapAS bool
  1064  	pseg := mm.pmas.LowerBoundSegment(ar.Start)
  1065  	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
  1066  		vma := vseg.ValuePtr()
  1067  		if vma.mlockMode != memmap.MLockNone {
  1068  			return linuxerr.EINVAL
  1069  		}
  1070  		vsegAR := vseg.Range().Intersect(ar)
  1071  		// pseg should already correspond to either this vma or a later one,
  1072  		// since there can't be a pma without a corresponding vma.
  1073  		if checkInvariants {
  1074  			if pseg.Ok() && pseg.End() <= vsegAR.Start {
  1075  				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
  1076  			}
  1077  		}
  1078  		for pseg.Ok() && pseg.Start() < vsegAR.End {
  1079  			pseg = mm.pmas.Isolate(pseg, vsegAR)
  1080  			pma := pseg.ValuePtr()
  1081  			if !didUnmapAS {
  1082  				// Unmap all of ar, not just pseg.Range(), to minimize host
  1083  				// syscalls. AddressSpace mappings must be removed before
  1084  				// mm.decPrivateRef().
  1085  				mm.unmapASLocked(ar)
  1086  				didUnmapAS = true
  1087  			}
  1088  			if pma.private {
  1089  				mm.decPrivateRef(pseg.fileRange())
  1090  			}
  1091  			pma.file.DecRef(pseg.fileRange())
  1092  			mm.removeRSSLocked(pseg.Range())
  1093  			pseg = mm.pmas.Remove(pseg).NextSegment()
  1094  		}
  1095  	}
  1096  
  1097  	// "If there are some parts of the specified address space that are not
  1098  	// mapped, the Linux version of madvise() ignores them and applies the call
  1099  	// to the rest (but returns ENOMEM from the system call, as it should)." -
  1100  	// madvise(2)
  1101  	if mm.vmas.SpanRange(ar) != ar.Length() {
  1102  		return syserror.ENOMEM
  1103  	}
  1104  	return nil
  1105  }
  1106  
  1107  // MSyncOpts holds options to MSync.
  1108  type MSyncOpts struct {
  1109  	// Sync has the semantics of MS_SYNC.
  1110  	Sync bool
  1111  
  1112  	// Invalidate has the semantics of MS_INVALIDATE.
  1113  	Invalidate bool
  1114  }
  1115  
  1116  // MSync implements the semantics of Linux's msync().
  1117  func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length uint64, opts MSyncOpts) error {
  1118  	if addr != addr.RoundDown() {
  1119  		return linuxerr.EINVAL
  1120  	}
  1121  	if length == 0 {
  1122  		return nil
  1123  	}
  1124  	la, ok := hostarch.Addr(length).RoundUp()
  1125  	if !ok {
  1126  		return syserror.ENOMEM
  1127  	}
  1128  	ar, ok := addr.ToRange(uint64(la))
  1129  	if !ok {
  1130  		return syserror.ENOMEM
  1131  	}
  1132  
  1133  	mm.mappingMu.RLock()
  1134  	// Can't defer mm.mappingMu.RUnlock(); see below.
  1135  	vseg := mm.vmas.LowerBoundSegment(ar.Start)
  1136  	if !vseg.Ok() {
  1137  		mm.mappingMu.RUnlock()
  1138  		return syserror.ENOMEM
  1139  	}
  1140  	var unmapped bool
  1141  	lastEnd := ar.Start
  1142  	for {
  1143  		if !vseg.Ok() {
  1144  			mm.mappingMu.RUnlock()
  1145  			unmapped = true
  1146  			break
  1147  		}
  1148  		if lastEnd < vseg.Start() {
  1149  			unmapped = true
  1150  		}
  1151  		lastEnd = vseg.End()
  1152  		vma := vseg.ValuePtr()
  1153  		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
  1154  			mm.mappingMu.RUnlock()
  1155  			return linuxerr.EBUSY
  1156  		}
  1157  		// It's only possible to have dirtied the Mappable through a shared
  1158  		// mapping. Don't check if the mapping is writable, because mprotect
  1159  		// may have changed this, and also because Linux doesn't.
  1160  		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
  1161  			// We can't call memmap.MappingIdentity.Msync while holding
  1162  			// mm.mappingMu since it may take fs locks that precede it in the
  1163  			// lock order.
  1164  			id.IncRef()
  1165  			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
  1166  			mm.mappingMu.RUnlock()
  1167  			err := id.Msync(ctx, mr)
  1168  			id.DecRef(ctx)
  1169  			if err != nil {
  1170  				return err
  1171  			}
  1172  			if lastEnd >= ar.End {
  1173  				break
  1174  			}
  1175  			mm.mappingMu.RLock()
  1176  			vseg = mm.vmas.LowerBoundSegment(lastEnd)
  1177  		} else {
  1178  			if lastEnd >= ar.End {
  1179  				mm.mappingMu.RUnlock()
  1180  				break
  1181  			}
  1182  			vseg = vseg.NextSegment()
  1183  		}
  1184  	}
  1185  
  1186  	if unmapped {
  1187  		return syserror.ENOMEM
  1188  	}
  1189  	return nil
  1190  }
  1191  
  1192  // GetSharedFutexKey is used by kernel.Task.GetSharedKey.
  1193  func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr hostarch.Addr) (futex.Key, error) {
  1194  	ar, ok := addr.ToRange(4) // sizeof(int32).
  1195  	if !ok {
  1196  		return futex.Key{}, syserror.EFAULT
  1197  	}
  1198  
  1199  	mm.mappingMu.RLock()
  1200  	defer mm.mappingMu.RUnlock()
  1201  	vseg, _, err := mm.getVMAsLocked(ctx, ar, hostarch.Read, false)
  1202  	if err != nil {
  1203  		return futex.Key{}, err
  1204  	}
  1205  	vma := vseg.ValuePtr()
  1206  
  1207  	if vma.private {
  1208  		return futex.Key{
  1209  			Kind:   futex.KindSharedPrivate,
  1210  			Offset: uint64(addr),
  1211  		}, nil
  1212  	}
  1213  
  1214  	if vma.id != nil {
  1215  		vma.id.IncRef()
  1216  	}
  1217  	return futex.Key{
  1218  		Kind:            futex.KindSharedMappable,
  1219  		Mappable:        vma.mappable,
  1220  		MappingIdentity: vma.id,
  1221  		Offset:          vseg.mappableOffsetAt(addr),
  1222  	}, nil
  1223  }
  1224  
  1225  // VirtualMemorySize returns the combined length in bytes of all mappings in
  1226  // mm.
  1227  func (mm *MemoryManager) VirtualMemorySize() uint64 {
  1228  	mm.mappingMu.RLock()
  1229  	defer mm.mappingMu.RUnlock()
  1230  	return mm.usageAS
  1231  }
  1232  
  1233  // VirtualMemorySizeRange returns the combined length in bytes of all mappings
  1234  // in ar in mm.
  1235  func (mm *MemoryManager) VirtualMemorySizeRange(ar hostarch.AddrRange) uint64 {
  1236  	mm.mappingMu.RLock()
  1237  	defer mm.mappingMu.RUnlock()
  1238  	return uint64(mm.vmas.SpanRange(ar))
  1239  }
  1240  
  1241  // ResidentSetSize returns the value advertised as mm's RSS in bytes.
  1242  func (mm *MemoryManager) ResidentSetSize() uint64 {
  1243  	mm.activeMu.RLock()
  1244  	defer mm.activeMu.RUnlock()
  1245  	return mm.curRSS
  1246  }
  1247  
  1248  // MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
  1249  func (mm *MemoryManager) MaxResidentSetSize() uint64 {
  1250  	mm.activeMu.RLock()
  1251  	defer mm.activeMu.RUnlock()
  1252  	return mm.maxRSS
  1253  }
  1254  
  1255  // VirtualDataSize returns the size of private data segments in mm.
  1256  func (mm *MemoryManager) VirtualDataSize() uint64 {
  1257  	mm.mappingMu.RLock()
  1258  	defer mm.mappingMu.RUnlock()
  1259  	return mm.dataAS
  1260  }
  1261  
  1262  // EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
  1263  // return true.
  1264  func (mm *MemoryManager) EnableMembarrierPrivate() {
  1265  	atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1)
  1266  }
  1267  
  1268  // IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
  1269  // previously been called.
  1270  func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
  1271  	return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0
  1272  }
  1273  
  1274  // EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to
  1275  // return true.
  1276  func (mm *MemoryManager) EnableMembarrierRSeq() {
  1277  	atomic.StoreUint32(&mm.membarrierRSeqEnabled, 1)
  1278  }
  1279  
  1280  // IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has
  1281  // previously been called.
  1282  func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool {
  1283  	return atomic.LoadUint32(&mm.membarrierRSeqEnabled) != 0
  1284  }