github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/mm/syscalls.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  	mrand "math/rand"
    20  
    21  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    22  	"github.com/ttpreport/gvisor-ligolo/pkg/context"
    23  	"github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr"
    24  	"github.com/ttpreport/gvisor-ligolo/pkg/hostarch"
    25  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth"
    26  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/futex"
    27  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/limits"
    28  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/memmap"
    29  )
    30  
    31  // HandleUserFault handles an application page fault. sp is the faulting
    32  // application thread's stack pointer.
    33  //
    34  // Preconditions: mm.as != nil.
    35  func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr, at hostarch.AccessType, sp hostarch.Addr) error {
    36  	ar, ok := addr.RoundDown().ToRange(hostarch.PageSize)
    37  	if !ok {
    38  		return linuxerr.EFAULT
    39  	}
    40  
    41  	// Don't bother trying existingPMAsLocked; in most cases, if we did have
    42  	// existing pmas, we wouldn't have faulted.
    43  
    44  	// Ensure that we have a usable vma. Here and below, since we are only
    45  	// asking for a single page, there is no possibility of partial success,
    46  	// and any error is immediately fatal.
    47  	mm.mappingMu.RLock()
    48  	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
    49  	if err != nil {
    50  		mm.mappingMu.RUnlock()
    51  		return err
    52  	}
    53  
    54  	// Ensure that we have a usable pma.
    55  	mm.activeMu.Lock()
    56  	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
    57  	mm.mappingMu.RUnlock()
    58  	if err != nil {
    59  		mm.activeMu.Unlock()
    60  		return err
    61  	}
    62  
    63  	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
    64  	// anymore.
    65  	mm.activeMu.DowngradeLock()
    66  
    67  	// Map the faulted page into the active AddressSpace.
    68  	err = mm.mapASLocked(pseg, ar, false)
    69  	mm.activeMu.RUnlock()
    70  	return err
    71  }
    72  
    73  // MMap establishes a memory mapping.
    74  func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) {
    75  	if opts.Length == 0 {
    76  		return 0, linuxerr.EINVAL
    77  	}
    78  	length, ok := hostarch.Addr(opts.Length).RoundUp()
    79  	if !ok {
    80  		return 0, linuxerr.ENOMEM
    81  	}
    82  	opts.Length = uint64(length)
    83  
    84  	if opts.Mappable != nil {
    85  		// Offset must be aligned.
    86  		if hostarch.Addr(opts.Offset).RoundDown() != hostarch.Addr(opts.Offset) {
    87  			return 0, linuxerr.EINVAL
    88  		}
    89  		// Offset + length must not overflow.
    90  		if end := opts.Offset + opts.Length; end < opts.Offset {
    91  			return 0, linuxerr.EOVERFLOW
    92  		}
    93  	} else {
    94  		opts.Offset = 0
    95  	}
    96  
    97  	if opts.Addr.RoundDown() != opts.Addr {
    98  		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
    99  		// don't.
   100  		if opts.Fixed {
   101  			return 0, linuxerr.EINVAL
   102  		}
   103  		opts.Addr = opts.Addr.RoundDown()
   104  	}
   105  
   106  	if !opts.MaxPerms.SupersetOf(opts.Perms) {
   107  		return 0, linuxerr.EACCES
   108  	}
   109  	if opts.Unmap && !opts.Fixed {
   110  		return 0, linuxerr.EINVAL
   111  	}
   112  	if opts.GrowsDown && opts.Mappable != nil {
   113  		return 0, linuxerr.EINVAL
   114  	}
   115  
   116  	// Get the new vma.
   117  	var droppedIDs []memmap.MappingIdentity
   118  	mm.mappingMu.Lock()
   119  	if opts.MLockMode < mm.defMLockMode {
   120  		opts.MLockMode = mm.defMLockMode
   121  	}
   122  	vseg, ar, droppedIDs, err := mm.createVMALocked(ctx, opts, droppedIDs)
   123  	if err != nil {
   124  		mm.mappingMu.Unlock()
   125  		return 0, err
   126  	}
   127  
   128  	// TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
   129  	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
   130  	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
   131  	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
   132  	// populate_vma_page_range(). Confirm this behavior.
   133  	switch {
   134  	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
   135  		// Get pmas and map with precommit as requested.
   136  		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
   137  
   138  	case opts.Mappable == nil && length <= privateAllocUnit:
   139  		// NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
   140  		// that doing so will save on future page faults. We only do this for
   141  		// anonymous mappings, since otherwise the cost of
   142  		// memmap.Mappable.Translate is unknown; and only for small mappings,
   143  		// to avoid needing to allocate large amounts of memory that we may
   144  		// subsequently need to checkpoint.
   145  		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
   146  
   147  	default:
   148  		mm.mappingMu.Unlock()
   149  	}
   150  
   151  	for _, id := range droppedIDs {
   152  		id.DecRef(ctx)
   153  	}
   154  
   155  	return ar.Start, nil
   156  }
   157  
   158  // populateVMA obtains pmas for addresses in ar in the given vma, and maps them
   159  // into mm.as if it is active.
   160  //
   161  // Preconditions:
   162  //   - mm.mappingMu must be locked.
   163  //   - vseg.Range().IsSupersetOf(ar).
   164  func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
   165  	if !vseg.ValuePtr().effectivePerms.Any() {
   166  		// Linux doesn't populate inaccessible pages. See
   167  		// mm/gup.c:populate_vma_page_range.
   168  		return
   169  	}
   170  
   171  	mm.activeMu.Lock()
   172  	// Can't defer mm.activeMu.Unlock(); see below.
   173  
   174  	// Even if we get new pmas, we can't actually map them if we don't have an
   175  	// AddressSpace.
   176  	if mm.as == nil {
   177  		mm.activeMu.Unlock()
   178  		return
   179  	}
   180  
   181  	// Ensure that we have usable pmas.
   182  	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess)
   183  	if err != nil {
   184  		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
   185  		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
   186  		// userspace actually tries to use the failing page.
   187  		mm.activeMu.Unlock()
   188  		return
   189  	}
   190  
   191  	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
   192  	// anymore.
   193  	mm.activeMu.DowngradeLock()
   194  
   195  	// As above, errors are silently ignored.
   196  	mm.mapASLocked(pseg, ar, precommit)
   197  	mm.activeMu.RUnlock()
   198  }
   199  
   200  // populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
   201  // unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
   202  // preferable to populateVMA since it unlocks mm.mappingMu before performing
   203  // expensive operations that don't require it to be locked.
   204  //
   205  // Preconditions:
   206  //   - mm.mappingMu must be locked for writing.
   207  //   - vseg.Range().IsSupersetOf(ar).
   208  //
   209  // Postconditions: mm.mappingMu will be unlocked.
   210  // +checklocksrelease:mm.mappingMu
   211  func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
   212  	// See populateVMA above for commentary.
   213  	if !vseg.ValuePtr().effectivePerms.Any() {
   214  		mm.mappingMu.Unlock()
   215  		return
   216  	}
   217  
   218  	mm.activeMu.Lock()
   219  
   220  	if mm.as == nil {
   221  		mm.activeMu.Unlock()
   222  		mm.mappingMu.Unlock()
   223  		return
   224  	}
   225  
   226  	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
   227  	// isn't needed at all for mapASLocked.
   228  	mm.mappingMu.DowngradeLock()
   229  	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess)
   230  	mm.mappingMu.RUnlock()
   231  	if err != nil {
   232  		mm.activeMu.Unlock()
   233  		return
   234  	}
   235  
   236  	mm.activeMu.DowngradeLock()
   237  	mm.mapASLocked(pseg, ar, precommit)
   238  	mm.activeMu.RUnlock()
   239  }
   240  
   241  // MapStack allocates the initial process stack.
   242  func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, error) {
   243  	// maxStackSize is the maximum supported process stack size in bytes.
   244  	//
   245  	// This limit exists because stack growing isn't implemented, so the entire
   246  	// process stack must be mapped up-front.
   247  	const maxStackSize = 128 << 20
   248  
   249  	stackSize := limits.FromContext(ctx).Get(limits.Stack)
   250  	r, ok := hostarch.Addr(stackSize.Cur).RoundUp()
   251  	sz := uint64(r)
   252  	if !ok {
   253  		// RLIM_INFINITY rounds up to 0.
   254  		sz = linux.DefaultStackSoftLimit
   255  	} else if sz > maxStackSize {
   256  		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
   257  		sz = maxStackSize
   258  	} else if sz == 0 {
   259  		return hostarch.AddrRange{}, linuxerr.ENOMEM
   260  	}
   261  	szaddr := hostarch.Addr(sz)
   262  	ctx.Debugf("Allocating stack with size of %v bytes", sz)
   263  
   264  	// Determine the stack's desired location. Unlike Linux, address
   265  	// randomization can't be disabled.
   266  	stackEnd := mm.layout.MaxAddr - hostarch.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
   267  	if stackEnd < szaddr {
   268  		return hostarch.AddrRange{}, linuxerr.ENOMEM
   269  	}
   270  	stackStart := stackEnd - szaddr
   271  	var droppedIDs []memmap.MappingIdentity
   272  	var ar hostarch.AddrRange
   273  	var err error
   274  	mm.mappingMu.Lock()
   275  	_, ar, droppedIDs, err = mm.createVMALocked(ctx, memmap.MMapOpts{
   276  		Length:    sz,
   277  		Addr:      stackStart,
   278  		Perms:     hostarch.ReadWrite,
   279  		MaxPerms:  hostarch.AnyAccess,
   280  		Private:   true,
   281  		GrowsDown: true,
   282  		MLockMode: mm.defMLockMode,
   283  		Hint:      "[stack]",
   284  	}, droppedIDs)
   285  	mm.mappingMu.Unlock()
   286  	for _, id := range droppedIDs {
   287  		id.DecRef(ctx)
   288  	}
   289  	return ar, err
   290  }
   291  
   292  // MUnmap implements the semantics of Linux's munmap(2).
   293  func (mm *MemoryManager) MUnmap(ctx context.Context, addr hostarch.Addr, length uint64) error {
   294  	if addr != addr.RoundDown() {
   295  		return linuxerr.EINVAL
   296  	}
   297  	if length == 0 {
   298  		return linuxerr.EINVAL
   299  	}
   300  	la, ok := hostarch.Addr(length).RoundUp()
   301  	if !ok {
   302  		return linuxerr.EINVAL
   303  	}
   304  	ar, ok := addr.ToRange(uint64(la))
   305  	if !ok {
   306  		return linuxerr.EINVAL
   307  	}
   308  
   309  	var droppedIDs []memmap.MappingIdentity
   310  	mm.mappingMu.Lock()
   311  	_, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs)
   312  	mm.mappingMu.Unlock()
   313  
   314  	for _, id := range droppedIDs {
   315  		id.DecRef(ctx)
   316  	}
   317  
   318  	return nil
   319  }
   320  
   321  // MRemapOpts specifies options to MRemap.
   322  type MRemapOpts struct {
   323  	// Move controls whether MRemap moves the remapped mapping to a new address.
   324  	Move MRemapMoveMode
   325  
   326  	// NewAddr is the new address for the remapping. NewAddr is ignored unless
   327  	// Move is MMRemapMustMove.
   328  	NewAddr hostarch.Addr
   329  }
   330  
   331  // MRemapMoveMode controls MRemap's moving behavior.
   332  type MRemapMoveMode int
   333  
   334  const (
   335  	// MRemapNoMove prevents MRemap from moving the remapped mapping.
   336  	MRemapNoMove MRemapMoveMode = iota
   337  
   338  	// MRemapMayMove allows MRemap to move the remapped mapping.
   339  	MRemapMayMove
   340  
   341  	// MRemapMustMove requires MRemap to move the remapped mapping to
   342  	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
   343  	// range.
   344  	MRemapMustMove
   345  )
   346  
   347  // MRemap implements the semantics of Linux's mremap(2).
   348  func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (hostarch.Addr, error) {
   349  	// "Note that old_address has to be page aligned." - mremap(2)
   350  	if oldAddr.RoundDown() != oldAddr {
   351  		return 0, linuxerr.EINVAL
   352  	}
   353  
   354  	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
   355  	// valid size. However, new_size can't be 0 after rounding.
   356  	oldSizeAddr, _ := hostarch.Addr(oldSize).RoundUp()
   357  	oldSize = uint64(oldSizeAddr)
   358  	newSizeAddr, ok := hostarch.Addr(newSize).RoundUp()
   359  	if !ok || newSizeAddr == 0 {
   360  		return 0, linuxerr.EINVAL
   361  	}
   362  	newSize = uint64(newSizeAddr)
   363  
   364  	oldEnd, ok := oldAddr.AddLength(oldSize)
   365  	if !ok {
   366  		return 0, linuxerr.EINVAL
   367  	}
   368  
   369  	var droppedIDs []memmap.MappingIdentity
   370  	// This must run after mm.mappingMu.Unlock().
   371  	defer func() {
   372  		for _, id := range droppedIDs {
   373  			id.DecRef(ctx)
   374  		}
   375  	}()
   376  
   377  	mm.mappingMu.Lock()
   378  	defer mm.mappingMu.Unlock()
   379  
   380  	// All cases require that a vma exists at oldAddr.
   381  	vseg := mm.vmas.FindSegment(oldAddr)
   382  	if !vseg.Ok() {
   383  		return 0, linuxerr.EFAULT
   384  	}
   385  
   386  	// Behavior matrix:
   387  	//
   388  	// Move     | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
   389  	// ---------+-------------+-------------------+-------------------+------------------
   390  	//   NoMove | ENOMEM [1]  | Grow in-place     | No-op             | Shrink in-place
   391  	//  MayMove | Copy [1]    | Grow in-place or  | No-op             | Shrink in-place
   392  	//          |             |   move            |                   |
   393  	// MustMove | Copy        | Move and grow     | Move              | Shrink and move
   394  	//
   395  	// [1] In-place growth is impossible because the vma at oldAddr already
   396  	// occupies at least part of the destination. Thus the NoMove case always
   397  	// fails and the MayMove case always falls back to copying.
   398  
   399  	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
   400  		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
   401  		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
   402  		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
   403  		// !CAP_IPC_LOCK.
   404  		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
   405  		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
   406  			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
   407  				return 0, linuxerr.EAGAIN
   408  			}
   409  		}
   410  	}
   411  
   412  	if opts.Move != MRemapMustMove {
   413  		// Handle no-ops and in-place shrinking. These cases don't care if
   414  		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
   415  		// (aside from oldAddr).
   416  		if newSize <= oldSize {
   417  			if newSize < oldSize {
   418  				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
   419  				// either.
   420  				newEnd := oldAddr + hostarch.Addr(newSize)
   421  				_, droppedIDs = mm.unmapLocked(ctx, hostarch.AddrRange{newEnd, oldEnd}, droppedIDs)
   422  			}
   423  			return oldAddr, nil
   424  		}
   425  
   426  		// Handle in-place growing.
   427  
   428  		// Check that oldEnd maps to the same vma as oldAddr.
   429  		if vseg.End() < oldEnd {
   430  			return 0, linuxerr.EFAULT
   431  		}
   432  		// "Grow" the existing vma by creating a new mergeable one.
   433  		vma := vseg.ValuePtr()
   434  		var newOffset uint64
   435  		if vma.mappable != nil {
   436  			newOffset = vseg.mappableRange().End
   437  		}
   438  		var vseg vmaIterator
   439  		var ar hostarch.AddrRange
   440  		var err error
   441  		vseg, ar, droppedIDs, err = mm.createVMALocked(ctx, memmap.MMapOpts{
   442  			Length:          newSize - oldSize,
   443  			MappingIdentity: vma.id,
   444  			Mappable:        vma.mappable,
   445  			Offset:          newOffset,
   446  			Addr:            oldEnd,
   447  			Fixed:           true,
   448  			Perms:           vma.realPerms,
   449  			MaxPerms:        vma.maxPerms,
   450  			Private:         vma.private,
   451  			GrowsDown:       vma.growsDown,
   452  			MLockMode:       vma.mlockMode,
   453  			Hint:            vma.hint,
   454  		}, droppedIDs)
   455  		if err == nil {
   456  			if vma.mlockMode == memmap.MLockEager {
   457  				mm.populateVMA(ctx, vseg, ar, true)
   458  			}
   459  			return oldAddr, nil
   460  		}
   461  		// In-place growth failed. In the MRemapMayMove case, fall through to
   462  		// copying/moving below.
   463  		if opts.Move == MRemapNoMove {
   464  			return 0, err
   465  		}
   466  	}
   467  
   468  	// Find a location for the new mapping.
   469  	var newAR hostarch.AddrRange
   470  	switch opts.Move {
   471  	case MRemapMayMove:
   472  		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
   473  		if err != nil {
   474  			return 0, err
   475  		}
   476  		newAR, _ = newAddr.ToRange(newSize)
   477  
   478  	case MRemapMustMove:
   479  		newAddr := opts.NewAddr
   480  		if newAddr.RoundDown() != newAddr {
   481  			return 0, linuxerr.EINVAL
   482  		}
   483  		var ok bool
   484  		newAR, ok = newAddr.ToRange(newSize)
   485  		if !ok {
   486  			return 0, linuxerr.EINVAL
   487  		}
   488  		if (hostarch.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
   489  			return 0, linuxerr.EINVAL
   490  		}
   491  
   492  		// Check that the new region is valid.
   493  		_, err := mm.findAvailableLocked(newSize, findAvailableOpts{
   494  			Addr:  newAddr,
   495  			Fixed: true,
   496  			Unmap: true,
   497  		})
   498  		if err != nil {
   499  			return 0, err
   500  		}
   501  
   502  		// Unmap any mappings at the destination.
   503  		_, droppedIDs = mm.unmapLocked(ctx, newAR, droppedIDs)
   504  
   505  		// If the sizes specify shrinking, unmap everything between the new and
   506  		// old sizes at the source. Unmapping before the following checks is
   507  		// correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
   508  		// vma_to_resize().
   509  		if newSize < oldSize {
   510  			oldNewEnd := oldAddr + hostarch.Addr(newSize)
   511  			_, droppedIDs = mm.unmapLocked(ctx, hostarch.AddrRange{oldNewEnd, oldEnd}, droppedIDs)
   512  			oldEnd = oldNewEnd
   513  		}
   514  
   515  		// unmapLocked may have invalidated vseg; look it up again.
   516  		vseg = mm.vmas.FindSegment(oldAddr)
   517  	}
   518  
   519  	oldAR := hostarch.AddrRange{oldAddr, oldEnd}
   520  
   521  	// Check that oldEnd maps to the same vma as oldAddr.
   522  	if vseg.End() < oldEnd {
   523  		return 0, linuxerr.EFAULT
   524  	}
   525  
   526  	// Check against RLIMIT_AS.
   527  	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   528  	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
   529  		return 0, linuxerr.ENOMEM
   530  	}
   531  
   532  	if vma := vseg.ValuePtr(); vma.mappable != nil {
   533  		// Check that offset+length does not overflow.
   534  		if vma.off+uint64(newAR.Length()) < vma.off {
   535  			return 0, linuxerr.EINVAL
   536  		}
   537  		// Inform the Mappable, if any, of the new mapping.
   538  		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
   539  			return 0, err
   540  		}
   541  	}
   542  
   543  	if oldSize == 0 {
   544  		// Handle copying.
   545  		//
   546  		// We can't use createVMALocked because it calls Mappable.AddMapping,
   547  		// whereas we've already called Mappable.CopyMapping (which is
   548  		// consistent with Linux).
   549  		vma := vseg.ValuePtr().copy()
   550  		if vma.mappable != nil {
   551  			vma.off = vseg.mappableOffsetAt(oldAR.Start)
   552  		}
   553  		if vma.id != nil {
   554  			vma.id.IncRef()
   555  		}
   556  		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
   557  		mm.usageAS += uint64(newAR.Length())
   558  		if vma.isPrivateDataLocked() {
   559  			mm.dataAS += uint64(newAR.Length())
   560  		}
   561  		if vma.mlockMode != memmap.MLockNone {
   562  			mm.lockedAS += uint64(newAR.Length())
   563  			if vma.mlockMode == memmap.MLockEager {
   564  				mm.populateVMA(ctx, vseg, newAR, true)
   565  			}
   566  		}
   567  		return newAR.Start, nil
   568  	}
   569  
   570  	// Handle moving.
   571  	//
   572  	// Remove the existing vma before inserting the new one to minimize
   573  	// iterator invalidation. We do this directly (instead of calling
   574  	// removeVMAsLocked) because:
   575  	//
   576  	// 1. We can't drop the reference on vma.id, which will be transferred to
   577  	// the new vma.
   578  	//
   579  	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
   580  	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
   581  	// overlapping oldAR.
   582  	vseg = mm.vmas.Isolate(vseg, oldAR)
   583  	vma := vseg.ValuePtr().copy()
   584  	mm.vmas.Remove(vseg)
   585  	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
   586  	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   587  	if vma.isPrivateDataLocked() {
   588  		mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   589  	}
   590  	if vma.mlockMode != memmap.MLockNone {
   591  		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
   592  	}
   593  
   594  	// Move pmas. This is technically optional for non-private pmas, which
   595  	// could just go through memmap.Mappable.Translate again, but it's required
   596  	// for private pmas.
   597  	mm.activeMu.Lock()
   598  	mm.movePMAsLocked(oldAR, newAR)
   599  	mm.activeMu.Unlock()
   600  
   601  	// Now that pmas have been moved to newAR, we can notify vma.mappable that
   602  	// oldAR is no longer mapped.
   603  	if vma.mappable != nil {
   604  		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
   605  	}
   606  
   607  	if vma.mlockMode == memmap.MLockEager {
   608  		mm.populateVMA(ctx, vseg, newAR, true)
   609  	}
   610  
   611  	return newAR.Start, nil
   612  }
   613  
   614  // MProtect implements the semantics of Linux's mprotect(2).
   615  func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms hostarch.AccessType, growsDown bool) error {
   616  	if addr.RoundDown() != addr {
   617  		return linuxerr.EINVAL
   618  	}
   619  	if length == 0 {
   620  		return nil
   621  	}
   622  	rlength, ok := hostarch.Addr(length).RoundUp()
   623  	if !ok {
   624  		return linuxerr.ENOMEM
   625  	}
   626  	ar, ok := addr.ToRange(uint64(rlength))
   627  	if !ok {
   628  		return linuxerr.ENOMEM
   629  	}
   630  	effectivePerms := realPerms.Effective()
   631  
   632  	mm.mappingMu.Lock()
   633  	defer mm.mappingMu.Unlock()
   634  	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
   635  	// the first non-empty gap. growsDown mprotect requires that the first vma
   636  	// be growsDown, but does not require it to extend all the way to ar.Start;
   637  	// vmas after the first must be contiguous but need not be growsDown, like
   638  	// the non-growsDown case.
   639  	vseg := mm.vmas.LowerBoundSegment(ar.Start)
   640  	if !vseg.Ok() {
   641  		return linuxerr.ENOMEM
   642  	}
   643  	if growsDown {
   644  		if !vseg.ValuePtr().growsDown {
   645  			return linuxerr.EINVAL
   646  		}
   647  		if ar.End <= vseg.Start() {
   648  			return linuxerr.ENOMEM
   649  		}
   650  		ar.Start = vseg.Start()
   651  	} else {
   652  		if ar.Start < vseg.Start() {
   653  			return linuxerr.ENOMEM
   654  		}
   655  	}
   656  
   657  	mm.activeMu.Lock()
   658  	defer mm.activeMu.Unlock()
   659  	defer func() {
   660  		mm.vmas.MergeRange(ar)
   661  		mm.vmas.MergeAdjacent(ar)
   662  		mm.pmas.MergeRange(ar)
   663  		mm.pmas.MergeAdjacent(ar)
   664  	}()
   665  	pseg := mm.pmas.LowerBoundSegment(ar.Start)
   666  	var didUnmapAS bool
   667  	for {
   668  		// Check for permission validity before splitting vmas, for consistency
   669  		// with Linux.
   670  		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
   671  			return linuxerr.EACCES
   672  		}
   673  		vseg = mm.vmas.Isolate(vseg, ar)
   674  
   675  		// Update vma permissions.
   676  		vma := vseg.ValuePtr()
   677  		vmaLength := vseg.Range().Length()
   678  		if vma.isPrivateDataLocked() {
   679  			mm.dataAS -= uint64(vmaLength)
   680  		}
   681  
   682  		vma.realPerms = realPerms
   683  		vma.effectivePerms = effectivePerms
   684  		if vma.isPrivateDataLocked() {
   685  			mm.dataAS += uint64(vmaLength)
   686  		}
   687  
   688  		// Propagate vma permission changes to pmas.
   689  		for pseg.Ok() && pseg.Start() < vseg.End() {
   690  			if pseg.Range().Overlaps(vseg.Range()) {
   691  				pseg = mm.pmas.Isolate(pseg, vseg.Range())
   692  				pma := pseg.ValuePtr()
   693  				if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
   694  					// Unmap all of ar, not just vseg.Range(), to minimize host
   695  					// syscalls.
   696  					mm.unmapASLocked(ar)
   697  					didUnmapAS = true
   698  				}
   699  				pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
   700  				if pma.needCOW {
   701  					pma.effectivePerms.Write = false
   702  				}
   703  			}
   704  			pseg = pseg.NextSegment()
   705  		}
   706  
   707  		// Continue to the next vma.
   708  		if ar.End <= vseg.End() {
   709  			return nil
   710  		}
   711  		vseg, _ = vseg.NextNonEmpty()
   712  		if !vseg.Ok() {
   713  			return linuxerr.ENOMEM
   714  		}
   715  	}
   716  }
   717  
   718  // BrkSetup sets mm's brk address to addr and its brk size to 0.
   719  func (mm *MemoryManager) BrkSetup(ctx context.Context, addr hostarch.Addr) {
   720  	var droppedIDs []memmap.MappingIdentity
   721  	mm.mappingMu.Lock()
   722  	// Unmap the existing brk.
   723  	if mm.brk.Length() != 0 {
   724  		_, droppedIDs = mm.unmapLocked(ctx, mm.brk, droppedIDs)
   725  	}
   726  	mm.brk = hostarch.AddrRange{addr, addr}
   727  	mm.mappingMu.Unlock()
   728  	for _, id := range droppedIDs {
   729  		id.DecRef(ctx)
   730  	}
   731  }
   732  
   733  // Brk implements the semantics of Linux's brk(2), except that it returns an
   734  // error on failure.
   735  func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch.Addr, error) {
   736  	mm.mappingMu.Lock()
   737  	// Can't defer mm.mappingMu.Unlock(); see below.
   738  
   739  	if addr < mm.brk.Start {
   740  		addr = mm.brk.End
   741  		mm.mappingMu.Unlock()
   742  		return addr, linuxerr.EINVAL
   743  	}
   744  
   745  	// TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
   746  	// slightly more permissive than the usual data limit. In particular,
   747  	// this only limits the size of the heap; a true RLIMIT_DATA limits the
   748  	// size of heap + data + bss. The segment sizes need to be plumbed from
   749  	// the loader package to fully enforce RLIMIT_DATA.
   750  	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
   751  		addr = mm.brk.End
   752  		mm.mappingMu.Unlock()
   753  		return addr, linuxerr.ENOMEM
   754  	}
   755  
   756  	oldbrkpg, _ := mm.brk.End.RoundUp()
   757  	newbrkpg, ok := addr.RoundUp()
   758  	if !ok {
   759  		addr = mm.brk.End
   760  		mm.mappingMu.Unlock()
   761  		return addr, linuxerr.EFAULT
   762  	}
   763  
   764  	var vseg vmaIterator
   765  	var ar hostarch.AddrRange
   766  	var err error
   767  
   768  	var droppedIDs []memmap.MappingIdentity
   769  	// This must run after mm.mappingMu.Unlock().
   770  	defer func() {
   771  		for _, id := range droppedIDs {
   772  			id.DecRef(ctx)
   773  		}
   774  	}()
   775  
   776  	switch {
   777  	case oldbrkpg < newbrkpg:
   778  		vseg, ar, droppedIDs, err = mm.createVMALocked(ctx, memmap.MMapOpts{
   779  			Length: uint64(newbrkpg - oldbrkpg),
   780  			Addr:   oldbrkpg,
   781  			Fixed:  true,
   782  			// Compare Linux's
   783  			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
   784  			Perms:    hostarch.ReadWrite,
   785  			MaxPerms: hostarch.AnyAccess,
   786  			Private:  true,
   787  			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
   788  			// mm->def_flags.
   789  			MLockMode: mm.defMLockMode,
   790  			Hint:      "[heap]",
   791  		}, droppedIDs)
   792  		if err != nil {
   793  			addr = mm.brk.End
   794  			mm.mappingMu.Unlock()
   795  			return addr, err
   796  		}
   797  		mm.brk.End = addr
   798  		if mm.defMLockMode == memmap.MLockEager {
   799  			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
   800  		} else {
   801  			mm.mappingMu.Unlock()
   802  		}
   803  
   804  	case newbrkpg < oldbrkpg:
   805  		_, droppedIDs = mm.unmapLocked(ctx, hostarch.AddrRange{newbrkpg, oldbrkpg}, droppedIDs)
   806  		fallthrough
   807  
   808  	default:
   809  		mm.brk.End = addr
   810  		mm.mappingMu.Unlock()
   811  	}
   812  
   813  	return addr, nil
   814  }
   815  
   816  // MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
   817  // depending on mode.
   818  func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length uint64, mode memmap.MLockMode) error {
   819  	// Linux allows this to overflow.
   820  	la, _ := hostarch.Addr(length + addr.PageOffset()).RoundUp()
   821  	ar, ok := addr.RoundDown().ToRange(uint64(la))
   822  	if !ok {
   823  		return linuxerr.EINVAL
   824  	}
   825  
   826  	mm.mappingMu.Lock()
   827  	// Can't defer mm.mappingMu.Unlock(); see below.
   828  
   829  	if mode != memmap.MLockNone {
   830  		// Check against RLIMIT_MEMLOCK.
   831  		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
   832  			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
   833  			if mlockLimit == 0 {
   834  				mm.mappingMu.Unlock()
   835  				return linuxerr.EPERM
   836  			}
   837  			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
   838  				mm.mappingMu.Unlock()
   839  				return linuxerr.ENOMEM
   840  			}
   841  		}
   842  	}
   843  
   844  	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
   845  	if ar.Length() == 0 {
   846  		mm.mappingMu.Unlock()
   847  		return nil
   848  	}
   849  
   850  	// Apply the new mlock mode to vmas.
   851  	var unmapped bool
   852  	vseg := mm.vmas.FindSegment(ar.Start)
   853  	for {
   854  		if !vseg.Ok() {
   855  			unmapped = true
   856  			break
   857  		}
   858  		vseg = mm.vmas.Isolate(vseg, ar)
   859  		vma := vseg.ValuePtr()
   860  		prevMode := vma.mlockMode
   861  		vma.mlockMode = mode
   862  		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
   863  			mm.lockedAS += uint64(vseg.Range().Length())
   864  		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
   865  			mm.lockedAS -= uint64(vseg.Range().Length())
   866  		}
   867  		if ar.End <= vseg.End() {
   868  			break
   869  		}
   870  		vseg, _ = vseg.NextNonEmpty()
   871  	}
   872  	mm.vmas.MergeRange(ar)
   873  	mm.vmas.MergeAdjacent(ar)
   874  	if unmapped {
   875  		mm.mappingMu.Unlock()
   876  		return linuxerr.ENOMEM
   877  	}
   878  
   879  	if mode == memmap.MLockEager {
   880  		// Ensure that we have usable pmas. Since we didn't return ENOMEM
   881  		// above, ar must be fully covered by vmas, so we can just use
   882  		// NextSegment below.
   883  		mm.activeMu.Lock()
   884  		mm.mappingMu.DowngradeLock()
   885  		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
   886  			if !vseg.ValuePtr().effectivePerms.Any() {
   887  				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
   888  				// case, which is converted to ENOMEM by mlock.
   889  				mm.activeMu.Unlock()
   890  				mm.mappingMu.RUnlock()
   891  				return linuxerr.ENOMEM
   892  			}
   893  			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), hostarch.NoAccess)
   894  			if err != nil {
   895  				mm.activeMu.Unlock()
   896  				mm.mappingMu.RUnlock()
   897  				// Linux: mm/mlock.c:__mlock_posix_error_return()
   898  				if linuxerr.Equals(linuxerr.EFAULT, err) {
   899  					return linuxerr.ENOMEM
   900  				}
   901  				if linuxerr.Equals(linuxerr.ENOMEM, err) {
   902  					return linuxerr.EAGAIN
   903  				}
   904  				return err
   905  			}
   906  		}
   907  
   908  		// Map pmas into the active AddressSpace, if we have one.
   909  		mm.mappingMu.RUnlock()
   910  		if mm.as != nil {
   911  			mm.activeMu.DowngradeLock()
   912  			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
   913  			mm.activeMu.RUnlock()
   914  			if err != nil {
   915  				return err
   916  			}
   917  		} else {
   918  			mm.activeMu.Unlock()
   919  		}
   920  	} else {
   921  		mm.mappingMu.Unlock()
   922  	}
   923  
   924  	return nil
   925  }
   926  
   927  // MLockAllOpts holds options to MLockAll.
   928  type MLockAllOpts struct {
   929  	// If Current is true, change the memory-locking behavior of all mappings
   930  	// to Mode. If Future is true, upgrade the memory-locking behavior of all
   931  	// future mappings to Mode. At least one of Current or Future must be true.
   932  	Current bool
   933  	Future  bool
   934  	Mode    memmap.MLockMode
   935  }
   936  
   937  // MLockAll implements the semantics of Linux's mlockall()/munlockall(),
   938  // depending on opts.
   939  func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
   940  	if !opts.Current && !opts.Future {
   941  		return linuxerr.EINVAL
   942  	}
   943  
   944  	mm.mappingMu.Lock()
   945  	// Can't defer mm.mappingMu.Unlock(); see below.
   946  
   947  	if opts.Current {
   948  		if opts.Mode != memmap.MLockNone {
   949  			// Check against RLIMIT_MEMLOCK.
   950  			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
   951  				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
   952  				if mlockLimit == 0 {
   953  					mm.mappingMu.Unlock()
   954  					return linuxerr.EPERM
   955  				}
   956  				if uint64(mm.vmas.Span()) > mlockLimit {
   957  					mm.mappingMu.Unlock()
   958  					return linuxerr.ENOMEM
   959  				}
   960  			}
   961  		}
   962  		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
   963  			vma := vseg.ValuePtr()
   964  			prevMode := vma.mlockMode
   965  			vma.mlockMode = opts.Mode
   966  			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
   967  				mm.lockedAS += uint64(vseg.Range().Length())
   968  			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
   969  				mm.lockedAS -= uint64(vseg.Range().Length())
   970  			}
   971  		}
   972  	}
   973  
   974  	if opts.Future {
   975  		mm.defMLockMode = opts.Mode
   976  	}
   977  
   978  	if opts.Current && opts.Mode == memmap.MLockEager {
   979  		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
   980  		// ignores the return value of __mm_populate(), so all errors below are
   981  		// ignored.
   982  		//
   983  		// Try to get usable pmas.
   984  		mm.activeMu.Lock()
   985  		mm.mappingMu.DowngradeLock()
   986  		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
   987  			if vseg.ValuePtr().effectivePerms.Any() {
   988  				mm.getPMAsLocked(ctx, vseg, vseg.Range(), hostarch.NoAccess)
   989  			}
   990  		}
   991  
   992  		// Map all pmas into the active AddressSpace, if we have one.
   993  		mm.mappingMu.RUnlock()
   994  		if mm.as != nil {
   995  			mm.activeMu.DowngradeLock()
   996  			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
   997  			mm.activeMu.RUnlock()
   998  		} else {
   999  			mm.activeMu.Unlock()
  1000  		}
  1001  	} else {
  1002  		mm.mappingMu.Unlock()
  1003  	}
  1004  	return nil
  1005  }
  1006  
  1007  // NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
  1008  func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint64, error) {
  1009  	mm.mappingMu.RLock()
  1010  	defer mm.mappingMu.RUnlock()
  1011  	vseg := mm.vmas.FindSegment(addr)
  1012  	if !vseg.Ok() {
  1013  		return 0, 0, linuxerr.EFAULT
  1014  	}
  1015  	vma := vseg.ValuePtr()
  1016  	return vma.numaPolicy, vma.numaNodemask, nil
  1017  }
  1018  
  1019  // SetNumaPolicy implements the semantics of Linux's mbind().
  1020  func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
  1021  	if !addr.IsPageAligned() {
  1022  		return linuxerr.EINVAL
  1023  	}
  1024  	// Linux allows this to overflow.
  1025  	la, _ := hostarch.Addr(length).RoundUp()
  1026  	ar, ok := addr.ToRange(uint64(la))
  1027  	if !ok {
  1028  		return linuxerr.EINVAL
  1029  	}
  1030  	if ar.Length() == 0 {
  1031  		return nil
  1032  	}
  1033  
  1034  	mm.mappingMu.Lock()
  1035  	defer mm.mappingMu.Unlock()
  1036  	defer func() {
  1037  		mm.vmas.MergeRange(ar)
  1038  		mm.vmas.MergeAdjacent(ar)
  1039  	}()
  1040  	vseg := mm.vmas.LowerBoundSegment(ar.Start)
  1041  	lastEnd := ar.Start
  1042  	for {
  1043  		if !vseg.Ok() || lastEnd < vseg.Start() {
  1044  			// "EFAULT: ... there was an unmapped hole in the specified memory
  1045  			// range specified [sic] by addr and len." - mbind(2)
  1046  			return linuxerr.EFAULT
  1047  		}
  1048  		vseg = mm.vmas.Isolate(vseg, ar)
  1049  		vma := vseg.ValuePtr()
  1050  		vma.numaPolicy = policy
  1051  		vma.numaNodemask = nodemask
  1052  		lastEnd = vseg.End()
  1053  		if ar.End <= lastEnd {
  1054  			return nil
  1055  		}
  1056  		vseg, _ = vseg.NextNonEmpty()
  1057  	}
  1058  }
  1059  
  1060  // SetDontFork implements the semantics of madvise MADV_DONTFORK.
  1061  func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork bool) error {
  1062  	ar, ok := addr.ToRange(length)
  1063  	if !ok {
  1064  		return linuxerr.EINVAL
  1065  	}
  1066  
  1067  	mm.mappingMu.Lock()
  1068  	defer mm.mappingMu.Unlock()
  1069  	defer func() {
  1070  		mm.vmas.MergeRange(ar)
  1071  		mm.vmas.MergeAdjacent(ar)
  1072  	}()
  1073  
  1074  	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
  1075  		vseg = mm.vmas.Isolate(vseg, ar)
  1076  		vma := vseg.ValuePtr()
  1077  		vma.dontfork = dontfork
  1078  	}
  1079  
  1080  	if mm.vmas.SpanRange(ar) != ar.Length() {
  1081  		return linuxerr.ENOMEM
  1082  	}
  1083  	return nil
  1084  }
  1085  
  1086  // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
  1087  func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error {
  1088  	ar, ok := addr.ToRange(length)
  1089  	if !ok {
  1090  		return linuxerr.EINVAL
  1091  	}
  1092  
  1093  	mm.mappingMu.RLock()
  1094  	defer mm.mappingMu.RUnlock()
  1095  	mm.activeMu.Lock()
  1096  	defer mm.activeMu.Unlock()
  1097  
  1098  	// This is invalidateLocked(invalidatePrivate=true, invalidateShared=true),
  1099  	// with the additional wrinkle that we must refuse to invalidate pmas under
  1100  	// mlocked vmas.
  1101  	var didUnmapAS bool
  1102  	pseg := mm.pmas.LowerBoundSegment(ar.Start)
  1103  	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
  1104  		vma := vseg.ValuePtr()
  1105  		if vma.mlockMode != memmap.MLockNone {
  1106  			return linuxerr.EINVAL
  1107  		}
  1108  		vsegAR := vseg.Range().Intersect(ar)
  1109  		// pseg should already correspond to either this vma or a later one,
  1110  		// since there can't be a pma without a corresponding vma.
  1111  		if checkInvariants {
  1112  			if pseg.Ok() && pseg.End() <= vsegAR.Start {
  1113  				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
  1114  			}
  1115  		}
  1116  		for pseg.Ok() && pseg.Start() < vsegAR.End {
  1117  			pseg = mm.pmas.Isolate(pseg, vsegAR)
  1118  			pma := pseg.ValuePtr()
  1119  			if !didUnmapAS {
  1120  				// Unmap all of ar, not just pseg.Range(), to minimize host
  1121  				// syscalls. AddressSpace mappings must be removed before
  1122  				// mm.decPrivateRef().
  1123  				mm.unmapASLocked(ar)
  1124  				didUnmapAS = true
  1125  			}
  1126  			if pma.private {
  1127  				mm.decPrivateRef(pseg.fileRange())
  1128  			}
  1129  			pma.file.DecRef(pseg.fileRange())
  1130  			mm.removeRSSLocked(pseg.Range())
  1131  			pseg = mm.pmas.Remove(pseg).NextSegment()
  1132  		}
  1133  	}
  1134  
  1135  	// "If there are some parts of the specified address space that are not
  1136  	// mapped, the Linux version of madvise() ignores them and applies the call
  1137  	// to the rest (but returns ENOMEM from the system call, as it should)." -
  1138  	// madvise(2)
  1139  	if mm.vmas.SpanRange(ar) != ar.Length() {
  1140  		return linuxerr.ENOMEM
  1141  	}
  1142  	return nil
  1143  }
  1144  
  1145  // MSyncOpts holds options to MSync.
  1146  type MSyncOpts struct {
  1147  	// Sync has the semantics of MS_SYNC.
  1148  	Sync bool
  1149  
  1150  	// Invalidate has the semantics of MS_INVALIDATE.
  1151  	Invalidate bool
  1152  }
  1153  
  1154  // MSync implements the semantics of Linux's msync().
  1155  func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length uint64, opts MSyncOpts) error {
  1156  	if addr != addr.RoundDown() {
  1157  		return linuxerr.EINVAL
  1158  	}
  1159  	if length == 0 {
  1160  		return nil
  1161  	}
  1162  	la, ok := hostarch.Addr(length).RoundUp()
  1163  	if !ok {
  1164  		return linuxerr.ENOMEM
  1165  	}
  1166  	ar, ok := addr.ToRange(uint64(la))
  1167  	if !ok {
  1168  		return linuxerr.ENOMEM
  1169  	}
  1170  
  1171  	mm.mappingMu.RLock()
  1172  	// Can't defer mm.mappingMu.RUnlock(); see below.
  1173  	vseg := mm.vmas.LowerBoundSegment(ar.Start)
  1174  	if !vseg.Ok() {
  1175  		mm.mappingMu.RUnlock()
  1176  		return linuxerr.ENOMEM
  1177  	}
  1178  	var unmapped bool
  1179  	lastEnd := ar.Start
  1180  	for {
  1181  		if !vseg.Ok() {
  1182  			mm.mappingMu.RUnlock()
  1183  			unmapped = true
  1184  			break
  1185  		}
  1186  		if lastEnd < vseg.Start() {
  1187  			unmapped = true
  1188  		}
  1189  		lastEnd = vseg.End()
  1190  		vma := vseg.ValuePtr()
  1191  		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
  1192  			mm.mappingMu.RUnlock()
  1193  			return linuxerr.EBUSY
  1194  		}
  1195  		// It's only possible to have dirtied the Mappable through a shared
  1196  		// mapping. Don't check if the mapping is writable, because mprotect
  1197  		// may have changed this, and also because Linux doesn't.
  1198  		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
  1199  			// We can't call memmap.MappingIdentity.Msync while holding
  1200  			// mm.mappingMu since it may take fs locks that precede it in the
  1201  			// lock order.
  1202  			id.IncRef()
  1203  			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
  1204  			mm.mappingMu.RUnlock()
  1205  			err := id.Msync(ctx, mr)
  1206  			id.DecRef(ctx)
  1207  			if err != nil {
  1208  				return err
  1209  			}
  1210  			if lastEnd >= ar.End {
  1211  				break
  1212  			}
  1213  			mm.mappingMu.RLock()
  1214  			vseg = mm.vmas.LowerBoundSegment(lastEnd)
  1215  		} else {
  1216  			if lastEnd >= ar.End {
  1217  				mm.mappingMu.RUnlock()
  1218  				break
  1219  			}
  1220  			vseg = vseg.NextSegment()
  1221  		}
  1222  	}
  1223  
  1224  	if unmapped {
  1225  		return linuxerr.ENOMEM
  1226  	}
  1227  	return nil
  1228  }
  1229  
  1230  // GetSharedFutexKey is used by kernel.Task.GetSharedKey.
  1231  func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr hostarch.Addr) (futex.Key, error) {
  1232  	ar, ok := addr.ToRange(4) // sizeof(int32).
  1233  	if !ok {
  1234  		return futex.Key{}, linuxerr.EFAULT
  1235  	}
  1236  
  1237  	mm.mappingMu.RLock()
  1238  	defer mm.mappingMu.RUnlock()
  1239  	vseg, _, err := mm.getVMAsLocked(ctx, ar, hostarch.Read, false)
  1240  	if err != nil {
  1241  		return futex.Key{}, err
  1242  	}
  1243  	vma := vseg.ValuePtr()
  1244  
  1245  	if vma.private {
  1246  		return futex.Key{
  1247  			Kind:   futex.KindSharedPrivate,
  1248  			Offset: uint64(addr),
  1249  		}, nil
  1250  	}
  1251  
  1252  	if vma.id != nil {
  1253  		vma.id.IncRef()
  1254  	}
  1255  	return futex.Key{
  1256  		Kind:            futex.KindSharedMappable,
  1257  		Mappable:        vma.mappable,
  1258  		MappingIdentity: vma.id,
  1259  		Offset:          vseg.mappableOffsetAt(addr),
  1260  	}, nil
  1261  }
  1262  
  1263  // VirtualMemorySize returns the combined length in bytes of all mappings in
  1264  // mm.
  1265  func (mm *MemoryManager) VirtualMemorySize() uint64 {
  1266  	mm.mappingMu.RLock()
  1267  	defer mm.mappingMu.RUnlock()
  1268  	return mm.usageAS
  1269  }
  1270  
  1271  // VirtualMemorySizeRange returns the combined length in bytes of all mappings
  1272  // in ar in mm.
  1273  func (mm *MemoryManager) VirtualMemorySizeRange(ar hostarch.AddrRange) uint64 {
  1274  	mm.mappingMu.RLock()
  1275  	defer mm.mappingMu.RUnlock()
  1276  	return uint64(mm.vmas.SpanRange(ar))
  1277  }
  1278  
  1279  // ResidentSetSize returns the value advertised as mm's RSS in bytes.
  1280  func (mm *MemoryManager) ResidentSetSize() uint64 {
  1281  	mm.activeMu.RLock()
  1282  	defer mm.activeMu.RUnlock()
  1283  	return mm.curRSS
  1284  }
  1285  
  1286  // MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
  1287  func (mm *MemoryManager) MaxResidentSetSize() uint64 {
  1288  	mm.activeMu.RLock()
  1289  	defer mm.activeMu.RUnlock()
  1290  	return mm.maxRSS
  1291  }
  1292  
  1293  // VirtualDataSize returns the size of private data segments in mm.
  1294  func (mm *MemoryManager) VirtualDataSize() uint64 {
  1295  	mm.mappingMu.RLock()
  1296  	defer mm.mappingMu.RUnlock()
  1297  	return mm.dataAS
  1298  }
  1299  
  1300  // EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
  1301  // return true.
  1302  func (mm *MemoryManager) EnableMembarrierPrivate() {
  1303  	mm.membarrierPrivateEnabled.Store(1)
  1304  }
  1305  
  1306  // IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
  1307  // previously been called.
  1308  func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
  1309  	return mm.membarrierPrivateEnabled.Load() != 0
  1310  }
  1311  
  1312  // EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to
  1313  // return true.
  1314  func (mm *MemoryManager) EnableMembarrierRSeq() {
  1315  	mm.membarrierRSeqEnabled.Store(1)
  1316  }
  1317  
  1318  // IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has
  1319  // previously been called.
  1320  func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool {
  1321  	return mm.membarrierRSeqEnabled.Load() != 0
  1322  }
  1323  
  1324  // FindVMAByName finds a vma with the specified name and returns its start address and offset.
  1325  func (mm *MemoryManager) FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error) {
  1326  	mm.mappingMu.RLock()
  1327  	defer mm.mappingMu.RUnlock()
  1328  
  1329  	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok(); vseg = vseg.NextSegment() {
  1330  		start := vseg.Start()
  1331  		if !ar.Contains(start) {
  1332  			break
  1333  		}
  1334  		vma := vseg.ValuePtr()
  1335  
  1336  		if vma.hint == hint {
  1337  			return start, vma.off, nil
  1338  		}
  1339  	}
  1340  	return 0, 0, fmt.Errorf("could not find \"%s\" in %s", hint, ar)
  1341  }