github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/pma.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/context"
    21  	"github.com/SagerNet/gvisor/pkg/hostarch"
    22  	"github.com/SagerNet/gvisor/pkg/safecopy"
    23  	"github.com/SagerNet/gvisor/pkg/safemem"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    26  	"github.com/SagerNet/gvisor/pkg/syserror"
    27  )
    28  
    29  // existingPMAsLocked checks that pmas exist for all addresses in ar, and
    30  // support access of type (at, ignorePermissions). If so, it returns an
    31  // iterator to the pma containing ar.Start. Otherwise it returns a terminal
    32  // iterator.
    33  //
    34  // Preconditions:
    35  // * mm.activeMu must be locked.
    36  // * ar.Length() != 0.
    37  func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
    38  	if checkInvariants {
    39  		if !ar.WellFormed() || ar.Length() == 0 {
    40  			panic(fmt.Sprintf("invalid ar: %v", ar))
    41  		}
    42  	}
    43  
    44  	first := mm.pmas.FindSegment(ar.Start)
    45  	pseg := first
    46  	for pseg.Ok() {
    47  		pma := pseg.ValuePtr()
    48  		perms := pma.effectivePerms
    49  		if ignorePermissions {
    50  			perms = pma.maxPerms
    51  		}
    52  		if !perms.SupersetOf(at) {
    53  			return pmaIterator{}
    54  		}
    55  		if needInternalMappings && pma.internalMappings.IsEmpty() {
    56  			return pmaIterator{}
    57  		}
    58  
    59  		if ar.End <= pseg.End() {
    60  			return first
    61  		}
    62  		pseg, _ = pseg.NextNonEmpty()
    63  	}
    64  
    65  	// Ran out of pmas before reaching ar.End.
    66  	return pmaIterator{}
    67  }
    68  
    69  // existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
    70  // and support access of type (at, ignorePermissions).
    71  //
    72  // Preconditions: mm.activeMu must be locked.
    73  func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
    74  	for ; !ars.IsEmpty(); ars = ars.Tail() {
    75  		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
    76  			return false
    77  		}
    78  	}
    79  	return true
    80  }
    81  
    82  // getPMAsLocked ensures that pmas exist for all addresses in ar, and support
    83  // access of type at. It returns:
    84  //
    85  // - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
    86  // the iterator is unspecified.
    87  //
    88  // - An iterator to the gap after the last pma containing an address in ar. If
    89  // pmas exist for no addresses in ar, the iterator is to a gap that begins
    90  // before ar.Start.
    91  //
    92  // - An error that is non-nil if pmas exist for only a subset of ar.
    93  //
    94  // Preconditions:
    95  // * mm.mappingMu must be locked.
    96  // * mm.activeMu must be locked for writing.
    97  // * ar.Length() != 0.
    98  // * vseg.Range().Contains(ar.Start).
    99  // * vmas must exist for all addresses in ar, and support accesses of type at
   100  //   (i.e. permission checks must have been performed against vmas).
   101  func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
   102  	if checkInvariants {
   103  		if !ar.WellFormed() || ar.Length() == 0 {
   104  			panic(fmt.Sprintf("invalid ar: %v", ar))
   105  		}
   106  		if !vseg.Ok() {
   107  			panic("terminal vma iterator")
   108  		}
   109  		if !vseg.Range().Contains(ar.Start) {
   110  			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
   111  		}
   112  	}
   113  
   114  	// Page-align ar so that all AddrRanges are aligned.
   115  	end, ok := ar.End.RoundUp()
   116  	var alignerr error
   117  	if !ok {
   118  		end = ar.End.RoundDown()
   119  		alignerr = syserror.EFAULT
   120  	}
   121  	ar = hostarch.AddrRange{ar.Start.RoundDown(), end}
   122  
   123  	pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
   124  	if pend.Start() <= ar.Start {
   125  		return pmaIterator{}, pend, perr
   126  	}
   127  	// getPMAsInternalLocked may not have returned pstart due to iterator
   128  	// invalidation.
   129  	if !pstart.Ok() {
   130  		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
   131  	}
   132  	if perr != nil {
   133  		return pstart, pend, perr
   134  	}
   135  	return pstart, pend, alignerr
   136  }
   137  
   138  // getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
   139  // support access of type at. It returns the subset of ars for which pmas
   140  // exist. If this is not equal to ars, it returns a non-nil error explaining
   141  // why.
   142  //
   143  // Preconditions:
   144  // * mm.mappingMu must be locked.
   145  // * mm.activeMu must be locked for writing.
   146  // * vmas must exist for all addresses in ars, and support accesses of type at
   147  //   (i.e. permission checks must have been performed against vmas).
   148  func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) {
   149  	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
   150  		ar := arsit.Head()
   151  		if ar.Length() == 0 {
   152  			continue
   153  		}
   154  		if checkInvariants {
   155  			if !ar.WellFormed() {
   156  				panic(fmt.Sprintf("invalid ar: %v", ar))
   157  			}
   158  		}
   159  
   160  		// Page-align ar so that all AddrRanges are aligned.
   161  		end, ok := ar.End.RoundUp()
   162  		var alignerr error
   163  		if !ok {
   164  			end = ar.End.RoundDown()
   165  			alignerr = syserror.EFAULT
   166  		}
   167  		ar = hostarch.AddrRange{ar.Start.RoundDown(), end}
   168  
   169  		_, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
   170  		if perr != nil {
   171  			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
   172  		}
   173  		if alignerr != nil {
   174  			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
   175  		}
   176  	}
   177  
   178  	return ars, nil
   179  }
   180  
   181  // getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
   182  // exceptions:
   183  //
   184  // - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
   185  // is, the returned iterator may be terminal, even if a pma that contains
   186  // ar.Start exists). Returning this iterator on a best-effort basis allows
   187  // callers that require it to use it when it's cheaply available, while also
   188  // avoiding the overhead of retrieving it when it's not.
   189  //
   190  // - getPMAsInternalLocked additionally requires that ar is page-aligned.
   191  //
   192  // getPMAsInternalLocked is an implementation helper for getPMAsLocked and
   193  // getVecPMAsLocked; other clients should call one of those instead.
   194  func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
   195  	if checkInvariants {
   196  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   197  			panic(fmt.Sprintf("invalid ar: %v", ar))
   198  		}
   199  		if !vseg.Ok() {
   200  			panic("terminal vma iterator")
   201  		}
   202  		if !vseg.Range().Contains(ar.Start) {
   203  			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
   204  		}
   205  	}
   206  
   207  	mf := mm.mfp.MemoryFile()
   208  	// Limit the range we allocate to ar, aligned to privateAllocUnit.
   209  	maskAR := privateAligned(ar)
   210  	didUnmapAS := false
   211  	// The range in which we iterate vmas and pmas is still limited to ar, to
   212  	// ensure that we don't allocate or COW-break a pma we don't need.
   213  	pseg, pgap := mm.pmas.Find(ar.Start)
   214  	pstart := pseg
   215  	for {
   216  		// Get pmas for this vma.
   217  		vsegAR := vseg.Range().Intersect(ar)
   218  		vma := vseg.ValuePtr()
   219  	pmaLoop:
   220  		for {
   221  			switch {
   222  			case pgap.Ok() && pgap.Start() < vsegAR.End:
   223  				// Need a pma here.
   224  				optAR := vseg.Range().Intersect(pgap.Range())
   225  				if checkInvariants {
   226  					if optAR.Length() == 0 {
   227  						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
   228  					}
   229  				}
   230  				if vma.mappable == nil {
   231  					// Private anonymous mappings get pmas by allocating.
   232  					allocAR := optAR.Intersect(maskAR)
   233  					fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
   234  					if err != nil {
   235  						return pstart, pgap, err
   236  					}
   237  					if checkInvariants {
   238  						if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
   239  							panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
   240  						}
   241  					}
   242  					mm.addRSSLocked(allocAR)
   243  					mm.incPrivateRef(fr)
   244  					mf.IncRef(fr)
   245  					pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
   246  						file:           mf,
   247  						off:            fr.Start,
   248  						translatePerms: hostarch.AnyAccess,
   249  						effectivePerms: vma.effectivePerms,
   250  						maxPerms:       vma.maxPerms,
   251  						// Since we just allocated this memory and have the
   252  						// only reference, the new pma does not need
   253  						// copy-on-write.
   254  						private: true,
   255  					}).NextNonEmpty()
   256  					pstart = pmaIterator{} // iterators invalidated
   257  				} else {
   258  					// Other mappings get pmas by translating.
   259  					optMR := vseg.mappableRangeOf(optAR)
   260  					reqAR := optAR.Intersect(ar)
   261  					reqMR := vseg.mappableRangeOf(reqAR)
   262  					perms := at
   263  					if vma.private {
   264  						// This pma will be copy-on-write; don't require write
   265  						// permission, but do require read permission to
   266  						// facilitate the copy.
   267  						//
   268  						// If at.Write is true, we will need to break
   269  						// copy-on-write immediately, which occurs after
   270  						// translation below.
   271  						perms.Read = true
   272  						perms.Write = false
   273  					}
   274  					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
   275  					if checkInvariants {
   276  						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
   277  							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
   278  						}
   279  					}
   280  					// Install a pma for each translation.
   281  					if len(ts) == 0 {
   282  						return pstart, pgap, err
   283  					}
   284  					pstart = pmaIterator{} // iterators invalidated
   285  					for _, t := range ts {
   286  						newpmaAR := vseg.addrRangeOf(t.Source)
   287  						newpma := pma{
   288  							file:           t.File,
   289  							off:            t.Offset,
   290  							translatePerms: t.Perms,
   291  							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
   292  							maxPerms:       vma.maxPerms.Intersect(t.Perms),
   293  						}
   294  						if vma.private {
   295  							newpma.effectivePerms.Write = false
   296  							newpma.maxPerms.Write = false
   297  							newpma.needCOW = true
   298  						}
   299  						mm.addRSSLocked(newpmaAR)
   300  						t.File.IncRef(t.FileRange())
   301  						// This is valid because memmap.Mappable.Translate is
   302  						// required to return Translations in increasing
   303  						// Translation.Source order.
   304  						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
   305  						pgap = pseg.NextGap()
   306  					}
   307  					// The error returned by Translate is only significant if
   308  					// it occurred before ar.End.
   309  					if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
   310  						return pstart, pgap, err
   311  					}
   312  					// Rewind pseg to the first pma inserted and continue the
   313  					// loop to check if we need to break copy-on-write.
   314  					pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
   315  					continue
   316  				}
   317  
   318  			case pseg.Ok() && pseg.Start() < vsegAR.End:
   319  				oldpma := pseg.ValuePtr()
   320  				if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
   321  					// Break copy-on-write by copying.
   322  					if checkInvariants {
   323  						if !oldpma.maxPerms.Read {
   324  							panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
   325  						}
   326  					}
   327  					// The majority of copy-on-write breaks on executable pages
   328  					// come from:
   329  					//
   330  					// - The ELF loader, which must zero out bytes on the last
   331  					// page of each segment after the end of the segment.
   332  					//
   333  					// - gdb's use of ptrace to insert breakpoints.
   334  					//
   335  					// Neither of these cases has enough spatial locality to
   336  					// benefit from copying nearby pages, so if the vma is
   337  					// executable, only copy the pages required.
   338  					var copyAR hostarch.AddrRange
   339  					if vseg.ValuePtr().effectivePerms.Execute {
   340  						copyAR = pseg.Range().Intersect(ar)
   341  					} else {
   342  						copyAR = pseg.Range().Intersect(maskAR)
   343  					}
   344  					// Get internal mappings from the pma to copy from.
   345  					if err := pseg.getInternalMappingsLocked(); err != nil {
   346  						return pstart, pseg.PrevGap(), err
   347  					}
   348  					// Copy contents.
   349  					fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
   350  					if _, ok := err.(safecopy.BusError); ok {
   351  						// If we got SIGBUS during the copy, deliver SIGBUS to
   352  						// userspace (instead of SIGSEGV) if we're breaking
   353  						// copy-on-write due to application page fault.
   354  						err = &memmap.BusError{err}
   355  					}
   356  					if fr.Length() == 0 {
   357  						return pstart, pseg.PrevGap(), err
   358  					}
   359  					// Unmap all of maskAR, not just copyAR, to minimize host
   360  					// syscalls. AddressSpace mappings must be removed before
   361  					// mm.decPrivateRef().
   362  					if !didUnmapAS {
   363  						mm.unmapASLocked(maskAR)
   364  						didUnmapAS = true
   365  					}
   366  					// Replace the pma with a copy in the part of the address
   367  					// range where copying was successful. This doesn't change
   368  					// RSS.
   369  					copyAR.End = copyAR.Start + hostarch.Addr(fr.Length())
   370  					if copyAR != pseg.Range() {
   371  						pseg = mm.pmas.Isolate(pseg, copyAR)
   372  						pstart = pmaIterator{} // iterators invalidated
   373  					}
   374  					oldpma = pseg.ValuePtr()
   375  					if oldpma.private {
   376  						mm.decPrivateRef(pseg.fileRange())
   377  					}
   378  					oldpma.file.DecRef(pseg.fileRange())
   379  					mm.incPrivateRef(fr)
   380  					mf.IncRef(fr)
   381  					oldpma.file = mf
   382  					oldpma.off = fr.Start
   383  					oldpma.translatePerms = hostarch.AnyAccess
   384  					oldpma.effectivePerms = vma.effectivePerms
   385  					oldpma.maxPerms = vma.maxPerms
   386  					oldpma.needCOW = false
   387  					oldpma.private = true
   388  					oldpma.internalMappings = safemem.BlockSeq{}
   389  					// Try to merge the pma with its neighbors.
   390  					if prev := pseg.PrevSegment(); prev.Ok() {
   391  						if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
   392  							pseg = merged
   393  							pstart = pmaIterator{} // iterators invalidated
   394  						}
   395  					}
   396  					if next := pseg.NextSegment(); next.Ok() {
   397  						if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
   398  							pseg = merged
   399  							pstart = pmaIterator{} // iterators invalidated
   400  						}
   401  					}
   402  					// The error returned by AllocateAndFill is only
   403  					// significant if it occurred before ar.End.
   404  					if err != nil && pseg.End() < ar.End {
   405  						return pstart, pseg.NextGap(), err
   406  					}
   407  					// Ensure pseg and pgap are correct for the next iteration
   408  					// of the loop.
   409  					pseg, pgap = pseg.NextNonEmpty()
   410  				} else if !oldpma.translatePerms.SupersetOf(at) {
   411  					// Get new pmas (with sufficient permissions) by calling
   412  					// memmap.Mappable.Translate again.
   413  					if checkInvariants {
   414  						if oldpma.private {
   415  							panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
   416  						}
   417  					}
   418  					// Allow the entire pma to be replaced.
   419  					optAR := pseg.Range()
   420  					optMR := vseg.mappableRangeOf(optAR)
   421  					reqAR := optAR.Intersect(ar)
   422  					reqMR := vseg.mappableRangeOf(reqAR)
   423  					perms := oldpma.translatePerms.Union(at)
   424  					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
   425  					if checkInvariants {
   426  						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
   427  							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
   428  						}
   429  					}
   430  					// Remove the part of the existing pma covered by new
   431  					// Translations, then insert new pmas. This doesn't change
   432  					// RSS. Note that we don't need to call unmapASLocked: any
   433  					// existing AddressSpace mappings are still valid (though
   434  					// less permissive than the new pmas indicate) until
   435  					// Invalidate is called, and will be replaced by future
   436  					// calls to mapASLocked.
   437  					if len(ts) == 0 {
   438  						return pstart, pseg.PrevGap(), err
   439  					}
   440  					transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
   441  					transAR := vseg.addrRangeOf(transMR)
   442  					pseg = mm.pmas.Isolate(pseg, transAR)
   443  					pseg.ValuePtr().file.DecRef(pseg.fileRange())
   444  					pgap = mm.pmas.Remove(pseg)
   445  					pstart = pmaIterator{} // iterators invalidated
   446  					for _, t := range ts {
   447  						newpmaAR := vseg.addrRangeOf(t.Source)
   448  						newpma := pma{
   449  							file:           t.File,
   450  							off:            t.Offset,
   451  							translatePerms: t.Perms,
   452  							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
   453  							maxPerms:       vma.maxPerms.Intersect(t.Perms),
   454  						}
   455  						if vma.private {
   456  							newpma.effectivePerms.Write = false
   457  							newpma.maxPerms.Write = false
   458  							newpma.needCOW = true
   459  						}
   460  						t.File.IncRef(t.FileRange())
   461  						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
   462  						pgap = pseg.NextGap()
   463  					}
   464  					// The error returned by Translate is only significant if
   465  					// it occurred before ar.End.
   466  					if err != nil && pseg.End() < ar.End {
   467  						return pstart, pgap, err
   468  					}
   469  					// Ensure pseg and pgap are correct for the next iteration
   470  					// of the loop.
   471  					if pgap.Range().Length() == 0 {
   472  						pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
   473  					} else {
   474  						pseg = pmaIterator{}
   475  					}
   476  				} else {
   477  					// We have a usable pma; continue.
   478  					pseg, pgap = pseg.NextNonEmpty()
   479  				}
   480  
   481  			default:
   482  				break pmaLoop
   483  			}
   484  		}
   485  		// Go to the next vma.
   486  		if ar.End <= vseg.End() {
   487  			if pgap.Ok() {
   488  				return pstart, pgap, nil
   489  			}
   490  			return pstart, pseg.PrevGap(), nil
   491  		}
   492  		vseg = vseg.NextSegment()
   493  	}
   494  }
   495  
   496  const (
   497  	// When memory is allocated for a private pma, align the allocated address
   498  	// range to a privateAllocUnit boundary when possible. Larger values of
   499  	// privateAllocUnit may reduce page faults by allowing fewer, larger pmas
   500  	// to be mapped, but may result in larger amounts of wasted memory in the
   501  	// presence of fragmentation. privateAllocUnit must be a power-of-2
   502  	// multiple of hostarch.PageSize.
   503  	privateAllocUnit = hostarch.HugePageSize
   504  
   505  	privateAllocMask = privateAllocUnit - 1
   506  )
   507  
   508  func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange {
   509  	aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End}
   510  	if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
   511  		aligned.End = end
   512  	}
   513  	if checkInvariants {
   514  		if !aligned.IsSupersetOf(ar) {
   515  			panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
   516  		}
   517  	}
   518  	return aligned
   519  }
   520  
   521  // isPMACopyOnWriteLocked returns true if the contents of the pma represented
   522  // by pseg must be copied to a new private pma to be written to.
   523  //
   524  // If the pma is a copy-on-write private pma, and holds the only reference on
   525  // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
   526  // and update the pma to indicate that it does not require copy-on-write.
   527  //
   528  // Preconditions:
   529  // * vseg.Range().IsSupersetOf(pseg.Range()).
   530  // * mm.mappingMu must be locked.
   531  // * mm.activeMu must be locked for writing.
   532  func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
   533  	pma := pseg.ValuePtr()
   534  	if !pma.needCOW {
   535  		return false
   536  	}
   537  	if !pma.private {
   538  		return true
   539  	}
   540  	// If we have the only reference on private memory to be copied, just take
   541  	// ownership of it instead of copying. If we do hold the only reference,
   542  	// additional references can only be taken by mm.Fork(), which is excluded
   543  	// by mm.activeMu, so this isn't racy.
   544  	mm.privateRefs.mu.Lock()
   545  	defer mm.privateRefs.mu.Unlock()
   546  	fr := pseg.fileRange()
   547  	// This check relies on mm.privateRefs.refs being kept fully merged.
   548  	rseg := mm.privateRefs.refs.FindSegment(fr.Start)
   549  	if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
   550  		pma.needCOW = false
   551  		// pma.private => pma.translatePerms == hostarch.AnyAccess
   552  		vma := vseg.ValuePtr()
   553  		pma.effectivePerms = vma.effectivePerms
   554  		pma.maxPerms = vma.maxPerms
   555  		return false
   556  	}
   557  	return true
   558  }
   559  
   560  // Invalidate implements memmap.MappingSpace.Invalidate.
   561  func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) {
   562  	if checkInvariants {
   563  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   564  			panic(fmt.Sprintf("invalid ar: %v", ar))
   565  		}
   566  	}
   567  
   568  	mm.activeMu.Lock()
   569  	defer mm.activeMu.Unlock()
   570  	if mm.captureInvalidations {
   571  		mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
   572  		return
   573  	}
   574  	mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
   575  }
   576  
   577  // invalidateLocked removes pmas and AddressSpace mappings of those pmas for
   578  // addresses in ar.
   579  //
   580  // Preconditions:
   581  // * mm.activeMu must be locked for writing.
   582  // * ar.Length() != 0.
   583  // * ar must be page-aligned.
   584  func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) {
   585  	if checkInvariants {
   586  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   587  			panic(fmt.Sprintf("invalid ar: %v", ar))
   588  		}
   589  	}
   590  
   591  	var didUnmapAS bool
   592  	pseg := mm.pmas.LowerBoundSegment(ar.Start)
   593  	for pseg.Ok() && pseg.Start() < ar.End {
   594  		pma := pseg.ValuePtr()
   595  		if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
   596  			pseg = mm.pmas.Isolate(pseg, ar)
   597  			pma = pseg.ValuePtr()
   598  			if !didUnmapAS {
   599  				// Unmap all of ar, not just pseg.Range(), to minimize host
   600  				// syscalls. AddressSpace mappings must be removed before
   601  				// mm.decPrivateRef().
   602  				mm.unmapASLocked(ar)
   603  				didUnmapAS = true
   604  			}
   605  			if pma.private {
   606  				mm.decPrivateRef(pseg.fileRange())
   607  			}
   608  			mm.removeRSSLocked(pseg.Range())
   609  			pma.file.DecRef(pseg.fileRange())
   610  			pseg = mm.pmas.Remove(pseg).NextSegment()
   611  		} else {
   612  			pseg = pseg.NextSegment()
   613  		}
   614  	}
   615  }
   616  
   617  // Pin returns the memmap.File ranges currently mapped by addresses in ar in
   618  // mm, acquiring a reference on the returned ranges which the caller must
   619  // release by calling Unpin. If not all addresses are mapped, Pin returns a
   620  // non-nil error. Note that Pin may return both a non-empty slice of
   621  // PinnedRanges and a non-nil error.
   622  //
   623  // Pin does not prevent mapped ranges from changing, making it unsuitable for
   624  // most I/O. It should only be used in contexts that would use get_user_pages()
   625  // in the Linux kernel.
   626  //
   627  // Preconditions:
   628  // * ar.Length() != 0.
   629  // * ar must be page-aligned.
   630  func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
   631  	if checkInvariants {
   632  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   633  			panic(fmt.Sprintf("invalid ar: %v", ar))
   634  		}
   635  	}
   636  
   637  	// Ensure that we have usable vmas.
   638  	mm.mappingMu.RLock()
   639  	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
   640  	if vendaddr := vend.Start(); vendaddr < ar.End {
   641  		if vendaddr <= ar.Start {
   642  			mm.mappingMu.RUnlock()
   643  			return nil, verr
   644  		}
   645  		ar.End = vendaddr
   646  	}
   647  
   648  	// Ensure that we have usable pmas.
   649  	mm.activeMu.Lock()
   650  	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
   651  	mm.mappingMu.RUnlock()
   652  	if pendaddr := pend.Start(); pendaddr < ar.End {
   653  		if pendaddr <= ar.Start {
   654  			mm.activeMu.Unlock()
   655  			return nil, perr
   656  		}
   657  		ar.End = pendaddr
   658  	}
   659  
   660  	// Gather pmas.
   661  	var prs []PinnedRange
   662  	for pseg.Ok() && pseg.Start() < ar.End {
   663  		psar := pseg.Range().Intersect(ar)
   664  		f := pseg.ValuePtr().file
   665  		fr := pseg.fileRangeOf(psar)
   666  		f.IncRef(fr)
   667  		prs = append(prs, PinnedRange{
   668  			Source: psar,
   669  			File:   f,
   670  			Offset: fr.Start,
   671  		})
   672  		pseg = pseg.NextSegment()
   673  	}
   674  	mm.activeMu.Unlock()
   675  
   676  	// Return the first error in order of progress through ar.
   677  	if perr != nil {
   678  		return prs, perr
   679  	}
   680  	return prs, verr
   681  }
   682  
   683  // PinnedRanges are returned by MemoryManager.Pin.
   684  type PinnedRange struct {
   685  	// Source is the corresponding range of addresses.
   686  	Source hostarch.AddrRange
   687  
   688  	// File is the mapped file.
   689  	File memmap.File
   690  
   691  	// Offset is the offset into File at which this PinnedRange begins.
   692  	Offset uint64
   693  }
   694  
   695  // FileRange returns the memmap.File offsets mapped by pr.
   696  func (pr PinnedRange) FileRange() memmap.FileRange {
   697  	return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
   698  }
   699  
   700  // Unpin releases the reference held by prs.
   701  func Unpin(prs []PinnedRange) {
   702  	for i := range prs {
   703  		prs[i].File.DecRef(prs[i].FileRange())
   704  	}
   705  }
   706  
   707  // movePMAsLocked moves all pmas in oldAR to newAR.
   708  //
   709  // Preconditions:
   710  // * mm.activeMu must be locked for writing.
   711  // * oldAR.Length() != 0.
   712  // * oldAR.Length() <= newAR.Length().
   713  // * !oldAR.Overlaps(newAR).
   714  // * mm.pmas.IsEmptyRange(newAR).
   715  // * oldAR and newAR must be page-aligned.
   716  func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) {
   717  	if checkInvariants {
   718  		if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
   719  			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
   720  		}
   721  		if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
   722  			panic(fmt.Sprintf("invalid newAR: %v", newAR))
   723  		}
   724  		if oldAR.Length() > newAR.Length() {
   725  			panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
   726  		}
   727  		if oldAR.Overlaps(newAR) {
   728  			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
   729  		}
   730  		// mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
   731  	}
   732  
   733  	type movedPMA struct {
   734  		oldAR hostarch.AddrRange
   735  		pma   pma
   736  	}
   737  	var movedPMAs []movedPMA
   738  	pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
   739  	for pseg.Ok() && pseg.Start() < oldAR.End {
   740  		pseg = mm.pmas.Isolate(pseg, oldAR)
   741  		movedPMAs = append(movedPMAs, movedPMA{
   742  			oldAR: pseg.Range(),
   743  			pma:   pseg.Value(),
   744  		})
   745  		pseg = mm.pmas.Remove(pseg).NextSegment()
   746  		// No RSS change is needed since we're re-inserting the same pmas
   747  		// below.
   748  	}
   749  
   750  	off := newAR.Start - oldAR.Start
   751  	pgap := mm.pmas.FindGap(newAR.Start)
   752  	for i := range movedPMAs {
   753  		mpma := &movedPMAs[i]
   754  		pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
   755  		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
   756  	}
   757  
   758  	mm.unmapASLocked(oldAR)
   759  }
   760  
   761  // getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
   762  // cached internal mappings. It returns:
   763  //
   764  // - An iterator to the gap after the last pma with internal mappings
   765  // containing an address in ar. If internal mappings exist for no addresses in
   766  // ar, the iterator is to a gap that begins before ar.Start.
   767  //
   768  // - An error that is non-nil if internal mappings exist for only a subset of
   769  // ar.
   770  //
   771  // Preconditions:
   772  // * mm.activeMu must be locked for writing.
   773  // * pseg.Range().Contains(ar.Start).
   774  // * pmas must exist for all addresses in ar.
   775  // * ar.Length() != 0.
   776  //
   777  // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
   778  // into mm.pmas.
   779  func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) (pmaGapIterator, error) {
   780  	if checkInvariants {
   781  		if !ar.WellFormed() || ar.Length() == 0 {
   782  			panic(fmt.Sprintf("invalid ar: %v", ar))
   783  		}
   784  		if !pseg.Range().Contains(ar.Start) {
   785  			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
   786  		}
   787  	}
   788  
   789  	for {
   790  		if err := pseg.getInternalMappingsLocked(); err != nil {
   791  			return pseg.PrevGap(), err
   792  		}
   793  		if ar.End <= pseg.End() {
   794  			return pseg.NextGap(), nil
   795  		}
   796  		pseg, _ = pseg.NextNonEmpty()
   797  	}
   798  }
   799  
   800  // getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
   801  // have cached internal mappings. It returns the subset of ars for which
   802  // internal mappings exist. If this is not equal to ars, it returns a non-nil
   803  // error explaining why.
   804  //
   805  // Preconditions:
   806  // * mm.activeMu must be locked for writing.
   807  // * pmas must exist for all addresses in ar.
   808  //
   809  // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
   810  // into mm.pmas.
   811  func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars hostarch.AddrRangeSeq) (hostarch.AddrRangeSeq, error) {
   812  	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
   813  		ar := arsit.Head()
   814  		if ar.Length() == 0 {
   815  			continue
   816  		}
   817  		if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
   818  			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
   819  		}
   820  	}
   821  	return ars, nil
   822  }
   823  
   824  // internalMappingsLocked returns internal mappings for addresses in ar.
   825  //
   826  // Preconditions:
   827  // * mm.activeMu must be locked.
   828  // * Internal mappings must have been previously established for all addresses
   829  //   in ar.
   830  // * ar.Length() != 0.
   831  // * pseg.Range().Contains(ar.Start).
   832  func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq {
   833  	if checkInvariants {
   834  		if !ar.WellFormed() || ar.Length() == 0 {
   835  			panic(fmt.Sprintf("invalid ar: %v", ar))
   836  		}
   837  		if !pseg.Range().Contains(ar.Start) {
   838  			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
   839  		}
   840  	}
   841  
   842  	if ar.End <= pseg.End() {
   843  		// Since only one pma is involved, we can use pma.internalMappings
   844  		// directly, avoiding a slice allocation.
   845  		offset := uint64(ar.Start - pseg.Start())
   846  		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
   847  	}
   848  
   849  	var ims []safemem.Block
   850  	for {
   851  		pr := pseg.Range().Intersect(ar)
   852  		for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
   853  			ims = append(ims, pims.Head())
   854  		}
   855  		if ar.End <= pseg.End() {
   856  			break
   857  		}
   858  		pseg = pseg.NextSegment()
   859  	}
   860  	return safemem.BlockSeqFromSlice(ims)
   861  }
   862  
   863  // vecInternalMappingsLocked returns internal mappings for addresses in ars.
   864  //
   865  // Preconditions:
   866  // * mm.activeMu must be locked.
   867  // * Internal mappings must have been previously established for all addresses
   868  //   in ars.
   869  func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq {
   870  	var ims []safemem.Block
   871  	for ; !ars.IsEmpty(); ars = ars.Tail() {
   872  		ar := ars.Head()
   873  		if ar.Length() == 0 {
   874  			continue
   875  		}
   876  		for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
   877  			ims = append(ims, pims.Head())
   878  		}
   879  	}
   880  	return safemem.BlockSeqFromSlice(ims)
   881  }
   882  
   883  // incPrivateRef acquires a reference on private pages in fr.
   884  func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) {
   885  	mm.privateRefs.mu.Lock()
   886  	defer mm.privateRefs.mu.Unlock()
   887  	refSet := &mm.privateRefs.refs
   888  	seg, gap := refSet.Find(fr.Start)
   889  	for {
   890  		switch {
   891  		case seg.Ok() && seg.Start() < fr.End:
   892  			seg = refSet.Isolate(seg, fr)
   893  			seg.SetValue(seg.Value() + 1)
   894  			seg, gap = seg.NextNonEmpty()
   895  		case gap.Ok() && gap.Start() < fr.End:
   896  			seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
   897  		default:
   898  			refSet.MergeAdjacent(fr)
   899  			return
   900  		}
   901  	}
   902  }
   903  
   904  // decPrivateRef releases a reference on private pages in fr.
   905  func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) {
   906  	var freed []memmap.FileRange
   907  
   908  	mm.privateRefs.mu.Lock()
   909  	refSet := &mm.privateRefs.refs
   910  	seg := refSet.LowerBoundSegment(fr.Start)
   911  	for seg.Ok() && seg.Start() < fr.End {
   912  		seg = refSet.Isolate(seg, fr)
   913  		if old := seg.Value(); old == 1 {
   914  			freed = append(freed, seg.Range())
   915  			seg = refSet.Remove(seg).NextSegment()
   916  		} else {
   917  			seg.SetValue(old - 1)
   918  			seg = seg.NextSegment()
   919  		}
   920  	}
   921  	refSet.MergeAdjacent(fr)
   922  	mm.privateRefs.mu.Unlock()
   923  
   924  	mf := mm.mfp.MemoryFile()
   925  	for _, fr := range freed {
   926  		mf.DecRef(fr)
   927  	}
   928  }
   929  
   930  // addRSSLocked updates the current and maximum resident set size of a
   931  // MemoryManager to reflect the insertion of a pma at ar.
   932  //
   933  // Preconditions: mm.activeMu must be locked for writing.
   934  func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) {
   935  	mm.curRSS += uint64(ar.Length())
   936  	if mm.curRSS > mm.maxRSS {
   937  		mm.maxRSS = mm.curRSS
   938  	}
   939  }
   940  
   941  // removeRSSLocked updates the current resident set size of a MemoryManager to
   942  // reflect the removal of a pma at ar.
   943  //
   944  // Preconditions: mm.activeMu must be locked for writing.
   945  func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) {
   946  	mm.curRSS -= uint64(ar.Length())
   947  }
   948  
   949  // pmaSetFunctions implements segment.Functions for pmaSet.
   950  type pmaSetFunctions struct{}
   951  
   952  func (pmaSetFunctions) MinKey() hostarch.Addr {
   953  	return 0
   954  }
   955  
   956  func (pmaSetFunctions) MaxKey() hostarch.Addr {
   957  	return ^hostarch.Addr(0)
   958  }
   959  
   960  func (pmaSetFunctions) ClearValue(pma *pma) {
   961  	pma.file = nil
   962  	pma.internalMappings = safemem.BlockSeq{}
   963  }
   964  
   965  func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) {
   966  	if pma1.file != pma2.file ||
   967  		pma1.off+uint64(ar1.Length()) != pma2.off ||
   968  		pma1.translatePerms != pma2.translatePerms ||
   969  		pma1.effectivePerms != pma2.effectivePerms ||
   970  		pma1.maxPerms != pma2.maxPerms ||
   971  		pma1.needCOW != pma2.needCOW ||
   972  		pma1.private != pma2.private {
   973  		return pma{}, false
   974  	}
   975  
   976  	// Discard internal mappings instead of trying to merge them, since merging
   977  	// them requires an allocation and getting them again from the
   978  	// memmap.File might not.
   979  	pma1.internalMappings = safemem.BlockSeq{}
   980  	return pma1, true
   981  }
   982  
   983  func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) {
   984  	newlen1 := uint64(split - ar.Start)
   985  	p2 := p
   986  	p2.off += newlen1
   987  	if !p.internalMappings.IsEmpty() {
   988  		p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
   989  		p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
   990  	}
   991  	return p, p2
   992  }
   993  
   994  // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
   995  // so by scanning linearly backward from pgap.
   996  //
   997  // Preconditions:
   998  // * mm.activeMu must be locked.
   999  // * addr <= pgap.Start().
  1000  func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator {
  1001  	if checkInvariants {
  1002  		if !pgap.Ok() {
  1003  			panic("terminal pma iterator")
  1004  		}
  1005  		if addr > pgap.Start() {
  1006  			panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
  1007  		}
  1008  	}
  1009  	// Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
  1010  	// which is the case if findOrSeekPrevUpperBoundPMA is called to find the
  1011  	// start of a range containing only a single PMA.
  1012  	if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
  1013  		return pseg
  1014  	}
  1015  	return mm.pmas.UpperBoundSegment(addr)
  1016  }
  1017  
  1018  // getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
  1019  // non-empty.
  1020  //
  1021  // Preconditions: mm.activeMu must be locked for writing.
  1022  func (pseg pmaIterator) getInternalMappingsLocked() error {
  1023  	pma := pseg.ValuePtr()
  1024  	if pma.internalMappings.IsEmpty() {
  1025  		// This must use maxPerms (instead of perms) because some permission
  1026  		// constraints are only visible to vmas; for example, mappings of
  1027  		// read-only files have vma.maxPerms.Write unset, but this may not be
  1028  		// visible to the memmap.Mappable.
  1029  		perms := pma.maxPerms
  1030  		// We will never execute application code through an internal mapping.
  1031  		perms.Execute = false
  1032  		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
  1033  		if err != nil {
  1034  			return err
  1035  		}
  1036  		pma.internalMappings = ims
  1037  	}
  1038  	return nil
  1039  }
  1040  
  1041  func (pseg pmaIterator) fileRange() memmap.FileRange {
  1042  	return pseg.fileRangeOf(pseg.Range())
  1043  }
  1044  
  1045  // Preconditions:
  1046  // * pseg.Range().IsSupersetOf(ar).
  1047  // * ar.Length != 0.
  1048  func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange {
  1049  	if checkInvariants {
  1050  		if !pseg.Ok() {
  1051  			panic("terminal pma iterator")
  1052  		}
  1053  		if !ar.WellFormed() || ar.Length() == 0 {
  1054  			panic(fmt.Sprintf("invalid ar: %v", ar))
  1055  		}
  1056  		if !pseg.Range().IsSupersetOf(ar) {
  1057  			panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
  1058  		}
  1059  	}
  1060  
  1061  	pma := pseg.ValuePtr()
  1062  	pstart := pseg.Start()
  1063  	return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
  1064  }