gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/mm/pma.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  	"sync"
    20  	"sync/atomic"
    21  
    22  	"gvisor.dev/gvisor/pkg/context"
    23  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    24  	"gvisor.dev/gvisor/pkg/hostarch"
    25  	"gvisor.dev/gvisor/pkg/safecopy"
    26  	"gvisor.dev/gvisor/pkg/safemem"
    27  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    28  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    29  	"gvisor.dev/gvisor/pkg/sentry/usage"
    30  )
    31  
    32  // existingPMAsLocked checks that pmas exist for all addresses in ar, and
    33  // support access of type (at, ignorePermissions). If so, it returns an
    34  // iterator to the pma containing ar.Start. Otherwise it returns a terminal
    35  // iterator.
    36  //
    37  // Preconditions:
    38  //   - mm.activeMu must be locked.
    39  //   - ar.Length() != 0.
    40  func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
    41  	if checkInvariants {
    42  		if !ar.WellFormed() || ar.Length() == 0 {
    43  			panic(fmt.Sprintf("invalid ar: %v", ar))
    44  		}
    45  	}
    46  
    47  	first := mm.pmas.FindSegment(ar.Start)
    48  	pseg := first
    49  	for pseg.Ok() {
    50  		pma := pseg.ValuePtr()
    51  		perms := pma.effectivePerms
    52  		if ignorePermissions {
    53  			perms = pma.maxPerms
    54  		}
    55  		if !perms.SupersetOf(at) {
    56  			return pmaIterator{}
    57  		}
    58  		if needInternalMappings && pma.internalMappings.IsEmpty() {
    59  			return pmaIterator{}
    60  		}
    61  
    62  		if ar.End <= pseg.End() {
    63  			return first
    64  		}
    65  		pseg, _ = pseg.NextNonEmpty()
    66  	}
    67  
    68  	// Ran out of pmas before reaching ar.End.
    69  	return pmaIterator{}
    70  }
    71  
    72  // existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
    73  // and support access of type (at, ignorePermissions).
    74  //
    75  // Preconditions: mm.activeMu must be locked.
    76  func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
    77  	for ; !ars.IsEmpty(); ars = ars.Tail() {
    78  		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
    79  			return false
    80  		}
    81  	}
    82  	return true
    83  }
    84  
    85  // getPMAsLocked ensures that pmas exist for all addresses in ar, and support
    86  // access of type at. It returns:
    87  //
    88  //   - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
    89  //     the iterator is unspecified.
    90  //
    91  //   - An iterator to the gap after the last pma containing an address in ar. If
    92  //     pmas exist for no addresses in ar, the iterator is to a gap that begins
    93  //     before ar.Start.
    94  //
    95  //   - An error that is non-nil if pmas exist for only a subset of ar.
    96  //
    97  // Preconditions:
    98  //   - mm.mappingMu must be locked.
    99  //   - mm.activeMu must be locked for writing.
   100  //   - ar.Length() != 0.
   101  //   - vseg.Range().Contains(ar.Start).
   102  //   - vmas must exist for all addresses in ar, and support accesses of type at
   103  //     (i.e. permission checks must have been performed against vmas).
   104  func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
   105  	if checkInvariants {
   106  		if !ar.WellFormed() || ar.Length() == 0 {
   107  			panic(fmt.Sprintf("invalid ar: %v", ar))
   108  		}
   109  		if !vseg.Ok() {
   110  			panic("terminal vma iterator")
   111  		}
   112  		if !vseg.Range().Contains(ar.Start) {
   113  			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
   114  		}
   115  	}
   116  
   117  	// Page-align ar so that all AddrRanges are aligned.
   118  	end, ok := ar.End.RoundUp()
   119  	var alignerr error
   120  	if !ok {
   121  		end = ar.End.RoundDown()
   122  		alignerr = linuxerr.EFAULT
   123  	}
   124  	ar = hostarch.AddrRange{ar.Start.RoundDown(), end}
   125  
   126  	pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
   127  	if pend.Start() <= ar.Start {
   128  		return pmaIterator{}, pend, perr
   129  	}
   130  	// getPMAsInternalLocked may not have returned pstart due to iterator
   131  	// invalidation.
   132  	if !pstart.Ok() {
   133  		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
   134  	}
   135  	if perr != nil {
   136  		return pstart, pend, perr
   137  	}
   138  	return pstart, pend, alignerr
   139  }
   140  
   141  // getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
   142  // support access of type at. It returns the subset of ars for which pmas
   143  // exist. If this is not equal to ars, it returns a non-nil error explaining
   144  // why.
   145  //
   146  // Preconditions:
   147  //   - mm.mappingMu must be locked.
   148  //   - mm.activeMu must be locked for writing.
   149  //   - vmas must exist for all addresses in ars, and support accesses of type at
   150  //     (i.e. permission checks must have been performed against vmas).
   151  func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) {
   152  	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
   153  		ar := arsit.Head()
   154  		if ar.Length() == 0 {
   155  			continue
   156  		}
   157  		if checkInvariants {
   158  			if !ar.WellFormed() {
   159  				panic(fmt.Sprintf("invalid ar: %v", ar))
   160  			}
   161  		}
   162  
   163  		// Page-align ar so that all AddrRanges are aligned.
   164  		end, ok := ar.End.RoundUp()
   165  		var alignerr error
   166  		if !ok {
   167  			end = ar.End.RoundDown()
   168  			alignerr = linuxerr.EFAULT
   169  		}
   170  		ar = hostarch.AddrRange{ar.Start.RoundDown(), end}
   171  
   172  		_, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
   173  		if perr != nil {
   174  			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
   175  		}
   176  		if alignerr != nil {
   177  			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
   178  		}
   179  	}
   180  
   181  	return ars, nil
   182  }
   183  
   184  // getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
   185  // exceptions:
   186  //
   187  //   - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
   188  //     is, the returned iterator may be terminal, even if a pma that contains
   189  //     ar.Start exists). Returning this iterator on a best-effort basis allows
   190  //     callers that require it to use it when it's cheaply available, while also
   191  //     avoiding the overhead of retrieving it when it's not.
   192  //
   193  //   - getPMAsInternalLocked additionally requires that ar is page-aligned.
   194  //     getPMAsInternalLocked is an implementation helper for getPMAsLocked and
   195  //     getVecPMAsLocked; other clients should call one of those instead.
   196  func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
   197  	if checkInvariants {
   198  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   199  			panic(fmt.Sprintf("invalid ar: %v", ar))
   200  		}
   201  		if !vseg.Ok() {
   202  			panic("terminal vma iterator")
   203  		}
   204  		if !vseg.Range().Contains(ar.Start) {
   205  			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
   206  		}
   207  	}
   208  	var pfdrs *pendingFileDecRefs
   209  	defer func() { // must be a closure to avoid evaluating pfdrs immediately
   210  		pfdrs.Cleanup()
   211  	}()
   212  	var unmapAR hostarch.AddrRange
   213  	defer func() {
   214  		mm.unmapASLocked(unmapAR)
   215  	}()
   216  
   217  	memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
   218  	opts := pgalloc.AllocOpts{Kind: usage.Anonymous, Dir: pgalloc.BottomUp, MemCgID: memCgID}
   219  	vma := vseg.ValuePtr()
   220  	if uintptr(ar.Start) < atomic.LoadUintptr(&vma.lastFault) {
   221  		// Detect cases where memory is accessed downwards and change memory file
   222  		// allocation order to increase the chances that pages are coalesced.
   223  		opts.Dir = pgalloc.TopDown
   224  	}
   225  	atomic.StoreUintptr(&vma.lastFault, uintptr(ar.Start))
   226  
   227  	// Limit the range we allocate to ar, aligned to privateAllocUnit.
   228  	maskAR := privateAligned(ar)
   229  	// The range in which we iterate vmas and pmas is still limited to ar, to
   230  	// ensure that we don't allocate or COW-break a pma we don't need.
   231  	pseg, pgap := mm.pmas.Find(ar.Start)
   232  	pstart := pseg
   233  	for {
   234  		// Get pmas for this vma.
   235  		vsegAR := vseg.Range().Intersect(ar)
   236  		vma := vseg.ValuePtr()
   237  	pmaLoop:
   238  		for {
   239  			switch {
   240  			case pgap.Ok() && pgap.Start() < vsegAR.End:
   241  				// Need a pma here.
   242  				optAR := vseg.Range().Intersect(pgap.Range())
   243  				if checkInvariants {
   244  					if optAR.Length() == 0 {
   245  						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
   246  					}
   247  				}
   248  				if vma.mappable == nil {
   249  					// Private anonymous mappings get pmas by allocating.
   250  					allocAR := optAR.Intersect(maskAR)
   251  					fr, err := mm.mf.Allocate(uint64(allocAR.Length()), opts)
   252  					if err != nil {
   253  						return pstart, pgap, err
   254  					}
   255  					if checkInvariants {
   256  						if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
   257  							panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
   258  						}
   259  					}
   260  					mm.addRSSLocked(allocAR)
   261  					pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
   262  						file:           mm.mf,
   263  						off:            fr.Start,
   264  						translatePerms: hostarch.AnyAccess,
   265  						effectivePerms: vma.effectivePerms,
   266  						maxPerms:       vma.maxPerms,
   267  						// Since we just allocated this memory and have the
   268  						// only reference, the new pma does not need
   269  						// copy-on-write.
   270  						private: true,
   271  					}).NextNonEmpty()
   272  					pstart = pmaIterator{} // iterators invalidated
   273  				} else {
   274  					// Other mappings get pmas by translating.
   275  					optMR := vseg.mappableRangeOf(optAR)
   276  					reqAR := optAR.Intersect(ar)
   277  					reqMR := vseg.mappableRangeOf(reqAR)
   278  					perms := at
   279  					if vma.private {
   280  						// This pma will be copy-on-write; don't require write
   281  						// permission, but do require read permission to
   282  						// facilitate the copy.
   283  						//
   284  						// If at.Write is true, we will need to break
   285  						// copy-on-write immediately, which occurs after
   286  						// translation below.
   287  						perms.Read = true
   288  						perms.Write = false
   289  					}
   290  					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
   291  					if checkInvariants {
   292  						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
   293  							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
   294  						}
   295  					}
   296  					// Install a pma for each translation.
   297  					if len(ts) == 0 {
   298  						return pstart, pgap, err
   299  					}
   300  					pstart = pmaIterator{} // iterators invalidated
   301  					for _, t := range ts {
   302  						newpmaAR := vseg.addrRangeOf(t.Source)
   303  						newpma := pma{
   304  							file:           t.File,
   305  							off:            t.Offset,
   306  							translatePerms: t.Perms,
   307  							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
   308  							maxPerms:       vma.maxPerms.Intersect(t.Perms),
   309  						}
   310  						if vma.private {
   311  							newpma.effectivePerms.Write = false
   312  							newpma.maxPerms.Write = false
   313  							newpma.needCOW = true
   314  						}
   315  						mm.addRSSLocked(newpmaAR)
   316  						t.File.IncRef(t.FileRange(), memCgID)
   317  						// This is valid because memmap.Mappable.Translate is
   318  						// required to return Translations in increasing
   319  						// Translation.Source order.
   320  						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
   321  						pgap = pseg.NextGap()
   322  					}
   323  					// The error returned by Translate is only significant if
   324  					// it occurred before ar.End.
   325  					if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
   326  						return pstart, pgap, err
   327  					}
   328  					// Rewind pseg to the first pma inserted and continue the
   329  					// loop to check if we need to break copy-on-write.
   330  					pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
   331  					continue
   332  				}
   333  
   334  			case pseg.Ok() && pseg.Start() < vsegAR.End:
   335  				oldpma := pseg.ValuePtr()
   336  				if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
   337  					// Break copy-on-write by copying.
   338  					if checkInvariants {
   339  						if !oldpma.maxPerms.Read {
   340  							panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
   341  						}
   342  					}
   343  					var copyAR hostarch.AddrRange
   344  					if vma := vseg.ValuePtr(); vma.effectivePerms.Execute {
   345  						// The majority of copy-on-write breaks on executable
   346  						// pages come from:
   347  						//
   348  						//	- The ELF loader, which must zero out bytes on the
   349  						//		last page of each segment after the end of the
   350  						//		segment.
   351  						//
   352  						//	- gdb's use of ptrace to insert breakpoints.
   353  						//
   354  						// Neither of these cases has enough spatial locality
   355  						// to benefit from copying nearby pages, so if the vma
   356  						// is executable, only copy the pages required.
   357  						copyAR = pseg.Range().Intersect(ar)
   358  					} else if vma.growsDown {
   359  						// In most cases, the new process will not use most of
   360  						// its stack before exiting or invoking execve(); it is
   361  						// especially unlikely to return very far down its call
   362  						// stack, since async-signal-safety concerns in
   363  						// multithreaded programs prevent the new process from
   364  						// being able to do much. So only copy up to one page
   365  						// before and after the pages required.
   366  						stackMaskAR := ar
   367  						if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start {
   368  							stackMaskAR.Start = newStart
   369  						}
   370  						if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End {
   371  							stackMaskAR.End = newEnd
   372  						}
   373  						copyAR = pseg.Range().Intersect(stackMaskAR)
   374  					} else {
   375  						copyAR = pseg.Range().Intersect(maskAR)
   376  					}
   377  					// Get internal mappings from the pma to copy from.
   378  					if err := pseg.getInternalMappingsLocked(); err != nil {
   379  						return pstart, pseg.PrevGap(), err
   380  					}
   381  					// Copy contents.
   382  					reader := safemem.BlockSeqReader{Blocks: mm.internalMappingsLocked(pseg, copyAR)}
   383  					fr, err := mm.mf.Allocate(uint64(copyAR.Length()), pgalloc.AllocOpts{
   384  						Kind:       usage.Anonymous,
   385  						Mode:       pgalloc.AllocateAndWritePopulate,
   386  						MemCgID:    memCgID,
   387  						ReaderFunc: reader.ReadToBlocks,
   388  					})
   389  					if _, ok := err.(safecopy.BusError); ok {
   390  						// If we got SIGBUS during the copy, deliver SIGBUS to
   391  						// userspace (instead of SIGSEGV) if we're breaking
   392  						// copy-on-write due to application page fault.
   393  						err = &memmap.BusError{err}
   394  					}
   395  					if fr.Length() == 0 {
   396  						return pstart, pseg.PrevGap(), err
   397  					}
   398  					// Replace the pma with a copy in the part of the address
   399  					// range where copying was successful. This doesn't change
   400  					// RSS.
   401  					copyAR.End = copyAR.Start + hostarch.Addr(fr.Length())
   402  					if copyAR != pseg.Range() {
   403  						pseg = mm.pmas.Isolate(pseg, copyAR)
   404  						pstart = pmaIterator{} // iterators invalidated
   405  					}
   406  					oldpma = pseg.ValuePtr()
   407  					unmapAR = joinAddrRanges(unmapAR, copyAR)
   408  					pfdrs = appendPendingFileDecRef(pfdrs, oldpma.file, pseg.fileRange())
   409  					oldpma.file = mm.mf
   410  					oldpma.off = fr.Start
   411  					oldpma.translatePerms = hostarch.AnyAccess
   412  					oldpma.effectivePerms = vma.effectivePerms
   413  					oldpma.maxPerms = vma.maxPerms
   414  					oldpma.needCOW = false
   415  					oldpma.private = true
   416  					oldpma.internalMappings = safemem.BlockSeq{}
   417  					// Try to merge the pma with its neighbors.
   418  					if prev := pseg.PrevSegment(); prev.Ok() {
   419  						if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
   420  							pseg = merged
   421  							pstart = pmaIterator{} // iterators invalidated
   422  						}
   423  					}
   424  					if next := pseg.NextSegment(); next.Ok() {
   425  						if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
   426  							pseg = merged
   427  							pstart = pmaIterator{} // iterators invalidated
   428  						}
   429  					}
   430  					// The error returned by AllocateAndFill is only
   431  					// significant if it occurred before ar.End.
   432  					if err != nil && pseg.End() < ar.End {
   433  						return pstart, pseg.NextGap(), err
   434  					}
   435  					// Ensure pseg and pgap are correct for the next iteration
   436  					// of the loop.
   437  					pseg, pgap = pseg.NextNonEmpty()
   438  				} else if !oldpma.translatePerms.SupersetOf(at) {
   439  					// Get new pmas (with sufficient permissions) by calling
   440  					// memmap.Mappable.Translate again.
   441  					if checkInvariants {
   442  						if oldpma.private {
   443  							panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
   444  						}
   445  					}
   446  					// Allow the entire pma to be replaced.
   447  					optAR := pseg.Range()
   448  					optMR := vseg.mappableRangeOf(optAR)
   449  					reqAR := optAR.Intersect(ar)
   450  					reqMR := vseg.mappableRangeOf(reqAR)
   451  					perms := oldpma.translatePerms.Union(at)
   452  					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
   453  					if checkInvariants {
   454  						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
   455  							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
   456  						}
   457  					}
   458  					// Remove the part of the existing pma covered by new
   459  					// Translations, then insert new pmas. This doesn't change
   460  					// RSS.
   461  					if len(ts) == 0 {
   462  						return pstart, pseg.PrevGap(), err
   463  					}
   464  					transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
   465  					transAR := vseg.addrRangeOf(transMR)
   466  					pseg = mm.pmas.Isolate(pseg, transAR)
   467  					unmapAR = joinAddrRanges(unmapAR, transAR)
   468  					pfdrs = appendPendingFileDecRef(pfdrs, pseg.ValuePtr().file, pseg.fileRange())
   469  					pgap = mm.pmas.Remove(pseg)
   470  					pstart = pmaIterator{} // iterators invalidated
   471  					for _, t := range ts {
   472  						newpmaAR := vseg.addrRangeOf(t.Source)
   473  						newpma := pma{
   474  							file:           t.File,
   475  							off:            t.Offset,
   476  							translatePerms: t.Perms,
   477  							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
   478  							maxPerms:       vma.maxPerms.Intersect(t.Perms),
   479  						}
   480  						if vma.private {
   481  							newpma.effectivePerms.Write = false
   482  							newpma.maxPerms.Write = false
   483  							newpma.needCOW = true
   484  						}
   485  						t.File.IncRef(t.FileRange(), memCgID)
   486  						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
   487  						pgap = pseg.NextGap()
   488  					}
   489  					// The error returned by Translate is only significant if
   490  					// it occurred before ar.End.
   491  					if err != nil && pseg.End() < ar.End {
   492  						return pstart, pgap, err
   493  					}
   494  					// Ensure pseg and pgap are correct for the next iteration
   495  					// of the loop.
   496  					if pgap.Range().Length() == 0 {
   497  						pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
   498  					} else {
   499  						pseg = pmaIterator{}
   500  					}
   501  				} else {
   502  					// We have a usable pma; continue.
   503  					pseg, pgap = pseg.NextNonEmpty()
   504  				}
   505  
   506  			default:
   507  				break pmaLoop
   508  			}
   509  		}
   510  		// Go to the next vma.
   511  		if ar.End <= vseg.End() {
   512  			if pgap.Ok() {
   513  				return pstart, pgap, nil
   514  			}
   515  			return pstart, pseg.PrevGap(), nil
   516  		}
   517  		vseg = vseg.NextSegment()
   518  	}
   519  }
   520  
   521  const (
   522  	// When memory is allocated for a private pma, align the allocated address
   523  	// range to a privateAllocUnit boundary when possible. Larger values of
   524  	// privateAllocUnit may reduce page faults by allowing fewer, larger pmas
   525  	// to be mapped, but may result in larger amounts of wasted memory in the
   526  	// presence of fragmentation. privateAllocUnit must be a power-of-2
   527  	// multiple of hostarch.PageSize.
   528  	privateAllocUnit = hostarch.HugePageSize
   529  
   530  	privateAllocMask = privateAllocUnit - 1
   531  )
   532  
   533  func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange {
   534  	aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End}
   535  	if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
   536  		aligned.End = end
   537  	}
   538  	if checkInvariants {
   539  		if !aligned.IsSupersetOf(ar) {
   540  			panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
   541  		}
   542  	}
   543  	return aligned
   544  }
   545  
   546  // isPMACopyOnWriteLocked returns true if the contents of the pma represented
   547  // by pseg must be copied to a new private pma to be written to.
   548  //
   549  // If the pma is a copy-on-write private pma, and holds the only reference on
   550  // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
   551  // and update the pma to indicate that it does not require copy-on-write.
   552  //
   553  // Preconditions:
   554  //   - vseg.Range().IsSupersetOf(pseg.Range()).
   555  //   - mm.mappingMu must be locked.
   556  //   - mm.activeMu must be locked for writing.
   557  func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
   558  	pma := pseg.ValuePtr()
   559  	if !pma.needCOW {
   560  		return false
   561  	}
   562  	if !pma.private {
   563  		return true
   564  	}
   565  	// If we have the only reference on private memory to be copied, just take
   566  	// ownership of it instead of copying. If we do hold the only reference,
   567  	// additional references can only be taken by mm.Fork(), which is excluded
   568  	// by mm.activeMu, so this isn't racy.
   569  	if mm.mf.HasUniqueRef(pseg.fileRange()) {
   570  		pma.needCOW = false
   571  		// pma.private => pma.translatePerms == hostarch.AnyAccess
   572  		vma := vseg.ValuePtr()
   573  		pma.effectivePerms = vma.effectivePerms
   574  		pma.maxPerms = vma.maxPerms
   575  		return false
   576  	}
   577  	return true
   578  }
   579  
   580  // Invalidate implements memmap.MappingSpace.Invalidate.
   581  func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) {
   582  	if checkInvariants {
   583  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   584  			panic(fmt.Sprintf("invalid ar: %v", ar))
   585  		}
   586  	}
   587  
   588  	mm.activeMu.Lock()
   589  	defer mm.activeMu.Unlock()
   590  	if mm.captureInvalidations {
   591  		mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
   592  		return
   593  	}
   594  	mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
   595  }
   596  
   597  // invalidateLocked removes pmas and AddressSpace mappings of those pmas for
   598  // addresses in ar.
   599  //
   600  // Preconditions:
   601  //   - mm.activeMu must be locked for writing.
   602  //   - ar.Length() != 0.
   603  //   - ar must be page-aligned.
   604  func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) {
   605  	if checkInvariants {
   606  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   607  			panic(fmt.Sprintf("invalid ar: %v", ar))
   608  		}
   609  	}
   610  
   611  	var didUnmapAS bool
   612  	pseg := mm.pmas.LowerBoundSegment(ar.Start)
   613  	for pseg.Ok() && pseg.Start() < ar.End {
   614  		pma := pseg.ValuePtr()
   615  		if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
   616  			pseg = mm.pmas.Isolate(pseg, ar)
   617  			pma = pseg.ValuePtr()
   618  			if !didUnmapAS {
   619  				// Unmap all of ar, not just pseg.Range(), to minimize host
   620  				// syscalls. AddressSpace mappings must be removed before
   621  				// pma.file.DecRef().
   622  				//
   623  				// Note that we do more than just ar here, and extrapolate
   624  				// to the end of any previous region that we may have mapped.
   625  				// This is done to ensure that lower layers can fully invalidate
   626  				// intermediate pagetable pages during the unmap.
   627  				var unmapAR hostarch.AddrRange
   628  				if prev := pseg.PrevSegment(); prev.Ok() {
   629  					unmapAR.Start = prev.End()
   630  				} else {
   631  					unmapAR.Start = mm.layout.MinAddr
   632  				}
   633  				if last := mm.pmas.LowerBoundSegment(ar.End); last.Ok() {
   634  					if last.Start() < ar.End {
   635  						unmapAR.End = ar.End
   636  					} else {
   637  						unmapAR.End = last.Start()
   638  					}
   639  				} else {
   640  					unmapAR.End = mm.layout.MaxAddr
   641  				}
   642  				mm.unmapASLocked(unmapAR)
   643  				didUnmapAS = true
   644  			}
   645  			mm.removeRSSLocked(pseg.Range())
   646  			pma.file.DecRef(pseg.fileRange())
   647  			pseg = mm.pmas.Remove(pseg).NextSegment()
   648  		} else {
   649  			pseg = pseg.NextSegment()
   650  		}
   651  	}
   652  }
   653  
   654  // Pin returns the memmap.File ranges currently mapped by addresses in ar in
   655  // mm, acquiring a reference on the returned ranges which the caller must
   656  // release by calling Unpin. If not all addresses are mapped, Pin returns a
   657  // non-nil error. Note that Pin may return both a non-empty slice of
   658  // PinnedRanges and a non-nil error.
   659  //
   660  // Pin does not prevent mapped ranges from changing, making it unsuitable for
   661  // most I/O. It should only be used in contexts that would use get_user_pages()
   662  // in the Linux kernel.
   663  //
   664  // Preconditions:
   665  //   - ar.Length() != 0.
   666  //   - ar must be page-aligned.
   667  func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
   668  	if checkInvariants {
   669  		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
   670  			panic(fmt.Sprintf("invalid ar: %v", ar))
   671  		}
   672  	}
   673  
   674  	// Ensure that we have usable vmas.
   675  	mm.mappingMu.RLock()
   676  	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
   677  	if vendaddr := vend.Start(); vendaddr < ar.End {
   678  		if vendaddr <= ar.Start {
   679  			mm.mappingMu.RUnlock()
   680  			return nil, verr
   681  		}
   682  		ar.End = vendaddr
   683  	}
   684  
   685  	// Ensure that we have usable pmas.
   686  	mm.activeMu.Lock()
   687  	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
   688  	mm.mappingMu.RUnlock()
   689  	if pendaddr := pend.Start(); pendaddr < ar.End {
   690  		if pendaddr <= ar.Start {
   691  			mm.activeMu.Unlock()
   692  			return nil, perr
   693  		}
   694  		ar.End = pendaddr
   695  	}
   696  
   697  	memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
   698  	// Gather pmas.
   699  	var prs []PinnedRange
   700  	for pseg.Ok() && pseg.Start() < ar.End {
   701  		psar := pseg.Range().Intersect(ar)
   702  		f := pseg.ValuePtr().file
   703  		fr := pseg.fileRangeOf(psar)
   704  		f.IncRef(fr, memCgID)
   705  		prs = append(prs, PinnedRange{
   706  			Source: psar,
   707  			File:   f,
   708  			Offset: fr.Start,
   709  		})
   710  		pseg = pseg.NextSegment()
   711  	}
   712  	mm.activeMu.Unlock()
   713  
   714  	// Return the first error in order of progress through ar.
   715  	if perr != nil {
   716  		return prs, perr
   717  	}
   718  	return prs, verr
   719  }
   720  
   721  // PinnedRanges are returned by MemoryManager.Pin.
   722  type PinnedRange struct {
   723  	// Source is the corresponding range of addresses.
   724  	Source hostarch.AddrRange
   725  
   726  	// File is the mapped file.
   727  	File memmap.File
   728  
   729  	// Offset is the offset into File at which this PinnedRange begins.
   730  	Offset uint64
   731  }
   732  
   733  // FileRange returns the memmap.File offsets mapped by pr.
   734  func (pr PinnedRange) FileRange() memmap.FileRange {
   735  	return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
   736  }
   737  
   738  // Unpin releases the reference held by prs.
   739  func Unpin(prs []PinnedRange) {
   740  	for i := range prs {
   741  		prs[i].File.DecRef(prs[i].FileRange())
   742  	}
   743  }
   744  
   745  // movePMAsLocked moves all pmas in oldAR to newAR.
   746  //
   747  // Preconditions:
   748  //   - mm.activeMu must be locked for writing.
   749  //   - oldAR.Length() != 0.
   750  //   - oldAR.Length() <= newAR.Length().
   751  //   - !oldAR.Overlaps(newAR).
   752  //   - mm.pmas.IsEmptyRange(newAR).
   753  //   - oldAR and newAR must be page-aligned.
   754  func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) {
   755  	if checkInvariants {
   756  		if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
   757  			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
   758  		}
   759  		if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
   760  			panic(fmt.Sprintf("invalid newAR: %v", newAR))
   761  		}
   762  		if oldAR.Length() > newAR.Length() {
   763  			panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
   764  		}
   765  		if oldAR.Overlaps(newAR) {
   766  			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
   767  		}
   768  		// mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
   769  	}
   770  
   771  	type movedPMA struct {
   772  		oldAR hostarch.AddrRange
   773  		pma   pma
   774  	}
   775  	var movedPMAs []movedPMA
   776  	pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
   777  	for pseg.Ok() && pseg.Start() < oldAR.End {
   778  		pseg = mm.pmas.Isolate(pseg, oldAR)
   779  		movedPMAs = append(movedPMAs, movedPMA{
   780  			oldAR: pseg.Range(),
   781  			pma:   pseg.Value(),
   782  		})
   783  		pseg = mm.pmas.Remove(pseg).NextSegment()
   784  		// No RSS change is needed since we're re-inserting the same pmas
   785  		// below.
   786  	}
   787  
   788  	off := newAR.Start - oldAR.Start
   789  	pgap := mm.pmas.FindGap(newAR.Start)
   790  	for i := range movedPMAs {
   791  		mpma := &movedPMAs[i]
   792  		pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
   793  		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
   794  	}
   795  
   796  	mm.unmapASLocked(oldAR)
   797  }
   798  
   799  // internalMappingsLocked returns cached internal mappings for addresses in ar.
   800  //
   801  // Preconditions:
   802  //   - mm.activeMu must be locked.
   803  //   - While mm.activeMu was locked, a call to
   804  //     existingPMAsLocked(needInternalMappings=true) succeeded for all
   805  //     addresses in ar.
   806  //   - ar.Length() != 0.
   807  //   - pseg.Range().Contains(ar.Start).
   808  func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq {
   809  	if checkInvariants {
   810  		if !ar.WellFormed() || ar.Length() == 0 {
   811  			panic(fmt.Sprintf("invalid ar: %v", ar))
   812  		}
   813  		if !pseg.Range().Contains(ar.Start) {
   814  			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
   815  		}
   816  	}
   817  
   818  	if ar.End <= pseg.End() {
   819  		// Since only one pma is involved, we can use pma.internalMappings
   820  		// directly, avoiding a slice allocation.
   821  		offset := uint64(ar.Start - pseg.Start())
   822  		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
   823  	}
   824  
   825  	var ims []safemem.Block
   826  	for {
   827  		pr := pseg.Range().Intersect(ar)
   828  		for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
   829  			ims = append(ims, pims.Head())
   830  		}
   831  		if ar.End <= pseg.End() {
   832  			break
   833  		}
   834  		pseg = pseg.NextSegment()
   835  	}
   836  	return safemem.BlockSeqFromSlice(ims)
   837  }
   838  
   839  // vecInternalMappingsLocked returns cached internal mappings for addresses in
   840  // ars.
   841  //
   842  // Preconditions:
   843  //   - mm.activeMu must be locked.
   844  //   - While mm.activeMu was locked, a call to
   845  //     existingVecPMAsLocked(needInternalMappings=true) succeeded for all
   846  //     addresses in ars.
   847  func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq {
   848  	var ims []safemem.Block
   849  	for ; !ars.IsEmpty(); ars = ars.Tail() {
   850  		ar := ars.Head()
   851  		if ar.Length() == 0 {
   852  			continue
   853  		}
   854  		for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
   855  			ims = append(ims, pims.Head())
   856  		}
   857  	}
   858  	return safemem.BlockSeqFromSlice(ims)
   859  }
   860  
   861  // addRSSLocked updates the current and maximum resident set size of a
   862  // MemoryManager to reflect the insertion of a pma at ar.
   863  //
   864  // Preconditions: mm.activeMu must be locked for writing.
   865  func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) {
   866  	mm.curRSS += uint64(ar.Length())
   867  	if mm.curRSS > mm.maxRSS {
   868  		mm.maxRSS = mm.curRSS
   869  	}
   870  }
   871  
   872  // removeRSSLocked updates the current resident set size of a MemoryManager to
   873  // reflect the removal of a pma at ar.
   874  //
   875  // Preconditions: mm.activeMu must be locked for writing.
   876  func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) {
   877  	mm.curRSS -= uint64(ar.Length())
   878  }
   879  
   880  // pmaSetFunctions implements segment.Functions for pmaSet.
   881  type pmaSetFunctions struct{}
   882  
   883  func (pmaSetFunctions) MinKey() hostarch.Addr {
   884  	return 0
   885  }
   886  
   887  func (pmaSetFunctions) MaxKey() hostarch.Addr {
   888  	return ^hostarch.Addr(0)
   889  }
   890  
   891  func (pmaSetFunctions) ClearValue(pma *pma) {
   892  	pma.file = nil
   893  	pma.internalMappings = safemem.BlockSeq{}
   894  }
   895  
   896  func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) {
   897  	if pma1.file != pma2.file ||
   898  		pma1.off+uint64(ar1.Length()) != pma2.off ||
   899  		pma1.translatePerms != pma2.translatePerms ||
   900  		pma1.effectivePerms != pma2.effectivePerms ||
   901  		pma1.maxPerms != pma2.maxPerms ||
   902  		pma1.needCOW != pma2.needCOW ||
   903  		pma1.private != pma2.private {
   904  		return pma{}, false
   905  	}
   906  
   907  	// Discard internal mappings instead of trying to merge them, since merging
   908  	// them requires an allocation and getting them again from the
   909  	// memmap.File might not.
   910  	pma1.internalMappings = safemem.BlockSeq{}
   911  	return pma1, true
   912  }
   913  
   914  func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) {
   915  	newlen1 := uint64(split - ar.Start)
   916  	p2 := p
   917  	p2.off += newlen1
   918  	if !p.internalMappings.IsEmpty() {
   919  		p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
   920  		p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
   921  	}
   922  	return p, p2
   923  }
   924  
   925  // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
   926  // so by scanning linearly backward from pgap.
   927  //
   928  // Preconditions:
   929  //   - mm.activeMu must be locked.
   930  //   - addr <= pgap.Start().
   931  func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator {
   932  	if checkInvariants {
   933  		if !pgap.Ok() {
   934  			panic("terminal pma iterator")
   935  		}
   936  		if addr > pgap.Start() {
   937  			panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
   938  		}
   939  	}
   940  	// Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
   941  	// which is the case if findOrSeekPrevUpperBoundPMA is called to find the
   942  	// start of a range containing only a single PMA.
   943  	if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
   944  		return pseg
   945  	}
   946  	return mm.pmas.UpperBoundSegment(addr)
   947  }
   948  
   949  // getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
   950  // non-empty.
   951  //
   952  // Preconditions: mm.activeMu must be locked for writing.
   953  func (pseg pmaIterator) getInternalMappingsLocked() error {
   954  	pma := pseg.ValuePtr()
   955  	if pma.internalMappings.IsEmpty() {
   956  		// This must use maxPerms (instead of perms) because some permission
   957  		// constraints are only visible to vmas; for example, mappings of
   958  		// read-only files have vma.maxPerms.Write unset, but this may not be
   959  		// visible to the memmap.Mappable.
   960  		perms := pma.maxPerms
   961  		// We will never execute application code through an internal mapping.
   962  		perms.Execute = false
   963  		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
   964  		if err != nil {
   965  			return err
   966  		}
   967  		pma.internalMappings = ims
   968  	}
   969  	return nil
   970  }
   971  
   972  func (pseg pmaIterator) fileRange() memmap.FileRange {
   973  	return pseg.fileRangeOf(pseg.Range())
   974  }
   975  
   976  // Preconditions:
   977  //   - pseg.Range().IsSupersetOf(ar).
   978  //   - ar.Length != 0.
   979  func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange {
   980  	if checkInvariants {
   981  		if !pseg.Ok() {
   982  			panic("terminal pma iterator")
   983  		}
   984  		if !ar.WellFormed() || ar.Length() == 0 {
   985  			panic(fmt.Sprintf("invalid ar: %v", ar))
   986  		}
   987  		if !pseg.Range().IsSupersetOf(ar) {
   988  			panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
   989  		}
   990  	}
   991  
   992  	pma := pseg.ValuePtr()
   993  	pstart := pseg.Start()
   994  	return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
   995  }
   996  
   997  // joinAddrRanges returns the smallest hostarch.AddrRange that is a superset of
   998  // both ar1 and ar2. If either ar1 or ar2 have length 0, joinAddrRanges returns
   999  // the other range. If both ar1 and ar2 have length 0, joinAddrRanges returns
  1000  // an unspecified range with length 0.
  1001  func joinAddrRanges(ar1, ar2 hostarch.AddrRange) hostarch.AddrRange {
  1002  	if ar1.Length() == 0 {
  1003  		return ar2
  1004  	}
  1005  	if ar2.Length() == 0 {
  1006  		return ar1
  1007  	}
  1008  	ar := ar1
  1009  	if ar.Start > ar2.Start {
  1010  		ar.Start = ar2.Start
  1011  	}
  1012  	if ar.End < ar2.End {
  1013  		ar.End = ar2.End
  1014  	}
  1015  	if checkInvariants {
  1016  		if !ar.IsSupersetOf(ar1) || !ar.IsSupersetOf(ar2) {
  1017  			panic(fmt.Sprintf("%v is not a superset of both %v and %v", ar, ar1, ar2))
  1018  		}
  1019  	}
  1020  	return ar
  1021  }
  1022  
  1023  // pendingFileDecRefs accumulates released memmap.FileRange references so that
  1024  // calls to memmap.File.DecRef() can occur without holding locks.
  1025  type pendingFileDecRefs struct {
  1026  	slice []pendingFileDecRef
  1027  }
  1028  
  1029  type pendingFileDecRef struct {
  1030  	file memmap.File
  1031  	fr   memmap.FileRange
  1032  }
  1033  
  1034  var pendingFileDecRefsPool = sync.Pool{
  1035  	New: func() any {
  1036  		return &pendingFileDecRefs{}
  1037  	},
  1038  }
  1039  
  1040  func appendPendingFileDecRef(pfdrs *pendingFileDecRefs, file memmap.File, fr memmap.FileRange) *pendingFileDecRefs {
  1041  	if pfdrs == nil {
  1042  		pfdrs = pendingFileDecRefsPool.Get().(*pendingFileDecRefs)
  1043  	}
  1044  	pfdrs.slice = append(pfdrs.slice, pendingFileDecRef{file, fr})
  1045  	return pfdrs
  1046  }
  1047  
  1048  // Cleanup releases all references accumulated by pfdrs and releases ownership
  1049  // of pfdrs. pfdrs may be nil.
  1050  //
  1051  // Preconditions: No AddressSpace ranges may be awaiting unmapping (since such
  1052  // ranges may refer to memmap.File pages that will be dropped.)
  1053  func (pfdrs *pendingFileDecRefs) Cleanup() {
  1054  	if pfdrs == nil {
  1055  		return
  1056  	}
  1057  	for i := range pfdrs.slice {
  1058  		pfdr := &pfdrs.slice[i]
  1059  		pfdr.file.DecRef(pfdr.fr)
  1060  		pfdr.file = nil // allow GC
  1061  	}
  1062  	pfdrs.slice = pfdrs.slice[:0]
  1063  	pendingFileDecRefsPool.Put(pfdrs)
  1064  }