github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/pgalloc/pgalloc.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package pgalloc contains the page allocator subsystem, which manages memory
    16  // that may be mapped into application address spaces.
    17  //
    18  // Lock order:
    19  //
    20  // pgalloc.MemoryFile.mu
    21  //   pgalloc.MemoryFile.mappingsMu
    22  package pgalloc
    23  
    24  import (
    25  	"fmt"
    26  	"math"
    27  	"os"
    28  	"sync/atomic"
    29  	"time"
    30  
    31  	"golang.org/x/sys/unix"
    32  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    33  	"github.com/SagerNet/gvisor/pkg/context"
    34  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    35  	"github.com/SagerNet/gvisor/pkg/hostarch"
    36  	"github.com/SagerNet/gvisor/pkg/log"
    37  	"github.com/SagerNet/gvisor/pkg/safemem"
    38  	"github.com/SagerNet/gvisor/pkg/sentry/hostmm"
    39  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    40  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    41  	"github.com/SagerNet/gvisor/pkg/sync"
    42  	"github.com/SagerNet/gvisor/pkg/syserror"
    43  )
    44  
    45  // MemoryFile is a memmap.File whose pages may be allocated to arbitrary
    46  // users.
    47  type MemoryFile struct {
    48  	// opts holds options passed to NewMemoryFile. opts is immutable.
    49  	opts MemoryFileOpts
    50  
    51  	// MemoryFile owns a single backing file, which is modeled as follows:
    52  	//
    53  	// Each page in the file can be committed or uncommitted. A page is
    54  	// committed if the host kernel is spending resources to store its contents
    55  	// and uncommitted otherwise. This definition includes pages that the host
    56  	// kernel has swapped; this is intentional, to ensure that accounting does
    57  	// not change even if host kernel swapping behavior changes, and that
    58  	// memory used by pseudo-swap mechanisms like zswap is still accounted.
    59  	//
    60  	// The initial contents of uncommitted pages are implicitly zero bytes. A
    61  	// read or write to the contents of an uncommitted page causes it to be
    62  	// committed. This is the only event that can cause a uncommitted page to
    63  	// be committed.
    64  	//
    65  	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
    66  	// pages to be uncommitted. This is the only event that can cause a
    67  	// committed page to be uncommitted.
    68  	//
    69  	// Memory accounting is based on identifying the set of committed pages.
    70  	// Since we do not have direct access to the MMU, tracking reads and writes
    71  	// to uncommitted pages to detect commitment would introduce additional
    72  	// page faults, which would be prohibitively expensive. Instead, we query
    73  	// the host kernel to determine which pages are committed.
    74  
    75  	// file is the backing file. The file pointer is immutable.
    76  	file *os.File
    77  
    78  	mu sync.Mutex
    79  
    80  	// usage maps each page in the file to metadata for that page. Pages for
    81  	// which no segment exists in usage are both unallocated (not in use) and
    82  	// uncommitted.
    83  	//
    84  	// Since usage stores usageInfo objects by value, clients should usually
    85  	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
    86  	// pointer to the usageInfo rather than a copy.
    87  	//
    88  	// usage must be kept maximally merged (that is, there should never be two
    89  	// adjacent segments with the same values). At least markReclaimed depends
    90  	// on this property.
    91  	//
    92  	// usage is protected by mu.
    93  	usage usageSet
    94  
    95  	// The UpdateUsage function scans all segments with knownCommitted set
    96  	// to false, sees which pages are committed and creates corresponding
    97  	// segments with knownCommitted set to true.
    98  	//
    99  	// In order to avoid unnecessary scans, usageExpected tracks the total
   100  	// file blocks expected. This is used to elide the scan when this
   101  	// matches the underlying file blocks.
   102  	//
   103  	// To track swapped pages, usageSwapped tracks the discrepency between
   104  	// what is observed in core and what is reported by the file. When
   105  	// usageSwapped is non-zero, a sweep will be performed at least every
   106  	// second. The start of the last sweep is recorded in usageLast.
   107  	//
   108  	// All usage attributes are all protected by mu.
   109  	usageExpected uint64
   110  	usageSwapped  uint64
   111  	usageLast     time.Time
   112  
   113  	// fileSize is the size of the backing memory file in bytes. fileSize is
   114  	// always a power-of-two multiple of chunkSize.
   115  	//
   116  	// fileSize is protected by mu.
   117  	fileSize int64
   118  
   119  	// Pages from the backing file are mapped into the local address space on
   120  	// the granularity of large pieces called chunks. mappings is a []uintptr
   121  	// that stores, for each chunk, the start address of a mapping of that
   122  	// chunk in the current process' address space, or 0 if no such mapping
   123  	// exists. Once a chunk is mapped, it is never remapped or unmapped until
   124  	// the MemoryFile is destroyed.
   125  	//
   126  	// Mutating the mappings slice or its contents requires both holding
   127  	// mappingsMu and using atomic memory operations. (The slice is mutated
   128  	// whenever the file is expanded. Per the above, the only permitted
   129  	// mutation of the slice's contents is the assignment of a mapping to a
   130  	// chunk that was previously unmapped.) Reading the slice or its contents
   131  	// only requires *either* holding mappingsMu or using atomic memory
   132  	// operations. This allows MemoryFile.MapInternal to avoid locking in the
   133  	// common case where chunk mappings already exist.
   134  	mappingsMu sync.Mutex
   135  	mappings   atomic.Value
   136  
   137  	// destroyed is set by Destroy to instruct the reclaimer goroutine to
   138  	// release resources and exit. destroyed is protected by mu.
   139  	destroyed bool
   140  
   141  	// reclaimable is true if usage may contain reclaimable pages. reclaimable
   142  	// is protected by mu.
   143  	reclaimable bool
   144  
   145  	// relcaim is the collection of regions for reclaim. relcaim is protected
   146  	// by mu.
   147  	reclaim reclaimSet
   148  
   149  	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
   150  	// transitions from false to true.
   151  	reclaimCond sync.Cond
   152  
   153  	// evictable maps EvictableMemoryUsers to eviction state.
   154  	//
   155  	// evictable is protected by mu.
   156  	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
   157  
   158  	// evictionWG counts the number of goroutines currently performing evictions.
   159  	evictionWG sync.WaitGroup
   160  
   161  	// stopNotifyPressure stops memory cgroup pressure level
   162  	// notifications used to drive eviction. stopNotifyPressure is
   163  	// immutable.
   164  	stopNotifyPressure func()
   165  }
   166  
   167  // MemoryFileOpts provides options to NewMemoryFile.
   168  type MemoryFileOpts struct {
   169  	// DelayedEviction controls the extent to which the MemoryFile may delay
   170  	// eviction of evictable allocations.
   171  	DelayedEviction DelayedEvictionType
   172  
   173  	// If UseHostMemcgPressure is true, use host memory cgroup pressure level
   174  	// notifications to determine when eviction is necessary. This option has
   175  	// no effect unless DelayedEviction is DelayedEvictionEnabled.
   176  	UseHostMemcgPressure bool
   177  
   178  	// If ManualZeroing is true, MemoryFile must not assume that new pages
   179  	// obtained from the host are zero-filled, such that MemoryFile must manually
   180  	// zero newly-allocated pages.
   181  	ManualZeroing bool
   182  }
   183  
   184  // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
   185  type DelayedEvictionType int
   186  
   187  const (
   188  	// DelayedEvictionDefault has unspecified behavior.
   189  	DelayedEvictionDefault DelayedEvictionType = iota
   190  
   191  	// DelayedEvictionDisabled requires that evictable allocations are evicted
   192  	// as soon as possible.
   193  	DelayedEvictionDisabled
   194  
   195  	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
   196  	// evictable allocations until doing so is considered necessary to avoid
   197  	// performance degradation due to host memory pressure, or OOM kills.
   198  	//
   199  	// As of this writing, the behavior of DelayedEvictionEnabled depends on
   200  	// whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
   201  	//
   202  	// - If UseHostMemcgPressure is true, evictions are delayed until memory
   203  	// pressure is indicated.
   204  	//
   205  	// - Otherwise, evictions are only delayed until the reclaimer goroutine
   206  	// is out of work (pages to reclaim).
   207  	DelayedEvictionEnabled
   208  
   209  	// DelayedEvictionManual requires that evictable allocations are only
   210  	// evicted when MemoryFile.StartEvictions() is called. This is extremely
   211  	// dangerous outside of tests.
   212  	DelayedEvictionManual
   213  )
   214  
   215  // usageInfo tracks usage information.
   216  //
   217  // +stateify savable
   218  type usageInfo struct {
   219  	// kind is the usage kind.
   220  	kind usage.MemoryKind
   221  
   222  	// knownCommitted is true if the tracked region is definitely committed.
   223  	// (If it is false, the tracked region may or may not be committed.)
   224  	knownCommitted bool
   225  
   226  	refs uint64
   227  }
   228  
   229  // canCommit returns true if the tracked region can be committed.
   230  func (u *usageInfo) canCommit() bool {
   231  	// refs must be greater than 0 because we assume that reclaimable pages
   232  	// (that aren't already known to be committed) are not committed. This
   233  	// isn't necessarily true, even after the reclaimer does Decommit(),
   234  	// because the kernel may subsequently back the hugepage-sized region
   235  	// containing the decommitted page with a hugepage. However, it's
   236  	// consistent with our treatment of unallocated pages, which have the same
   237  	// property.
   238  	return !u.knownCommitted && u.refs != 0
   239  }
   240  
   241  // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
   242  // may be asked to deallocate that memory in the presence of memory pressure.
   243  type EvictableMemoryUser interface {
   244  	// Evict requests that the EvictableMemoryUser deallocate memory used by
   245  	// er, which was registered as evictable by a previous call to
   246  	// MemoryFile.MarkEvictable.
   247  	//
   248  	// Evict is not required to deallocate memory. In particular, since pgalloc
   249  	// must call Evict without holding locks to avoid circular lock ordering,
   250  	// it is possible that the passed range has already been marked as
   251  	// unevictable by a racing call to MemoryFile.MarkUnevictable.
   252  	// Implementations of EvictableMemoryUser must detect such races and handle
   253  	// them by making Evict have no effect on unevictable ranges.
   254  	//
   255  	// After a call to Evict, the MemoryFile will consider the evicted range
   256  	// unevictable (i.e. it will not call Evict on the same range again) until
   257  	// informed otherwise by a subsequent call to MarkEvictable.
   258  	Evict(ctx context.Context, er EvictableRange)
   259  }
   260  
   261  // An EvictableRange represents a range of uint64 offsets in an
   262  // EvictableMemoryUser.
   263  //
   264  // In practice, most EvictableMemoryUsers will probably be implementations of
   265  // memmap.Mappable, and EvictableRange therefore corresponds to
   266  // memmap.MappableRange. However, this package cannot depend on the memmap
   267  // package, since doing so would create a circular dependency.
   268  //
   269  // type EvictableRange <generated using go_generics>
   270  
   271  // evictableMemoryUserInfo is the value type of MemoryFile.evictable.
   272  type evictableMemoryUserInfo struct {
   273  	// ranges tracks all evictable ranges for the given user.
   274  	ranges evictableRangeSet
   275  
   276  	// If evicting is true, there is a goroutine currently evicting all
   277  	// evictable ranges for this user.
   278  	evicting bool
   279  }
   280  
   281  const (
   282  	chunkShift = 30
   283  	chunkSize  = 1 << chunkShift // 1 GB
   284  	chunkMask  = chunkSize - 1
   285  
   286  	// maxPage is the highest 64-bit page.
   287  	maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1)
   288  )
   289  
   290  // NewMemoryFile creates a MemoryFile backed by the given file. If
   291  // NewMemoryFile succeeds, ownership of file is transferred to the returned
   292  // MemoryFile.
   293  func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
   294  	switch opts.DelayedEviction {
   295  	case DelayedEvictionDefault:
   296  		opts.DelayedEviction = DelayedEvictionEnabled
   297  	case DelayedEvictionDisabled, DelayedEvictionManual:
   298  		opts.UseHostMemcgPressure = false
   299  	case DelayedEvictionEnabled:
   300  		// ok
   301  	default:
   302  		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
   303  	}
   304  
   305  	// Truncate the file to 0 bytes first to ensure that it's empty.
   306  	if err := file.Truncate(0); err != nil {
   307  		return nil, err
   308  	}
   309  	f := &MemoryFile{
   310  		opts:      opts,
   311  		file:      file,
   312  		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
   313  	}
   314  	f.mappings.Store(make([]uintptr, 0))
   315  	f.reclaimCond.L = &f.mu
   316  
   317  	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
   318  		stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
   319  			f.mu.Lock()
   320  			startedAny := f.startEvictionsLocked()
   321  			f.mu.Unlock()
   322  			if startedAny {
   323  				log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
   324  			}
   325  		}, "low")
   326  		if err != nil {
   327  			return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
   328  		}
   329  		f.stopNotifyPressure = stop
   330  	}
   331  
   332  	go f.runReclaim() // S/R-SAFE: f.mu
   333  
   334  	// The Linux kernel contains an optional feature called "Integrity
   335  	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
   336  	// binaries the first time they are mapped PROT_EXEC. This is bad news for
   337  	// executable pages mapped from our backing file, which can grow to
   338  	// terabytes in (sparse) size. If IMA attempts to checksum a file that
   339  	// large, it will allocate all of the sparse pages and quickly exhaust all
   340  	// memory.
   341  	//
   342  	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
   343  	// while the backing file is still small. IMA will ignore any future
   344  	// mappings.
   345  	m, _, errno := unix.Syscall6(
   346  		unix.SYS_MMAP,
   347  		0,
   348  		hostarch.PageSize,
   349  		unix.PROT_EXEC,
   350  		unix.MAP_SHARED,
   351  		file.Fd(),
   352  		0)
   353  	if errno != 0 {
   354  		// This isn't fatal (IMA may not even be in use). Log the error, but
   355  		// don't return it.
   356  		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
   357  	} else {
   358  		if _, _, errno := unix.Syscall(
   359  			unix.SYS_MUNMAP,
   360  			m,
   361  			hostarch.PageSize,
   362  			0); errno != 0 {
   363  			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
   364  		}
   365  	}
   366  
   367  	return f, nil
   368  }
   369  
   370  // Destroy releases all resources used by f.
   371  //
   372  // Preconditions: All pages allocated by f have been freed.
   373  //
   374  // Postconditions: None of f's methods may be called after Destroy.
   375  func (f *MemoryFile) Destroy() {
   376  	f.mu.Lock()
   377  	defer f.mu.Unlock()
   378  	f.destroyed = true
   379  	f.reclaimCond.Signal()
   380  }
   381  
   382  // Allocate returns a range of initially-zeroed pages of the given length with
   383  // the given accounting kind and a single reference held by the caller. When
   384  // the last reference on an allocated page is released, ownership of the page
   385  // is returned to the MemoryFile, allowing it to be returned by a future call
   386  // to Allocate.
   387  //
   388  // Preconditions: length must be page-aligned and non-zero.
   389  func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) {
   390  	if length == 0 || length%hostarch.PageSize != 0 {
   391  		panic(fmt.Sprintf("invalid allocation length: %#x", length))
   392  	}
   393  
   394  	f.mu.Lock()
   395  	defer f.mu.Unlock()
   396  
   397  	// Align hugepage-and-larger allocations on hugepage boundaries to try
   398  	// to take advantage of hugetmpfs.
   399  	alignment := uint64(hostarch.PageSize)
   400  	if length >= hostarch.HugePageSize {
   401  		alignment = hostarch.HugePageSize
   402  	}
   403  
   404  	// Find a range in the underlying file.
   405  	fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
   406  	if !ok {
   407  		return memmap.FileRange{}, syserror.ENOMEM
   408  	}
   409  
   410  	// Expand the file if needed.
   411  	if int64(fr.End) > f.fileSize {
   412  		// Round the new file size up to be chunk-aligned.
   413  		newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
   414  		if err := f.file.Truncate(newFileSize); err != nil {
   415  			return memmap.FileRange{}, err
   416  		}
   417  		f.fileSize = newFileSize
   418  		f.mappingsMu.Lock()
   419  		oldMappings := f.mappings.Load().([]uintptr)
   420  		newMappings := make([]uintptr, newFileSize>>chunkShift)
   421  		copy(newMappings, oldMappings)
   422  		f.mappings.Store(newMappings)
   423  		f.mappingsMu.Unlock()
   424  	}
   425  
   426  	if f.opts.ManualZeroing {
   427  		if err := f.manuallyZero(fr); err != nil {
   428  			return memmap.FileRange{}, err
   429  		}
   430  	}
   431  	// Mark selected pages as in use.
   432  	if !f.usage.Add(fr, usageInfo{
   433  		kind: kind,
   434  		refs: 1,
   435  	}) {
   436  		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
   437  	}
   438  
   439  	return fr, nil
   440  }
   441  
   442  // findAvailableRange returns an available range in the usageSet.
   443  //
   444  // Note that scanning for available slots takes place from end first backwards,
   445  // then forwards. This heuristic has important consequence for how sequential
   446  // mappings can be merged in the host VMAs, given that addresses for both
   447  // application and sentry mappings are allocated top-down (from higher to
   448  // lower addresses). The file is also grown expoentially in order to create
   449  // space for mappings to be allocated downwards.
   450  //
   451  // Precondition: alignment must be a power of 2.
   452  func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
   453  	alignmentMask := alignment - 1
   454  
   455  	// Search for space in existing gaps, starting at the current end of the
   456  	// file and working backward.
   457  	lastGap := usage.LastGap()
   458  	gap := lastGap
   459  	for {
   460  		end := gap.End()
   461  		if end > uint64(fileSize) {
   462  			end = uint64(fileSize)
   463  		}
   464  
   465  		// Try to allocate from the end of this gap, with the start of the
   466  		// allocated range aligned down to alignment.
   467  		unalignedStart := end - length
   468  		if unalignedStart > end {
   469  			// Negative overflow: this and all preceding gaps are too small to
   470  			// accommodate length.
   471  			break
   472  		}
   473  		if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
   474  			return memmap.FileRange{start, start + length}, true
   475  		}
   476  
   477  		gap = gap.PrevLargeEnoughGap(length)
   478  		if !gap.Ok() {
   479  			break
   480  		}
   481  	}
   482  
   483  	// Check that it's possible to fit this allocation at the end of a file of any size.
   484  	min := lastGap.Start()
   485  	min = (min + alignmentMask) &^ alignmentMask
   486  	if min+length < min {
   487  		// Overflow: allocation would exceed the range of uint64.
   488  		return memmap.FileRange{}, false
   489  	}
   490  
   491  	// Determine the minimum file size required to fit this allocation at its end.
   492  	for {
   493  		newFileSize := 2 * fileSize
   494  		if newFileSize <= fileSize {
   495  			if fileSize != 0 {
   496  				// Overflow: allocation would exceed the range of int64.
   497  				return memmap.FileRange{}, false
   498  			}
   499  			newFileSize = chunkSize
   500  		}
   501  		fileSize = newFileSize
   502  
   503  		unalignedStart := uint64(fileSize) - length
   504  		if unalignedStart > uint64(fileSize) {
   505  			// Negative overflow: fileSize is still inadequate.
   506  			continue
   507  		}
   508  		if start := unalignedStart &^ alignmentMask; start >= min {
   509  			return memmap.FileRange{start, start + length}, true
   510  		}
   511  	}
   512  }
   513  
   514  // AllocateAndFill allocates memory of the given kind and fills it by calling
   515  // r.ReadToBlocks() repeatedly until either length bytes are read or a non-nil
   516  // error is returned. It returns the memory filled by r, truncated down to the
   517  // nearest page. If this is shorter than length bytes due to an error returned
   518  // by r.ReadToBlocks(), it returns that error.
   519  //
   520  // Preconditions:
   521  // * length > 0.
   522  // * length must be page-aligned.
   523  func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
   524  	fr, err := f.Allocate(length, kind)
   525  	if err != nil {
   526  		return memmap.FileRange{}, err
   527  	}
   528  	dsts, err := f.MapInternal(fr, hostarch.Write)
   529  	if err != nil {
   530  		f.DecRef(fr)
   531  		return memmap.FileRange{}, err
   532  	}
   533  	n, err := safemem.ReadFullToBlocks(r, dsts)
   534  	un := uint64(hostarch.Addr(n).RoundDown())
   535  	if un < length {
   536  		// Free unused memory and update fr to contain only the memory that is
   537  		// still allocated.
   538  		f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
   539  		fr.End = fr.Start + un
   540  	}
   541  	return fr, err
   542  }
   543  
   544  // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
   545  const (
   546  	_FALLOC_FL_KEEP_SIZE  = 1
   547  	_FALLOC_FL_PUNCH_HOLE = 2
   548  )
   549  
   550  // Decommit releases resources associated with maintaining the contents of the
   551  // given pages. If Decommit succeeds, future accesses of the decommitted pages
   552  // will read zeroes.
   553  //
   554  // Preconditions: fr.Length() > 0.
   555  func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
   556  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   557  		panic(fmt.Sprintf("invalid range: %v", fr))
   558  	}
   559  
   560  	if f.opts.ManualZeroing {
   561  		// FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in
   562  		// effect.
   563  		if err := f.manuallyZero(fr); err != nil {
   564  			return err
   565  		}
   566  	} else {
   567  		if err := f.decommitFile(fr); err != nil {
   568  			return err
   569  		}
   570  	}
   571  
   572  	f.markDecommitted(fr)
   573  	return nil
   574  }
   575  
   576  func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error {
   577  	return f.forEachMappingSlice(fr, func(bs []byte) {
   578  		for i := range bs {
   579  			bs[i] = 0
   580  		}
   581  	})
   582  }
   583  
   584  func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
   585  	// "After a successful call, subsequent reads from this range will
   586  	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
   587  	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
   588  	return unix.Fallocate(
   589  		int(f.file.Fd()),
   590  		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
   591  		int64(fr.Start),
   592  		int64(fr.Length()))
   593  }
   594  
   595  func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
   596  	f.mu.Lock()
   597  	defer f.mu.Unlock()
   598  	// Since we're changing the knownCommitted attribute, we need to merge
   599  	// across the entire range to ensure that the usage tree is minimal.
   600  	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
   601  		val := seg.ValuePtr()
   602  		if val.knownCommitted {
   603  			// Drop the usageExpected appropriately.
   604  			amount := seg.Range().Length()
   605  			usage.MemoryAccounting.Dec(amount, val.kind)
   606  			f.usageExpected -= amount
   607  			val.knownCommitted = false
   608  		}
   609  	})
   610  	if gap.Ok() {
   611  		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
   612  	}
   613  	f.usage.MergeRange(fr)
   614  }
   615  
   616  // IncRef implements memmap.File.IncRef.
   617  func (f *MemoryFile) IncRef(fr memmap.FileRange) {
   618  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   619  		panic(fmt.Sprintf("invalid range: %v", fr))
   620  	}
   621  
   622  	f.mu.Lock()
   623  	defer f.mu.Unlock()
   624  
   625  	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
   626  		seg.ValuePtr().refs++
   627  	})
   628  	if gap.Ok() {
   629  		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
   630  	}
   631  
   632  	f.usage.MergeAdjacent(fr)
   633  }
   634  
   635  // DecRef implements memmap.File.DecRef.
   636  func (f *MemoryFile) DecRef(fr memmap.FileRange) {
   637  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   638  		panic(fmt.Sprintf("invalid range: %v", fr))
   639  	}
   640  
   641  	var freed bool
   642  
   643  	f.mu.Lock()
   644  	defer f.mu.Unlock()
   645  
   646  	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
   647  		seg = f.usage.Isolate(seg, fr)
   648  		val := seg.ValuePtr()
   649  		if val.refs == 0 {
   650  			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
   651  		}
   652  		val.refs--
   653  		if val.refs == 0 {
   654  			f.reclaim.Add(seg.Range(), reclaimSetValue{})
   655  			freed = true
   656  			// Reclassify memory as System, until it's freed by the reclaim
   657  			// goroutine.
   658  			if val.knownCommitted {
   659  				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
   660  			}
   661  			val.kind = usage.System
   662  		}
   663  	}
   664  	f.usage.MergeAdjacent(fr)
   665  
   666  	if freed {
   667  		f.reclaimable = true
   668  		f.reclaimCond.Signal()
   669  	}
   670  }
   671  
   672  // MapInternal implements memmap.File.MapInternal.
   673  func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
   674  	if !fr.WellFormed() || fr.Length() == 0 {
   675  		panic(fmt.Sprintf("invalid range: %v", fr))
   676  	}
   677  	if at.Execute {
   678  		return safemem.BlockSeq{}, linuxerr.EACCES
   679  	}
   680  
   681  	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
   682  	if chunks == 1 {
   683  		// Avoid an unnecessary slice allocation.
   684  		var seq safemem.BlockSeq
   685  		err := f.forEachMappingSlice(fr, func(bs []byte) {
   686  			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
   687  		})
   688  		return seq, err
   689  	}
   690  	blocks := make([]safemem.Block, 0, chunks)
   691  	err := f.forEachMappingSlice(fr, func(bs []byte) {
   692  		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
   693  	})
   694  	return safemem.BlockSeqFromSlice(blocks), err
   695  }
   696  
   697  // forEachMappingSlice invokes fn on a sequence of byte slices that
   698  // collectively map all bytes in fr.
   699  func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
   700  	mappings := f.mappings.Load().([]uintptr)
   701  	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
   702  		chunk := int(chunkStart >> chunkShift)
   703  		m := atomic.LoadUintptr(&mappings[chunk])
   704  		if m == 0 {
   705  			var err error
   706  			mappings, m, err = f.getChunkMapping(chunk)
   707  			if err != nil {
   708  				return err
   709  			}
   710  		}
   711  		startOff := uint64(0)
   712  		if chunkStart < fr.Start {
   713  			startOff = fr.Start - chunkStart
   714  		}
   715  		endOff := uint64(chunkSize)
   716  		if chunkStart+chunkSize > fr.End {
   717  			endOff = fr.End - chunkStart
   718  		}
   719  		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
   720  	}
   721  	return nil
   722  }
   723  
   724  func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
   725  	f.mappingsMu.Lock()
   726  	defer f.mappingsMu.Unlock()
   727  	// Another thread may have replaced f.mappings altogether due to file
   728  	// expansion.
   729  	mappings := f.mappings.Load().([]uintptr)
   730  	// Another thread may have already mapped the chunk.
   731  	if m := mappings[chunk]; m != 0 {
   732  		return mappings, m, nil
   733  	}
   734  	m, _, errno := unix.Syscall6(
   735  		unix.SYS_MMAP,
   736  		0,
   737  		chunkSize,
   738  		unix.PROT_READ|unix.PROT_WRITE,
   739  		unix.MAP_SHARED,
   740  		f.file.Fd(),
   741  		uintptr(chunk<<chunkShift))
   742  	if errno != 0 {
   743  		return nil, 0, errno
   744  	}
   745  	atomic.StoreUintptr(&mappings[chunk], m)
   746  	return mappings, m, nil
   747  }
   748  
   749  // MarkEvictable allows f to request memory deallocation by calling
   750  // user.Evict(er) in the future.
   751  //
   752  // Redundantly marking an already-evictable range as evictable has no effect.
   753  func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
   754  	f.mu.Lock()
   755  	defer f.mu.Unlock()
   756  	info, ok := f.evictable[user]
   757  	if !ok {
   758  		info = &evictableMemoryUserInfo{}
   759  		f.evictable[user] = info
   760  	}
   761  	gap := info.ranges.LowerBoundGap(er.Start)
   762  	for gap.Ok() && gap.Start() < er.End {
   763  		gapER := gap.Range().Intersect(er)
   764  		if gapER.Length() == 0 {
   765  			gap = gap.NextGap()
   766  			continue
   767  		}
   768  		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
   769  	}
   770  	if !info.evicting {
   771  		switch f.opts.DelayedEviction {
   772  		case DelayedEvictionDisabled:
   773  			// Kick off eviction immediately.
   774  			f.startEvictionGoroutineLocked(user, info)
   775  		case DelayedEvictionEnabled:
   776  			if !f.opts.UseHostMemcgPressure {
   777  				// Ensure that the reclaimer goroutine is running, so that it
   778  				// can start eviction when necessary.
   779  				f.reclaimCond.Signal()
   780  			}
   781  		}
   782  	}
   783  }
   784  
   785  // MarkUnevictable informs f that user no longer considers er to be evictable,
   786  // so the MemoryFile should no longer call user.Evict(er). Note that, per
   787  // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
   788  // called even after MarkUnevictable returns due to race conditions, and
   789  // implementations of EvictableMemoryUser must handle this possibility.
   790  //
   791  // Redundantly marking an already-unevictable range as unevictable has no
   792  // effect.
   793  func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
   794  	f.mu.Lock()
   795  	defer f.mu.Unlock()
   796  	info, ok := f.evictable[user]
   797  	if !ok {
   798  		return
   799  	}
   800  	seg := info.ranges.LowerBoundSegment(er.Start)
   801  	for seg.Ok() && seg.Start() < er.End {
   802  		seg = info.ranges.Isolate(seg, er)
   803  		seg = info.ranges.Remove(seg).NextSegment()
   804  	}
   805  	// We can only remove info if there's no eviction goroutine running on its
   806  	// behalf.
   807  	if !info.evicting && info.ranges.IsEmpty() {
   808  		delete(f.evictable, user)
   809  	}
   810  }
   811  
   812  // MarkAllUnevictable informs f that user no longer considers any offsets to be
   813  // evictable. It otherwise has the same semantics as MarkUnevictable.
   814  func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
   815  	f.mu.Lock()
   816  	defer f.mu.Unlock()
   817  	info, ok := f.evictable[user]
   818  	if !ok {
   819  		return
   820  	}
   821  	info.ranges.RemoveAll()
   822  	// We can only remove info if there's no eviction goroutine running on its
   823  	// behalf.
   824  	if !info.evicting {
   825  		delete(f.evictable, user)
   826  	}
   827  }
   828  
   829  // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
   830  // evictable memory, such that it may be advantageous to cache data in
   831  // evictable memory. The value returned by ShouldCacheEvictable may change
   832  // between calls.
   833  func (f *MemoryFile) ShouldCacheEvictable() bool {
   834  	return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
   835  }
   836  
   837  // UpdateUsage ensures that the memory usage statistics in
   838  // usage.MemoryAccounting are up to date.
   839  func (f *MemoryFile) UpdateUsage() error {
   840  	f.mu.Lock()
   841  	defer f.mu.Unlock()
   842  
   843  	// If the underlying usage matches where the usage tree already
   844  	// represents, then we can just avoid the entire scan (we know it's
   845  	// accurate).
   846  	currentUsage, err := f.TotalUsage()
   847  	if err != nil {
   848  		return err
   849  	}
   850  	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
   851  		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
   852  		return nil
   853  	}
   854  	// If the current usage matches the expected but there's swap
   855  	// accounting, then ensure a scan takes place at least every second
   856  	// (when requested).
   857  	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
   858  		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
   859  		return nil
   860  	}
   861  	// Linux updates usage values at CONFIG_HZ.
   862  	if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
   863  		log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
   864  		return nil
   865  	}
   866  
   867  	f.usageLast = time.Now()
   868  	err = f.updateUsageLocked(currentUsage, mincore)
   869  	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
   870  		currentUsage, f.usageExpected, f.usageSwapped)
   871  	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
   872  	return err
   873  }
   874  
   875  // updateUsageLocked attempts to detect commitment of previous-uncommitted
   876  // pages by invoking checkCommitted, which is a function that, for each page i
   877  // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
   878  //
   879  // Precondition: f.mu must be held; it may be unlocked and reacquired.
   880  // +checklocks:f.mu
   881  func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
   882  	// Track if anything changed to elide the merge. In the common case, we
   883  	// expect all segments to be committed and no merge to occur.
   884  	changedAny := false
   885  	defer func() {
   886  		if changedAny {
   887  			f.usage.MergeAll()
   888  		}
   889  
   890  		// Adjust the swap usage to reflect reality.
   891  		if f.usageExpected < currentUsage {
   892  			// Since no pages may be marked decommitted while we hold mu, we
   893  			// know that usage may have only increased since we got the last
   894  			// current usage. Therefore, if usageExpected is still short of
   895  			// currentUsage, we must assume that the difference is in pages
   896  			// that have been swapped.
   897  			newUsageSwapped := currentUsage - f.usageExpected
   898  			if f.usageSwapped < newUsageSwapped {
   899  				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
   900  			} else {
   901  				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
   902  			}
   903  			f.usageSwapped = newUsageSwapped
   904  		} else if f.usageSwapped != 0 {
   905  			// We have more usage accounted for than the file itself.
   906  			// That's fine, we probably caught a race where pages were
   907  			// being committed while the below loop was running. Just
   908  			// report the higher number that we found and ignore swap.
   909  			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
   910  			f.usageSwapped = 0
   911  		}
   912  	}()
   913  
   914  	// Reused mincore buffer, will generally be <= 4096 bytes.
   915  	var buf []byte
   916  
   917  	// Iterate over all usage data. There will only be usage segments
   918  	// present when there is an associated reference.
   919  	for seg := f.usage.FirstSegment(); seg.Ok(); {
   920  		if !seg.ValuePtr().canCommit() {
   921  			seg = seg.NextSegment()
   922  			continue
   923  		}
   924  
   925  		// Get the range for this segment. As we touch slices, the
   926  		// Start value will be walked along.
   927  		r := seg.Range()
   928  
   929  		var checkErr error
   930  		err := f.forEachMappingSlice(r,
   931  			func(s []byte) {
   932  				if checkErr != nil {
   933  					return
   934  				}
   935  
   936  				// Ensure that we have sufficient buffer for the call
   937  				// (one byte per page). The length of each slice must
   938  				// be page-aligned.
   939  				bufLen := len(s) / hostarch.PageSize
   940  				if len(buf) < bufLen {
   941  					buf = make([]byte, bufLen)
   942  				}
   943  
   944  				// Query for new pages in core.
   945  				// NOTE(b/165896008): mincore (which is passed as checkCommitted)
   946  				// by f.UpdateUsage() might take a really long time. So unlock f.mu
   947  				// while checkCommitted runs.
   948  				f.mu.Unlock() // +checklocksforce
   949  				err := checkCommitted(s, buf)
   950  				f.mu.Lock()
   951  				if err != nil {
   952  					checkErr = err
   953  					return
   954  				}
   955  
   956  				// Scan each page and switch out segments.
   957  				seg := f.usage.LowerBoundSegment(r.Start)
   958  				for i := 0; i < bufLen; {
   959  					if buf[i]&0x1 == 0 {
   960  						i++
   961  						continue
   962  					}
   963  					// Scan to the end of this committed range.
   964  					j := i + 1
   965  					for ; j < bufLen; j++ {
   966  						if buf[j]&0x1 == 0 {
   967  							break
   968  						}
   969  					}
   970  					committedFR := memmap.FileRange{
   971  						Start: r.Start + uint64(i*hostarch.PageSize),
   972  						End:   r.Start + uint64(j*hostarch.PageSize),
   973  					}
   974  					// Advance seg to committedFR.Start.
   975  					for seg.Ok() && seg.End() < committedFR.Start {
   976  						seg = seg.NextSegment()
   977  					}
   978  					// Mark pages overlapping committedFR as committed.
   979  					for seg.Ok() && seg.Start() < committedFR.End {
   980  						if seg.ValuePtr().canCommit() {
   981  							seg = f.usage.Isolate(seg, committedFR)
   982  							seg.ValuePtr().knownCommitted = true
   983  							amount := seg.Range().Length()
   984  							usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind)
   985  							f.usageExpected += amount
   986  							changedAny = true
   987  						}
   988  						seg = seg.NextSegment()
   989  					}
   990  					// Continue scanning for committed pages.
   991  					i = j + 1
   992  				}
   993  
   994  				// Advance r.Start.
   995  				r.Start += uint64(len(s))
   996  			})
   997  		if checkErr != nil {
   998  			return checkErr
   999  		}
  1000  		if err != nil {
  1001  			return err
  1002  		}
  1003  
  1004  		// Continue with the first segment after r.End.
  1005  		seg = f.usage.LowerBoundSegment(r.End)
  1006  	}
  1007  
  1008  	return nil
  1009  }
  1010  
  1011  // TotalUsage returns an aggregate usage for all memory statistics except
  1012  // Mapped (which is external to MemoryFile). This is generally much cheaper
  1013  // than UpdateUsage, but will not provide a fine-grained breakdown.
  1014  func (f *MemoryFile) TotalUsage() (uint64, error) {
  1015  	// Stat the underlying file to discover the underlying usage. stat(2)
  1016  	// always reports the allocated block count in units of 512 bytes. This
  1017  	// includes pages in the page cache and swapped pages.
  1018  	var stat unix.Stat_t
  1019  	if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil {
  1020  		return 0, err
  1021  	}
  1022  	return uint64(stat.Blocks * 512), nil
  1023  }
  1024  
  1025  // TotalSize returns the current size of the backing file in bytes, which is an
  1026  // upper bound on the amount of memory that can currently be allocated from the
  1027  // MemoryFile. The value returned by TotalSize is permitted to change.
  1028  func (f *MemoryFile) TotalSize() uint64 {
  1029  	f.mu.Lock()
  1030  	defer f.mu.Unlock()
  1031  	return uint64(f.fileSize)
  1032  }
  1033  
  1034  // File returns the backing file.
  1035  func (f *MemoryFile) File() *os.File {
  1036  	return f.file
  1037  }
  1038  
  1039  // FD implements memmap.File.FD.
  1040  func (f *MemoryFile) FD() int {
  1041  	return int(f.file.Fd())
  1042  }
  1043  
  1044  // String implements fmt.Stringer.String.
  1045  //
  1046  // Note that because f.String locks f.mu, calling f.String internally
  1047  // (including indirectly through the fmt package) risks recursive locking.
  1048  // Within the pgalloc package, use f.usage directly instead.
  1049  func (f *MemoryFile) String() string {
  1050  	f.mu.Lock()
  1051  	defer f.mu.Unlock()
  1052  	return f.usage.String()
  1053  }
  1054  
  1055  // runReclaim implements the reclaimer goroutine, which continuously decommits
  1056  // reclaimable pages in order to reduce memory usage and make them available
  1057  // for allocation.
  1058  func (f *MemoryFile) runReclaim() {
  1059  	for {
  1060  		// N.B. We must call f.markReclaimed on the returned FrameRange.
  1061  		fr, ok := f.findReclaimable()
  1062  		if !ok {
  1063  			break
  1064  		}
  1065  
  1066  		if f.opts.ManualZeroing {
  1067  			// If ManualZeroing is in effect, only hugepage-aligned regions may
  1068  			// be safely passed to decommitFile. Pages will be zeroed on
  1069  			// reallocation, so we don't need to perform any manual zeroing
  1070  			// here, whether or not decommitFile succeeds.
  1071  			if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok {
  1072  				if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr {
  1073  					decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)}
  1074  					if err := f.decommitFile(decommitFR); err != nil {
  1075  						log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err)
  1076  					}
  1077  				}
  1078  			}
  1079  		} else {
  1080  			if err := f.decommitFile(fr); err != nil {
  1081  				log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
  1082  				// Zero the pages manually. This won't reduce memory usage, but at
  1083  				// least ensures that the pages will be zero when reallocated.
  1084  				if err := f.manuallyZero(fr); err != nil {
  1085  					panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err))
  1086  				}
  1087  			}
  1088  		}
  1089  		f.markDecommitted(fr)
  1090  		f.markReclaimed(fr)
  1091  	}
  1092  
  1093  	// We only get here if findReclaimable finds f.destroyed set and returns
  1094  	// false.
  1095  	f.mu.Lock()
  1096  	if !f.destroyed {
  1097  		f.mu.Unlock()
  1098  		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
  1099  	}
  1100  	f.file.Close()
  1101  	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
  1102  	// that has possibly been reassigned.
  1103  	f.file = nil
  1104  	f.mappingsMu.Lock()
  1105  	defer f.mappingsMu.Unlock()
  1106  	mappings := f.mappings.Load().([]uintptr)
  1107  	for i, m := range mappings {
  1108  		if m != 0 {
  1109  			_, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0)
  1110  			if errno != 0 {
  1111  				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
  1112  			}
  1113  		}
  1114  	}
  1115  	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
  1116  	f.mappings.Store([]uintptr{})
  1117  	f.mu.Unlock()
  1118  
  1119  	// This must be called without holding f.mu to avoid circular lock
  1120  	// ordering.
  1121  	if f.stopNotifyPressure != nil {
  1122  		f.stopNotifyPressure()
  1123  	}
  1124  }
  1125  
  1126  // findReclaimable finds memory that has been marked for reclaim.
  1127  //
  1128  // Note that there returned range will be removed from tracking. It
  1129  // must be reclaimed (removed from f.usage) at this point.
  1130  func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
  1131  	f.mu.Lock()
  1132  	defer f.mu.Unlock()
  1133  	for {
  1134  		for {
  1135  			if f.destroyed {
  1136  				return memmap.FileRange{}, false
  1137  			}
  1138  			if f.reclaimable {
  1139  				break
  1140  			}
  1141  			if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
  1142  				// No work to do. Evict any pending evictable allocations to
  1143  				// get more reclaimable pages before going to sleep.
  1144  				f.startEvictionsLocked()
  1145  			}
  1146  			f.reclaimCond.Wait()
  1147  		}
  1148  		// Allocate works from the back of the file inwards, so reclaim
  1149  		// preserves this order to minimize the cost of the search.
  1150  		if seg := f.reclaim.LastSegment(); seg.Ok() {
  1151  			fr := seg.Range()
  1152  			f.reclaim.Remove(seg)
  1153  			return fr, true
  1154  		}
  1155  		// Nothing is reclaimable.
  1156  		f.reclaimable = false
  1157  	}
  1158  }
  1159  
  1160  func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
  1161  	f.mu.Lock()
  1162  	defer f.mu.Unlock()
  1163  	seg := f.usage.FindSegment(fr.Start)
  1164  	// All of fr should be mapped to a single uncommitted reclaimable
  1165  	// segment accounted to System.
  1166  	if !seg.Ok() {
  1167  		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
  1168  	}
  1169  	if !seg.Range().IsSupersetOf(fr) {
  1170  		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
  1171  	}
  1172  	if got, want := seg.Value(), (usageInfo{
  1173  		kind:           usage.System,
  1174  		knownCommitted: false,
  1175  		refs:           0,
  1176  	}); got != want {
  1177  		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
  1178  	}
  1179  	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
  1180  	// the caller of markReclaimed may not have decommitted it, so we can
  1181  	// only mark fr as reclaimed.
  1182  	f.usage.Remove(f.usage.Isolate(seg, fr))
  1183  }
  1184  
  1185  // StartEvictions requests that f evict all evictable allocations. It does not
  1186  // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
  1187  func (f *MemoryFile) StartEvictions() {
  1188  	f.mu.Lock()
  1189  	defer f.mu.Unlock()
  1190  	f.startEvictionsLocked()
  1191  }
  1192  
  1193  // Preconditions: f.mu must be locked.
  1194  func (f *MemoryFile) startEvictionsLocked() bool {
  1195  	startedAny := false
  1196  	for user, info := range f.evictable {
  1197  		// Don't start multiple goroutines to evict the same user's
  1198  		// allocations.
  1199  		if !info.evicting {
  1200  			f.startEvictionGoroutineLocked(user, info)
  1201  			startedAny = true
  1202  		}
  1203  	}
  1204  	return startedAny
  1205  }
  1206  
  1207  // Preconditions:
  1208  // * info == f.evictable[user].
  1209  // * !info.evicting.
  1210  // * f.mu must be locked.
  1211  func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
  1212  	info.evicting = true
  1213  	f.evictionWG.Add(1)
  1214  	go func() { // S/R-SAFE: f.evictionWG
  1215  		defer f.evictionWG.Done()
  1216  		for {
  1217  			f.mu.Lock()
  1218  			info, ok := f.evictable[user]
  1219  			if !ok {
  1220  				// This shouldn't happen: only this goroutine is permitted
  1221  				// to delete this entry.
  1222  				f.mu.Unlock()
  1223  				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
  1224  			}
  1225  			if info.ranges.IsEmpty() {
  1226  				delete(f.evictable, user)
  1227  				f.mu.Unlock()
  1228  				return
  1229  			}
  1230  			// Evict from the end of info.ranges, under the assumption that
  1231  			// if ranges in user start being used again (and are
  1232  			// consequently marked unevictable), such uses are more likely
  1233  			// to start from the beginning of user.
  1234  			seg := info.ranges.LastSegment()
  1235  			er := seg.Range()
  1236  			info.ranges.Remove(seg)
  1237  			// user.Evict() must be called without holding f.mu to avoid
  1238  			// circular lock ordering.
  1239  			f.mu.Unlock()
  1240  			user.Evict(context.Background(), er)
  1241  		}
  1242  	}()
  1243  }
  1244  
  1245  // WaitForEvictions blocks until f is no longer evicting any evictable
  1246  // allocations.
  1247  func (f *MemoryFile) WaitForEvictions() {
  1248  	f.evictionWG.Wait()
  1249  }
  1250  
  1251  type usageSetFunctions struct{}
  1252  
  1253  func (usageSetFunctions) MinKey() uint64 {
  1254  	return 0
  1255  }
  1256  
  1257  func (usageSetFunctions) MaxKey() uint64 {
  1258  	return math.MaxUint64
  1259  }
  1260  
  1261  func (usageSetFunctions) ClearValue(val *usageInfo) {
  1262  }
  1263  
  1264  func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
  1265  	return val1, val1 == val2
  1266  }
  1267  
  1268  func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
  1269  	return val, val
  1270  }
  1271  
  1272  // evictableRangeSetValue is the value type of evictableRangeSet.
  1273  type evictableRangeSetValue struct{}
  1274  
  1275  type evictableRangeSetFunctions struct{}
  1276  
  1277  func (evictableRangeSetFunctions) MinKey() uint64 {
  1278  	return 0
  1279  }
  1280  
  1281  func (evictableRangeSetFunctions) MaxKey() uint64 {
  1282  	return math.MaxUint64
  1283  }
  1284  
  1285  func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
  1286  }
  1287  
  1288  func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
  1289  	return evictableRangeSetValue{}, true
  1290  }
  1291  
  1292  func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
  1293  	return evictableRangeSetValue{}, evictableRangeSetValue{}
  1294  }
  1295  
  1296  // reclaimSetValue is the value type of reclaimSet.
  1297  type reclaimSetValue struct{}
  1298  
  1299  type reclaimSetFunctions struct{}
  1300  
  1301  func (reclaimSetFunctions) MinKey() uint64 {
  1302  	return 0
  1303  }
  1304  
  1305  func (reclaimSetFunctions) MaxKey() uint64 {
  1306  	return math.MaxUint64
  1307  }
  1308  
  1309  func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
  1310  }
  1311  
  1312  func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
  1313  	return reclaimSetValue{}, true
  1314  }
  1315  
  1316  func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
  1317  	return reclaimSetValue{}, reclaimSetValue{}
  1318  }