github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/pgalloc/pgalloc.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/pgalloc/pgalloc.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package pgalloc contains the page allocator subsystem, which manages memory
    16  // that may be mapped into application address spaces.
    17  //
    18  // Lock order:
    19  //
    20  //	 pgalloc.MemoryFile.mu
    21  //		pgalloc.MemoryFile.mappingsMu
    22  package pgalloc
    23  
    24  import (
    25  	"fmt"
    26  	"math"
    27  	"os"
    28  	"sync/atomic"
    29  	"time"
    30  
    31  	"golang.org/x/sys/unix"
    32  	"github.com/metacubex/gvisor/pkg/abi/linux"
    33  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    34  	"github.com/metacubex/gvisor/pkg/context"
    35  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    36  	"github.com/metacubex/gvisor/pkg/hostarch"
    37  	"github.com/metacubex/gvisor/pkg/log"
    38  	"github.com/metacubex/gvisor/pkg/safemem"
    39  	"github.com/metacubex/gvisor/pkg/sentry/hostmm"
    40  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    41  	"github.com/metacubex/gvisor/pkg/sentry/usage"
    42  	"github.com/metacubex/gvisor/pkg/sync"
    43  )
    44  
    45  // Direction describes how to allocate offsets from MemoryFile.
    46  type Direction int
    47  
    48  const (
    49  	// BottomUp allocates offsets in increasing offsets.
    50  	BottomUp Direction = iota
    51  	// TopDown allocates offsets in decreasing offsets.
    52  	TopDown
    53  )
    54  
    55  // String implements fmt.Stringer.
    56  func (d Direction) String() string {
    57  	switch d {
    58  	case BottomUp:
    59  		return "up"
    60  	case TopDown:
    61  		return "down"
    62  	}
    63  	panic(fmt.Sprintf("invalid direction: %d", d))
    64  }
    65  
    66  // MemoryFile is a memmap.File whose pages may be allocated to arbitrary
    67  // users.
    68  type MemoryFile struct {
    69  	// opts holds options passed to NewMemoryFile. opts is immutable.
    70  	opts MemoryFileOpts
    71  
    72  	// MemoryFile owns a single backing file, which is modeled as follows:
    73  	//
    74  	// Each page in the file can be committed or uncommitted. A page is
    75  	// committed if the host kernel is spending resources to store its contents
    76  	// and uncommitted otherwise. This definition includes pages that the host
    77  	// kernel has swapped; this is intentional, to ensure that accounting does
    78  	// not change even if host kernel swapping behavior changes, and that
    79  	// memory used by pseudo-swap mechanisms like zswap is still accounted.
    80  	//
    81  	// The initial contents of uncommitted pages are implicitly zero bytes. A
    82  	// read or write to the contents of an uncommitted page causes it to be
    83  	// committed. This is the only event that can cause a uncommitted page to
    84  	// be committed.
    85  	//
    86  	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
    87  	// pages to be uncommitted. This is the only event that can cause a
    88  	// committed page to be uncommitted.
    89  	//
    90  	// Memory accounting is based on identifying the set of committed pages.
    91  	// Since we do not have direct access to the MMU, tracking reads and writes
    92  	// to uncommitted pages to detect commitment would introduce additional
    93  	// page faults, which would be prohibitively expensive. Instead, we query
    94  	// the host kernel to determine which pages are committed.
    95  
    96  	// file is the backing file. The file pointer is immutable.
    97  	file *os.File
    98  
    99  	mu memoryFileMutex
   100  
   101  	// usage maps each page in the file to metadata for that page. Pages for
   102  	// which no segment exists in usage are both unallocated (not in use) and
   103  	// uncommitted.
   104  	//
   105  	// Since usage stores usageInfo objects by value, clients should usually
   106  	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
   107  	// pointer to the usageInfo rather than a copy.
   108  	//
   109  	// usage must be kept maximally merged (that is, there should never be two
   110  	// adjacent segments with the same values). At least markReclaimed depends
   111  	// on this property.
   112  	//
   113  	// usage is protected by mu.
   114  	usage usageSet
   115  
   116  	// The UpdateUsage function scans all segments with knownCommitted set
   117  	// to false, sees which pages are committed and creates corresponding
   118  	// segments with knownCommitted set to true.
   119  	//
   120  	// In order to avoid unnecessary scans, usageExpected tracks the total
   121  	// file blocks expected. This is used to elide the scan when this
   122  	// matches the underlying file blocks.
   123  	//
   124  	// To track swapped pages, usageSwapped tracks the discrepancy between
   125  	// what is observed in core and what is reported by the file. When
   126  	// usageSwapped is non-zero, a sweep will be performed at least every
   127  	// second. The start of the last sweep is recorded in usageLast.
   128  	//
   129  	// All usage attributes are all protected by mu.
   130  	usageExpected uint64
   131  	usageSwapped  uint64
   132  	usageLast     time.Time
   133  
   134  	// fileSize is the size of the backing memory file in bytes. fileSize is
   135  	// always a power-of-two multiple of chunkSize.
   136  	//
   137  	// fileSize is protected by mu.
   138  	fileSize int64
   139  
   140  	// Pages from the backing file are mapped into the local address space on
   141  	// the granularity of large pieces called chunks. mappings is a []uintptr
   142  	// that stores, for each chunk, the start address of a mapping of that
   143  	// chunk in the current process' address space, or 0 if no such mapping
   144  	// exists. Once a chunk is mapped, it is never remapped or unmapped until
   145  	// the MemoryFile is destroyed.
   146  	//
   147  	// Mutating the mappings slice or its contents requires both holding
   148  	// mappingsMu and using atomic memory operations. (The slice is mutated
   149  	// whenever the file is expanded. Per the above, the only permitted
   150  	// mutation of the slice's contents is the assignment of a mapping to a
   151  	// chunk that was previously unmapped.) Reading the slice or its contents
   152  	// only requires *either* holding mappingsMu or using atomic memory
   153  	// operations. This allows MemoryFile.MapInternal to avoid locking in the
   154  	// common case where chunk mappings already exist.
   155  	mappingsMu mappingsMutex
   156  	mappings   atomic.Pointer[[]uintptr]
   157  
   158  	// destroyed is set by Destroy to instruct the reclaimer goroutine to
   159  	// release resources and exit. destroyed is protected by mu.
   160  	destroyed bool
   161  
   162  	// reclaimable is true if usage may contain reclaimable pages. reclaimable
   163  	// is protected by mu.
   164  	reclaimable bool
   165  
   166  	// reclaim is the collection of regions for reclaim. reclaim is protected
   167  	// by mu.
   168  	reclaim reclaimSet
   169  
   170  	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
   171  	// transitions from false to true.
   172  	reclaimCond sync.Cond
   173  
   174  	// evictable maps EvictableMemoryUsers to eviction state.
   175  	//
   176  	// evictable is protected by mu.
   177  	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
   178  
   179  	// evictionWG counts the number of goroutines currently performing evictions.
   180  	evictionWG sync.WaitGroup
   181  
   182  	// stopNotifyPressure stops memory cgroup pressure level
   183  	// notifications used to drive eviction. stopNotifyPressure is
   184  	// immutable.
   185  	stopNotifyPressure func()
   186  
   187  	// savable is true if this MemoryFile will be saved via SaveTo() during
   188  	// the kernel's SaveTo operation. savable is protected by mu.
   189  	savable bool
   190  }
   191  
   192  // MemoryFileOpts provides options to NewMemoryFile.
   193  type MemoryFileOpts struct {
   194  	// DelayedEviction controls the extent to which the MemoryFile may delay
   195  	// eviction of evictable allocations.
   196  	DelayedEviction DelayedEvictionType
   197  
   198  	// If UseHostMemcgPressure is true, use host memory cgroup pressure level
   199  	// notifications to determine when eviction is necessary. This option has
   200  	// no effect unless DelayedEviction is DelayedEvictionEnabled.
   201  	UseHostMemcgPressure bool
   202  
   203  	// DecommitOnDestroy indicates whether the entire host file should be
   204  	// decommitted on destruction. This is appropriate for host filesystem based
   205  	// files that need to be explicitly cleaned up to release disk space.
   206  	DecommitOnDestroy bool
   207  
   208  	// If ManualZeroing is true, MemoryFile must not assume that new pages
   209  	// obtained from the host are zero-filled, such that MemoryFile must manually
   210  	// zero newly-allocated pages.
   211  	ManualZeroing bool
   212  
   213  	// If DisableIMAWorkAround is true, NewMemoryFile will not call
   214  	// IMAWorkAroundForMemFile().
   215  	DisableIMAWorkAround bool
   216  
   217  	// DiskBackedFile indicates that the MemoryFile is backed by a file on disk.
   218  	DiskBackedFile bool
   219  
   220  	// RestoreID is an opaque string used to reassociate the MemoryFile with its
   221  	// replacement during restore.
   222  	RestoreID string
   223  }
   224  
   225  // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
   226  type DelayedEvictionType int
   227  
   228  const (
   229  	// DelayedEvictionDefault has unspecified behavior.
   230  	DelayedEvictionDefault DelayedEvictionType = iota
   231  
   232  	// DelayedEvictionDisabled requires that evictable allocations are evicted
   233  	// as soon as possible.
   234  	DelayedEvictionDisabled
   235  
   236  	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
   237  	// evictable allocations until doing so is considered necessary to avoid
   238  	// performance degradation due to host memory pressure, or OOM kills.
   239  	//
   240  	// As of this writing, the behavior of DelayedEvictionEnabled depends on
   241  	// whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
   242  	//
   243  	//	- If UseHostMemcgPressure is true, evictions are delayed until memory
   244  	//		pressure is indicated.
   245  	//
   246  	//	- Otherwise, evictions are only delayed until the reclaimer goroutine
   247  	//		is out of work (pages to reclaim).
   248  	DelayedEvictionEnabled
   249  
   250  	// DelayedEvictionManual requires that evictable allocations are only
   251  	// evicted when MemoryFile.StartEvictions() is called. This is extremely
   252  	// dangerous outside of tests.
   253  	DelayedEvictionManual
   254  )
   255  
   256  // usageInfo tracks usage information.
   257  //
   258  // +stateify savable
   259  type usageInfo struct {
   260  	// kind is the usage kind.
   261  	kind usage.MemoryKind
   262  
   263  	// knownCommitted is true if the tracked region is definitely committed.
   264  	// (If it is false, the tracked region may or may not be committed.)
   265  	knownCommitted bool
   266  
   267  	refs uint64
   268  
   269  	// memCgID is the memory cgroup id to which this page is committed.
   270  	memCgID uint32
   271  }
   272  
   273  // canCommit returns true if the tracked region can be committed.
   274  func (u *usageInfo) canCommit() bool {
   275  	// refs must be greater than 0 because we assume that reclaimable pages
   276  	// (that aren't already known to be committed) are not committed. This
   277  	// isn't necessarily true, even after the reclaimer does Decommit(),
   278  	// because the kernel may subsequently back the hugepage-sized region
   279  	// containing the decommitted page with a hugepage. However, it's
   280  	// consistent with our treatment of unallocated pages, which have the same
   281  	// property.
   282  	return !u.knownCommitted && u.refs != 0
   283  }
   284  
   285  // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
   286  // may be asked to deallocate that memory in the presence of memory pressure.
   287  type EvictableMemoryUser interface {
   288  	// Evict requests that the EvictableMemoryUser deallocate memory used by
   289  	// er, which was registered as evictable by a previous call to
   290  	// MemoryFile.MarkEvictable.
   291  	//
   292  	// Evict is not required to deallocate memory. In particular, since pgalloc
   293  	// must call Evict without holding locks to avoid circular lock ordering,
   294  	// it is possible that the passed range has already been marked as
   295  	// unevictable by a racing call to MemoryFile.MarkUnevictable.
   296  	// Implementations of EvictableMemoryUser must detect such races and handle
   297  	// them by making Evict have no effect on unevictable ranges.
   298  	//
   299  	// After a call to Evict, the MemoryFile will consider the evicted range
   300  	// unevictable (i.e. it will not call Evict on the same range again) until
   301  	// informed otherwise by a subsequent call to MarkEvictable.
   302  	Evict(ctx context.Context, er EvictableRange)
   303  }
   304  
   305  // An EvictableRange represents a range of uint64 offsets in an
   306  // EvictableMemoryUser.
   307  //
   308  // In practice, most EvictableMemoryUsers will probably be implementations of
   309  // memmap.Mappable, and EvictableRange therefore corresponds to
   310  // memmap.MappableRange. However, this package cannot depend on the memmap
   311  // package, since doing so would create a circular dependency.
   312  //
   313  // type EvictableRange <generated using go_generics>
   314  
   315  // evictableMemoryUserInfo is the value type of MemoryFile.evictable.
   316  type evictableMemoryUserInfo struct {
   317  	// ranges tracks all evictable ranges for the given user.
   318  	ranges evictableRangeSet
   319  
   320  	// If evicting is true, there is a goroutine currently evicting all
   321  	// evictable ranges for this user.
   322  	evicting bool
   323  }
   324  
   325  const (
   326  	chunkShift = 30
   327  	chunkSize  = 1 << chunkShift // 1 GB
   328  	chunkMask  = chunkSize - 1
   329  
   330  	// maxPage is the highest 64-bit page.
   331  	maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1)
   332  )
   333  
   334  // NewMemoryFile creates a MemoryFile backed by the given file. If
   335  // NewMemoryFile succeeds, ownership of file is transferred to the returned
   336  // MemoryFile.
   337  func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
   338  	switch opts.DelayedEviction {
   339  	case DelayedEvictionDefault:
   340  		opts.DelayedEviction = DelayedEvictionEnabled
   341  	case DelayedEvictionDisabled, DelayedEvictionManual:
   342  		opts.UseHostMemcgPressure = false
   343  	case DelayedEvictionEnabled:
   344  		// ok
   345  	default:
   346  		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
   347  	}
   348  
   349  	// Truncate the file to 0 bytes first to ensure that it's empty.
   350  	if err := file.Truncate(0); err != nil {
   351  		return nil, err
   352  	}
   353  	f := &MemoryFile{
   354  		opts:      opts,
   355  		file:      file,
   356  		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
   357  	}
   358  	f.mappings.Store(&[]uintptr{})
   359  	f.reclaimCond.L = &f.mu
   360  
   361  	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
   362  		stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
   363  			f.mu.Lock()
   364  			startedAny := f.startEvictionsLocked()
   365  			f.mu.Unlock()
   366  			if startedAny {
   367  				log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
   368  			}
   369  		}, "low")
   370  		if err != nil {
   371  			return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
   372  		}
   373  		f.stopNotifyPressure = stop
   374  	}
   375  
   376  	go f.runReclaim() // S/R-SAFE: f.mu
   377  
   378  	if !opts.DisableIMAWorkAround {
   379  		IMAWorkAroundForMemFile(file.Fd())
   380  	}
   381  	return f, nil
   382  }
   383  
   384  // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary
   385  // PROT_EXEC mapping, while the backing file is still small. IMA will ignore
   386  // any future mappings.
   387  //
   388  // The Linux kernel contains an optional feature called "Integrity
   389  // Measurement Architecture" (IMA). If IMA is enabled, it will checksum
   390  // binaries the first time they are mapped PROT_EXEC. This is bad news for
   391  // executable pages mapped from our backing file, which can grow to
   392  // terabytes in (sparse) size. If IMA attempts to checksum a file that
   393  // large, it will allocate all of the sparse pages and quickly exhaust all
   394  // memory.
   395  func IMAWorkAroundForMemFile(fd uintptr) {
   396  	m, _, errno := unix.Syscall6(
   397  		unix.SYS_MMAP,
   398  		0,
   399  		hostarch.PageSize,
   400  		unix.PROT_EXEC,
   401  		unix.MAP_SHARED,
   402  		fd,
   403  		0)
   404  	if errno != 0 {
   405  		// This isn't fatal (IMA may not even be in use). Log the error, but
   406  		// don't return it.
   407  		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
   408  	} else {
   409  		if _, _, errno := unix.Syscall(
   410  			unix.SYS_MUNMAP,
   411  			m,
   412  			hostarch.PageSize,
   413  			0); errno != 0 {
   414  			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
   415  		}
   416  	}
   417  }
   418  
   419  // Destroy releases all resources used by f.
   420  //
   421  // Preconditions: All pages allocated by f have been freed.
   422  //
   423  // Postconditions: None of f's methods may be called after Destroy.
   424  func (f *MemoryFile) Destroy() {
   425  	f.mu.Lock()
   426  	defer f.mu.Unlock()
   427  	f.destroyed = true
   428  	f.reclaimCond.Signal()
   429  }
   430  
   431  // AllocationMode provides a way to inform the pgalloc API how to allocate
   432  // memory and pages on the host.
   433  // A page will exist in one of the following incremental states:
   434  //  1. Allocated: A page is allocated if it was returned by Allocate() and its
   435  //     reference count hasn't dropped to 0 since then.
   436  //  2. Committed: As described in MemoryFile documentation above, a page is
   437  //     committed if the host kernel is spending resources to store its
   438  //     contents. A committed page is implicitly allocated.
   439  //  3. Populated: A page is populated for reading/writing in a page table
   440  //     hierarchy if it has a page table entry that permits reading/writing
   441  //     respectively. A populated page is implicitly committed, since the page
   442  //     table entry needs a physical page to point to, but not vice versa.
   443  type AllocationMode int
   444  
   445  const (
   446  	// AllocateOnly indicates that pages need to only be allocated.
   447  	AllocateOnly AllocationMode = iota
   448  	// AllocateAndCommit indicates that pages need to be committed, in addition
   449  	// to being allocated.
   450  	AllocateAndCommit
   451  	// AllocateAndWritePopulate indicates that writable pages should ideally be
   452  	// populated in the page table, in addition to being allocated. This is a
   453  	// suggestion, not a requirement.
   454  	AllocateAndWritePopulate
   455  )
   456  
   457  // AllocOpts are options used in MemoryFile.Allocate.
   458  type AllocOpts struct {
   459  	// Kind is the memory kind to be used for accounting.
   460  	Kind usage.MemoryKind
   461  	// Dir indicates the direction in which offsets are allocated.
   462  	Dir Direction
   463  	// MemCgID is the memory cgroup ID and the zero value indicates that
   464  	// the memory will not be accounted to any cgroup.
   465  	MemCgID uint32
   466  	// Mode allows the callers to select how the pages are allocated in the
   467  	// MemoryFile. Callers that will fill the allocated memory by writing to it
   468  	// should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers
   469  	// that will fill the allocated memory by invoking host system calls should
   470  	// pass AllocateOnly.
   471  	Mode AllocationMode
   472  	// If ReaderFunc is provided, the allocated memory is filled by calling it
   473  	// repeatedly until either length bytes are read or a non-nil error is
   474  	// returned. It returns the allocated memory, truncated down to the nearest
   475  	// page. If this is shorter than length bytes due to an error returned by
   476  	// ReaderFunc, it returns the partially filled fr and error.
   477  	ReaderFunc safemem.ReaderFunc
   478  }
   479  
   480  // Allocate returns a range of initially-zeroed pages of the given length with
   481  // the given accounting kind and a single reference held by the caller. When
   482  // the last reference on an allocated page is released, ownership of the page
   483  // is returned to the MemoryFile, allowing it to be returned by a future call
   484  // to Allocate.
   485  //
   486  // Preconditions: length must be page-aligned and non-zero.
   487  func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) {
   488  	fr, err := f.allocate(length, &opts)
   489  	if err != nil {
   490  		return memmap.FileRange{}, err
   491  	}
   492  	var dsts safemem.BlockSeq
   493  	switch opts.Mode {
   494  	case AllocateOnly: // Allocation is handled above. Nothing more to do.
   495  	case AllocateAndCommit:
   496  		if err := f.commitFile(fr); err != nil {
   497  			f.DecRef(fr)
   498  			return memmap.FileRange{}, err
   499  		}
   500  	case AllocateAndWritePopulate:
   501  		dsts, err = f.MapInternal(fr, hostarch.Write)
   502  		if err != nil {
   503  			f.DecRef(fr)
   504  			return memmap.FileRange{}, err
   505  		}
   506  		if canPopulate() {
   507  			rem := dsts
   508  			for {
   509  				if !tryPopulate(rem.Head()) {
   510  					break
   511  				}
   512  				rem = rem.Tail()
   513  				if rem.IsEmpty() {
   514  					break
   515  				}
   516  			}
   517  		}
   518  	default:
   519  		panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode))
   520  	}
   521  	if opts.ReaderFunc != nil {
   522  		if dsts.IsEmpty() {
   523  			dsts, err = f.MapInternal(fr, hostarch.Write)
   524  			if err != nil {
   525  				f.DecRef(fr)
   526  				return memmap.FileRange{}, err
   527  			}
   528  		}
   529  		n, err := safemem.ReadFullToBlocks(opts.ReaderFunc, dsts)
   530  		un := uint64(hostarch.Addr(n).RoundDown())
   531  		if un < length {
   532  			// Free unused memory and update fr to contain only the memory that is
   533  			// still allocated.
   534  			f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
   535  			fr.End = fr.Start + un
   536  		}
   537  		if err != nil {
   538  			return fr, err
   539  		}
   540  	}
   541  	return fr, nil
   542  }
   543  
   544  func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) {
   545  	if length == 0 || length%hostarch.PageSize != 0 {
   546  		panic(fmt.Sprintf("invalid allocation length: %#x", length))
   547  	}
   548  
   549  	f.mu.Lock()
   550  	defer f.mu.Unlock()
   551  
   552  	// Align hugepage-and-larger allocations on hugepage boundaries to try
   553  	// to take advantage of hugetmpfs.
   554  	alignment := uint64(hostarch.PageSize)
   555  	if length >= hostarch.HugePageSize {
   556  		alignment = hostarch.HugePageSize
   557  	}
   558  
   559  	// Find a range in the underlying file.
   560  	fr, ok := f.findAvailableRange(length, alignment, opts.Dir)
   561  	if !ok {
   562  		return memmap.FileRange{}, linuxerr.ENOMEM
   563  	}
   564  
   565  	// Expand the file if needed.
   566  	if int64(fr.End) > f.fileSize {
   567  		// Round the new file size up to be chunk-aligned.
   568  		newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
   569  		if err := f.file.Truncate(newFileSize); err != nil {
   570  			return memmap.FileRange{}, err
   571  		}
   572  		f.fileSize = newFileSize
   573  		f.mappingsMu.Lock()
   574  		oldMappings := *f.mappings.Load()
   575  		newMappings := make([]uintptr, newFileSize>>chunkShift)
   576  		copy(newMappings, oldMappings)
   577  		f.mappings.Store(&newMappings)
   578  		f.mappingsMu.Unlock()
   579  	}
   580  
   581  	if f.opts.ManualZeroing {
   582  		if err := f.manuallyZero(fr); err != nil {
   583  			return memmap.FileRange{}, err
   584  		}
   585  	}
   586  	// Mark selected pages as in use.
   587  	f.usage.InsertRange(fr, usageInfo{
   588  		kind:    opts.Kind,
   589  		refs:    1,
   590  		memCgID: opts.MemCgID,
   591  	})
   592  
   593  	return fr, nil
   594  }
   595  
   596  // findAvailableRange returns an available range in the usageSet.
   597  //
   598  // Note that scanning for available slots takes place from end first backwards,
   599  // then forwards. This heuristic has important consequence for how sequential
   600  // mappings can be merged in the host VMAs, given that addresses for both
   601  // application and sentry mappings are allocated top-down (from higher to
   602  // lower addresses). The file is also grown exponentially in order to create
   603  // space for mappings to be allocated downwards.
   604  //
   605  // Precondition: alignment must be a power of 2.
   606  func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) {
   607  	if dir == BottomUp {
   608  		return findAvailableRangeBottomUp(&f.usage, length, alignment)
   609  	}
   610  	return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment)
   611  }
   612  
   613  func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
   614  	alignmentMask := alignment - 1
   615  
   616  	// Search for space in existing gaps, starting at the current end of the
   617  	// file and working backward.
   618  	lastGap := usage.LastGap()
   619  	gap := lastGap
   620  	for {
   621  		end := gap.End()
   622  		if end > uint64(fileSize) {
   623  			end = uint64(fileSize)
   624  		}
   625  
   626  		// Try to allocate from the end of this gap, with the start of the
   627  		// allocated range aligned down to alignment.
   628  		unalignedStart := end - length
   629  		if unalignedStart > end {
   630  			// Negative overflow: this and all preceding gaps are too small to
   631  			// accommodate length.
   632  			break
   633  		}
   634  		if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
   635  			return memmap.FileRange{start, start + length}, true
   636  		}
   637  
   638  		gap = gap.PrevLargeEnoughGap(length)
   639  		if !gap.Ok() {
   640  			break
   641  		}
   642  	}
   643  
   644  	// Check that it's possible to fit this allocation at the end of a file of any size.
   645  	min := lastGap.Start()
   646  	min = (min + alignmentMask) &^ alignmentMask
   647  	if min+length < min {
   648  		// Overflow: allocation would exceed the range of uint64.
   649  		return memmap.FileRange{}, false
   650  	}
   651  
   652  	// Determine the minimum file size required to fit this allocation at its end.
   653  	for {
   654  		newFileSize := 2 * fileSize
   655  		if newFileSize <= fileSize {
   656  			if fileSize != 0 {
   657  				// Overflow: allocation would exceed the range of int64.
   658  				return memmap.FileRange{}, false
   659  			}
   660  			newFileSize = chunkSize
   661  		}
   662  		fileSize = newFileSize
   663  
   664  		unalignedStart := uint64(fileSize) - length
   665  		if unalignedStart > uint64(fileSize) {
   666  			// Negative overflow: fileSize is still inadequate.
   667  			continue
   668  		}
   669  		if start := unalignedStart &^ alignmentMask; start >= min {
   670  			return memmap.FileRange{start, start + length}, true
   671  		}
   672  	}
   673  }
   674  
   675  func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) {
   676  	alignmentMask := alignment - 1
   677  	for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) {
   678  		// Align the start address and check if allocation still fits in the gap.
   679  		start := (gap.Start() + alignmentMask) &^ alignmentMask
   680  
   681  		// File offsets are int64s. Since length must be strictly positive, end
   682  		// cannot legitimately be 0.
   683  		end := start + length
   684  		if end < start || int64(end) <= 0 {
   685  			return memmap.FileRange{}, false
   686  		}
   687  		if end <= gap.End() {
   688  			return memmap.FileRange{start, end}, true
   689  		}
   690  	}
   691  
   692  	// NextLargeEnoughGap should have returned a gap at the end.
   693  	panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length))
   694  }
   695  
   696  var mlockDisabled atomicbitops.Uint32
   697  var madvPopulateWriteDisabled atomicbitops.Uint32
   698  
   699  func canPopulate() bool {
   700  	return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0
   701  }
   702  
   703  func tryPopulateMadv(b safemem.Block) bool {
   704  	if madvPopulateWriteDisabled.Load() != 0 {
   705  		return false
   706  	}
   707  	start, ok := hostarch.Addr(b.Addr()).RoundUp()
   708  	if !ok {
   709  		return true
   710  	}
   711  	end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown()
   712  	bLen := end - start
   713  	// Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated.
   714  	// 1 syscall overhead >= 1 page fault overhead. This is because syscalls are
   715  	// susceptible to additional overheads like seccomp-bpf filters and auditing.
   716  	if start >= end || bLen <= hostarch.PageSize {
   717  		return true
   718  	}
   719  	_, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE)
   720  	if errno != 0 {
   721  		if errno == unix.EINVAL {
   722  			// EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14).
   723  			log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
   724  		} else {
   725  			log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
   726  		}
   727  		madvPopulateWriteDisabled.Store(1)
   728  		return false
   729  	}
   730  	return true
   731  }
   732  
   733  func tryPopulateMlock(b safemem.Block) bool {
   734  	if mlockDisabled.Load() != 0 {
   735  		return false
   736  	}
   737  	// Call mlock to populate pages, then munlock to cancel the mlock (but keep
   738  	// the pages populated). Only do so for hugepage-aligned address ranges to
   739  	// ensure that splitting the VMA in mlock doesn't split any existing
   740  	// hugepages. This assumes that two host syscalls, plus the MM overhead of
   741  	// mlock + munlock, is faster on average than trapping for
   742  	// HugePageSize/PageSize small page faults.
   743  	start, ok := hostarch.Addr(b.Addr()).HugeRoundUp()
   744  	if !ok {
   745  		return true
   746  	}
   747  	end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown()
   748  	if start >= end {
   749  		return true
   750  	}
   751  	_, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0)
   752  	unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0)
   753  	if errno != 0 {
   754  		if errno == unix.ENOMEM || errno == unix.EPERM {
   755  			// These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or
   756  			// hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively.
   757  			log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
   758  		} else {
   759  			log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
   760  		}
   761  		mlockDisabled.Store(1)
   762  		return false
   763  	}
   764  	return true
   765  }
   766  
   767  func tryPopulate(b safemem.Block) bool {
   768  	// There are two approaches for populating writable pages:
   769  	// 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate
   770  	//    (prefault) page tables writable, faulting in all pages in the range
   771  	//    just as if manually writing to each each page".
   772  	// 2. Call mlock to populate pages, then munlock to cancel the mlock (but
   773  	//    keep the pages populated).
   774  	//
   775  	// Prefer the madvise(MADV_POPULATE_WRITE) approach because:
   776  	// - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach.
   777  	// - It is faster because it doesn't have to modify vmas like mlock does.
   778  	// - It works for disk-backed memory mappings too. The mlock approach doesn't
   779  	//   work for disk-backed filesystems (e.g. ext4). This is because
   780  	//   mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable
   781  	//   MAP_SHARED mappings. For memory-backed (shmem) files,
   782  	//   mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so
   783  	//   the page table entries populated by a read fault are writable. For
   784  	//   disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is
   785  	//   true, so the page table entries populated by a read fault are read-only.
   786  	if tryPopulateMadv(b) {
   787  		return true
   788  	}
   789  	return tryPopulateMlock(b)
   790  }
   791  
   792  // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
   793  const (
   794  	_FALLOC_FL_KEEP_SIZE  = 1
   795  	_FALLOC_FL_PUNCH_HOLE = 2
   796  )
   797  
   798  // Decommit releases resources associated with maintaining the contents of the
   799  // given pages. If Decommit succeeds, future accesses of the decommitted pages
   800  // will read zeroes.
   801  //
   802  // Preconditions: fr.Length() > 0.
   803  func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
   804  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   805  		panic(fmt.Sprintf("invalid range: %v", fr))
   806  	}
   807  
   808  	if f.opts.ManualZeroing {
   809  		// FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in
   810  		// effect.
   811  		if err := f.manuallyZero(fr); err != nil {
   812  			return err
   813  		}
   814  	} else {
   815  		if err := f.decommitFile(fr); err != nil {
   816  			return err
   817  		}
   818  	}
   819  
   820  	f.markDecommitted(fr)
   821  	return nil
   822  }
   823  
   824  func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error {
   825  	return f.forEachMappingSlice(fr, func(bs []byte) {
   826  		for i := range bs {
   827  			bs[i] = 0
   828  		}
   829  	})
   830  }
   831  
   832  func (f *MemoryFile) commitFile(fr memmap.FileRange) error {
   833  	// "The default operation (i.e., mode is zero) of fallocate() allocates the
   834  	// disk space within the range specified by offset and len." - fallocate(2)
   835  	return unix.Fallocate(
   836  		int(f.file.Fd()),
   837  		0, // mode
   838  		int64(fr.Start),
   839  		int64(fr.Length()))
   840  }
   841  
   842  func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
   843  	// "After a successful call, subsequent reads from this range will
   844  	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
   845  	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
   846  	return unix.Fallocate(
   847  		int(f.file.Fd()),
   848  		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
   849  		int64(fr.Start),
   850  		int64(fr.Length()))
   851  }
   852  
   853  func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
   854  	f.mu.Lock()
   855  	defer f.mu.Unlock()
   856  	// Since we're changing the knownCommitted attribute, we need to merge
   857  	// across the entire range to ensure that the usage tree is minimal.
   858  	f.usage.MutateFullRange(fr, func(seg usageIterator) bool {
   859  		val := seg.ValuePtr()
   860  		if val.knownCommitted {
   861  			// Drop the usageExpected appropriately.
   862  			amount := seg.Range().Length()
   863  			usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID)
   864  			f.usageExpected -= amount
   865  			val.knownCommitted = false
   866  		}
   867  		val.memCgID = 0
   868  		return true
   869  	})
   870  }
   871  
   872  // HasUniqueRef returns true if all pages in the given range have exactly one
   873  // reference. A return value of false is inherently racy, but if the caller
   874  // holds a reference on the given range and is preventing other goroutines from
   875  // copying it, then a return value of true is not racy.
   876  //
   877  // Preconditions: At least one reference must be held on all pages in fr.
   878  func (f *MemoryFile) HasUniqueRef(fr memmap.FileRange) bool {
   879  	f.mu.Lock()
   880  	defer f.mu.Unlock()
   881  	hasUniqueRef := true
   882  	f.usage.VisitFullRange(fr, func(seg usageIterator) bool {
   883  		if seg.ValuePtr().refs != 1 {
   884  			hasUniqueRef = false
   885  			return false
   886  		}
   887  		return true
   888  	})
   889  	return hasUniqueRef
   890  }
   891  
   892  // IncRef implements memmap.File.IncRef.
   893  func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) {
   894  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   895  		panic(fmt.Sprintf("invalid range: %v", fr))
   896  	}
   897  
   898  	f.mu.Lock()
   899  	defer f.mu.Unlock()
   900  
   901  	f.usage.MutateFullRange(fr, func(seg usageIterator) bool {
   902  		seg.ValuePtr().refs++
   903  		return true
   904  	})
   905  }
   906  
   907  // DecRef implements memmap.File.DecRef.
   908  func (f *MemoryFile) DecRef(fr memmap.FileRange) {
   909  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   910  		panic(fmt.Sprintf("invalid range: %v", fr))
   911  	}
   912  
   913  	var freed bool
   914  
   915  	f.mu.Lock()
   916  	defer f.mu.Unlock()
   917  
   918  	f.usage.MutateFullRange(fr, func(seg usageIterator) bool {
   919  		val := seg.ValuePtr()
   920  		if val.refs == 0 {
   921  			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
   922  		}
   923  		val.refs--
   924  		if val.refs == 0 {
   925  			f.reclaim.InsertRange(seg.Range(), reclaimSetValue{})
   926  			freed = true
   927  			// Reclassify memory as System, until it's freed by the reclaim
   928  			// goroutine.
   929  			if val.knownCommitted {
   930  				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID)
   931  			}
   932  			val.kind = usage.System
   933  		}
   934  		return true
   935  	})
   936  
   937  	if freed {
   938  		f.reclaimable = true
   939  		f.reclaimCond.Signal()
   940  	}
   941  }
   942  
   943  // MapInternal implements memmap.File.MapInternal.
   944  func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
   945  	if !fr.WellFormed() || fr.Length() == 0 {
   946  		panic(fmt.Sprintf("invalid range: %v", fr))
   947  	}
   948  	if at.Execute {
   949  		return safemem.BlockSeq{}, linuxerr.EACCES
   950  	}
   951  
   952  	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
   953  	if chunks == 1 {
   954  		// Avoid an unnecessary slice allocation.
   955  		var seq safemem.BlockSeq
   956  		err := f.forEachMappingSlice(fr, func(bs []byte) {
   957  			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
   958  		})
   959  		return seq, err
   960  	}
   961  	blocks := make([]safemem.Block, 0, chunks)
   962  	err := f.forEachMappingSlice(fr, func(bs []byte) {
   963  		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
   964  	})
   965  	return safemem.BlockSeqFromSlice(blocks), err
   966  }
   967  
   968  // forEachMappingSlice invokes fn on a sequence of byte slices that
   969  // collectively map all bytes in fr.
   970  func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
   971  	mappings := *f.mappings.Load()
   972  	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
   973  		chunk := int(chunkStart >> chunkShift)
   974  		m := atomic.LoadUintptr(&mappings[chunk])
   975  		if m == 0 {
   976  			var err error
   977  			mappings, m, err = f.getChunkMapping(chunk)
   978  			if err != nil {
   979  				return err
   980  			}
   981  		}
   982  		startOff := uint64(0)
   983  		if chunkStart < fr.Start {
   984  			startOff = fr.Start - chunkStart
   985  		}
   986  		endOff := uint64(chunkSize)
   987  		if chunkStart+chunkSize > fr.End {
   988  			endOff = fr.End - chunkStart
   989  		}
   990  		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
   991  	}
   992  	return nil
   993  }
   994  
   995  func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
   996  	f.mappingsMu.Lock()
   997  	defer f.mappingsMu.Unlock()
   998  	// Another thread may have replaced f.mappings altogether due to file
   999  	// expansion.
  1000  	mappings := *f.mappings.Load()
  1001  	// Another thread may have already mapped the chunk.
  1002  	if m := mappings[chunk]; m != 0 {
  1003  		return mappings, m, nil
  1004  	}
  1005  	m, _, errno := unix.Syscall6(
  1006  		unix.SYS_MMAP,
  1007  		0,
  1008  		chunkSize,
  1009  		unix.PROT_READ|unix.PROT_WRITE,
  1010  		unix.MAP_SHARED,
  1011  		f.file.Fd(),
  1012  		uintptr(chunk<<chunkShift))
  1013  	if errno != 0 {
  1014  		return nil, 0, errno
  1015  	}
  1016  	atomic.StoreUintptr(&mappings[chunk], m)
  1017  	return mappings, m, nil
  1018  }
  1019  
  1020  // MarkEvictable allows f to request memory deallocation by calling
  1021  // user.Evict(er) in the future.
  1022  //
  1023  // Redundantly marking an already-evictable range as evictable has no effect.
  1024  func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
  1025  	f.mu.Lock()
  1026  	defer f.mu.Unlock()
  1027  	info, ok := f.evictable[user]
  1028  	if !ok {
  1029  		info = &evictableMemoryUserInfo{}
  1030  		f.evictable[user] = info
  1031  	}
  1032  	gap := info.ranges.LowerBoundGap(er.Start)
  1033  	for gap.Ok() && gap.Start() < er.End {
  1034  		gapER := gap.Range().Intersect(er)
  1035  		if gapER.Length() == 0 {
  1036  			gap = gap.NextGap()
  1037  			continue
  1038  		}
  1039  		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
  1040  	}
  1041  	if !info.evicting {
  1042  		switch f.opts.DelayedEviction {
  1043  		case DelayedEvictionDisabled:
  1044  			// Kick off eviction immediately.
  1045  			f.startEvictionGoroutineLocked(user, info)
  1046  		case DelayedEvictionEnabled:
  1047  			if !f.opts.UseHostMemcgPressure {
  1048  				// Ensure that the reclaimer goroutine is running, so that it
  1049  				// can start eviction when necessary.
  1050  				f.reclaimCond.Signal()
  1051  			}
  1052  		}
  1053  	}
  1054  }
  1055  
  1056  // MarkUnevictable informs f that user no longer considers er to be evictable,
  1057  // so the MemoryFile should no longer call user.Evict(er). Note that, per
  1058  // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
  1059  // called even after MarkUnevictable returns due to race conditions, and
  1060  // implementations of EvictableMemoryUser must handle this possibility.
  1061  //
  1062  // Redundantly marking an already-unevictable range as unevictable has no
  1063  // effect.
  1064  func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
  1065  	f.mu.Lock()
  1066  	defer f.mu.Unlock()
  1067  	info, ok := f.evictable[user]
  1068  	if !ok {
  1069  		return
  1070  	}
  1071  	seg := info.ranges.LowerBoundSegment(er.Start)
  1072  	for seg.Ok() && seg.Start() < er.End {
  1073  		seg = info.ranges.Isolate(seg, er)
  1074  		seg = info.ranges.Remove(seg).NextSegment()
  1075  	}
  1076  	// We can only remove info if there's no eviction goroutine running on its
  1077  	// behalf.
  1078  	if !info.evicting && info.ranges.IsEmpty() {
  1079  		delete(f.evictable, user)
  1080  	}
  1081  }
  1082  
  1083  // MarkAllUnevictable informs f that user no longer considers any offsets to be
  1084  // evictable. It otherwise has the same semantics as MarkUnevictable.
  1085  func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
  1086  	f.mu.Lock()
  1087  	defer f.mu.Unlock()
  1088  	info, ok := f.evictable[user]
  1089  	if !ok {
  1090  		return
  1091  	}
  1092  	info.ranges.RemoveAll()
  1093  	// We can only remove info if there's no eviction goroutine running on its
  1094  	// behalf.
  1095  	if !info.evicting {
  1096  		delete(f.evictable, user)
  1097  	}
  1098  }
  1099  
  1100  // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
  1101  // evictable memory, such that it may be advantageous to cache data in
  1102  // evictable memory. The value returned by ShouldCacheEvictable may change
  1103  // between calls.
  1104  func (f *MemoryFile) ShouldCacheEvictable() bool {
  1105  	return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
  1106  }
  1107  
  1108  // UpdateUsage ensures that the memory usage statistics in
  1109  // usage.MemoryAccounting are up to date. If memCgIDs is nil, all the pages
  1110  // will be scanned. Else only the pages which belong to the memory cgroup ids
  1111  // in memCgIDs will be scanned and the memory usage will be updated.
  1112  func (f *MemoryFile) UpdateUsage(memCgIDs map[uint32]struct{}) error {
  1113  	f.mu.Lock()
  1114  	defer f.mu.Unlock()
  1115  
  1116  	// If the underlying usage matches where the usage tree already
  1117  	// represents, then we can just avoid the entire scan (we know it's
  1118  	// accurate).
  1119  	currentUsage, err := f.TotalUsage()
  1120  	if err != nil {
  1121  		return err
  1122  	}
  1123  	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
  1124  		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
  1125  		return nil
  1126  	}
  1127  	// If the current usage matches the expected but there's swap
  1128  	// accounting, then ensure a scan takes place at least every second
  1129  	// (when requested).
  1130  	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
  1131  		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
  1132  		return nil
  1133  	}
  1134  
  1135  	// Linux updates usage values at CONFIG_HZ.
  1136  	if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
  1137  		log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
  1138  		return nil
  1139  	}
  1140  
  1141  	if memCgIDs == nil {
  1142  		f.usageLast = time.Now()
  1143  	}
  1144  	err = f.updateUsageLocked(currentUsage, memCgIDs, mincore)
  1145  	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
  1146  		currentUsage, f.usageExpected, f.usageSwapped)
  1147  	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
  1148  	return err
  1149  }
  1150  
  1151  // updateUsageLocked attempts to detect commitment of previous-uncommitted
  1152  // pages by invoking checkCommitted, which is a function that, for each page i
  1153  // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
  1154  //
  1155  // Precondition: f.mu must be held; it may be unlocked and reacquired.
  1156  // +checklocks:f.mu
  1157  func (f *MemoryFile) updateUsageLocked(currentUsage uint64, memCgIDs map[uint32]struct{}, checkCommitted func(bs []byte, committed []byte) error) error {
  1158  	// Track if anything changed to elide the merge. In the common case, we
  1159  	// expect all segments to be committed and no merge to occur.
  1160  	changedAny := false
  1161  	defer func() {
  1162  		if changedAny {
  1163  			f.usage.MergeAll()
  1164  		}
  1165  
  1166  		// Adjust the swap usage to reflect reality.
  1167  		if f.usageExpected < currentUsage {
  1168  			// Since no pages may be marked decommitted while we hold mu, we
  1169  			// know that usage may have only increased since we got the last
  1170  			// current usage. Therefore, if usageExpected is still short of
  1171  			// currentUsage, we must assume that the difference is in pages
  1172  			// that have been swapped.
  1173  			newUsageSwapped := currentUsage - f.usageExpected
  1174  			if f.usageSwapped < newUsageSwapped {
  1175  				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0)
  1176  			} else {
  1177  				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0)
  1178  			}
  1179  			f.usageSwapped = newUsageSwapped
  1180  		} else if f.usageSwapped != 0 {
  1181  			// We have more usage accounted for than the file itself.
  1182  			// That's fine, we probably caught a race where pages were
  1183  			// being committed while the below loop was running. Just
  1184  			// report the higher number that we found and ignore swap.
  1185  			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0)
  1186  			f.usageSwapped = 0
  1187  		}
  1188  	}()
  1189  
  1190  	// Reused mincore buffer, will generally be <= 4096 bytes.
  1191  	var buf []byte
  1192  
  1193  	// Iterate over all usage data. There will only be usage segments
  1194  	// present when there is an associated reference.
  1195  	for seg := f.usage.FirstSegment(); seg.Ok(); {
  1196  		if !seg.ValuePtr().canCommit() {
  1197  			seg = seg.NextSegment()
  1198  			continue
  1199  		}
  1200  
  1201  		// Scan the pages of the given memCgID only. This will avoid scanning the
  1202  		// whole memory file when the memory usage is required only for a specific
  1203  		// cgroup. The total memory usage of all cgroups can be obtained when the
  1204  		// memCgIDs is nil.
  1205  		if memCgIDs != nil {
  1206  			if _, ok := memCgIDs[seg.ValuePtr().memCgID]; !ok {
  1207  				seg = seg.NextSegment()
  1208  				continue
  1209  			}
  1210  		}
  1211  
  1212  		// Get the range for this segment. As we touch slices, the
  1213  		// Start value will be walked along.
  1214  		r := seg.Range()
  1215  
  1216  		var checkErr error
  1217  		err := f.forEachMappingSlice(r,
  1218  			func(s []byte) {
  1219  				if checkErr != nil {
  1220  					return
  1221  				}
  1222  
  1223  				// Ensure that we have sufficient buffer for the call
  1224  				// (one byte per page). The length of each slice must
  1225  				// be page-aligned.
  1226  				bufLen := len(s) / hostarch.PageSize
  1227  				if len(buf) < bufLen {
  1228  					buf = make([]byte, bufLen)
  1229  				}
  1230  
  1231  				// Query for new pages in core.
  1232  				// NOTE(b/165896008): mincore (which is passed as checkCommitted)
  1233  				// by f.UpdateUsage() might take a really long time. So unlock f.mu
  1234  				// while checkCommitted runs.
  1235  				f.mu.Unlock() // +checklocksforce
  1236  				err := checkCommitted(s, buf)
  1237  				f.mu.Lock()
  1238  				if err != nil {
  1239  					checkErr = err
  1240  					return
  1241  				}
  1242  
  1243  				// Scan each page and switch out segments.
  1244  				seg := f.usage.LowerBoundSegment(r.Start)
  1245  				for i := 0; i < bufLen; {
  1246  					if buf[i]&0x1 == 0 {
  1247  						i++
  1248  						continue
  1249  					}
  1250  					// Scan to the end of this committed range.
  1251  					j := i + 1
  1252  					for ; j < bufLen; j++ {
  1253  						if buf[j]&0x1 == 0 {
  1254  							break
  1255  						}
  1256  					}
  1257  					committedFR := memmap.FileRange{
  1258  						Start: r.Start + uint64(i*hostarch.PageSize),
  1259  						End:   r.Start + uint64(j*hostarch.PageSize),
  1260  					}
  1261  					// Advance seg to committedFR.Start.
  1262  					for seg.Ok() && seg.End() < committedFR.Start {
  1263  						seg = seg.NextSegment()
  1264  					}
  1265  					// Mark pages overlapping committedFR as committed.
  1266  					for seg.Ok() && seg.Start() < committedFR.End {
  1267  						if seg.ValuePtr().canCommit() {
  1268  							seg = f.usage.Isolate(seg, committedFR)
  1269  							seg.ValuePtr().knownCommitted = true
  1270  							amount := seg.Range().Length()
  1271  							usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID)
  1272  							f.usageExpected += amount
  1273  							changedAny = true
  1274  						}
  1275  						seg = seg.NextSegment()
  1276  					}
  1277  					// Continue scanning for committed pages.
  1278  					i = j + 1
  1279  				}
  1280  
  1281  				// Advance r.Start.
  1282  				r.Start += uint64(len(s))
  1283  			})
  1284  		if checkErr != nil {
  1285  			return checkErr
  1286  		}
  1287  		if err != nil {
  1288  			return err
  1289  		}
  1290  
  1291  		// Continue with the first segment after r.End.
  1292  		seg = f.usage.LowerBoundSegment(r.End)
  1293  	}
  1294  
  1295  	return nil
  1296  }
  1297  
  1298  // TotalUsage returns an aggregate usage for all memory statistics except
  1299  // Mapped (which is external to MemoryFile). This is generally much cheaper
  1300  // than UpdateUsage, but will not provide a fine-grained breakdown.
  1301  func (f *MemoryFile) TotalUsage() (uint64, error) {
  1302  	// Stat the underlying file to discover the underlying usage. stat(2)
  1303  	// always reports the allocated block count in units of 512 bytes. This
  1304  	// includes pages in the page cache and swapped pages.
  1305  	var stat unix.Stat_t
  1306  	if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil {
  1307  		return 0, err
  1308  	}
  1309  	return uint64(stat.Blocks * 512), nil
  1310  }
  1311  
  1312  // TotalSize returns the current size of the backing file in bytes, which is an
  1313  // upper bound on the amount of memory that can currently be allocated from the
  1314  // MemoryFile. The value returned by TotalSize is permitted to change.
  1315  func (f *MemoryFile) TotalSize() uint64 {
  1316  	f.mu.Lock()
  1317  	defer f.mu.Unlock()
  1318  	return uint64(f.fileSize)
  1319  }
  1320  
  1321  // File returns the backing file.
  1322  func (f *MemoryFile) File() *os.File {
  1323  	return f.file
  1324  }
  1325  
  1326  // FD implements memmap.File.FD.
  1327  func (f *MemoryFile) FD() int {
  1328  	return int(f.file.Fd())
  1329  }
  1330  
  1331  // IsDiskBacked returns true if f is backed by a file on disk.
  1332  func (f *MemoryFile) IsDiskBacked() bool {
  1333  	return f.opts.DiskBackedFile
  1334  }
  1335  
  1336  // String implements fmt.Stringer.String.
  1337  //
  1338  // Note that because f.String locks f.mu, calling f.String internally
  1339  // (including indirectly through the fmt package) risks recursive locking.
  1340  // Within the pgalloc package, use f.usage directly instead.
  1341  func (f *MemoryFile) String() string {
  1342  	f.mu.Lock()
  1343  	defer f.mu.Unlock()
  1344  	return f.usage.String()
  1345  }
  1346  
  1347  // runReclaim implements the reclaimer goroutine, which continuously decommits
  1348  // reclaimable pages in order to reduce memory usage and make them available
  1349  // for allocation.
  1350  func (f *MemoryFile) runReclaim() {
  1351  	for {
  1352  		// N.B. We must call f.markReclaimed on the returned FrameRange.
  1353  		fr, ok := f.findReclaimable()
  1354  		if !ok {
  1355  			break
  1356  		}
  1357  
  1358  		if f.opts.ManualZeroing {
  1359  			// If ManualZeroing is in effect, only hugepage-aligned regions may
  1360  			// be safely passed to decommitFile. Pages will be zeroed on
  1361  			// reallocation, so we don't need to perform any manual zeroing
  1362  			// here, whether or not decommitFile succeeds.
  1363  			if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok {
  1364  				if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr {
  1365  					decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)}
  1366  					if err := f.decommitFile(decommitFR); err != nil {
  1367  						log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err)
  1368  					}
  1369  				}
  1370  			}
  1371  		} else {
  1372  			if err := f.decommitFile(fr); err != nil {
  1373  				log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
  1374  				// Zero the pages manually. This won't reduce memory usage, but at
  1375  				// least ensures that the pages will be zero when reallocated.
  1376  				if err := f.manuallyZero(fr); err != nil {
  1377  					panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err))
  1378  				}
  1379  			}
  1380  		}
  1381  		f.markDecommitted(fr)
  1382  		f.markReclaimed(fr)
  1383  	}
  1384  
  1385  	// We only get here if findReclaimable finds f.destroyed set and returns
  1386  	// false.
  1387  	f.mu.Lock()
  1388  	if !f.destroyed {
  1389  		f.mu.Unlock()
  1390  		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
  1391  	}
  1392  	if f.opts.DecommitOnDestroy && f.fileSize > 0 {
  1393  		if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil {
  1394  			f.mu.Unlock()
  1395  			panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err))
  1396  		}
  1397  	}
  1398  	f.file.Close()
  1399  	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
  1400  	// that has possibly been reassigned.
  1401  	f.file = nil
  1402  	f.mappingsMu.Lock()
  1403  	defer f.mappingsMu.Unlock()
  1404  	mappings := *f.mappings.Load()
  1405  	for i, m := range mappings {
  1406  		if m != 0 {
  1407  			_, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0)
  1408  			if errno != 0 {
  1409  				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
  1410  			}
  1411  		}
  1412  	}
  1413  	// Similarly, invalidate f.mappings
  1414  	f.mappings.Store(nil)
  1415  	f.mu.Unlock()
  1416  
  1417  	// This must be called without holding f.mu to avoid circular lock
  1418  	// ordering.
  1419  	if f.stopNotifyPressure != nil {
  1420  		f.stopNotifyPressure()
  1421  	}
  1422  }
  1423  
  1424  // findReclaimable finds memory that has been marked for reclaim.
  1425  //
  1426  // Note that there returned range will be removed from tracking. It
  1427  // must be reclaimed (removed from f.usage) at this point.
  1428  func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
  1429  	f.mu.Lock()
  1430  	defer f.mu.Unlock()
  1431  	for {
  1432  		for {
  1433  			if f.destroyed {
  1434  				return memmap.FileRange{}, false
  1435  			}
  1436  			if f.reclaimable {
  1437  				break
  1438  			}
  1439  			if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
  1440  				// No work to do. Evict any pending evictable allocations to
  1441  				// get more reclaimable pages before going to sleep.
  1442  				f.startEvictionsLocked()
  1443  			}
  1444  			f.reclaimCond.Wait()
  1445  		}
  1446  		// Most allocations are done upwards, with exceptions being stacks and some
  1447  		// allocators that allocate top-down. Reclaim preserves this order to
  1448  		// minimize the cost of the search.
  1449  		if seg := f.reclaim.FirstSegment(); seg.Ok() {
  1450  			fr := seg.Range()
  1451  			f.reclaim.Remove(seg)
  1452  			return fr, true
  1453  		}
  1454  		// Nothing is reclaimable.
  1455  		f.reclaimable = false
  1456  	}
  1457  }
  1458  
  1459  func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
  1460  	f.mu.Lock()
  1461  	defer f.mu.Unlock()
  1462  	seg := f.usage.FindSegment(fr.Start)
  1463  	// All of fr should be mapped to a single uncommitted reclaimable
  1464  	// segment accounted to System.
  1465  	if !seg.Ok() {
  1466  		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
  1467  	}
  1468  	if !seg.Range().IsSupersetOf(fr) {
  1469  		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
  1470  	}
  1471  	if got, want := seg.Value(), (usageInfo{
  1472  		kind:           usage.System,
  1473  		knownCommitted: false,
  1474  		refs:           0,
  1475  		memCgID:        0,
  1476  	}); got != want {
  1477  		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
  1478  	}
  1479  	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
  1480  	// the caller of markReclaimed may not have decommitted it, so we can
  1481  	// only mark fr as reclaimed.
  1482  	f.usage.Remove(f.usage.Isolate(seg, fr))
  1483  }
  1484  
  1485  // StartEvictions requests that f evict all evictable allocations. It does not
  1486  // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
  1487  func (f *MemoryFile) StartEvictions() {
  1488  	f.mu.Lock()
  1489  	defer f.mu.Unlock()
  1490  	f.startEvictionsLocked()
  1491  }
  1492  
  1493  // Preconditions: f.mu must be locked.
  1494  func (f *MemoryFile) startEvictionsLocked() bool {
  1495  	startedAny := false
  1496  	for user, info := range f.evictable {
  1497  		// Don't start multiple goroutines to evict the same user's
  1498  		// allocations.
  1499  		if !info.evicting {
  1500  			f.startEvictionGoroutineLocked(user, info)
  1501  			startedAny = true
  1502  		}
  1503  	}
  1504  	return startedAny
  1505  }
  1506  
  1507  // Preconditions:
  1508  //   - info == f.evictable[user].
  1509  //   - !info.evicting.
  1510  //   - f.mu must be locked.
  1511  func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
  1512  	info.evicting = true
  1513  	f.evictionWG.Add(1)
  1514  	go func() { // S/R-SAFE: f.evictionWG
  1515  		defer f.evictionWG.Done()
  1516  		for {
  1517  			f.mu.Lock()
  1518  			info, ok := f.evictable[user]
  1519  			if !ok {
  1520  				// This shouldn't happen: only this goroutine is permitted
  1521  				// to delete this entry.
  1522  				f.mu.Unlock()
  1523  				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
  1524  			}
  1525  			if info.ranges.IsEmpty() {
  1526  				delete(f.evictable, user)
  1527  				f.mu.Unlock()
  1528  				return
  1529  			}
  1530  			// Evict from the end of info.ranges, under the assumption that
  1531  			// if ranges in user start being used again (and are
  1532  			// consequently marked unevictable), such uses are more likely
  1533  			// to start from the beginning of user.
  1534  			seg := info.ranges.LastSegment()
  1535  			er := seg.Range()
  1536  			info.ranges.Remove(seg)
  1537  			// user.Evict() must be called without holding f.mu to avoid
  1538  			// circular lock ordering.
  1539  			f.mu.Unlock()
  1540  			user.Evict(context.Background(), er)
  1541  		}
  1542  	}()
  1543  }
  1544  
  1545  // WaitForEvictions blocks until f is no longer evicting any evictable
  1546  // allocations.
  1547  func (f *MemoryFile) WaitForEvictions() {
  1548  	f.evictionWG.Wait()
  1549  }
  1550  
  1551  type usageSetFunctions struct{}
  1552  
  1553  func (usageSetFunctions) MinKey() uint64 {
  1554  	return 0
  1555  }
  1556  
  1557  func (usageSetFunctions) MaxKey() uint64 {
  1558  	return math.MaxUint64
  1559  }
  1560  
  1561  func (usageSetFunctions) ClearValue(val *usageInfo) {
  1562  }
  1563  
  1564  func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
  1565  	return val1, val1 == val2
  1566  }
  1567  
  1568  func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
  1569  	return val, val
  1570  }
  1571  
  1572  // evictableRangeSetValue is the value type of evictableRangeSet.
  1573  type evictableRangeSetValue struct{}
  1574  
  1575  type evictableRangeSetFunctions struct{}
  1576  
  1577  func (evictableRangeSetFunctions) MinKey() uint64 {
  1578  	return 0
  1579  }
  1580  
  1581  func (evictableRangeSetFunctions) MaxKey() uint64 {
  1582  	return math.MaxUint64
  1583  }
  1584  
  1585  func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
  1586  }
  1587  
  1588  func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
  1589  	return evictableRangeSetValue{}, true
  1590  }
  1591  
  1592  func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
  1593  	return evictableRangeSetValue{}, evictableRangeSetValue{}
  1594  }
  1595  
  1596  // reclaimSetValue is the value type of reclaimSet.
  1597  type reclaimSetValue struct{}
  1598  
  1599  type reclaimSetFunctions struct{}
  1600  
  1601  func (reclaimSetFunctions) MinKey() uint64 {
  1602  	return 0
  1603  }
  1604  
  1605  func (reclaimSetFunctions) MaxKey() uint64 {
  1606  	return math.MaxUint64
  1607  }
  1608  
  1609  func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
  1610  }
  1611  
  1612  func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
  1613  	return reclaimSetValue{}, true
  1614  }
  1615  
  1616  func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
  1617  	return reclaimSetValue{}, reclaimSetValue{}
  1618  }