gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/pgalloc/pgalloc.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/pgalloc/pgalloc.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package pgalloc contains the page allocator subsystem, which manages memory
    16  // that may be mapped into application address spaces.
    17  //
    18  // Lock order:
    19  //
    20  //	 pgalloc.MemoryFile.mu
    21  //		pgalloc.MemoryFile.mappingsMu
    22  package pgalloc
    23  
    24  import (
    25  	"fmt"
    26  	"math"
    27  	"os"
    28  	"sync/atomic"
    29  	"time"
    30  
    31  	"golang.org/x/sys/unix"
    32  	"gvisor.dev/gvisor/pkg/abi/linux"
    33  	"gvisor.dev/gvisor/pkg/atomicbitops"
    34  	"gvisor.dev/gvisor/pkg/context"
    35  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    36  	"gvisor.dev/gvisor/pkg/hostarch"
    37  	"gvisor.dev/gvisor/pkg/log"
    38  	"gvisor.dev/gvisor/pkg/safemem"
    39  	"gvisor.dev/gvisor/pkg/sentry/hostmm"
    40  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    41  	"gvisor.dev/gvisor/pkg/sentry/usage"
    42  	"gvisor.dev/gvisor/pkg/sync"
    43  )
    44  
    45  // Direction describes how to allocate offsets from MemoryFile.
    46  type Direction int
    47  
    48  const (
    49  	// BottomUp allocates offsets in increasing offsets.
    50  	BottomUp Direction = iota
    51  	// TopDown allocates offsets in decreasing offsets.
    52  	TopDown
    53  )
    54  
    55  // String implements fmt.Stringer.
    56  func (d Direction) String() string {
    57  	switch d {
    58  	case BottomUp:
    59  		return "up"
    60  	case TopDown:
    61  		return "down"
    62  	}
    63  	panic(fmt.Sprintf("invalid direction: %d", d))
    64  }
    65  
    66  // MemoryFile is a memmap.File whose pages may be allocated to arbitrary
    67  // users.
    68  type MemoryFile struct {
    69  	memmap.NoBufferedIOFallback
    70  
    71  	// opts holds options passed to NewMemoryFile. opts is immutable.
    72  	opts MemoryFileOpts
    73  
    74  	// MemoryFile owns a single backing file, which is modeled as follows:
    75  	//
    76  	// Each page in the file can be committed or uncommitted. A page is
    77  	// committed if the host kernel is spending resources to store its contents
    78  	// and uncommitted otherwise. This definition includes pages that the host
    79  	// kernel has swapped; this is intentional, to ensure that accounting does
    80  	// not change even if host kernel swapping behavior changes, and that
    81  	// memory used by pseudo-swap mechanisms like zswap is still accounted.
    82  	//
    83  	// The initial contents of uncommitted pages are implicitly zero bytes. A
    84  	// read or write to the contents of an uncommitted page causes it to be
    85  	// committed. This is the only event that can cause a uncommitted page to
    86  	// be committed.
    87  	//
    88  	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
    89  	// pages to be uncommitted. This is the only event that can cause a
    90  	// committed page to be uncommitted.
    91  	//
    92  	// Memory accounting is based on identifying the set of committed pages.
    93  	// Since we do not have direct access to the MMU, tracking reads and writes
    94  	// to uncommitted pages to detect commitment would introduce additional
    95  	// page faults, which would be prohibitively expensive. Instead, we query
    96  	// the host kernel to determine which pages are committed.
    97  
    98  	// file is the backing file. The file pointer is immutable.
    99  	file *os.File
   100  
   101  	mu memoryFileMutex
   102  
   103  	// usage maps each page in the file to metadata for that page. Pages for
   104  	// which no segment exists in usage are both unallocated (not in use) and
   105  	// uncommitted.
   106  	//
   107  	// Since usage stores usageInfo objects by value, clients should usually
   108  	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
   109  	// pointer to the usageInfo rather than a copy.
   110  	//
   111  	// usage must be kept maximally merged (that is, there should never be two
   112  	// adjacent segments with the same values). At least markReclaimed depends
   113  	// on this property.
   114  	//
   115  	// usage is protected by mu.
   116  	usage usageSet
   117  
   118  	// The UpdateUsage function scans all segments with knownCommitted set
   119  	// to false, sees which pages are committed and creates corresponding
   120  	// segments with knownCommitted set to true.
   121  	//
   122  	// In order to avoid unnecessary scans, usageExpected tracks the total
   123  	// file blocks expected. This is used to elide the scan when this
   124  	// matches the underlying file blocks.
   125  	//
   126  	// To track swapped pages, usageSwapped tracks the discrepancy between
   127  	// what is observed in core and what is reported by the file. When
   128  	// usageSwapped is non-zero, a sweep will be performed at least every
   129  	// second. The start of the last sweep is recorded in usageLast.
   130  	//
   131  	// All usage attributes are all protected by mu.
   132  	usageExpected uint64
   133  	usageSwapped  uint64
   134  	usageLast     time.Time
   135  
   136  	// fileSize is the size of the backing memory file in bytes. fileSize is
   137  	// always a power-of-two multiple of chunkSize.
   138  	//
   139  	// fileSize is protected by mu.
   140  	fileSize int64
   141  
   142  	// Pages from the backing file are mapped into the local address space on
   143  	// the granularity of large pieces called chunks. mappings is a []uintptr
   144  	// that stores, for each chunk, the start address of a mapping of that
   145  	// chunk in the current process' address space, or 0 if no such mapping
   146  	// exists. Once a chunk is mapped, it is never remapped or unmapped until
   147  	// the MemoryFile is destroyed.
   148  	//
   149  	// Mutating the mappings slice or its contents requires both holding
   150  	// mappingsMu and using atomic memory operations. (The slice is mutated
   151  	// whenever the file is expanded. Per the above, the only permitted
   152  	// mutation of the slice's contents is the assignment of a mapping to a
   153  	// chunk that was previously unmapped.) Reading the slice or its contents
   154  	// only requires *either* holding mappingsMu or using atomic memory
   155  	// operations. This allows MemoryFile.MapInternal to avoid locking in the
   156  	// common case where chunk mappings already exist.
   157  	mappingsMu mappingsMutex
   158  	mappings   atomic.Pointer[[]uintptr]
   159  
   160  	// destroyed is set by Destroy to instruct the reclaimer goroutine to
   161  	// release resources and exit. destroyed is protected by mu.
   162  	destroyed bool
   163  
   164  	// reclaimable is true if usage may contain reclaimable pages. reclaimable
   165  	// is protected by mu.
   166  	reclaimable bool
   167  
   168  	// reclaim is the collection of regions for reclaim. reclaim is protected
   169  	// by mu.
   170  	reclaim reclaimSet
   171  
   172  	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
   173  	// transitions from false to true.
   174  	reclaimCond sync.Cond
   175  
   176  	// evictable maps EvictableMemoryUsers to eviction state.
   177  	//
   178  	// evictable is protected by mu.
   179  	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
   180  
   181  	// evictionWG counts the number of goroutines currently performing evictions.
   182  	evictionWG sync.WaitGroup
   183  
   184  	// stopNotifyPressure stops memory cgroup pressure level
   185  	// notifications used to drive eviction. stopNotifyPressure is
   186  	// immutable.
   187  	stopNotifyPressure func()
   188  
   189  	// savable is true if this MemoryFile will be saved via SaveTo() during
   190  	// the kernel's SaveTo operation. savable is protected by mu.
   191  	savable bool
   192  }
   193  
   194  // MemoryFileOpts provides options to NewMemoryFile.
   195  type MemoryFileOpts struct {
   196  	// DelayedEviction controls the extent to which the MemoryFile may delay
   197  	// eviction of evictable allocations.
   198  	DelayedEviction DelayedEvictionType
   199  
   200  	// If UseHostMemcgPressure is true, use host memory cgroup pressure level
   201  	// notifications to determine when eviction is necessary. This option has
   202  	// no effect unless DelayedEviction is DelayedEvictionEnabled.
   203  	UseHostMemcgPressure bool
   204  
   205  	// DecommitOnDestroy indicates whether the entire host file should be
   206  	// decommitted on destruction. This is appropriate for host filesystem based
   207  	// files that need to be explicitly cleaned up to release disk space.
   208  	DecommitOnDestroy bool
   209  
   210  	// If ManualZeroing is true, MemoryFile must not assume that new pages
   211  	// obtained from the host are zero-filled, such that MemoryFile must manually
   212  	// zero newly-allocated pages.
   213  	ManualZeroing bool
   214  
   215  	// If DisableIMAWorkAround is true, NewMemoryFile will not call
   216  	// IMAWorkAroundForMemFile().
   217  	DisableIMAWorkAround bool
   218  
   219  	// DiskBackedFile indicates that the MemoryFile is backed by a file on disk.
   220  	DiskBackedFile bool
   221  
   222  	// RestoreID is an opaque string used to reassociate the MemoryFile with its
   223  	// replacement during restore.
   224  	RestoreID string
   225  
   226  	// EnforceMaximumAllocatable is a flag that governs whether the MemoryFile
   227  	// will be limited in size of total allocations by
   228  	// usage.MaximumAllocatableBytes.
   229  	EnforceMaximumAllocatable bool
   230  }
   231  
   232  // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
   233  type DelayedEvictionType int
   234  
   235  const (
   236  	// DelayedEvictionDefault has unspecified behavior.
   237  	DelayedEvictionDefault DelayedEvictionType = iota
   238  
   239  	// DelayedEvictionDisabled requires that evictable allocations are evicted
   240  	// as soon as possible.
   241  	DelayedEvictionDisabled
   242  
   243  	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
   244  	// evictable allocations until doing so is considered necessary to avoid
   245  	// performance degradation due to host memory pressure, or OOM kills.
   246  	//
   247  	// As of this writing, the behavior of DelayedEvictionEnabled depends on
   248  	// whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
   249  	//
   250  	//	- If UseHostMemcgPressure is true, evictions are delayed until memory
   251  	//		pressure is indicated.
   252  	//
   253  	//	- Otherwise, evictions are only delayed until the reclaimer goroutine
   254  	//		is out of work (pages to reclaim).
   255  	DelayedEvictionEnabled
   256  
   257  	// DelayedEvictionManual requires that evictable allocations are only
   258  	// evicted when MemoryFile.StartEvictions() is called. This is extremely
   259  	// dangerous outside of tests.
   260  	DelayedEvictionManual
   261  )
   262  
   263  // usageInfo tracks usage information.
   264  //
   265  // +stateify savable
   266  type usageInfo struct {
   267  	// kind is the usage kind.
   268  	kind usage.MemoryKind
   269  
   270  	// knownCommitted is true if the tracked region is definitely committed.
   271  	// (If it is false, the tracked region may or may not be committed.)
   272  	knownCommitted bool
   273  
   274  	refs uint64
   275  
   276  	// memCgID is the memory cgroup id to which this page is committed.
   277  	memCgID uint32
   278  }
   279  
   280  // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
   281  // may be asked to deallocate that memory in the presence of memory pressure.
   282  type EvictableMemoryUser interface {
   283  	// Evict requests that the EvictableMemoryUser deallocate memory used by
   284  	// er, which was registered as evictable by a previous call to
   285  	// MemoryFile.MarkEvictable.
   286  	//
   287  	// Evict is not required to deallocate memory. In particular, since pgalloc
   288  	// must call Evict without holding locks to avoid circular lock ordering,
   289  	// it is possible that the passed range has already been marked as
   290  	// unevictable by a racing call to MemoryFile.MarkUnevictable.
   291  	// Implementations of EvictableMemoryUser must detect such races and handle
   292  	// them by making Evict have no effect on unevictable ranges.
   293  	//
   294  	// After a call to Evict, the MemoryFile will consider the evicted range
   295  	// unevictable (i.e. it will not call Evict on the same range again) until
   296  	// informed otherwise by a subsequent call to MarkEvictable.
   297  	Evict(ctx context.Context, er EvictableRange)
   298  }
   299  
   300  // An EvictableRange represents a range of uint64 offsets in an
   301  // EvictableMemoryUser.
   302  //
   303  // In practice, most EvictableMemoryUsers will probably be implementations of
   304  // memmap.Mappable, and EvictableRange therefore corresponds to
   305  // memmap.MappableRange. However, this package cannot depend on the memmap
   306  // package, since doing so would create a circular dependency.
   307  //
   308  // type EvictableRange <generated using go_generics>
   309  
   310  // evictableMemoryUserInfo is the value type of MemoryFile.evictable.
   311  type evictableMemoryUserInfo struct {
   312  	// ranges tracks all evictable ranges for the given user.
   313  	ranges evictableRangeSet
   314  
   315  	// If evicting is true, there is a goroutine currently evicting all
   316  	// evictable ranges for this user.
   317  	evicting bool
   318  }
   319  
   320  const (
   321  	chunkShift = 30
   322  	chunkSize  = 1 << chunkShift // 1 GB
   323  	chunkMask  = chunkSize - 1
   324  
   325  	// maxPage is the highest 64-bit page.
   326  	maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1)
   327  )
   328  
   329  // NewMemoryFile creates a MemoryFile backed by the given file. If
   330  // NewMemoryFile succeeds, ownership of file is transferred to the returned
   331  // MemoryFile.
   332  func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
   333  	switch opts.DelayedEviction {
   334  	case DelayedEvictionDefault:
   335  		opts.DelayedEviction = DelayedEvictionEnabled
   336  	case DelayedEvictionDisabled, DelayedEvictionManual:
   337  		opts.UseHostMemcgPressure = false
   338  	case DelayedEvictionEnabled:
   339  		// ok
   340  	default:
   341  		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
   342  	}
   343  
   344  	// Truncate the file to 0 bytes first to ensure that it's empty.
   345  	if err := file.Truncate(0); err != nil {
   346  		return nil, err
   347  	}
   348  	f := &MemoryFile{
   349  		opts:      opts,
   350  		file:      file,
   351  		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
   352  	}
   353  	f.mappings.Store(&[]uintptr{})
   354  	f.reclaimCond.L = &f.mu
   355  
   356  	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
   357  		stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
   358  			f.mu.Lock()
   359  			startedAny := f.startEvictionsLocked()
   360  			f.mu.Unlock()
   361  			if startedAny {
   362  				log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
   363  			}
   364  		}, "low")
   365  		if err != nil {
   366  			return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
   367  		}
   368  		f.stopNotifyPressure = stop
   369  	}
   370  
   371  	go f.runReclaim() // S/R-SAFE: f.mu
   372  
   373  	if !opts.DisableIMAWorkAround {
   374  		IMAWorkAroundForMemFile(file.Fd())
   375  	}
   376  	return f, nil
   377  }
   378  
   379  // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary
   380  // PROT_EXEC mapping, while the backing file is still small. IMA will ignore
   381  // any future mappings.
   382  //
   383  // The Linux kernel contains an optional feature called "Integrity
   384  // Measurement Architecture" (IMA). If IMA is enabled, it will checksum
   385  // binaries the first time they are mapped PROT_EXEC. This is bad news for
   386  // executable pages mapped from our backing file, which can grow to
   387  // terabytes in (sparse) size. If IMA attempts to checksum a file that
   388  // large, it will allocate all of the sparse pages and quickly exhaust all
   389  // memory.
   390  func IMAWorkAroundForMemFile(fd uintptr) {
   391  	m, _, errno := unix.Syscall6(
   392  		unix.SYS_MMAP,
   393  		0,
   394  		hostarch.PageSize,
   395  		unix.PROT_EXEC,
   396  		unix.MAP_SHARED,
   397  		fd,
   398  		0)
   399  	if errno != 0 {
   400  		// This isn't fatal (IMA may not even be in use). Log the error, but
   401  		// don't return it.
   402  		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
   403  	} else {
   404  		if _, _, errno := unix.Syscall(
   405  			unix.SYS_MUNMAP,
   406  			m,
   407  			hostarch.PageSize,
   408  			0); errno != 0 {
   409  			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
   410  		}
   411  	}
   412  }
   413  
   414  // Destroy releases all resources used by f.
   415  //
   416  // Preconditions: All pages allocated by f have been freed.
   417  //
   418  // Postconditions: None of f's methods may be called after Destroy.
   419  func (f *MemoryFile) Destroy() {
   420  	f.mu.Lock()
   421  	defer f.mu.Unlock()
   422  	f.destroyed = true
   423  	f.reclaimCond.Signal()
   424  }
   425  
   426  // AllocationMode provides a way to inform the pgalloc API how to allocate
   427  // memory and pages on the host.
   428  // A page will exist in one of the following incremental states:
   429  //  1. Allocated: A page is allocated if it was returned by Allocate() and its
   430  //     reference count hasn't dropped to 0 since then.
   431  //  2. Committed: As described in MemoryFile documentation above, a page is
   432  //     committed if the host kernel is spending resources to store its
   433  //     contents. A committed page is implicitly allocated.
   434  //  3. Populated: A page is populated for reading/writing in a page table
   435  //     hierarchy if it has a page table entry that permits reading/writing
   436  //     respectively. A populated page is implicitly committed, since the page
   437  //     table entry needs a physical page to point to, but not vice versa.
   438  type AllocationMode int
   439  
   440  const (
   441  	// AllocateOnly indicates that pages need to only be allocated.
   442  	AllocateOnly AllocationMode = iota
   443  	// AllocateAndCommit indicates that pages need to be committed, in addition
   444  	// to being allocated.
   445  	AllocateAndCommit
   446  	// AllocateAndWritePopulate indicates that writable pages should ideally be
   447  	// populated in the page table, in addition to being allocated. This is a
   448  	// suggestion, not a requirement.
   449  	AllocateAndWritePopulate
   450  )
   451  
   452  // AllocOpts are options used in MemoryFile.Allocate.
   453  type AllocOpts struct {
   454  	// Kind is the memory kind to be used for accounting.
   455  	Kind usage.MemoryKind
   456  	// Dir indicates the direction in which offsets are allocated.
   457  	Dir Direction
   458  	// MemCgID is the memory cgroup ID and the zero value indicates that
   459  	// the memory will not be accounted to any cgroup.
   460  	MemCgID uint32
   461  	// Mode allows the callers to select how the pages are allocated in the
   462  	// MemoryFile. Callers that will fill the allocated memory by writing to it
   463  	// should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers
   464  	// that will fill the allocated memory by invoking host system calls should
   465  	// pass AllocateOnly.
   466  	Mode AllocationMode
   467  	// If ReaderFunc is provided, the allocated memory is filled by calling it
   468  	// repeatedly until either length bytes are read or a non-nil error is
   469  	// returned. It returns the allocated memory, truncated down to the nearest
   470  	// page. If this is shorter than length bytes due to an error returned by
   471  	// ReaderFunc, it returns the partially filled fr and error.
   472  	ReaderFunc safemem.ReaderFunc
   473  }
   474  
   475  // Allocate returns a range of initially-zeroed pages of the given length with
   476  // the given accounting kind and a single reference held by the caller. When
   477  // the last reference on an allocated page is released, ownership of the page
   478  // is returned to the MemoryFile, allowing it to be returned by a future call
   479  // to Allocate.
   480  //
   481  // Preconditions: length must be page-aligned and non-zero.
   482  func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) {
   483  	fr, err := f.allocate(length, &opts)
   484  	if err != nil {
   485  		return memmap.FileRange{}, err
   486  	}
   487  	var dsts safemem.BlockSeq
   488  	switch opts.Mode {
   489  	case AllocateOnly: // Allocation is handled above. Nothing more to do.
   490  	case AllocateAndCommit:
   491  		if err := f.commitFile(fr); err != nil {
   492  			f.DecRef(fr)
   493  			return memmap.FileRange{}, err
   494  		}
   495  	case AllocateAndWritePopulate:
   496  		dsts, err = f.MapInternal(fr, hostarch.Write)
   497  		if err != nil {
   498  			f.DecRef(fr)
   499  			return memmap.FileRange{}, err
   500  		}
   501  		if canPopulate() {
   502  			rem := dsts
   503  			for {
   504  				if !tryPopulate(rem.Head()) {
   505  					break
   506  				}
   507  				rem = rem.Tail()
   508  				if rem.IsEmpty() {
   509  					break
   510  				}
   511  			}
   512  		}
   513  	default:
   514  		panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode))
   515  	}
   516  	if opts.ReaderFunc != nil {
   517  		if dsts.IsEmpty() {
   518  			dsts, err = f.MapInternal(fr, hostarch.Write)
   519  			if err != nil {
   520  				f.DecRef(fr)
   521  				return memmap.FileRange{}, err
   522  			}
   523  		}
   524  		n, err := safemem.ReadFullToBlocks(opts.ReaderFunc, dsts)
   525  		un := uint64(hostarch.Addr(n).RoundDown())
   526  		if un < length {
   527  			// Free unused memory and update fr to contain only the memory that is
   528  			// still allocated.
   529  			f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
   530  			fr.End = fr.Start + un
   531  		}
   532  		if err != nil {
   533  			return fr, err
   534  		}
   535  	}
   536  	return fr, nil
   537  }
   538  
   539  func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) {
   540  	if length == 0 || length%hostarch.PageSize != 0 {
   541  		panic(fmt.Sprintf("invalid allocation length: %#x", length))
   542  	}
   543  
   544  	f.mu.Lock()
   545  	defer f.mu.Unlock()
   546  
   547  	if !f.hasSpaceToAllocate(length) {
   548  		log.Debugf("Enforcing memory limit on allocation of size %d, max is %d, already have %d", length, usage.MaximumAllocatableBytes, f.usageExpected)
   549  		return memmap.FileRange{}, linuxerr.ENOMEM
   550  	}
   551  
   552  	// Align hugepage-and-larger allocations on hugepage boundaries to try
   553  	// to take advantage of hugetmpfs.
   554  	alignment := uint64(hostarch.PageSize)
   555  	if length >= hostarch.HugePageSize {
   556  		alignment = hostarch.HugePageSize
   557  	}
   558  
   559  	// Find a range in the underlying file.
   560  	fr, ok := f.findAvailableRange(length, alignment, opts.Dir)
   561  	if !ok {
   562  		return memmap.FileRange{}, linuxerr.ENOMEM
   563  	}
   564  
   565  	// Expand the file if needed.
   566  	if int64(fr.End) > f.fileSize {
   567  		// Round the new file size up to be chunk-aligned.
   568  		newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
   569  		if err := f.file.Truncate(newFileSize); err != nil {
   570  			return memmap.FileRange{}, err
   571  		}
   572  		f.fileSize = newFileSize
   573  		f.mappingsMu.Lock()
   574  		oldMappings := *f.mappings.Load()
   575  		newMappings := make([]uintptr, newFileSize>>chunkShift)
   576  		copy(newMappings, oldMappings)
   577  		f.mappings.Store(&newMappings)
   578  		f.mappingsMu.Unlock()
   579  	}
   580  
   581  	if f.opts.ManualZeroing {
   582  		if err := f.manuallyZero(fr); err != nil {
   583  			return memmap.FileRange{}, err
   584  		}
   585  	}
   586  	// Mark selected pages as in use.
   587  	f.usage.InsertRange(fr, usageInfo{
   588  		kind:    opts.Kind,
   589  		refs:    1,
   590  		memCgID: opts.MemCgID,
   591  	})
   592  
   593  	return fr, nil
   594  }
   595  
   596  func (f *MemoryFile) hasSpaceToAllocate(length uint64) bool {
   597  	if f.opts.EnforceMaximumAllocatable && usage.MaximumAllocatableBytes != 0 && ((f.usageExpected+length) > usage.MaximumAllocatableBytes || (f.usageExpected+length) < f.usageExpected) {
   598  		// f.usageExpected is not guaranteed to be correct because it is
   599  		// updated only when f.UpdateUsage is called periodically.
   600  		// To eliminate false-positives double check against the exact
   601  		// measure; we don't care as much about false-negatives, which
   602  		// helps avoid a host-syscall via f.TotalUsage in the happy-path.
   603  		exactUsage, err := f.TotalUsage()
   604  		if err != nil {
   605  			log.Warningf("Failed to fetch total usage for memory file: %v", err)
   606  			return false
   607  		}
   608  		if (exactUsage+length) > usage.MaximumAllocatableBytes || (exactUsage+length) < exactUsage {
   609  			return false
   610  		}
   611  	}
   612  	return true
   613  }
   614  
   615  // findAvailableRange returns an available range in the usageSet.
   616  //
   617  // Note that scanning for available slots takes place from end first backwards,
   618  // then forwards. This heuristic has important consequence for how sequential
   619  // mappings can be merged in the host VMAs, given that addresses for both
   620  // application and sentry mappings are allocated top-down (from higher to
   621  // lower addresses). The file is also grown exponentially in order to create
   622  // space for mappings to be allocated downwards.
   623  //
   624  // Precondition: alignment must be a power of 2.
   625  func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) {
   626  	if dir == BottomUp {
   627  		return findAvailableRangeBottomUp(&f.usage, length, alignment)
   628  	}
   629  	return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment)
   630  }
   631  
   632  func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
   633  	alignmentMask := alignment - 1
   634  
   635  	// Search for space in existing gaps, starting at the current end of the
   636  	// file and working backward.
   637  	lastGap := usage.LastGap()
   638  	gap := lastGap
   639  	for {
   640  		end := gap.End()
   641  		if end > uint64(fileSize) {
   642  			end = uint64(fileSize)
   643  		}
   644  
   645  		// Try to allocate from the end of this gap, with the start of the
   646  		// allocated range aligned down to alignment.
   647  		unalignedStart := end - length
   648  		if unalignedStart > end {
   649  			// Negative overflow: this and all preceding gaps are too small to
   650  			// accommodate length.
   651  			break
   652  		}
   653  		if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
   654  			return memmap.FileRange{start, start + length}, true
   655  		}
   656  
   657  		gap = gap.PrevLargeEnoughGap(length)
   658  		if !gap.Ok() {
   659  			break
   660  		}
   661  	}
   662  
   663  	// Check that it's possible to fit this allocation at the end of a file of any size.
   664  	min := lastGap.Start()
   665  	min = (min + alignmentMask) &^ alignmentMask
   666  	if min+length < min {
   667  		// Overflow: allocation would exceed the range of uint64.
   668  		return memmap.FileRange{}, false
   669  	}
   670  
   671  	// Determine the minimum file size required to fit this allocation at its end.
   672  	for {
   673  		newFileSize := 2 * fileSize
   674  		if newFileSize <= fileSize {
   675  			if fileSize != 0 {
   676  				// Overflow: allocation would exceed the range of int64.
   677  				return memmap.FileRange{}, false
   678  			}
   679  			newFileSize = chunkSize
   680  		}
   681  		fileSize = newFileSize
   682  
   683  		unalignedStart := uint64(fileSize) - length
   684  		if unalignedStart > uint64(fileSize) {
   685  			// Negative overflow: fileSize is still inadequate.
   686  			continue
   687  		}
   688  		if start := unalignedStart &^ alignmentMask; start >= min {
   689  			return memmap.FileRange{start, start + length}, true
   690  		}
   691  	}
   692  }
   693  
   694  func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) {
   695  	alignmentMask := alignment - 1
   696  	for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) {
   697  		// Align the start address and check if allocation still fits in the gap.
   698  		start := (gap.Start() + alignmentMask) &^ alignmentMask
   699  
   700  		// File offsets are int64s. Since length must be strictly positive, end
   701  		// cannot legitimately be 0.
   702  		end := start + length
   703  		if end < start || int64(end) <= 0 {
   704  			return memmap.FileRange{}, false
   705  		}
   706  		if end <= gap.End() {
   707  			return memmap.FileRange{start, end}, true
   708  		}
   709  	}
   710  
   711  	// NextLargeEnoughGap should have returned a gap at the end.
   712  	panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length))
   713  }
   714  
   715  var mlockDisabled atomicbitops.Uint32
   716  var madvPopulateWriteDisabled atomicbitops.Uint32
   717  
   718  func canPopulate() bool {
   719  	return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0
   720  }
   721  
   722  func tryPopulateMadv(b safemem.Block) bool {
   723  	if madvPopulateWriteDisabled.Load() != 0 {
   724  		return false
   725  	}
   726  	start, ok := hostarch.Addr(b.Addr()).RoundUp()
   727  	if !ok {
   728  		return true
   729  	}
   730  	end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown()
   731  	bLen := end - start
   732  	// Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated.
   733  	// 1 syscall overhead >= 1 page fault overhead. This is because syscalls are
   734  	// susceptible to additional overheads like seccomp-bpf filters and auditing.
   735  	if start >= end || bLen <= hostarch.PageSize {
   736  		return true
   737  	}
   738  	_, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE)
   739  	if errno != 0 {
   740  		if errno == unix.EINVAL {
   741  			// EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14).
   742  			log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
   743  		} else {
   744  			log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
   745  		}
   746  		madvPopulateWriteDisabled.Store(1)
   747  		return false
   748  	}
   749  	return true
   750  }
   751  
   752  func tryPopulateMlock(b safemem.Block) bool {
   753  	if mlockDisabled.Load() != 0 {
   754  		return false
   755  	}
   756  	// Call mlock to populate pages, then munlock to cancel the mlock (but keep
   757  	// the pages populated). Only do so for hugepage-aligned address ranges to
   758  	// ensure that splitting the VMA in mlock doesn't split any existing
   759  	// hugepages. This assumes that two host syscalls, plus the MM overhead of
   760  	// mlock + munlock, is faster on average than trapping for
   761  	// HugePageSize/PageSize small page faults.
   762  	start, ok := hostarch.Addr(b.Addr()).HugeRoundUp()
   763  	if !ok {
   764  		return true
   765  	}
   766  	end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown()
   767  	if start >= end {
   768  		return true
   769  	}
   770  	_, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0)
   771  	unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0)
   772  	if errno != 0 {
   773  		if errno == unix.ENOMEM || errno == unix.EPERM {
   774  			// These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or
   775  			// hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively.
   776  			log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
   777  		} else {
   778  			log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
   779  		}
   780  		mlockDisabled.Store(1)
   781  		return false
   782  	}
   783  	return true
   784  }
   785  
   786  func tryPopulate(b safemem.Block) bool {
   787  	// There are two approaches for populating writable pages:
   788  	// 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate
   789  	//    (prefault) page tables writable, faulting in all pages in the range
   790  	//    just as if manually writing to each each page".
   791  	// 2. Call mlock to populate pages, then munlock to cancel the mlock (but
   792  	//    keep the pages populated).
   793  	//
   794  	// Prefer the madvise(MADV_POPULATE_WRITE) approach because:
   795  	// - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach.
   796  	// - It is faster because it doesn't have to modify vmas like mlock does.
   797  	// - It works for disk-backed memory mappings too. The mlock approach doesn't
   798  	//   work for disk-backed filesystems (e.g. ext4). This is because
   799  	//   mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable
   800  	//   MAP_SHARED mappings. For memory-backed (shmem) files,
   801  	//   mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so
   802  	//   the page table entries populated by a read fault are writable. For
   803  	//   disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is
   804  	//   true, so the page table entries populated by a read fault are read-only.
   805  	if tryPopulateMadv(b) {
   806  		return true
   807  	}
   808  	return tryPopulateMlock(b)
   809  }
   810  
   811  // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
   812  const (
   813  	_FALLOC_FL_KEEP_SIZE  = 1
   814  	_FALLOC_FL_PUNCH_HOLE = 2
   815  )
   816  
   817  // Decommit releases resources associated with maintaining the contents of the
   818  // given pages. If Decommit succeeds, future accesses of the decommitted pages
   819  // will read zeroes.
   820  //
   821  // Preconditions: fr.Length() > 0.
   822  func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
   823  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   824  		panic(fmt.Sprintf("invalid range: %v", fr))
   825  	}
   826  
   827  	if f.opts.ManualZeroing {
   828  		// FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in
   829  		// effect.
   830  		if err := f.manuallyZero(fr); err != nil {
   831  			return err
   832  		}
   833  	} else {
   834  		if err := f.decommitFile(fr); err != nil {
   835  			return err
   836  		}
   837  	}
   838  
   839  	f.markDecommitted(fr)
   840  	return nil
   841  }
   842  
   843  func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error {
   844  	return f.forEachMappingSlice(fr, func(bs []byte) {
   845  		clear(bs)
   846  	})
   847  }
   848  
   849  func (f *MemoryFile) commitFile(fr memmap.FileRange) error {
   850  	// "The default operation (i.e., mode is zero) of fallocate() allocates the
   851  	// disk space within the range specified by offset and len." - fallocate(2)
   852  	return unix.Fallocate(
   853  		int(f.file.Fd()),
   854  		0, // mode
   855  		int64(fr.Start),
   856  		int64(fr.Length()))
   857  }
   858  
   859  func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
   860  	// "After a successful call, subsequent reads from this range will
   861  	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
   862  	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
   863  	return unix.Fallocate(
   864  		int(f.file.Fd()),
   865  		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
   866  		int64(fr.Start),
   867  		int64(fr.Length()))
   868  }
   869  
   870  func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
   871  	f.mu.Lock()
   872  	defer f.mu.Unlock()
   873  	// Since we're changing the knownCommitted attribute, we need to merge
   874  	// across the entire range to ensure that the usage tree is minimal.
   875  	f.usage.MutateFullRange(fr, func(seg usageIterator) bool {
   876  		val := seg.ValuePtr()
   877  		if val.knownCommitted {
   878  			// Drop the usageExpected appropriately.
   879  			amount := seg.Range().Length()
   880  			usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID)
   881  			f.usageExpected -= amount
   882  			val.knownCommitted = false
   883  		}
   884  		val.memCgID = 0
   885  		return true
   886  	})
   887  }
   888  
   889  // HasUniqueRef returns true if all pages in the given range have exactly one
   890  // reference. A return value of false is inherently racy, but if the caller
   891  // holds a reference on the given range and is preventing other goroutines from
   892  // copying it, then a return value of true is not racy.
   893  //
   894  // Preconditions: At least one reference must be held on all pages in fr.
   895  func (f *MemoryFile) HasUniqueRef(fr memmap.FileRange) bool {
   896  	f.mu.Lock()
   897  	defer f.mu.Unlock()
   898  	hasUniqueRef := true
   899  	f.usage.VisitFullRange(fr, func(seg usageIterator) bool {
   900  		if seg.ValuePtr().refs != 1 {
   901  			hasUniqueRef = false
   902  			return false
   903  		}
   904  		return true
   905  	})
   906  	return hasUniqueRef
   907  }
   908  
   909  // IncRef implements memmap.File.IncRef.
   910  func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) {
   911  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   912  		panic(fmt.Sprintf("invalid range: %v", fr))
   913  	}
   914  
   915  	f.mu.Lock()
   916  	defer f.mu.Unlock()
   917  
   918  	f.usage.MutateFullRange(fr, func(seg usageIterator) bool {
   919  		seg.ValuePtr().refs++
   920  		return true
   921  	})
   922  }
   923  
   924  // DecRef implements memmap.File.DecRef.
   925  func (f *MemoryFile) DecRef(fr memmap.FileRange) {
   926  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   927  		panic(fmt.Sprintf("invalid range: %v", fr))
   928  	}
   929  
   930  	var freed bool
   931  
   932  	f.mu.Lock()
   933  	defer f.mu.Unlock()
   934  
   935  	f.usage.MutateFullRange(fr, func(seg usageIterator) bool {
   936  		val := seg.ValuePtr()
   937  		if val.refs == 0 {
   938  			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
   939  		}
   940  		val.refs--
   941  		if val.refs == 0 {
   942  			f.reclaim.InsertRange(seg.Range(), reclaimSetValue{})
   943  			freed = true
   944  			// Reclassify memory as System, until it's freed by the reclaim
   945  			// goroutine.
   946  			if val.knownCommitted {
   947  				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID)
   948  			}
   949  			val.kind = usage.System
   950  		}
   951  		return true
   952  	})
   953  
   954  	if freed {
   955  		f.reclaimable = true
   956  		f.reclaimCond.Signal()
   957  	}
   958  }
   959  
   960  // MapInternal implements memmap.File.MapInternal.
   961  func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
   962  	if !fr.WellFormed() || fr.Length() == 0 {
   963  		panic(fmt.Sprintf("invalid range: %v", fr))
   964  	}
   965  	if at.Execute {
   966  		return safemem.BlockSeq{}, linuxerr.EACCES
   967  	}
   968  
   969  	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
   970  	if chunks == 1 {
   971  		// Avoid an unnecessary slice allocation.
   972  		var seq safemem.BlockSeq
   973  		err := f.forEachMappingSlice(fr, func(bs []byte) {
   974  			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
   975  		})
   976  		return seq, err
   977  	}
   978  	blocks := make([]safemem.Block, 0, chunks)
   979  	err := f.forEachMappingSlice(fr, func(bs []byte) {
   980  		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
   981  	})
   982  	return safemem.BlockSeqFromSlice(blocks), err
   983  }
   984  
   985  // forEachMappingSlice invokes fn on a sequence of byte slices that
   986  // collectively map all bytes in fr.
   987  func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
   988  	mappings := *f.mappings.Load()
   989  	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
   990  		chunk := int(chunkStart >> chunkShift)
   991  		m := atomic.LoadUintptr(&mappings[chunk])
   992  		if m == 0 {
   993  			var err error
   994  			mappings, m, err = f.getChunkMapping(chunk)
   995  			if err != nil {
   996  				return err
   997  			}
   998  		}
   999  		startOff := uint64(0)
  1000  		if chunkStart < fr.Start {
  1001  			startOff = fr.Start - chunkStart
  1002  		}
  1003  		endOff := uint64(chunkSize)
  1004  		if chunkStart+chunkSize > fr.End {
  1005  			endOff = fr.End - chunkStart
  1006  		}
  1007  		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
  1008  	}
  1009  	return nil
  1010  }
  1011  
  1012  func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
  1013  	f.mappingsMu.Lock()
  1014  	defer f.mappingsMu.Unlock()
  1015  	// Another thread may have replaced f.mappings altogether due to file
  1016  	// expansion.
  1017  	mappings := *f.mappings.Load()
  1018  	// Another thread may have already mapped the chunk.
  1019  	if m := mappings[chunk]; m != 0 {
  1020  		return mappings, m, nil
  1021  	}
  1022  	m, _, errno := unix.Syscall6(
  1023  		unix.SYS_MMAP,
  1024  		0,
  1025  		chunkSize,
  1026  		unix.PROT_READ|unix.PROT_WRITE,
  1027  		unix.MAP_SHARED,
  1028  		f.file.Fd(),
  1029  		uintptr(chunk<<chunkShift))
  1030  	if errno != 0 {
  1031  		return nil, 0, errno
  1032  	}
  1033  	atomic.StoreUintptr(&mappings[chunk], m)
  1034  	return mappings, m, nil
  1035  }
  1036  
  1037  // MarkEvictable allows f to request memory deallocation by calling
  1038  // user.Evict(er) in the future.
  1039  //
  1040  // Redundantly marking an already-evictable range as evictable has no effect.
  1041  func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
  1042  	f.mu.Lock()
  1043  	defer f.mu.Unlock()
  1044  	info, ok := f.evictable[user]
  1045  	if !ok {
  1046  		info = &evictableMemoryUserInfo{}
  1047  		f.evictable[user] = info
  1048  	}
  1049  	gap := info.ranges.LowerBoundGap(er.Start)
  1050  	for gap.Ok() && gap.Start() < er.End {
  1051  		gapER := gap.Range().Intersect(er)
  1052  		if gapER.Length() == 0 {
  1053  			gap = gap.NextGap()
  1054  			continue
  1055  		}
  1056  		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
  1057  	}
  1058  	if !info.evicting {
  1059  		switch f.opts.DelayedEviction {
  1060  		case DelayedEvictionDisabled:
  1061  			// Kick off eviction immediately.
  1062  			f.startEvictionGoroutineLocked(user, info)
  1063  		case DelayedEvictionEnabled:
  1064  			if !f.opts.UseHostMemcgPressure {
  1065  				// Ensure that the reclaimer goroutine is running, so that it
  1066  				// can start eviction when necessary.
  1067  				f.reclaimCond.Signal()
  1068  			}
  1069  		}
  1070  	}
  1071  }
  1072  
  1073  // MarkUnevictable informs f that user no longer considers er to be evictable,
  1074  // so the MemoryFile should no longer call user.Evict(er). Note that, per
  1075  // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
  1076  // called even after MarkUnevictable returns due to race conditions, and
  1077  // implementations of EvictableMemoryUser must handle this possibility.
  1078  //
  1079  // Redundantly marking an already-unevictable range as unevictable has no
  1080  // effect.
  1081  func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
  1082  	f.mu.Lock()
  1083  	defer f.mu.Unlock()
  1084  	info, ok := f.evictable[user]
  1085  	if !ok {
  1086  		return
  1087  	}
  1088  	seg := info.ranges.LowerBoundSegment(er.Start)
  1089  	for seg.Ok() && seg.Start() < er.End {
  1090  		seg = info.ranges.Isolate(seg, er)
  1091  		seg = info.ranges.Remove(seg).NextSegment()
  1092  	}
  1093  	// We can only remove info if there's no eviction goroutine running on its
  1094  	// behalf.
  1095  	if !info.evicting && info.ranges.IsEmpty() {
  1096  		delete(f.evictable, user)
  1097  	}
  1098  }
  1099  
  1100  // MarkAllUnevictable informs f that user no longer considers any offsets to be
  1101  // evictable. It otherwise has the same semantics as MarkUnevictable.
  1102  func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
  1103  	f.mu.Lock()
  1104  	defer f.mu.Unlock()
  1105  	info, ok := f.evictable[user]
  1106  	if !ok {
  1107  		return
  1108  	}
  1109  	info.ranges.RemoveAll()
  1110  	// We can only remove info if there's no eviction goroutine running on its
  1111  	// behalf.
  1112  	if !info.evicting {
  1113  		delete(f.evictable, user)
  1114  	}
  1115  }
  1116  
  1117  // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
  1118  // evictable memory, such that it may be advantageous to cache data in
  1119  // evictable memory. The value returned by ShouldCacheEvictable may change
  1120  // between calls.
  1121  func (f *MemoryFile) ShouldCacheEvictable() bool {
  1122  	return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
  1123  }
  1124  
  1125  // UpdateUsage ensures that the memory usage statistics in
  1126  // usage.MemoryAccounting are up to date. If memCgIDs is nil, all the pages
  1127  // will be scanned. Else only the pages which belong to the memory cgroup ids
  1128  // in memCgIDs will be scanned and the memory usage will be updated.
  1129  func (f *MemoryFile) UpdateUsage(memCgIDs map[uint32]struct{}) error {
  1130  	f.mu.Lock()
  1131  	defer f.mu.Unlock()
  1132  
  1133  	// If the underlying usage matches where the usage tree already
  1134  	// represents, then we can just avoid the entire scan (we know it's
  1135  	// accurate).
  1136  	currentUsage, err := f.TotalUsage()
  1137  	if err != nil {
  1138  		return err
  1139  	}
  1140  	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
  1141  		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
  1142  		return nil
  1143  	}
  1144  	// If the current usage matches the expected but there's swap
  1145  	// accounting, then ensure a scan takes place at least every second
  1146  	// (when requested).
  1147  	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
  1148  		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
  1149  		return nil
  1150  	}
  1151  
  1152  	// Linux updates usage values at CONFIG_HZ.
  1153  	if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
  1154  		log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
  1155  		return nil
  1156  	}
  1157  
  1158  	if memCgIDs == nil {
  1159  		f.usageLast = time.Now()
  1160  	}
  1161  	err = f.updateUsageLocked(currentUsage, memCgIDs, false /* alsoScanCommitted */, mincore)
  1162  	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
  1163  		currentUsage, f.usageExpected, f.usageSwapped)
  1164  	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
  1165  	return err
  1166  }
  1167  
  1168  // updateUsageLocked attempts to detect commitment of previously-uncommitted
  1169  // pages by invoking checkCommitted, and updates memory accounting to reflect
  1170  // newly-committed pages. If alsoScanCommitted is true, updateUsageLocked also
  1171  // attempts to detect decommitment of previously-committed pages; this is only
  1172  // used by save/restore, which optionally temporarily treats zeroed pages as
  1173  // decommitted in order to skip saving them.
  1174  //
  1175  // For each page i in bs, checkCommitted must set committed[i] to 1 if the page
  1176  // is committed and 0 otherwise. off is the offset at which bs begins.
  1177  // wasCommitted is true if the page was known-committed before the call to
  1178  // checkCommitted and false otherwise; wasCommitted can only be true if
  1179  // alsoScanCommitted is true.
  1180  //
  1181  // Precondition: f.mu must be held; it may be unlocked and reacquired.
  1182  // +checklocks:f.mu
  1183  func (f *MemoryFile) updateUsageLocked(currentUsage uint64, memCgIDs map[uint32]struct{}, alsoScanCommitted bool, checkCommitted func(bs []byte, committed []byte, off uint64, wasCommitted bool) error) error {
  1184  	// Track if anything changed to elide the merge. In the common case, we
  1185  	// expect all segments to be committed and no merge to occur.
  1186  	changedAny := false
  1187  	defer func() {
  1188  		if changedAny {
  1189  			f.usage.MergeAll()
  1190  		}
  1191  
  1192  		// Adjust the swap usage to reflect reality.
  1193  		if f.usageExpected < currentUsage {
  1194  			// Since no pages may be marked decommitted while we hold mu, we
  1195  			// know that usage may have only increased since we got the last
  1196  			// current usage. Therefore, if usageExpected is still short of
  1197  			// currentUsage, we must assume that the difference is in pages
  1198  			// that have been swapped.
  1199  			newUsageSwapped := currentUsage - f.usageExpected
  1200  			if f.usageSwapped < newUsageSwapped {
  1201  				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0)
  1202  			} else {
  1203  				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0)
  1204  			}
  1205  			f.usageSwapped = newUsageSwapped
  1206  		} else if f.usageSwapped != 0 {
  1207  			// We have more usage accounted for than the file itself.
  1208  			// That's fine, we probably caught a race where pages were
  1209  			// being committed while the below loop was running. Just
  1210  			// report the higher number that we found and ignore swap.
  1211  			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0)
  1212  			f.usageSwapped = 0
  1213  		}
  1214  	}()
  1215  
  1216  	// Reused mincore buffer, will generally be <= 4096 bytes.
  1217  	var buf []byte
  1218  
  1219  	// Iterate over all usage data. There will only be usage segments
  1220  	// present when there is an associated reference.
  1221  	for seg := f.usage.FirstSegment(); seg.Ok(); {
  1222  		if seg.ValuePtr().refs == 0 {
  1223  			// We assume that reclaimable pages (that aren't already known to
  1224  			// be committed) are not committed. This isn't necessarily true,
  1225  			// even after the reclaimer does Decommit(), because the kernel may
  1226  			// subsequently back the hugepage-sized region containing the
  1227  			// decommitted page with a hugepage. However, it's consistent with
  1228  			// our treatment of unallocated pages, which have the same
  1229  			// property.
  1230  			seg = seg.NextSegment()
  1231  			continue
  1232  		}
  1233  		wasCommitted := seg.ValuePtr().knownCommitted
  1234  		if !alsoScanCommitted && wasCommitted {
  1235  			seg = seg.NextSegment()
  1236  			continue
  1237  		}
  1238  
  1239  		// Scan the pages of the given memCgID only. This will avoid scanning the
  1240  		// whole memory file when the memory usage is required only for a specific
  1241  		// cgroup. The total memory usage of all cgroups can be obtained when the
  1242  		// memCgIDs is nil.
  1243  		if memCgIDs != nil {
  1244  			if _, ok := memCgIDs[seg.ValuePtr().memCgID]; !ok {
  1245  				seg = seg.NextSegment()
  1246  				continue
  1247  			}
  1248  		}
  1249  
  1250  		// Get the range for this segment. As we touch slices, the
  1251  		// Start value will be walked along.
  1252  		r := seg.Range()
  1253  
  1254  		var checkErr error
  1255  		err := f.forEachMappingSlice(r,
  1256  			func(s []byte) {
  1257  				if checkErr != nil {
  1258  					return
  1259  				}
  1260  
  1261  				// Ensure that we have sufficient buffer for the call
  1262  				// (one byte per page). The length of each slice must
  1263  				// be page-aligned.
  1264  				bufLen := len(s) / hostarch.PageSize
  1265  				if len(buf) < bufLen {
  1266  					buf = make([]byte, bufLen)
  1267  				}
  1268  
  1269  				// Query for new pages in core.
  1270  				// NOTE(b/165896008): mincore (which is passed as checkCommitted)
  1271  				// by f.UpdateUsage() might take a really long time. So unlock f.mu
  1272  				// while checkCommitted runs.
  1273  				f.mu.Unlock() // +checklocksforce
  1274  				err := checkCommitted(s, buf, r.Start, wasCommitted)
  1275  				f.mu.Lock()
  1276  				if err != nil {
  1277  					checkErr = err
  1278  					return
  1279  				}
  1280  
  1281  				// Scan each page and switch out segments. If wasCommitted is
  1282  				// false, then we are marking ranges that are now committed;
  1283  				// otherwise, we are marking ranges that are now uncommitted.
  1284  				unchangedVal := byte(0)
  1285  				if wasCommitted {
  1286  					unchangedVal = 1
  1287  				}
  1288  				seg := f.usage.LowerBoundSegment(r.Start)
  1289  				for i := 0; i < bufLen; {
  1290  					if buf[i]&0x1 == unchangedVal {
  1291  						i++
  1292  						continue
  1293  					}
  1294  					// Scan to the end of this changed range.
  1295  					j := i + 1
  1296  					for ; j < bufLen; j++ {
  1297  						if buf[j]&0x1 == unchangedVal {
  1298  							break
  1299  						}
  1300  					}
  1301  					changedFR := memmap.FileRange{
  1302  						Start: r.Start + uint64(i*hostarch.PageSize),
  1303  						End:   r.Start + uint64(j*hostarch.PageSize),
  1304  					}
  1305  					// Advance seg to changedFR.Start.
  1306  					for seg.Ok() && seg.End() <= changedFR.Start {
  1307  						seg = seg.NextSegment()
  1308  					}
  1309  					// Mark pages overlapping changedFR as committed or
  1310  					// decommitted.
  1311  					for seg.Ok() && seg.Start() < changedFR.End {
  1312  						if seg.ValuePtr().refs != 0 && seg.ValuePtr().knownCommitted == wasCommitted {
  1313  							seg = f.usage.Isolate(seg, changedFR)
  1314  							seg.ValuePtr().knownCommitted = !wasCommitted
  1315  							amount := seg.Range().Length()
  1316  							if wasCommitted {
  1317  								usage.MemoryAccounting.Dec(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID)
  1318  								f.usageExpected -= amount
  1319  							} else {
  1320  								usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID)
  1321  								f.usageExpected += amount
  1322  							}
  1323  							changedAny = true
  1324  						}
  1325  						seg = seg.NextSegment()
  1326  					}
  1327  					// Continue scanning for changed pages.
  1328  					i = j + 1
  1329  				}
  1330  
  1331  				// Advance r.Start.
  1332  				r.Start += uint64(len(s))
  1333  			})
  1334  		if checkErr != nil {
  1335  			return checkErr
  1336  		}
  1337  		if err != nil {
  1338  			return err
  1339  		}
  1340  
  1341  		// Continue with the first segment after r.End.
  1342  		seg = f.usage.LowerBoundSegment(r.End)
  1343  	}
  1344  
  1345  	return nil
  1346  }
  1347  
  1348  // TotalUsage returns an aggregate usage for all memory statistics except
  1349  // Mapped (which is external to MemoryFile). This is generally much cheaper
  1350  // than UpdateUsage, but will not provide a fine-grained breakdown.
  1351  func (f *MemoryFile) TotalUsage() (uint64, error) {
  1352  	// Stat the underlying file to discover the underlying usage. stat(2)
  1353  	// always reports the allocated block count in units of 512 bytes. This
  1354  	// includes pages in the page cache and swapped pages.
  1355  	var stat unix.Stat_t
  1356  	if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil {
  1357  		return 0, err
  1358  	}
  1359  	return uint64(stat.Blocks * 512), nil
  1360  }
  1361  
  1362  // TotalSize returns the current size of the backing file in bytes, which is an
  1363  // upper bound on the amount of memory that can currently be allocated from the
  1364  // MemoryFile. The value returned by TotalSize is permitted to change.
  1365  func (f *MemoryFile) TotalSize() uint64 {
  1366  	f.mu.Lock()
  1367  	defer f.mu.Unlock()
  1368  	return uint64(f.fileSize)
  1369  }
  1370  
  1371  // File returns the backing file.
  1372  func (f *MemoryFile) File() *os.File {
  1373  	return f.file
  1374  }
  1375  
  1376  // FD implements memmap.File.FD.
  1377  func (f *MemoryFile) FD() int {
  1378  	return int(f.file.Fd())
  1379  }
  1380  
  1381  // IsDiskBacked returns true if f is backed by a file on disk.
  1382  func (f *MemoryFile) IsDiskBacked() bool {
  1383  	return f.opts.DiskBackedFile
  1384  }
  1385  
  1386  // String implements fmt.Stringer.String.
  1387  //
  1388  // Note that because f.String locks f.mu, calling f.String internally
  1389  // (including indirectly through the fmt package) risks recursive locking.
  1390  // Within the pgalloc package, use f.usage directly instead.
  1391  func (f *MemoryFile) String() string {
  1392  	f.mu.Lock()
  1393  	defer f.mu.Unlock()
  1394  	return f.usage.String()
  1395  }
  1396  
  1397  // runReclaim implements the reclaimer goroutine, which continuously decommits
  1398  // reclaimable pages in order to reduce memory usage and make them available
  1399  // for allocation.
  1400  func (f *MemoryFile) runReclaim() {
  1401  	for {
  1402  		// N.B. We must call f.markReclaimed on the returned FrameRange.
  1403  		fr, ok := f.findReclaimable()
  1404  		if !ok {
  1405  			break
  1406  		}
  1407  
  1408  		if f.opts.ManualZeroing {
  1409  			// If ManualZeroing is in effect, only hugepage-aligned regions may
  1410  			// be safely passed to decommitFile. Pages will be zeroed on
  1411  			// reallocation, so we don't need to perform any manual zeroing
  1412  			// here, whether or not decommitFile succeeds.
  1413  			if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok {
  1414  				if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr {
  1415  					decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)}
  1416  					if err := f.decommitFile(decommitFR); err != nil {
  1417  						log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err)
  1418  					}
  1419  				}
  1420  			}
  1421  		} else {
  1422  			if err := f.decommitFile(fr); err != nil {
  1423  				log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
  1424  				// Zero the pages manually. This won't reduce memory usage, but at
  1425  				// least ensures that the pages will be zero when reallocated.
  1426  				if err := f.manuallyZero(fr); err != nil {
  1427  					panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err))
  1428  				}
  1429  			}
  1430  		}
  1431  		f.markDecommitted(fr)
  1432  		f.markReclaimed(fr)
  1433  	}
  1434  
  1435  	// We only get here if findReclaimable finds f.destroyed set and returns
  1436  	// false.
  1437  	f.mu.Lock()
  1438  	if !f.destroyed {
  1439  		f.mu.Unlock()
  1440  		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
  1441  	}
  1442  	if f.opts.DecommitOnDestroy && f.fileSize > 0 {
  1443  		if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil {
  1444  			f.mu.Unlock()
  1445  			panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err))
  1446  		}
  1447  	}
  1448  	f.file.Close()
  1449  	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
  1450  	// that has possibly been reassigned.
  1451  	f.file = nil
  1452  	f.mappingsMu.Lock()
  1453  	defer f.mappingsMu.Unlock()
  1454  	mappings := *f.mappings.Load()
  1455  	for i, m := range mappings {
  1456  		if m != 0 {
  1457  			_, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0)
  1458  			if errno != 0 {
  1459  				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
  1460  			}
  1461  		}
  1462  	}
  1463  	// Similarly, invalidate f.mappings
  1464  	f.mappings.Store(nil)
  1465  	f.mu.Unlock()
  1466  
  1467  	// This must be called without holding f.mu to avoid circular lock
  1468  	// ordering.
  1469  	if f.stopNotifyPressure != nil {
  1470  		f.stopNotifyPressure()
  1471  	}
  1472  }
  1473  
  1474  // findReclaimable finds memory that has been marked for reclaim.
  1475  //
  1476  // Note that there returned range will be removed from tracking. It
  1477  // must be reclaimed (removed from f.usage) at this point.
  1478  func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
  1479  	f.mu.Lock()
  1480  	defer f.mu.Unlock()
  1481  	for {
  1482  		for {
  1483  			if f.destroyed {
  1484  				return memmap.FileRange{}, false
  1485  			}
  1486  			if f.reclaimable {
  1487  				break
  1488  			}
  1489  			if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
  1490  				// No work to do. Evict any pending evictable allocations to
  1491  				// get more reclaimable pages before going to sleep.
  1492  				f.startEvictionsLocked()
  1493  			}
  1494  			f.reclaimCond.Wait()
  1495  		}
  1496  		// Most allocations are done upwards, with exceptions being stacks and some
  1497  		// allocators that allocate top-down. Reclaim preserves this order to
  1498  		// minimize the cost of the search.
  1499  		if seg := f.reclaim.FirstSegment(); seg.Ok() {
  1500  			fr := seg.Range()
  1501  			f.reclaim.Remove(seg)
  1502  			return fr, true
  1503  		}
  1504  		// Nothing is reclaimable.
  1505  		f.reclaimable = false
  1506  	}
  1507  }
  1508  
  1509  func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
  1510  	f.mu.Lock()
  1511  	defer f.mu.Unlock()
  1512  	seg := f.usage.FindSegment(fr.Start)
  1513  	// All of fr should be mapped to a single uncommitted reclaimable
  1514  	// segment accounted to System.
  1515  	if !seg.Ok() {
  1516  		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
  1517  	}
  1518  	if !seg.Range().IsSupersetOf(fr) {
  1519  		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
  1520  	}
  1521  	if got, want := seg.Value(), (usageInfo{
  1522  		kind:           usage.System,
  1523  		knownCommitted: false,
  1524  		refs:           0,
  1525  		memCgID:        0,
  1526  	}); got != want {
  1527  		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
  1528  	}
  1529  	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
  1530  	// the caller of markReclaimed may not have decommitted it, so we can
  1531  	// only mark fr as reclaimed.
  1532  	f.usage.Remove(f.usage.Isolate(seg, fr))
  1533  }
  1534  
  1535  // StartEvictions requests that f evict all evictable allocations. It does not
  1536  // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
  1537  func (f *MemoryFile) StartEvictions() {
  1538  	f.mu.Lock()
  1539  	defer f.mu.Unlock()
  1540  	f.startEvictionsLocked()
  1541  }
  1542  
  1543  // Preconditions: f.mu must be locked.
  1544  func (f *MemoryFile) startEvictionsLocked() bool {
  1545  	startedAny := false
  1546  	for user, info := range f.evictable {
  1547  		// Don't start multiple goroutines to evict the same user's
  1548  		// allocations.
  1549  		if !info.evicting {
  1550  			f.startEvictionGoroutineLocked(user, info)
  1551  			startedAny = true
  1552  		}
  1553  	}
  1554  	return startedAny
  1555  }
  1556  
  1557  // Preconditions:
  1558  //   - info == f.evictable[user].
  1559  //   - !info.evicting.
  1560  //   - f.mu must be locked.
  1561  func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
  1562  	info.evicting = true
  1563  	f.evictionWG.Add(1)
  1564  	go func() { // S/R-SAFE: f.evictionWG
  1565  		defer f.evictionWG.Done()
  1566  		for {
  1567  			f.mu.Lock()
  1568  			info, ok := f.evictable[user]
  1569  			if !ok {
  1570  				// This shouldn't happen: only this goroutine is permitted
  1571  				// to delete this entry.
  1572  				f.mu.Unlock()
  1573  				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
  1574  			}
  1575  			if info.ranges.IsEmpty() {
  1576  				delete(f.evictable, user)
  1577  				f.mu.Unlock()
  1578  				return
  1579  			}
  1580  			// Evict from the end of info.ranges, under the assumption that
  1581  			// if ranges in user start being used again (and are
  1582  			// consequently marked unevictable), such uses are more likely
  1583  			// to start from the beginning of user.
  1584  			seg := info.ranges.LastSegment()
  1585  			er := seg.Range()
  1586  			info.ranges.Remove(seg)
  1587  			// user.Evict() must be called without holding f.mu to avoid
  1588  			// circular lock ordering.
  1589  			f.mu.Unlock()
  1590  			user.Evict(context.Background(), er)
  1591  		}
  1592  	}()
  1593  }
  1594  
  1595  // WaitForEvictions blocks until f is no longer evicting any evictable
  1596  // allocations.
  1597  func (f *MemoryFile) WaitForEvictions() {
  1598  	f.evictionWG.Wait()
  1599  }
  1600  
  1601  type usageSetFunctions struct{}
  1602  
  1603  func (usageSetFunctions) MinKey() uint64 {
  1604  	return 0
  1605  }
  1606  
  1607  func (usageSetFunctions) MaxKey() uint64 {
  1608  	return math.MaxUint64
  1609  }
  1610  
  1611  func (usageSetFunctions) ClearValue(val *usageInfo) {
  1612  }
  1613  
  1614  func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
  1615  	return val1, val1 == val2
  1616  }
  1617  
  1618  func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
  1619  	return val, val
  1620  }
  1621  
  1622  // evictableRangeSetValue is the value type of evictableRangeSet.
  1623  type evictableRangeSetValue struct{}
  1624  
  1625  type evictableRangeSetFunctions struct{}
  1626  
  1627  func (evictableRangeSetFunctions) MinKey() uint64 {
  1628  	return 0
  1629  }
  1630  
  1631  func (evictableRangeSetFunctions) MaxKey() uint64 {
  1632  	return math.MaxUint64
  1633  }
  1634  
  1635  func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
  1636  }
  1637  
  1638  func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
  1639  	return evictableRangeSetValue{}, true
  1640  }
  1641  
  1642  func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
  1643  	return evictableRangeSetValue{}, evictableRangeSetValue{}
  1644  }
  1645  
  1646  // reclaimSetValue is the value type of reclaimSet.
  1647  type reclaimSetValue struct{}
  1648  
  1649  type reclaimSetFunctions struct{}
  1650  
  1651  func (reclaimSetFunctions) MinKey() uint64 {
  1652  	return 0
  1653  }
  1654  
  1655  func (reclaimSetFunctions) MaxKey() uint64 {
  1656  	return math.MaxUint64
  1657  }
  1658  
  1659  func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
  1660  }
  1661  
  1662  func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
  1663  	return reclaimSetValue{}, true
  1664  }
  1665  
  1666  func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
  1667  	return reclaimSetValue{}, reclaimSetValue{}
  1668  }